python
复制代码
import os
import glob
import paddle
from tqdm import tqdm
from paddlenlp.transformers import LlamaForCausalLM, LlamaConfig, LlamaTokenizer
pattern = 'paddle-model-?????-of-?????.pdparams'
# Vicuna 7B
# ckpt_dir = 'vicuna-7b-v1.1'
# config_dict = {
# "hidden_size": 4096,
# "initializer_range": 0.02,
# "intermediate_size": 11008,
# "max_position_embeddings": 2048,
# "model_type": "llama",
# "num_attention_heads": 32,
# "num_hidden_layers": 32,
# "rms_norm_eps": 1e-06,
# "vocab_size": 32000,
# "bos_token_id": 1,
# "eos_token_id": 2,
# "pad_token_id": 0,
# "use_cache": True,
# "use_recompute": False,
# "use_flash_attention": False,
# }
# Vicuna 13B
ckpt_dir = 'vicuna-13b-v1.1'
config_dict = {
"hidden_size": 5120,
"initializer_range": 0.02,
"intermediate_size": 13824,
"max_position_embeddings": 2048,
"model_type": "llama",
"num_attention_heads": 40,
"num_hidden_layers": 40,
"rms_norm_eps": 1e-06,
"vocab_size": 32000,
"bos_token_id": 1,
"eos_token_id": 2,
"pad_token_id": 0,
"use_cache": True,
"use_recompute": False,
"use_flash_attention": False,
}
paddle.set_default_dtype('float16')
tokenizer = LlamaTokenizer.from_pretrained(ckpt_dir)
config = LlamaConfig(**config_dict)
model = LlamaForCausalLM(config)
model.eval()
for name, layer in model.named_sublayers():
if 'rotary_emb' in name:
layer.inv_freq = layer.inv_freq.cast(paddle.float32)
paddle.device.cuda.empty_cache()
for file_path in tqdm(glob.glob(os.path.join(ckpt_dir, pattern))):
params = paddle.load(file_path)
assert model.set_dict(params)[1] == [], 'Load error.'
del params
paddle.device.cuda.empty_cache()
input_text = input('USER: ')
prompt = f'''USER: {input_text}\n\nASSISTANT: '''
with paddle.no_grad():
with paddle.amp.auto_cast(False, level='O2', dtype='float16'):
while True:
if input_text == 'exit':
break
inputs = tokenizer(
prompt,
return_tensors="pd",
return_attention_mask=True,
return_position_ids=True
)
outputs = model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
position_ids=inputs.position_ids,
max_length=2048-inputs.input_ids.shape[1],
min_length=0,
decode_strategy="sampling",
temperature=0.8,
top_k=40,
top_p=0.95,
repetition_penalty=1.1,
bos_token_id=tokenizer.bos_token_id,
eos_token_id=tokenizer.eos_token_id,
pad_token_id=tokenizer.pad_token_id,
use_cache=True,
use_fast=True,
use_fp16_decoding=True)
response = tokenizer.decode(outputs[0][0], skip_special_tokens=True)
print('ASSISTANT: ' + response)
input_text = input('USER: ')
prompt += f'''{response}\n\nUSER: {input_text}\n\nASSISTANT: '''
del inputs
del outputs
del response
paddle.device.cuda.empty_cache()