1、transformers peft加载lora模型
代码:
cpp
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoTokenizer,AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("/ai/loong/Qwen1.5-7B-Chat")
model = AutoModelForCausalLM.from_pretrained("/ai/loong/Qwen1.5-7B-Chat", trust_remote_code=True, device_map="auto")
model = PeftModel.from_pretrained(model, "/ai/loong/output/checkpoint-300",offload_folder='./')
model.eval()
inputs = tokenizer("你是谁", return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=500)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
TextStreamer流式输出
参考:https://zhuanlan.zhihu.com/p/694576810
cpp
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoTokenizer,AutoModelForCausalLM,TextStreamer
import torch
tokenizer = AutoTokenizer.from_pretrained("/ai/loong/Qwen1.5-7B-Chat")
model = AutoModelForCausalLM.from_pretrained("/ai/loong/Qwen1.5-7B-Chat", trust_remote_code=True, device_map="auto")
model = PeftModel.from_pretrained(model, "/ai/loong/output/checkpoint-300",offload_folder='./')
inputs = tokenizer("听说你以前叫通义千问", return_tensors="pt")
streamer = TextStreamer(tokenizer)
# Despite returning the usual output, the streamer will also print the generated text to stdout.
model.generate(**inputs, streamer=streamer, max_new_tokens=20)
kv cache使用
use_cache=True
cpp
model.eval()
inputs = tokenizer("你是谁", return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=500,use_cache=True)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])