Gemma

Gemma

1.使用

首先是去HF下载模型,但一直下载不了,所以去了HF镜像网站,下载gemma需要HF的Token,按照步骤就可以下载。代码主要是Kaggle论坛里面的分享内容

python 复制代码
huggingface-cli download --token hf_XXX --resume-download google/gemma-7b --local-dir gemma-7b-mirror

这里我有时是2b有时是7b,换着用。

python 复制代码
from transformers import AutoTokenizer, AutoModelForCausalLM  
tokenizer = AutoTokenizer.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
Gemma = AutoModelForCausalLM.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
def answer_the_question(question):
    input_ids = tokenizer(question, return_tensors="pt")
    generated_text = Gemma.generate(**input_ids,max_length=256)
    answer = tokenizer.decode(generated_text[0], skip_special_tokens=True)
    return answer
question = "给我写一首优美的诗歌?"
answer = answer_the_question(question)
print(answer)

2.RAG

参考

python 复制代码
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
##2.1 根据question检索sentence chunk
import os
def get_all_pdfs(directory):
    pdf_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                pdf_files.append(os.path.join(root, file))
    return pdf_files


class RAG:
    def __init__(self, num_retrieved_docs=5, pdf_folder_path='D:/Gemma/PDF'):
        pdf_files = get_all_pdfs(pdf_folder_path)
        print("Documents used", pdf_files)
        loaders = [PyPDFLoader(pdf_file) for pdf_file in pdf_files]
        all_documents = []
        for loader in loaders:
            raw_documents = loader.load()
            text_splitter = CharacterTextSplitter(
                separator="\n\n",
                chunk_size=10,
                chunk_overlap=1,
                # length_function=len,
            )
            documents = text_splitter.split_documents(raw_documents)
            all_documents.extend(documents)
        embeddings = HuggingFaceEmbeddings(model_name="D:/Projects/model/m3e-base")    
        self.db = FAISS.from_documents(all_documents, embeddings)
        self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})

    def search(self, query):
        docs = self.retriever.get_relevant_documents(query)
        return docs
retriever = RAG()
##2.2根据sentence chunk和question去回答
class Assistant:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
        self.Gemma = AutoModelForCausalLM.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")

    def create_prompt(self, query, retrieved_info):
        prompt = f"""你是人工智能助手,需要根据Relevant information里面的相关内容回答用户的Instruction,其中相关信息如下:
        Instruction: {query}
        Relevant information: {retrieved_info}
        Output:
        """
        print(prompt)
        return prompt
    
    def reply(self, query, retrieved_info):
        prompt = self.create_prompt(query, retrieved_info)
        input_ids = self.tokenizer(query, return_tensors="pt").input_ids
        # Generate text with a focus on factual responses
        generated_text = self.Gemma.generate(
            input_ids,
            do_sample=True,
            max_length=500,
            temperature=0.7, # Adjust temperature according to the task, for code generation it can be 0.9
            
        )
        # Decode and return the answer
        answer = self.tokenizer.decode(generated_text[0], skip_special_tokens=True)
        return answer
chatbot = Assistant()
## 2.3开始使用RAG
def generate_reply(query):
    related_docs = retriever.search(query)
    #print('related docs', related_docs)
    reply = chatbot.reply(query, related_docs)
    return reply
reply = generate_reply("存在的不足及后续的优化工作")
for s in reply.split('\n'):
    print(s)

3.LoRA

3.1LoRA分类任务

参考

使用nlp-getting-started数据集训练模型做二分类任务。首先拿到源model

python 复制代码
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments,pipeline
from peft import prepare_model_for_int8_training,LoraConfig, TaskType, get_peft_model
import numpy as np
NUM_CLASSES = 2#模型输出分类的类别数
BATCH_SIZE,EPOCHS,R,LORA_ALPHA,LORA_DROPOUT = 8,5,64,32,0.1#LoRA训练的参数
MODEL_PATH="D:/Gemma/gemma-2b-int-mirror2"#模型地址
# 1.源model,设置输出二分类
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=NUM_CLASSES)
print(model)

处理csv数据,将输入文字经过tokenizer编码处理

python 复制代码
#2.处理dataset,输入过长进行truncation(tokenizer处理后)
dataset = load_dataset('csv', data_files='D:/Gemma/nlp-getting-started/train.csv')
dataset['test'] = dataset['train']
dataset = dataset.remove_columns(['id', 'keyword', 'location'])
dataset = dataset.rename_column("target", "label")#csv最后只保留了text列和label列
tokenized_dataset = {}#train和test
for split in dataset.keys():
    tokenized_dataset[split] = dataset[split].map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
print(tokenized_dataset["train"])
print(tokenized_dataset["train"][1])

在源model基础上配置LoRA的参数,形成lora_model

python 复制代码
#3.LoRA模型参数设置
model = prepare_model_for_int8_training(model)
lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,#SEQ_CLS:序列分类任务;TOKEN_CLS命名实体识别;SEQ2SEQ机器翻译;LM语言建模任务
    target_modules='all-linear'#all-linear所有线性层;embeddings嵌入层;convs卷积层
)
lora_model = get_peft_model(model, lora_config)
print(lora_model)
print(lora_model.print_trainable_parameters())#LoRA模型要训练的参数

配置lora_model的训练参数

python 复制代码
#4.LoRA训练参数设置(损失计算等)
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./LoAR_data/",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_steps=10,
        report_to="none"
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

开始训练并保存使用模型

python 复制代码
#5.训练并评估
print("Evaluating the Model Before Training!")
trainer.evaluate()
print("Training the Model")
trainer.train()
print("Evaluating the trained model")
trainer.evaluate()
#6.保存并使用
lora_model.save_pretrained('fine-tuned-model')
clf = pipeline("text-classification", lora_model, tokenizer=MODEL_PATH)#LoRA训练后的模型

3.2LoRA中文建模任务

参考

首先拿到源model和config

python 复制代码
from transformers import AutoConfig,AutoTokenizer,AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training,PeftModel
import torch
import  datasets
from tqdm import tqdm
import json
BATCH_SIZE,EPOCHS,R,LORA_ALPHA,LORA_DROPOUT = 8,5,64,32,0.1#LoRA训练的参数
MODEL_PATH="D:/Gemma/gemma-2b-int-mirror2"#模型地址
device = torch.device('cuda:0')
# 1.源model和model的config
config = AutoConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
config.is_causal = True  #确保模型在生成文本时只能看到左侧的上下文
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,device_map="auto", config=config,trust_remote_code=True)

根据模型和config处理json数据

python 复制代码
#2.根据model的config处理dataset(tokenizer处理后),并保存加载
def preprocess(tokenizer, config, file_path, max_seq_length, prompt_key, target_key, skip_overlength=False):
    with open(file_path, "r", encoding="utf8") as f:
        for line in tqdm(f.readlines()):
            example = json.loads(line)
            prompt_ids = tokenizer.encode(example[prompt_key], max_length=max_seq_length, truncation=True)
            target_ids = tokenizer.encode(example[target_key], max_length=max_seq_length, truncation=True)
            input_ids = prompt_ids + target_ids + [config.eos_token_id]
            if skip_overlength and len(input_ids) > max_seq_length:
                continue
            input_ids = input_ids[:max_seq_length]
            yield {
                "input_ids": input_ids,
                "seq_len": len(prompt_ids)
            }
dataset = datasets.Dataset.from_generator(lambda: preprocess(tokenizer, 
                                            config, 
                                            "D:/Gemma/try/hc3_chatgpt_zh_specific_qa.json", 
                                            max_seq_length=2000, 
                                            prompt_key="q",
                                            target_key="a",))

dataset.save_to_disk("h3c-chinese")  # 保存处理后的数据集
train_set = datasets.load_from_disk("h3c-chinese")#加载处理后的数据集

配置Lora参数

python 复制代码
#3.LoRA模型参数设置
model = prepare_model_for_kbit_training(model)
lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type="CAUSAL_LM",
    target_modules='all-linear'
)
lora_model = get_peft_model(model, lora_config)
print(lora_model)
print(lora_model.print_trainable_parameters())#LoRA模型要训练的参数

配置lora的训练参数,包括损失计算compute_metrics,并对输入的input_ids构造输入样本列表批次处理。

python 复制代码
tokenizer.pad_token_id = config.pad_token_id
def data_collator(features):#封装每一批数据forward前预处理的函数
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len) + ids[seq_len:] + [-100] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        input_ids.append(torch.LongTensor(ids))
        labels_list.append(torch.LongTensor(labels))
    return {
        "input_ids": torch.stack(input_ids),
        "labels": torch.stack(labels_list),
    }
def compute_metrics(inputs):  
    # 使用模型计算损失  
    loss = model(input_ids=inputs["input_ids"], labels=inputs["labels"]).loss  
    return {  "loss": loss.item()}# 将Tensor转换为Python数字  
trainer = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./LoAR_data2/",
        learning_rate=2e-5,
        per_device_train_batch_size=BATCH_SIZE,
        save_strategy="epoch",
        num_train_epochs=EPOCHS,
        weight_decay=0.01,
        logging_steps=10,
        report_to="none"
    ),
    train_dataset=train_set,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
相关推荐
databook3 小时前
Manim实现闪光轨迹特效
后端·python·动效
Juchecar4 小时前
解惑:NumPy 中 ndarray.ndim 到底是什么?
python
用户8356290780514 小时前
Python 删除 Excel 工作表中的空白行列
后端·python
Json_4 小时前
使用python-fastApi框架开发一个学校宿舍管理系统-前后端分离项目
后端·python·fastapi
数据智能老司机11 小时前
精通 Python 设计模式——分布式系统模式
python·设计模式·架构
数据智能老司机12 小时前
精通 Python 设计模式——并发与异步模式
python·设计模式·编程语言
数据智能老司机12 小时前
精通 Python 设计模式——测试模式
python·设计模式·架构
数据智能老司机12 小时前
精通 Python 设计模式——性能模式
python·设计模式·架构
c8i12 小时前
drf初步梳理
python·django
每日AI新事件12 小时前
python的异步函数
python