最近新换了工作，以后的工作内容会和大模型相关，所以先抽空跑了一下chatGLM2-6b的demo，使用Qlora或lora微调模型

今天简单写个文档记录一下，顺便也是一个简单的教程，并且踩了qlora loss变成nan训练不稳定的问题

本教程并没有写lora的原理，需要的话自行查阅

1.chatGLM2-6b 模型我已经从huggingface 下载到服务器，因为我的服务器不能直接连接huggingface 下载
我是放到了文件夹下 /data/tmp/chatGLM2_6b_pretrain,包含模型文件和一些配置文件，直接在huggingface下载就好

2.打印模型结构

复制代码

1 from transformers import AutoModel
2 
3 model_name = "/data/tmp/chatGLM2_6b_pretrain"
4 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
5 print(model)

复制代码

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
  )
)

复制代码

query_key_value 这个矩阵不是三个方阵拼接到一起，应该是Wq 4096*4096  Wk 4096*256 Wv 4096*256 使用的 group-attention

3.打印添加lora后的模型结构

复制代码

 1 from transformers import AutoTokenizer, AutoModel, AutoConfig
 2 from peft import LoraConfig, get_peft_model, TaskType
 3 
 4 model_name = "/data/tmp/chatGLM2_6b_pretrain"
 5 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 6 
 7 config = LoraConfig(
 8     peft_type="LORA",
 9     task_type=TaskType.CAUSAL_LM,
10     inference_mode=False,
11     r=8,
12     lora_alpha=16,
13     lora_dropout=0.1,
14     fan_in_fan_out=False,
15     bias='lora_only',
16     target_modules=["query_key_value"]
17 )
18 
19 model = get_peft_model(model, config)
20 print(model)

复制代码

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): ChatGLMForConditionalGeneration(
      (transformer): ChatGLMModel(
        (embedding): Embedding(
          (word_embeddings): Embedding(65024, 4096)
        )
        (rotary_pos_emb): RotaryEmbedding()
        (encoder): GLMTransformer(
          (layers): ModuleList(
            (0-27): 28 x GLMBlock(
              (input_layernorm): RMSNorm()
              (self_attention): SelfAttention(
                (query_key_value): Linear(
                  in_features=4096, out_features=4608, bias=True
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=4096, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=4608, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                )
                (core_attention): CoreAttention(
                  (attention_dropout): Dropout(p=0.0, inplace=False)
                )
                (dense): Linear(in_features=4096, out_features=4096, bias=False)
              )
              (post_attention_layernorm): RMSNorm()
              (mlp): MLP(
                (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
                (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
              )
            )
          )
          (final_layernorm): RMSNorm()
        )
        (output_layer): Linear(in_features=4096, out_features=65024, bias=False)
      )
    )
  )
)

会发现在query_key_value 矩阵下多了两个全连接层，lora_A 和 lora_B ，这两个全连接层就是要训练的

4.准备数据集，我们使用的firefly数据集,可以自行去huggingface下载jsonl格式，需要提前划分好训练集和测试集 qa_dataset.py

复制代码

 1 # -*- coding: utf-8 -*-
 2 from torch.utils.data import Dataset
 3 import torch
 4 import json
 5 import numpy as np
 6 
 7 
 8 class QADataset(Dataset):
 9     def __init__(self, data_path, tokenizer, max_source_length, max_target_length) -> None:
10         super().__init__()
11         self.tokenizer = tokenizer
12         self.max_source_length = max_source_length
13         self.max_target_length = max_target_length
14         self.max_seq_length = self.max_source_length + self.max_target_length
15 
16         self.data = []
17         with open(data_path, "r", encoding='utf-8') as f:
18             for line in f:
19                 if not line or line == "":
20                     continue
21                 json_line = json.loads(line)
22                 # {'kind': 'NLI', 'input': '自然语言推理：\n前提：家里人心甘情愿地养他,还有几家想让他做女婿的\n假设：他是被家里人收养的孤儿', 'target': '中立'}
23                 kind = json_line["kind"]
24                 input = json_line["input"]
25                 target = json_line["target"]
26                 self.data.append({
27                     "question": input,
28                     "answer": "--**"+kind+"**--\n"+target
29                 })
30         print("data load ， size：", len(self.data))
31     def preprocess(self, question, answer):
32         prompt = self.tokenizer.build_prompt(question, None)
33 
34         a_ids = self.tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
35                                       max_length=self.max_source_length)
36 
37         b_ids = self.tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
38                                       max_length=self.max_target_length-1) #因为会补充eos_token
39 
40         context_length = len(a_ids)
41         input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
42         labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
43 
44         pad_len = self.max_seq_length - len(input_ids)
45         input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
46         labels = labels + [self.tokenizer.pad_token_id] * pad_len
47         labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
48         return input_ids, labels
49 
50     def __getitem__(self, index):
51         item_data = self.data[index]
52 
53         input_ids, labels = self.preprocess(**item_data)
54 
55         return {
56             "input_ids": torch.LongTensor(np.array(input_ids)),
57             "labels": torch.LongTensor(np.array(labels))
58         }
59 
60     def __len__(self):
61         return len(self.data)
62 
63 if __name__ == "__main__":
64     with open("/data/tmp/firefly_data/firefly-train-1.1M.jsonl", "r", encoding='utf-8') as f_read,open("/data/tmp/firefly_data/firefly_train80000.jsonl","w",encoding='utf-8') as f_trainx, open("/data/tmp/firefly_data/firefly_train.jsonl","w",encoding='utf-8') as f_train, open("/data/tmp/firefly_data/firefly_test.jsonl","w",encoding='utf-8') as f_test:
65         lines = f_read.readlines()
66 
67         f_test.writelines(lines[:1000])
68         f_train.writelines(lines[1000:])
69         f_trainx.writelines(lines[1000:81000])

5.训练lora，使用半精度，占用显存很大，batch_size只能为1，显存就要占用到30g了，而且训练很久，为了解决这个显存占用大的问题，后面又尝试了qlora

train_lora.py

复制代码

  1 # -*- coding: utf-8 -*-
  2 import pandas as pd
  3 from torch.utils.data import DataLoader
  4 from transformers import AutoTokenizer, AutoModel
  5 from qa_dataset import QADataset
  6 from peft import LoraConfig, get_peft_model, TaskType
  7 from tqdm import tqdm
  8 import torch
  9 import os, time, sys
 10 import numpy as np
 11 
 12 
 13 def train(epoch, model, device, loader, optimizer, gradient_accumulation_steps,model_output_dir):
 14     model.train()
 15     time1 = time.time()
 16     losses = []
 17     train_bar = tqdm(loader,total=len(loader))
 18     for index, data in enumerate(train_bar):
 19         input_ids = data['input_ids'].to(device, dtype=torch.long)
 20         labels = data['labels'].to(device, dtype=torch.long)
 21 
 22         outputs = model(
 23             input_ids=input_ids,
 24             labels=labels,
 25         )
 26         loss = outputs.loss
 27         # 反向传播，计算当前梯度
 28         loss.backward()
 29         losses.append(loss.item())
 30         # 梯度累积步数
 31         if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
 32             # 更新网络参数
 33             optimizer.step()
 34             # 清空过往梯度
 35             optimizer.zero_grad()
 36 
 37         if index % 300 == 0:
 38             model_save_path = os.path.join(model_output_dir,"index_{}".format(index))
 39             if os.path.exists(model_save_path):
 40                 pass
 41             else:
 42                 os.makedirs(model_save_path)
 43             model.save_pretrained(model_save_path)
 44         train_bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,index,np.mean(losses)))
 45 
 46 
 47 
 48 def validate(tokenizer, model, device, loader, max_length):
 49     model.eval()
 50     predictions = []
 51     actuals = []
 52     with torch.no_grad():
 53         for _, data in enumerate(tqdm(loader, file=sys.stdout, desc="Validation Data")):
 54             input_ids = data['input_ids'].to(device, dtype=torch.long)
 55             labels = data['labels'].to(device, dtype=torch.long)
 56             generated_ids = model.generate(
 57                 input_ids=input_ids,
 58                 max_length=max_length,
 59                 do_sample=False,
 60                 temperature=0
 61             )
 62             preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
 63                      generated_ids]
 64             target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
 65             predictions.extend(preds)
 66             actuals.extend(target)
 67     return predictions, actuals
 68 
 69 
 70 def main():
 71     model_name = "/data/tmp/chatGLM2_6b_pretrain"
 72     train_json_path = "/data/tmp/firefly_data/firefly_train20000.jsonl"
 73     val_json_path = "/data/tmp/firefly_data/firefly_test.jsonl"
 74     max_source_length = 60
 75     max_target_length = 360
 76     epochs = 1
 77     batch_size = 1
 78     lr = 1e-4
 79     lora_rank = 8
 80     lora_alpha = 32
 81     gradient_accumulation_steps = 16
 82     model_output_dir = "output"
 83     # 设备
 84     device = torch.device("cuda:0")
 85 
 86     # 加载分词器和模型
 87     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 88     model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
 89 
 90     # setup peft
 91     peft_config = LoraConfig(
 92         task_type=TaskType.CAUSAL_LM,
 93         inference_mode=False,
 94         r=lora_rank,
 95         lora_alpha=lora_alpha,
 96         lora_dropout=0.1
 97     )
 98     model = get_peft_model(model, peft_config)
 99     model.is_parallelizable = True
100     model.model_parallel = True
101     model.print_trainable_parameters()
102     # 转为半精度
103     model = model.half()
104     model.float()
105 
106     print("Start Load Train Data...")
107     train_params = {
108         "batch_size": batch_size,
109         "shuffle": True,
110         "num_workers": 0,
111     }
112     training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
113     training_loader = DataLoader(training_set, **train_params)
114     print("Start Load Validation Data...")
115     val_params = {
116         "batch_size": batch_size,
117         "shuffle": False,
118         "num_workers": 0,
119     }
120     val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
121     val_loader = DataLoader(val_set, **val_params)
122 
123     optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
124     model = model.to(device)
125     print("Start Training...")
126     for epoch in range(epochs):
127         train(epoch, model, device, training_loader, optimizer, gradient_accumulation_steps,model_output_dir)
128         # print("Save Model To ", model_output_dir)
129         # model.save_pretrained(model_output_dir)
130     # 验证
131     print("Start Validation...")
132     with torch.no_grad():
133         predictions, actuals = validate(tokenizer, model, device, val_loader, max_target_length)
134         # 验证结果存储
135         final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
136         val_data_path = os.path.join(model_output_dir, "predictions.csv")
137         final_df.to_csv(val_data_path)
138         print("Validation Data To ", val_data_path)
139 
140 
141 if __name__ == '__main__':
142     main()

6.有很多同学手里没有v100 32g显卡，即便batch_size=1 也没办法训练，所以又研究了qlora训练，qlora相比于lora来说就是多了一个模型量化过程，他会把原模型量化得到4-bit NormalFloat，这就会让原模型的显存占用很低，就可以让更多的显存来存放lora部分的参数

但是4-bit NormalFloat 结合手动half半精度的混合精度训练会导致loss很不稳定，可能一跑起来就变成nan了，就连float32结合半精度float16也是不稳定的，所以这块踩了坑，手动使用model.half() loss总会变成nan值，最后使用torch官方的自动混合精度就好了，它会自动的进行混合精度和梯度缩放，不至于产生超过半精度上下限的nan值。

这里说明为什么使用fp16混合精度，而不用float32，实践发现使用混合精度训练可以提速5-6倍的训练时间，在大模型上动不动就要跑很久，时间成本很高。

在2w的训练数据下，
qlora半精度需要2个小时就可以训练完成但是不稳定，float32需要11+小时
lora半精度需要不到5个小时，而且会比较稳定
如果这样的话，qlora不能使用半精度训练，否则loss不稳定，使用float32的话就是时间换空间，qlora除了能降低显存就完全失去了优势，还好torch官方的混合精度可以拯救半精度训练的问题

trian_qlora.py

复制代码

  1 # -*- coding: utf-8 -*-
  2 import pandas as pd
  3 from torch.utils.data import DataLoader
  4 from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
  5 from qa_dataset import QADataset
  6 from peft import LoraConfig, get_peft_model, TaskType,prepare_model_for_kbit_training
  7 from tqdm import tqdm
  8 import torch
  9 import os, time, sys
 10 from transformers import (
 11     set_seed,
 12     HfArgumentParser,
 13     TrainingArguments,
 14     AutoModelForCausalLM
 15 )
 16 import bitsandbytes as bnb
 17 from collections import defaultdict
 18 import numpy as np
 19 import os
 20 
 21 def verify_model_dtype(model):
 22     """
 23     查看模型种各种类型的参数的情况
 24     """
 25     dtype2param_num = defaultdict(int)  # 每种数据类型的参数量
 26     dtype2param_name = defaultdict(list)  # 每种数据类型的参数名称
 27     dtype2trainable_param_num = defaultdict(int)  # 每种数据类型参与训练的参数量
 28     dtype2trainable_param_name = defaultdict(list)  # 每种数据类型参与训练的参数名称
 29     for name, p in model.named_parameters():
 30         dtype = p.dtype
 31         dtype2param_num[dtype] += p.numel()
 32         dtype2param_name[dtype].append(name)
 33         if p.requires_grad:
 34             dtype2trainable_param_num[dtype] += p.numel()
 35             dtype2trainable_param_name[dtype].append(name)
 36     # 统计全部参数中，各种类型参数分布
 37     total = 0
 38     print('verify all params of the model')
 39     for k, v in dtype2param_num.items():
 40         total += v
 41     for k, v in dtype2param_num.items():
 42         print(k, v, v / total)
 43     for k, v in dtype2trainable_param_name.items():
 44         print(k, v)
 45 
 46     print()
 47     # 统计可训练参数中，各种类型参数分布
 48     print('verify trainable params the model')
 49     total_trainable = 0
 50     for k, v in dtype2trainable_param_num.items():
 51         total_trainable += v
 52     for k, v in dtype2trainable_param_num.items():
 53         print(k, v, v / total_trainable)
 54     for k, v in dtype2trainable_param_num.items():
 55         print(k, v)
 56 
 57 def find_all_linear_names(model):
 58     """
 59     找出所有全连接层，为所有全连接添加adapter
 60     """
 61     cls = bnb.nn.Linear4bit
 62     lora_module_names = set()
 63     for name, module in model.named_modules():
 64         if isinstance(module, cls):
 65             names = name.split('.')
 66             lora_module_names.add(names[0] if len(names) == 1 else names[-1])
 67 
 68     if 'lm_head' in lora_module_names:  # needed for 16-bit
 69         lora_module_names.remove('lm_head')
 70     return list(lora_module_names)
 71 
 72 def train(epoch, model, device, loader, optimizer,scaler, gradient_accumulation_steps,model_output_dir):
 73     model.train()
 74     time1 = time.time()
 75     losses = []
 76     train_bar = tqdm(loader,total=len(loader))
 77     for index, data in enumerate(train_bar):
 78         optimizer.zero_grad()
 79         with torch.autocast(device_type="cuda",dtype=torch.float16):
 80             input_ids = data['input_ids'].to(device, dtype=torch.long)
 81             labels = data['labels'].to(device, dtype=torch.long)
 82 
 83             outputs = model(
 84                 input_ids=input_ids,
 85                 labels=labels,
 86             )
 87             loss = outputs.loss
 88             losses.append(loss.item())
 89 
 90             scaler.scale(loss).backward()
 91             # Unscales the gradients of optimizer's assigned params in-place
 92             # scaler.unscale_(optimizer)
 93 
 94             # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
 95             # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
 96 
 97             # optimizer's gradients are already unscaled, so scaler.step does not unscale them,
 98             # although it still skips optimizer.step() if the gradients contain infs or NaNs.
 99             scaler.step(optimizer)
100 
101             # Updates the scale for next iteration.
102             scaler.update()
103 
104             # # 反向传播，计算当前梯度
105             # loss.backward()
106             # optimizer.step()
107             # 梯度累积步数
108             # if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
109             #     # 更新网络参数
110             #     # optimizer.step()
111             #     scaler.step(optimizer)
112             #     scaler.update()
113             #     # 清空过往梯度
114             #     optimizer.zero_grad()
115 
116         if index % 300 == 0:
117             model_save_path = os.path.join(model_output_dir,"index_{}".format(index))
118             if os.path.exists(model_save_path):
119                 pass
120             else:
121                 os.makedirs(model_save_path)
122             model.save_pretrained(model_save_path)
123         train_bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,index,np.mean(losses)))
124         # 100轮打印一次 loss
125         # if index % 100 == 0 or index == len(loader) - 1:
126         #     time2 = time.time()
127         #     tqdm.write(
128         #         f"{index}, epoch: {epoch} -loss: {str(loss)} ; each step's time spent: {(str(float(time2 - time1) / float(index + 0.0001)))}")
129 
130 
131 def validate(tokenizer, model, device, loader, max_length):
132     model.eval()
133     predictions = []
134     actuals = []
135     with torch.no_grad():
136         for _, data in enumerate(tqdm(loader, file=sys.stdout, desc="Validation Data")):
137             input_ids = data['input_ids'].to(device, dtype=torch.long)
138             labels = data['labels'].to(device, dtype=torch.long)
139             generated_ids = model.generate(
140                 input_ids=input_ids,
141                 max_length=max_length,
142                 do_sample=False,
143                 temperature=0
144             )
145             preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
146                      generated_ids]
147             target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
148             predictions.extend(preds)
149             actuals.extend(target)
150     return predictions, actuals
151 
152 
153 def main():
154     model_name = "/data/tmp/chatGLM2_6b_pretrain"
155     train_json_path = "/data/tmp/firefly_data/firefly_train80000.jsonl"
156     val_json_path = "/data/tmp/firefly_data/firefly_test.jsonl"
157     max_source_length = 128
158     max_target_length = 512
159     epochs = 1
160     batch_size = 16
161     lr = 1e-4
162     lora_rank = 32
163     lora_alpha = 32
164     gradient_accumulation_steps = 16
165     model_output_dir = "output"
166     # 设备
167     device = torch.device("cuda:0")
168     lora_dropout = 0.05
169 
170     # 加载分词器和模型
171     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
172     # model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
173     # 加载模型
174     model = AutoModelForCausalLM.from_pretrained(
175         model_name,
176         device_map=0,
177         load_in_4bit=True,
178         torch_dtype=torch.float16,
179         trust_remote_code=True,
180         quantization_config=BitsAndBytesConfig(
181             load_in_4bit=True,
182             bnb_4bit_compute_dtype=torch.float16,
183             bnb_4bit_use_double_quant=True,
184             bnb_4bit_quant_type="nf4",
185             llm_int8_threshold=6.0,
186             llm_int8_has_fp16_weight=False,
187         ),
188     )
189 
190     # casts all the non int8 modules to full precision (fp32) for stability
191     model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
192     print(f'memory footprint of model: {model.get_memory_footprint()/(1024*1024*1024)} GB')
193 
194     # 找到所有需要插入adapter的全连接层
195     target_modules = find_all_linear_names(model)
196     print("全连接层：",target_modules)
197     # 初始化lora配置
198     peft_config = LoraConfig(
199         r=lora_rank,
200         lora_alpha=lora_alpha,
201         # target_modules=target_modules,
202         target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"],
203         lora_dropout=lora_dropout,
204         bias="none",
205         task_type="CAUSAL_LM",
206     )
207 
208     model = get_peft_model(model, peft_config)
209 
210 
211     # model.is_parallelizable = True
212     # model.model_parallel = True
213     model.print_trainable_parameters()
214     # 转为半精度
215     # model = model.half() #You shouldn't call half manually on the model or data.  不使用torch官方的自动混合精度和梯度缩放，手动使用half()半精度会导致模型loss变成nan，使用官方的混合精度需要关闭half()手动半精度，不然会报错
216     # model.float()
217     model.config.torch_dtype = torch.float32
218     # 查看模型种各种类型的参数的情况
219     verify_model_dtype(model)
220 
221     print(model)
222 
223     print("Start Load Train Data...")
224     train_params = {
225         "batch_size": batch_size,
226         "shuffle": True,
227         "num_workers": 0,
228     }
229     training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
230     training_loader = DataLoader(training_set, **train_params)
231     print("Start Load Validation Data...")
232     val_params = {
233         "batch_size": batch_size,
234         "shuffle": False,
235         "num_workers": 0,
236     }
237     val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
238     val_loader = DataLoader(val_set, **val_params)
239 
240 
241     scaler = torch.cuda.amp.GradScaler()
242     optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
243     model = model.to(device)
244     print("Start Training...")
245     for epoch in range(epochs):
246         train(epoch, model, device, training_loader, optimizer,scaler, gradient_accumulation_steps,model_output_dir)
247         print("Save Model To ", model_output_dir)
248         model.save_pretrained(model_output_dir)
249     # 验证
250     print("Start Validation...")
251     with torch.no_grad():
252         predictions, actuals = validate(tokenizer, model, device, val_loader, max_target_length)
253         # 验证结果存储
254         final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
255         val_data_path = os.path.join(model_output_dir, "predictions.csv")
256         final_df.to_csv(val_data_path)
257         print("Validation Data To ", val_data_path)
258 
259 
260 if __name__ == '__main__':
261     main()

torch官方的自动混合精度代码

复制代码

# Creates model and optimizer in default precision
model = Net().cuda()
optimizer = optim.SGD(model.parameters(), ...)

# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()

for epoch in epochs:
    for input, target in data:
        optimizer.zero_grad()

        # Runs the forward pass with autocasting.
        with autocast(device_type='cuda', dtype=torch.float16):
            output = model(input)
            loss = loss_fn(output, target)

        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
        # Backward passes under autocast are not recommended.
        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
        scaler.scale(loss).backward()

        # scaler.step() first unscales the gradients of the optimizer's assigned params.
        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
        # otherwise, optimizer.step() is skipped.
        scaler.step(optimizer)

        # Updates the scale for next iteration.
        scaler.update()

qlora在batch=16 lora_rank = 32 target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"] 对所有模型矩阵参数都进行lora的情况下才占用不足29g显存
而lora在batch=1 lora_rank = 8 只对query_key_value矩阵进行lora训练的情况下就占用了30g显存，而且速度会比较慢

报错 ValueError: Attempting to unscale FP16 gradients.
官方说不需要手动再调用.half()分别在模型和数据上

model = model.half()

使用官方的混合精度训练，模型的loss确实可以稳定下降

7.使用qlora + torch fp16混合精度训练可以稳定，模型输出结果：
Qlora 后chatGLM6b 预测结果：

复制代码

[Round 1]

问：在上海的苹果代工厂，较低的基本工资让工人们形成了“软强制”的加班默契。加班能多拿两三千，“自愿”加班成为常态。律师提示，加班后虽能获得一时不错的报酬，但过重的工作负荷会透支身体，可能对今后劳动权利造成不利影响。
输出摘要：

答： --**Summary**--
苹果代工厂员工调查：为何争着“自愿”加班
[Round 1]

问：上联：把酒邀春，春日三人醉
下联：

答： --**Couplet**--
梳妆佩玉，玉王点一娇

actual label

复制代码

--**Summary**--
苹果代工厂员工调查：为何争着“自愿”加班
--**Couplet**--
梳妆佩玉，玉王点一娇

8.使用qlora加载模型推理 model_test.py qlora推理占用的显存同样很低

复制代码

 1 from transformers import AutoTokenizer, AutoModel, AutoConfig,BitsAndBytesConfig
 2 from peft import PeftConfig, PeftModel, LoraConfig, get_peft_model, TaskType
 3 import torch
 4 from transformers import (
 5     set_seed,
 6     HfArgumentParser,
 7     TrainingArguments,
 8     AutoModelForCausalLM
 9 )
10 
11 device = torch.device("cuda:0")
12 
13 model_name = "/data/tmp/chatGLM2_6b_pretrain"
14 lora_dir = "output"
15 
16 # 加载模型
17 model = AutoModelForCausalLM.from_pretrained(
18     model_name,
19     device_map=0,
20     load_in_4bit=True,
21     torch_dtype=torch.float16,
22     trust_remote_code=True,
23     quantization_config=BitsAndBytesConfig(
24         load_in_4bit=True,
25         bnb_4bit_compute_dtype=torch.float16,
26         bnb_4bit_use_double_quant=True,
27         bnb_4bit_quant_type="nf4",
28         llm_int8_threshold=6.0,
29         llm_int8_has_fp16_weight=False,
30     ),
31 )
32 
33 # peft_config = LoraConfig(
34 #     r=lora_rank,
35 #     lora_alpha=lora_alpha,
36 #     # target_modules=target_modules,
37 #     target_modules=["query_key_value","dense_h_to_4h"],
38 #     lora_dropout=lora_dropout,
39 #     bias="none",
40 #     task_type="CAUSAL_LM",
41 # )
42 
43 # model = get_peft_model(model, peft_config)
44 
45 # model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
46 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
47 
48 config = PeftConfig.from_pretrained(lora_dir)
49 model = PeftModel.from_pretrained(model, lora_dir)
50 
51 model = model.to(device)
52 model.eval()
53 
54 while True:
55     text = input("问题：")
56     response, history = model.chat(tokenizer, text, history=[])
57     print("回答：", response)
58

复制代码

邓紫棋在北京鸟巢开演唱会，唱了音乐《 画》 。 请找出这段话中的实体
回答： --NER--
北京鸟巢,邓紫棋

使用单卡v100 32g或更低显存的卡，使用peft工具qlora或lora混合精度训练大模型chatGLM2-6b，torch混合精度加速稳定训练,解决qlora loss变成nan的问题！

model = model.half()