最近新换了工作,以后的工作内容会和大模型相关,所以先抽空跑了一下chatGLM2-6b的demo,使用Qlora或lora微调模型
今天简单写个文档记录一下,顺便也是一个简单的教程,并且踩了qlora loss变成nan训练不稳定的问题
本教程并没有写lora的原理,需要的话自行查阅
1.chatGLM2-6b 模型我已经从huggingface 下载到服务器,因为我的服务器不能直接连接huggingface 下载
我是放到了文件夹下 /data/tmp/chatGLM2_6b_pretrain,包含模型文件和一些配置文件,直接在huggingface下载就好
2.打印模型结构
1 from transformers import AutoModel
2
3 model_name = "/data/tmp/chatGLM2_6b_pretrain"
4 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
5 print(model)
ChatGLMForConditionalGeneration(
(transformer): ChatGLMModel(
(embedding): Embedding(
(word_embeddings): Embedding(65024, 4096)
)
(rotary_pos_emb): RotaryEmbedding()
(encoder): GLMTransformer(
(layers): ModuleList(
(0-27): 28 x GLMBlock(
(input_layernorm): RMSNorm()
(self_attention): SelfAttention(
(query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
(core_attention): CoreAttention(
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(dense): Linear(in_features=4096, out_features=4096, bias=False)
)
(post_attention_layernorm): RMSNorm()
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
(dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
)
)
)
(final_layernorm): RMSNorm()
)
(output_layer): Linear(in_features=4096, out_features=65024, bias=False)
)
)
query_key_value 这个矩阵不是三个方阵拼接到一起,应该是Wq 4096*4096 Wk 4096*256 Wv 4096*256 使用的 group-attention
3.打印添加lora后的模型结构
1 from transformers import AutoTokenizer, AutoModel, AutoConfig
2 from peft import LoraConfig, get_peft_model, TaskType
3
4 model_name = "/data/tmp/chatGLM2_6b_pretrain"
5 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
6
7 config = LoraConfig(
8 peft_type="LORA",
9 task_type=TaskType.CAUSAL_LM,
10 inference_mode=False,
11 r=8,
12 lora_alpha=16,
13 lora_dropout=0.1,
14 fan_in_fan_out=False,
15 bias='lora_only',
16 target_modules=["query_key_value"]
17 )
18
19 model = get_peft_model(model, config)
20 print(model)
PeftModelForCausalLM(
(base_model): LoraModel(
(model): ChatGLMForConditionalGeneration(
(transformer): ChatGLMModel(
(embedding): Embedding(
(word_embeddings): Embedding(65024, 4096)
)
(rotary_pos_emb): RotaryEmbedding()
(encoder): GLMTransformer(
(layers): ModuleList(
(0-27): 28 x GLMBlock(
(input_layernorm): RMSNorm()
(self_attention): SelfAttention(
(query_key_value): Linear(
in_features=4096, out_features=4608, bias=True
(lora_dropout): ModuleDict(
(default): Dropout(p=0.1, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=4608, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
)
(core_attention): CoreAttention(
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(dense): Linear(in_features=4096, out_features=4096, bias=False)
)
(post_attention_layernorm): RMSNorm()
(mlp): MLP(
(dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
(dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
)
)
)
(final_layernorm): RMSNorm()
)
(output_layer): Linear(in_features=4096, out_features=65024, bias=False)
)
)
)
)
会发现 在query_key_value 矩阵下 多了两个全连接层,lora_A 和 lora_B ,这两个全连接层就是要训练的
4.准备数据集,我们使用的firefly数据集,可以自行去huggingface下载jsonl格式,需要提前划分好训练集和测试集 qa_dataset.py
1 # -*- coding: utf-8 -*-
2 from torch.utils.data import Dataset
3 import torch
4 import json
5 import numpy as np
6
7
8 class QADataset(Dataset):
9 def __init__(self, data_path, tokenizer, max_source_length, max_target_length) -> None:
10 super().__init__()
11 self.tokenizer = tokenizer
12 self.max_source_length = max_source_length
13 self.max_target_length = max_target_length
14 self.max_seq_length = self.max_source_length + self.max_target_length
15
16 self.data = []
17 with open(data_path, "r", encoding='utf-8') as f:
18 for line in f:
19 if not line or line == "":
20 continue
21 json_line = json.loads(line)
22 # {'kind': 'NLI', 'input': '自然语言推理:\n前提:家里人心甘情愿地养他,还有几家想让他做女婿的\n假设:他是被家里人收养的孤儿', 'target': '中立'}
23 kind = json_line["kind"]
24 input = json_line["input"]
25 target = json_line["target"]
26 self.data.append({
27 "question": input,
28 "answer": "--**"+kind+"**--\n"+target
29 })
30 print("data load , size:", len(self.data))
31 def preprocess(self, question, answer):
32 prompt = self.tokenizer.build_prompt(question, None)
33
34 a_ids = self.tokenizer.encode(text=prompt, add_special_tokens=True, truncation=True,
35 max_length=self.max_source_length)
36
37 b_ids = self.tokenizer.encode(text=answer, add_special_tokens=False, truncation=True,
38 max_length=self.max_target_length-1) #因为会补充eos_token
39
40 context_length = len(a_ids)
41 input_ids = a_ids + b_ids + [self.tokenizer.eos_token_id]
42 labels = [self.tokenizer.pad_token_id] * context_length + b_ids + [self.tokenizer.eos_token_id]
43
44 pad_len = self.max_seq_length - len(input_ids)
45 input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_len
46 labels = labels + [self.tokenizer.pad_token_id] * pad_len
47 labels = [(l if l != self.tokenizer.pad_token_id else -100) for l in labels]
48 return input_ids, labels
49
50 def __getitem__(self, index):
51 item_data = self.data[index]
52
53 input_ids, labels = self.preprocess(**item_data)
54
55 return {
56 "input_ids": torch.LongTensor(np.array(input_ids)),
57 "labels": torch.LongTensor(np.array(labels))
58 }
59
60 def __len__(self):
61 return len(self.data)
62
63 if __name__ == "__main__":
64 with open("/data/tmp/firefly_data/firefly-train-1.1M.jsonl", "r", encoding='utf-8') as f_read,open("/data/tmp/firefly_data/firefly_train80000.jsonl","w",encoding='utf-8') as f_trainx, open("/data/tmp/firefly_data/firefly_train.jsonl","w",encoding='utf-8') as f_train, open("/data/tmp/firefly_data/firefly_test.jsonl","w",encoding='utf-8') as f_test:
65 lines = f_read.readlines()
66
67 f_test.writelines(lines[:1000])
68 f_train.writelines(lines[1000:])
69 f_trainx.writelines(lines[1000:81000])
5.训练lora,使用半精度,占用显存很大,batch_size只能为1,显存就要占用到30g了,而且训练很久,为了解决这个显存占用大的问题,后面又尝试了qlora
train_lora.py
1 # -*- coding: utf-8 -*-
2 import pandas as pd
3 from torch.utils.data import DataLoader
4 from transformers import AutoTokenizer, AutoModel
5 from qa_dataset import QADataset
6 from peft import LoraConfig, get_peft_model, TaskType
7 from tqdm import tqdm
8 import torch
9 import os, time, sys
10 import numpy as np
11
12
13 def train(epoch, model, device, loader, optimizer, gradient_accumulation_steps,model_output_dir):
14 model.train()
15 time1 = time.time()
16 losses = []
17 train_bar = tqdm(loader,total=len(loader))
18 for index, data in enumerate(train_bar):
19 input_ids = data['input_ids'].to(device, dtype=torch.long)
20 labels = data['labels'].to(device, dtype=torch.long)
21
22 outputs = model(
23 input_ids=input_ids,
24 labels=labels,
25 )
26 loss = outputs.loss
27 # 反向传播,计算当前梯度
28 loss.backward()
29 losses.append(loss.item())
30 # 梯度累积步数
31 if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
32 # 更新网络参数
33 optimizer.step()
34 # 清空过往梯度
35 optimizer.zero_grad()
36
37 if index % 300 == 0:
38 model_save_path = os.path.join(model_output_dir,"index_{}".format(index))
39 if os.path.exists(model_save_path):
40 pass
41 else:
42 os.makedirs(model_save_path)
43 model.save_pretrained(model_save_path)
44 train_bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,index,np.mean(losses)))
45
46
47
48 def validate(tokenizer, model, device, loader, max_length):
49 model.eval()
50 predictions = []
51 actuals = []
52 with torch.no_grad():
53 for _, data in enumerate(tqdm(loader, file=sys.stdout, desc="Validation Data")):
54 input_ids = data['input_ids'].to(device, dtype=torch.long)
55 labels = data['labels'].to(device, dtype=torch.long)
56 generated_ids = model.generate(
57 input_ids=input_ids,
58 max_length=max_length,
59 do_sample=False,
60 temperature=0
61 )
62 preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
63 generated_ids]
64 target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
65 predictions.extend(preds)
66 actuals.extend(target)
67 return predictions, actuals
68
69
70 def main():
71 model_name = "/data/tmp/chatGLM2_6b_pretrain"
72 train_json_path = "/data/tmp/firefly_data/firefly_train20000.jsonl"
73 val_json_path = "/data/tmp/firefly_data/firefly_test.jsonl"
74 max_source_length = 60
75 max_target_length = 360
76 epochs = 1
77 batch_size = 1
78 lr = 1e-4
79 lora_rank = 8
80 lora_alpha = 32
81 gradient_accumulation_steps = 16
82 model_output_dir = "output"
83 # 设备
84 device = torch.device("cuda:0")
85
86 # 加载分词器和模型
87 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
88 model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
89
90 # setup peft
91 peft_config = LoraConfig(
92 task_type=TaskType.CAUSAL_LM,
93 inference_mode=False,
94 r=lora_rank,
95 lora_alpha=lora_alpha,
96 lora_dropout=0.1
97 )
98 model = get_peft_model(model, peft_config)
99 model.is_parallelizable = True
100 model.model_parallel = True
101 model.print_trainable_parameters()
102 # 转为半精度
103 model = model.half()
104 model.float()
105
106 print("Start Load Train Data...")
107 train_params = {
108 "batch_size": batch_size,
109 "shuffle": True,
110 "num_workers": 0,
111 }
112 training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
113 training_loader = DataLoader(training_set, **train_params)
114 print("Start Load Validation Data...")
115 val_params = {
116 "batch_size": batch_size,
117 "shuffle": False,
118 "num_workers": 0,
119 }
120 val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
121 val_loader = DataLoader(val_set, **val_params)
122
123 optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
124 model = model.to(device)
125 print("Start Training...")
126 for epoch in range(epochs):
127 train(epoch, model, device, training_loader, optimizer, gradient_accumulation_steps,model_output_dir)
128 # print("Save Model To ", model_output_dir)
129 # model.save_pretrained(model_output_dir)
130 # 验证
131 print("Start Validation...")
132 with torch.no_grad():
133 predictions, actuals = validate(tokenizer, model, device, val_loader, max_target_length)
134 # 验证结果存储
135 final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
136 val_data_path = os.path.join(model_output_dir, "predictions.csv")
137 final_df.to_csv(val_data_path)
138 print("Validation Data To ", val_data_path)
139
140
141 if __name__ == '__main__':
142 main()
6.有很多同学手里没有v100 32g显卡,即便batch_size=1 也没办法训练,所以又研究了qlora训练,qlora相比于lora来说就是多了一个模型量化过程,他会把原模型量化得到4-bit NormalFloat,这就会让原模型的显存占用很低,就可以让更多的显存来存放lora部分的参数
但是4-bit NormalFloat 结合手动half半精度的混合精度训练会导致loss很不稳定,可能一跑起来就变成nan了,就连float32结合半精度float16也是不稳定的,所以这块踩了坑,手动使用model.half() loss总会变成nan值,最后使用torch官方的自动混合精度就好了,它会自动的进行混合精度和梯度缩放,不至于产生超过半精度上下限的nan值。
这里说明为什么使用fp16混合精度,而不用float32,实践发现使用混合精度训练可以提速5-6倍的训练时间,在大模型上动不动就要跑很久,时间成本很高。
在2w的训练数据下,
qlora半精度需要2个小时就可以训练完成但是不稳定,float32需要11+小时
lora半精度需要不到5个小时,而且会比较稳定
如果这样的话,qlora不能使用半精度训练,否则loss不稳定,使用float32的话就是时间换空间,qlora除了能降低显存就完全失去了优势,还好torch官方的混合精度可以拯救半精度训练的问题
trian_qlora.py
1 # -*- coding: utf-8 -*-
2 import pandas as pd
3 from torch.utils.data import DataLoader
4 from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
5 from qa_dataset import QADataset
6 from peft import LoraConfig, get_peft_model, TaskType,prepare_model_for_kbit_training
7 from tqdm import tqdm
8 import torch
9 import os, time, sys
10 from transformers import (
11 set_seed,
12 HfArgumentParser,
13 TrainingArguments,
14 AutoModelForCausalLM
15 )
16 import bitsandbytes as bnb
17 from collections import defaultdict
18 import numpy as np
19 import os
20
21 def verify_model_dtype(model):
22 """
23 查看模型种各种类型的参数的情况
24 """
25 dtype2param_num = defaultdict(int) # 每种数据类型的参数量
26 dtype2param_name = defaultdict(list) # 每种数据类型的参数名称
27 dtype2trainable_param_num = defaultdict(int) # 每种数据类型参与训练的参数量
28 dtype2trainable_param_name = defaultdict(list) # 每种数据类型参与训练的参数名称
29 for name, p in model.named_parameters():
30 dtype = p.dtype
31 dtype2param_num[dtype] += p.numel()
32 dtype2param_name[dtype].append(name)
33 if p.requires_grad:
34 dtype2trainable_param_num[dtype] += p.numel()
35 dtype2trainable_param_name[dtype].append(name)
36 # 统计全部参数中,各种类型参数分布
37 total = 0
38 print('verify all params of the model')
39 for k, v in dtype2param_num.items():
40 total += v
41 for k, v in dtype2param_num.items():
42 print(k, v, v / total)
43 for k, v in dtype2trainable_param_name.items():
44 print(k, v)
45
46 print()
47 # 统计可训练参数中,各种类型参数分布
48 print('verify trainable params the model')
49 total_trainable = 0
50 for k, v in dtype2trainable_param_num.items():
51 total_trainable += v
52 for k, v in dtype2trainable_param_num.items():
53 print(k, v, v / total_trainable)
54 for k, v in dtype2trainable_param_num.items():
55 print(k, v)
56
57 def find_all_linear_names(model):
58 """
59 找出所有全连接层,为所有全连接添加adapter
60 """
61 cls = bnb.nn.Linear4bit
62 lora_module_names = set()
63 for name, module in model.named_modules():
64 if isinstance(module, cls):
65 names = name.split('.')
66 lora_module_names.add(names[0] if len(names) == 1 else names[-1])
67
68 if 'lm_head' in lora_module_names: # needed for 16-bit
69 lora_module_names.remove('lm_head')
70 return list(lora_module_names)
71
72 def train(epoch, model, device, loader, optimizer,scaler, gradient_accumulation_steps,model_output_dir):
73 model.train()
74 time1 = time.time()
75 losses = []
76 train_bar = tqdm(loader,total=len(loader))
77 for index, data in enumerate(train_bar):
78 optimizer.zero_grad()
79 with torch.autocast(device_type="cuda",dtype=torch.float16):
80 input_ids = data['input_ids'].to(device, dtype=torch.long)
81 labels = data['labels'].to(device, dtype=torch.long)
82
83 outputs = model(
84 input_ids=input_ids,
85 labels=labels,
86 )
87 loss = outputs.loss
88 losses.append(loss.item())
89
90 scaler.scale(loss).backward()
91 # Unscales the gradients of optimizer's assigned params in-place
92 # scaler.unscale_(optimizer)
93
94 # Since the gradients of optimizer's assigned params are unscaled, clips as usual:
95 # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
96
97 # optimizer's gradients are already unscaled, so scaler.step does not unscale them,
98 # although it still skips optimizer.step() if the gradients contain infs or NaNs.
99 scaler.step(optimizer)
100
101 # Updates the scale for next iteration.
102 scaler.update()
103
104 # # 反向传播,计算当前梯度
105 # loss.backward()
106 # optimizer.step()
107 # 梯度累积步数
108 # if (index % gradient_accumulation_steps == 0 and index != 0) or index == len(loader) - 1:
109 # # 更新网络参数
110 # # optimizer.step()
111 # scaler.step(optimizer)
112 # scaler.update()
113 # # 清空过往梯度
114 # optimizer.zero_grad()
115
116 if index % 300 == 0:
117 model_save_path = os.path.join(model_output_dir,"index_{}".format(index))
118 if os.path.exists(model_save_path):
119 pass
120 else:
121 os.makedirs(model_save_path)
122 model.save_pretrained(model_save_path)
123 train_bar.set_description("epoch:{} idx:{} loss:{:.6f}".format(epoch,index,np.mean(losses)))
124 # 100轮打印一次 loss
125 # if index % 100 == 0 or index == len(loader) - 1:
126 # time2 = time.time()
127 # tqdm.write(
128 # f"{index}, epoch: {epoch} -loss: {str(loss)} ; each step's time spent: {(str(float(time2 - time1) / float(index + 0.0001)))}")
129
130
131 def validate(tokenizer, model, device, loader, max_length):
132 model.eval()
133 predictions = []
134 actuals = []
135 with torch.no_grad():
136 for _, data in enumerate(tqdm(loader, file=sys.stdout, desc="Validation Data")):
137 input_ids = data['input_ids'].to(device, dtype=torch.long)
138 labels = data['labels'].to(device, dtype=torch.long)
139 generated_ids = model.generate(
140 input_ids=input_ids,
141 max_length=max_length,
142 do_sample=False,
143 temperature=0
144 )
145 preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in
146 generated_ids]
147 target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in labels]
148 predictions.extend(preds)
149 actuals.extend(target)
150 return predictions, actuals
151
152
153 def main():
154 model_name = "/data/tmp/chatGLM2_6b_pretrain"
155 train_json_path = "/data/tmp/firefly_data/firefly_train80000.jsonl"
156 val_json_path = "/data/tmp/firefly_data/firefly_test.jsonl"
157 max_source_length = 128
158 max_target_length = 512
159 epochs = 1
160 batch_size = 16
161 lr = 1e-4
162 lora_rank = 32
163 lora_alpha = 32
164 gradient_accumulation_steps = 16
165 model_output_dir = "output"
166 # 设备
167 device = torch.device("cuda:0")
168 lora_dropout = 0.05
169
170 # 加载分词器和模型
171 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
172 # model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
173 # 加载模型
174 model = AutoModelForCausalLM.from_pretrained(
175 model_name,
176 device_map=0,
177 load_in_4bit=True,
178 torch_dtype=torch.float16,
179 trust_remote_code=True,
180 quantization_config=BitsAndBytesConfig(
181 load_in_4bit=True,
182 bnb_4bit_compute_dtype=torch.float16,
183 bnb_4bit_use_double_quant=True,
184 bnb_4bit_quant_type="nf4",
185 llm_int8_threshold=6.0,
186 llm_int8_has_fp16_weight=False,
187 ),
188 )
189
190 # casts all the non int8 modules to full precision (fp32) for stability
191 model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
192 print(f'memory footprint of model: {model.get_memory_footprint()/(1024*1024*1024)} GB')
193
194 # 找到所有需要插入adapter的全连接层
195 target_modules = find_all_linear_names(model)
196 print("全连接层:",target_modules)
197 # 初始化lora配置
198 peft_config = LoraConfig(
199 r=lora_rank,
200 lora_alpha=lora_alpha,
201 # target_modules=target_modules,
202 target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"],
203 lora_dropout=lora_dropout,
204 bias="none",
205 task_type="CAUSAL_LM",
206 )
207
208 model = get_peft_model(model, peft_config)
209
210
211 # model.is_parallelizable = True
212 # model.model_parallel = True
213 model.print_trainable_parameters()
214 # 转为半精度
215 # model = model.half() #You shouldn't call half manually on the model or data. 不使用torch官方的自动混合精度和梯度缩放,手动使用half()半精度会导致模型loss变成nan,使用官方的混合精度需要关闭half()手动半精度,不然会报错
216 # model.float()
217 model.config.torch_dtype = torch.float32
218 # 查看模型种各种类型的参数的情况
219 verify_model_dtype(model)
220
221 print(model)
222
223 print("Start Load Train Data...")
224 train_params = {
225 "batch_size": batch_size,
226 "shuffle": True,
227 "num_workers": 0,
228 }
229 training_set = QADataset(train_json_path, tokenizer, max_source_length, max_target_length)
230 training_loader = DataLoader(training_set, **train_params)
231 print("Start Load Validation Data...")
232 val_params = {
233 "batch_size": batch_size,
234 "shuffle": False,
235 "num_workers": 0,
236 }
237 val_set = QADataset(val_json_path, tokenizer, max_source_length, max_target_length)
238 val_loader = DataLoader(val_set, **val_params)
239
240
241 scaler = torch.cuda.amp.GradScaler()
242 optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr)
243 model = model.to(device)
244 print("Start Training...")
245 for epoch in range(epochs):
246 train(epoch, model, device, training_loader, optimizer,scaler, gradient_accumulation_steps,model_output_dir)
247 print("Save Model To ", model_output_dir)
248 model.save_pretrained(model_output_dir)
249 # 验证
250 print("Start Validation...")
251 with torch.no_grad():
252 predictions, actuals = validate(tokenizer, model, device, val_loader, max_target_length)
253 # 验证结果存储
254 final_df = pd.DataFrame({"Generated Text": predictions, "Actual Text": actuals})
255 val_data_path = os.path.join(model_output_dir, "predictions.csv")
256 final_df.to_csv(val_data_path)
257 print("Validation Data To ", val_data_path)
258
259
260 if __name__ == '__main__':
261 main()
torch官方的自动混合精度代码
# Creates model and optimizer in default precision
model = Net().cuda()
optimizer = optim.SGD(model.parameters(), ...)
# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()
for epoch in epochs:
for input, target in data:
optimizer.zero_grad()
# Runs the forward pass with autocasting.
with autocast(device_type='cuda', dtype=torch.float16):
output = model(input)
loss = loss_fn(output, target)
# Scales loss. Calls backward() on scaled loss to create scaled gradients.
# Backward passes under autocast are not recommended.
# Backward ops run in the same dtype autocast chose for corresponding forward ops.
scaler.scale(loss).backward()
# scaler.step() first unscales the gradients of the optimizer's assigned params.
# If these gradients do not contain infs or NaNs, optimizer.step() is then called,
# otherwise, optimizer.step() is skipped.
scaler.step(optimizer)
# Updates the scale for next iteration.
scaler.update()
qlora在batch=16 lora_rank = 32 target_modules=["query_key_value","dense","dense_h_to_4h","dense_4h_to_h"] 对所有模型矩阵参数都进行lora的情况下才占用不足29g显存
而lora在batch=1 lora_rank = 8 只对query_key_value矩阵进行lora训练的情况下就占用了30g显存,而且速度会比较慢
报错 ValueError: Attempting to unscale FP16 gradients.
官方说不需要手动再调用.half()分别在模型和数据上
model = model.half()
使用官方的混合精度训练,模型的loss确实可以稳定下降
7.使用qlora + torch fp16混合精度训练可以稳定,模型输出结果:
Qlora 后chatGLM6b 预测结果:
[Round 1]
问:在上海的苹果代工厂,较低的基本工资让工人们形成了“软强制”的加班默契。加班能多拿两三千,“自愿”加班成为常态。律师提示,加班后虽能获得一时不错的报酬,但过重的工作负荷会透支身体,可能对今后劳动权利造成不利影响。
输出摘要:
答: --**Summary**--
苹果代工厂员工调查:为何争着“自愿”加班
[Round 1]
问:上联:把酒邀春,春日三人醉
下联:
答: --**Couplet**--
梳妆佩玉,玉王点一娇
actual label
--**Summary**--
苹果代工厂员工调查:为何争着“自愿”加班
--**Couplet**--
梳妆佩玉,玉王点一娇
8.使用qlora加载模型推理 model_test.py qlora推理占用的显存同样很低
1 from transformers import AutoTokenizer, AutoModel, AutoConfig,BitsAndBytesConfig
2 from peft import PeftConfig, PeftModel, LoraConfig, get_peft_model, TaskType
3 import torch
4 from transformers import (
5 set_seed,
6 HfArgumentParser,
7 TrainingArguments,
8 AutoModelForCausalLM
9 )
10
11 device = torch.device("cuda:0")
12
13 model_name = "/data/tmp/chatGLM2_6b_pretrain"
14 lora_dir = "output"
15
16 # 加载模型
17 model = AutoModelForCausalLM.from_pretrained(
18 model_name,
19 device_map=0,
20 load_in_4bit=True,
21 torch_dtype=torch.float16,
22 trust_remote_code=True,
23 quantization_config=BitsAndBytesConfig(
24 load_in_4bit=True,
25 bnb_4bit_compute_dtype=torch.float16,
26 bnb_4bit_use_double_quant=True,
27 bnb_4bit_quant_type="nf4",
28 llm_int8_threshold=6.0,
29 llm_int8_has_fp16_weight=False,
30 ),
31 )
32
33 # peft_config = LoraConfig(
34 # r=lora_rank,
35 # lora_alpha=lora_alpha,
36 # # target_modules=target_modules,
37 # target_modules=["query_key_value","dense_h_to_4h"],
38 # lora_dropout=lora_dropout,
39 # bias="none",
40 # task_type="CAUSAL_LM",
41 # )
42
43 # model = get_peft_model(model, peft_config)
44
45 # model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
46 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
47
48 config = PeftConfig.from_pretrained(lora_dir)
49 model = PeftModel.from_pretrained(model, lora_dir)
50
51 model = model.to(device)
52 model.eval()
53
54 while True:
55 text = input("问题:")
56 response, history = model.chat(tokenizer, text, history=[])
57 print("回答:", response)
58
邓紫棋在北京鸟巢开演唱会,唱了音乐《 画》 。 请找出这段话中的实体
回答: --NER--
北京鸟巢,邓紫棋