目录
1.数据预处理
导入所需库
pythonimport numpy as np from paddle.io import DataLoader,TensorDataset from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer from sklearn.model_selection import train_test_split import paddle import matplotlib.pyplot as plt import jieba
训练集格式 标签ID+\t+标签+\t+原文标题
pythoncontents=[] datas=[] labels=[] with open('data/data126283/data/Train.txt',mode='r',encoding='utf-8') as f: contents=f.read().split('\n') for item in contents: if item=='': continue labels.append(item.split('\t')[0]) datas.append(remove_stopwords(jieba.cut(item.split('\t')[-1]))) datas=convert(datas)
去除停用词、
pythonstop=[] with open('stop.txt',mode='r',encoding='utf-8') as f: stop=f.read().split('\n') stop_word={} for s in stop: stop_word[s]=True def remove_stopwords(datas): filtered_words = [text for text in datas if text not in stop_word] return ' '.join(filtered_words)
进行中文分词、转换为token序列
pythontokenizer = BertTokenizer.from_pretrained('bert-base-chinese') def convert(datas, max_seq_length=40): ans=[] for text in datas: input_ids = tokenizer(text, max_seq_len=max_seq_length)['input_ids'] input_ids = input_ids[:max_seq_length] # 截断 input_ids = input_ids + [tokenizer.pad_token_id] * (max_seq_length - len(input_ids)) # 填充 ans.append(input_ids) return ans
导入数据,进行预处理,数据集在最后
pythoncontents=[] datas=[] labels=[] with open('data/data126283/data/Train.txt',mode='r',encoding='utf-8') as f: contents=f.read().split('\n') for item in contents: if item=='': continue labels.append(item.split('\t')[0]) datas.append(remove_stopwords(jieba.cut(item.split('\t')[-1]))) datas=convert(datas)
2.加载模型
加载预训练模型,冻结大部分参数
pythonmodel = BertForSequenceClassification.from_pretrained('bert-base-chinese') model.classifier = paddle.nn.Linear(768, 14) for name, param in model.named_parameters(): if "classifier" not in name and 'bert.pooler.dense' not in name and 'bert.encoder.layers.11' not in name: param.stop_gradient = True
ps:如果只保留classifier用来训练,效果欠佳。
设置超参数,学习率初始设为0.01~0.1
pythonepochs=2 batch_size=1024*4 learning_rate=0.001
损失函数和优化器
pythoncriterion = paddle.nn.CrossEntropyLoss() optimizer = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters())
3.批训练
划分训练集和测试集
pythondatas=np.array(datas) labels=np.array(labels) x_train,x_test,y_train,y_test=train_test_split(datas,labels,random_state=42,test_size=0.2) train_dataset=TensorDataset([x_train,y_train]) train_loader=DataLoader(train_dataset,shuffle=True,batch_size=batch_size)
迭代分批训练,可视化损失函数
pythontotal_loss=[] for epoch in range(epochs): for batch_data,batch_label in train_loader: batch_label=paddle.to_tensor(batch_label,dtype='int64') batch_data=paddle.to_tensor(batch_data,dtype='int64') outputs=model(batch_data) loss=criterion(outputs,batch_label) print(epoch,loss.numpy()[0]) total_loss.append(loss.numpy()[0]) optimizer.clear_grad() loss.backward() optimizer.step() paddle.save({'model':model.state_dict()},'model.param') paddle.save({'optimizer':optimizer.state_dict()},'optimizer.param') plt.plot(range(len(total_loss)),total_loss) plt.show()
4.准确率
在测试集上如法炮制,查看准确率
pythontotal_loss=[] x_test=np.array(x_test) y_test=np.array(y_test) test_dataset=TensorDataset([x_test,y_test]) test_loader=DataLoader(test_dataset,shuffle=True,batch_size=batch_size) with paddle.no_grad(): for batch_data,batch_label in test_loader: batch_label=paddle.to_tensor(batch_label,dtype='int64') batch_data=paddle.to_tensor(batch_data,dtype='int64') outputs=model(batch_data) loss=criterion(outputs,batch_label) print(loss) outputs=paddle.argmax(outputs,axis=1) total_loss.append(loss.numpy()[0]) score=0 for predict,label in zip(outputs,batch_label): if predict==label: score+=1 print(score/len(batch_label)) plt.plot(range(len(total_loss)),total_loss) plt.show()
最后在验证集上输出要求的类别
pythonarr=['财经','彩票','房产','股票','家居','教育','科技','社会','时尚','时政','体育','星座','游戏','娱乐'] evals=[] contetns=[] with open('data/data126283/data/Test.txt',mode='r',encoding='utf-8') as f: contents=f.read().split('\n') for item in contents: if item=='': continue evals.append(item) evals=convert(evals) evals=np.array(evals) with paddle.no_grad(): for i in range(0,len(evals),2048): i=min(len(evals),i) batch_data=evals[i:i+2048] batch_data=paddle.to_tensor(batch_data,dtype='int64') predict=model(batch_data) predict=list(paddle.argmax(predict,axis=1)) print(i,len(predict)) for j in range(len(predict)): predict[j]=arr[predict[j]] with open('result.txt',mode='a',encoding='utf-8') as f: f.write('\n'.join(predict)) f.write('\n')
ps:注意最后的f.write('\n'),否则除第一次,每次打印少一行,很坑
最后损失函数收敛在0.2或0.1左右比较正常,四舍五入差不多90准确率,当然如果你解冻更多参数,自然可以更加精确,看运行环境的配置了,建议不要使用免费平台配置,否则比乌龟还慢。。
欢迎提出问题