数据准备
AclImdb -- v1 Dataset 是用于二进制情绪分类的大型电影评论数据集,其涵盖比基准数据集更多的数据,其中有 25,000 条电影评论用于训练,25,000 条用于测试,还有其他未经标记的数据可供使用。
数据预处理和数据装载
python
import re
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import os
def tokenization(content):
content = re.sub("<.*?>"," ",content)
fileters = ['\t','\n','\x97','\x96','#','%','$','&',"\.","\?","!","\,"]
content = re.sub("|".join(fileters)," ",content)
tokens = [i.strip().lower() for i in content.split()]
return tokens
def collate_fn(batch):
"""
:param batch:( [tokens, labels], [tokens, labels])
:return:
"""
content, label = list(zip(*batch))
return content,label
class ImdbDataset(Dataset):
def __init__(self, train=True):
self.train_data_path = '..\\aclImdb\\train\\'
self.test_data_path = '..\\aclImdb\\test\\'
data_path = self.train_data_path if train else self.test_data_path
#把所有文件名放入列表
temp_data_path = [os.path.join(data_path,"pos"), os.path.join(data_path+"neg")]
print(temp_data_path)
self.total_file_path = [] #所有评论文件路径
for path in temp_data_path:
file_name_list = os.listdir(path)
file_path_list = [os.path.join(path, i) for i in file_name_list if i.endswith(".txt")]
self.total_file_path.extend(file_path_list)
def __len__(self):
return len(self.total_file_path)
def __getitem__(self, index):
file_path = self.total_file_path[index]
# 获取label
labelstr = file_path.split("\\")[-2]
label = 0 if labelstr == "neg" else 1
# 获取内容
content = open(file_path).read()
tokens = tokenization(content)
return tokens, label
def get_data(train=True):
imbd_dataset = ImdbDataset(train)
data_loader = DataLoader(imbd_dataset, batch_size=2, shuffle=True,collate_fn=collate_fn)
return data_loader
文本序列化
把文本里每个词语和其对应数字,使用字典保存 即句子--->数字列表
思路:
- 句子进行分词(tokenization)
- 词语存入字典,统计出现次数,根据出现次数对齐进行过滤
- 把文本 转 数字序列
- 把 数字序列 转 文本
遇到新出现的字符再词典里没有,可以用特殊字符替代
预保持每个batch里的序列大小一致,使用填充方法
python
"""
构建词典 把句子转换成序列 再把序列转成句子
"""
class Word2Sequence:
UNK_TAG = "UNK"
PAD_TAG = "PAD"
UNK =0
PAD =1
def __init__(self):
self.dict = {
self.UNK_TAG: self.UNK,
self.PAD_TAG: self.PAD
}
self.count = {}
def fit(self, sentence):
# 把单个句子保存到dict
for word in sentence:
self.count[word] = self.count.get(word, 0)+1
def build_vocab(self, min=5, max=None, max_features=None):
"""
:param min:
:param max:
:param max_features: 一共保留多少个词语
:return:
"""
# 删除count中词频小于min的词语
self.count = {word:value for word, value in self.count.items() if value>min}
# 删除count中词频大于max的词语
if max is not None:
self.count = {word: value for word, value in self.count.items() if value < max}
# 限制保留的词语数
if max_features is not None:
temp = sorted(self.cout.items(), key=lambda x:x[-1], reverse=True)[:max_features]
self.count = dict(temp)
# 把 词语 ------>数字
for word in self.count:
self.dict[word] = len(self.dict)
# 得到一个反转的dict字典
self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))
def transform(self, sentence, max_len=None):
"""
把句子 转成 序列
:param sentence: [word1, word2, ..]
:param max_len: 对句子进行填充或者裁剪
:return:
"""
if max_len is not None:
if max_len > len(sentence):
sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence)) # 填充
if max_len < len(sentence):
sentence = sentence[:max_len] # 裁剪
return [self.dict.get(word, self.UNK) for word in sentence]
def inverse_transform(self, indices):
# 把 序列 ------>句子
return [self.inverse_dict.get(idx) for idx in indices]
if __name__ == '__main__':
ws = Word2Sequence()
ws.fit(["我","是","你","的","爸爸"])
ws.fit(["我","是","我","的","人"])
ws.build_vocab(min=0)
print(ws.dict)
re = ws.transform(["我","爱","人"],max_len=10)
print(re)
ret = ws.inverse_transform(re)
print(ret)
模型构建(简单全连接)
注意 word_embedding的使用!
python
"""
定义模型
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from lib import ws,max_len
from dataset import get_data
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.embedding = nn.Embedding(len(ws), 100)
self.fc = nn.Linear(100*max_len, 2)
def forward(self, input):
"""
:param input: [batch_size, max_len]
:return:
"""
x = self.embedding(input) # [batch_size, max_len, 100]
x = x.view([-1, 100*max_len])
output = self.fc(x)
return F.log_softmax(output,dim=-1)
model = MyModel()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
def train(epoch):
for idx,(input,target) in enumerate(get_data(train=True)):
# 梯度清零
optimizer.zero_grad()
output= model(input)
loss = F.nll_loss(output,target)
loss.backward()
optimizer.step()
print(loss.item())
if __name__ == '__main__':
for i in range(1):
train(epoch=i)