🚀 Day04 - NLP项目实战
📖 导读 :
第四天,进入项目实战阶段。
🏗️ 项目流程
1. 数据加载
python
import pandas as pd
train_df = pd.read_csv('train.tsv', sep='\t')
test_df = pd.read_csv('test.tsv', sep='\t')
2. 数据清洗
python
import re
def clean_text(text):
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text
🔧 特征工程
3.1 文本特征
python
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_df['text'])
3.2 长度特征
python
train_df['length'] = train_df['text'].apply(len)
train_df['word_count'] = train_df['text'].apply(lambda x: len(x.split()))
🧠 模型构建
4.1 简单RNN
python
class TextRNN(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, 1)
def forward(self, x):
embed = self.embedding(x)
output, hidden = self.rnn(embed)
return self.fc(hidden[-1])
4.2 Attention分类
python
class AttentionClassifier(nn.Module):
def __init__(self, vocab_size, embed_size, hidden_size):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_size)
self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True)
self.attention = Attention(hidden_size * 2)
self.fc = nn.Linear(hidden_size * 2, 1)
def forward(self, x):
embed = self.embedding(x)
output, _ = self.lstm(embed)
context, _ = self.attention(output)
return self.fc(context)
🚀 训练与评估
5.1 训练循环
python
for epoch in range(10):
model.train()
for batch in dataloader:
optimizer.zero_grad()
output = model(batch.x)
loss = criterion(output, batch.y)
loss.backward()
optimizer.step()
5.2 评估
python
from sklearn.metrics import classification_report
model.eval()
preds = model(test_x)
print(classification_report(test_y, preds))
📝 总结
Day04完成项目实战全流程。