🚀 Day01 - NLP自然语言处理基础
📖 导读 :
这是NLP学习的第一天,掌握文本处理的基础能力。
🗺️ 一、文本预处理
1.1 jieba分词器基础
python
import jieba
# 精确模式(推荐)
text = "我爱自然语言处理技术"
words = jieba.lcut(text)
print(words)
# ['我', '爱', '自然语言', '处理', '技术']
💡 代码解析:
jieba.lcut():返回列表形式- 精确模式尽量保持词语完整
python
# 全模式(最细粒度)
words_full = jieba.lcut(text, cut_all=True)
# ['我', '爱', '自', '然', '语', '言', '处', '理', '技', '术']
# 搜索引擎模式
words_search = jieba.lcut_for_search(text)
# ['我', '爱', '自然', '语言', '自然语言', '处理', '技术']
1.2 繁体和自定义词典
python
# 繁体字典
jieba.set_dictionary('dict.txt.big')
# 添加词汇
jieba.add_word("传智教育")
jieba.add_word("黑马程序员", freq=10, tag='n')
# 从文件加载
jieba.load_userdict("my_dict.txt")
text = "传智教育旗下有黑马程序员"
print(jieba.lcut(text))
# ['传智教育', '旗下', '有', '黑马程序员']
1.3 词性标注
python
import jieba.posseg as pseg
text = "我爱自然语言处理"
result = pseg.lcut(text)
for word, flag in result:
print(f"{word}: {flag}")
# 我: r (代词)
# 爱: v (动词)
# 自然语言: n (名词)
# 处理: vn (动名词)
💻 二、文本向量化
2.1 One-Hot编码
python
from tensorflow.keras.preprocessing.text import Tokenizer
vocabs = ["苹果", "香蕉", "橙子", "葡萄"]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocabs)
print(tokenizer.word_index)
# {'苹果': 1, '香蕉': 2, '橙子': 3, '葡萄': 4}
one_hot = tokenizer.texts_to_matrix(vocabs, mode='binary')
print(one_hot)
2.2 Word2Vec
python
import fasttext
# 训练词向量
model = fasttext.train_unsupervised(
'corpus.txt',
model='cbow',
dim=100,
epoch=5
)
# 获取向量
vec = model.get_word_vector('苹果')
print(vec.shape) # (100,)
# 找相似词
similar = model.get_nearest_neighbors('苹果')
print(similar)
2.3 词嵌入Embedding
python
import torch.nn as nn
embedding = nn.Embedding(num_embeddings=10000, embedding_dim=128)
word_indices = torch.tensor([1, 5, 10])
vectors = embedding(word_indices)
print(vectors.shape) # (3, 128)
📊 三、文本数据分析
3.1 目标值分布
python
import pandas as pd
import seaborn as sns
train_df = pd.read_csv('train.tsv', sep='\t')
print(train_df['label'].value_counts())
sns.countplot(x='label', data=train_df)
plt.show()
3.2 句子长度分布
python
train_df['length'] = train_df['text'].apply(lambda x: len(x))
print(train_df['length'].describe())
sns.boxplot(x='label', y='length', data=train_df)
plt.show()
3.3 词云
python
from wordcloud import WordCloud
wc = WordCloud(font_path='simhei.ttf').generate(text)
plt.imshow(wc)
plt.axis('off')
plt.show()
🧠 四、RNN循环神经网络
4.1 RNN
python
import torch.nn as nn
rnn = nn.RNN(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
x = torch.randn(3, 5, 10)
h0 = torch.zeros(2, 3, 20)
output, hn = rnn(x, h0)
print(f"output: {output.shape}") # (3, 5, 20)
4.2 LSTM
python
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, bidirectional=True)
h0 = torch.zeros(4, 3, 20)
c0 = torch.zeros(4, 3, 20)
output, (hn, cn) = lstm(x, (h0, c0))
print(f"output: {output.shape}") # (5, 3, 40)
4.3 GRU
python
gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2, bidirectional=True)
output, hn = gru(x, h0)
print(f"output: {output.shape}") # (5, 3, 40)
🎯 五、注意力机制
python
import torch.nn.functional as F
class Attention(nn.Module):
def __init__(self, hidden_size):
super().__init__()
self.attn = nn.Linear(hidden_size * 2, hidden_size)
self.v = nn.Parameter(torch.rand(hidden_size))
def forward(self, hidden, encoder_outputs):
seq_len = encoder_outputs.size(0)
hidden = hidden.unsqueeze(0).repeat(seq_len, 1, 1)
energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], dim=2)))
energy = energy.permute(1, 2, 0)
v = self.v.repeat(encoder_outputs.size(1), 1).unsqueeze(1)
scores = torch.bmm(v, energy).squeeze(1)
attention_weights = F.softmax(scores, dim=1)
context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs.permute(1, 0, 2)).squeeze(1)
return context, attention_weights
🔥 六、Transformer
6.1 输入处理
python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
inputs = tokenizer("我爱自然语言处理", return_tensors="pt")
print(inputs)
6.2 完整Transformer
python
class Transformer(nn.Module):
def __init__(self, src_vocab, trg_vocab, embed_size=256, num_layers=6, heads=8):
super().__init__()
self.encoder = Encoder(embed_size, heads, num_layers)
self.decoder = Decoder(embed_size, heads, num_layers)
self.fc_out = nn.Linear(embed_size, trg_vocab)
def forward(self, src, trg):
enc_out = self.encoder(src)
dec_out = self.decoder(trg, enc_out)
return self.fc_out(dec_out)
🚀 七、迁移学习
7.1 FastText
python
import fasttext
model = fasttext.train_supervised('train.txt', lr=0.1, epoch=5, word_ngrams=2)
pred, prob = model.predict('这个产品太好了!')
print(pred, prob)
7.2 Hugging Face
python
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("I love this!")
print(result)
7.3 BERT微调
python
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
training_args = TrainingArguments(output_dir="./results", num_train_epochs=3, per_device_train_batch_size=16)
trainer = Trainer(model=model, args=training_args, train_dataset=train_data)
trainer.train()
📝 总结
Day01主要学习了:
- jieba分词
- 文本向量化
- 文本数据分析
- RNN/LSTM/GRU
- 注意力机制
- Transformer基础
- 迁移学习