机器学习导论习题解答

在文档聚类中,通过考虑上下文,例如考虑像"cocktail party"与"party elections"中的词对,可以减少二义性。实现方法。

python 复制代码
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
# ========== 新增:配置Matplotlib支持中文 ==========
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  # 指定默认字体(微软雅黑)
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示为方块的问题
# ===============================================

# ---------------------- 1. 初始化配置与数据准备 ----------------------
# 下载nltk停用词(首次运行需要)
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# 定义示例文档集(包含"party"的不同上下文)
documents = [
    "We had a great cocktail party at the beach with friends",
    "The cocktail party had delicious drinks and music",
    "The political party won the elections by a large margin",
    "Party elections are held every four years in the country",
    "I attended a birthday party with family last weekend",
    "The ruling party announced new policies after the elections",
    "Beach party with cocktails is my favorite summer activity",
    "Elections campaign for the party started last month"
]

# 定义停用词表
stop_words = set(stopwords.words('english'))

# ---------------------- 2. 文本预处理函数 ----------------------
def preprocess_text(text):
    """
    文本预处理:小写化、去除特殊字符、分词、去停用词
    """
    # 小写化
    text = text.lower()
    # 去除非字母字符(保留空格)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # 分词
    tokens = word_tokenize(text)
    # 去停用词和空字符串
    tokens = [token for token in tokens if token not in stop_words and token.strip() != '']
    return tokens

# 对所有文档进行预处理
processed_docs = [preprocess_text(doc) for doc in documents]
# 为TF-IDF准备拼接后的文本(需要字符串格式)
tfidf_docs = [' '.join(tokens) for tokens in processed_docs]

# ---------------------- 3. 捕捉局部上下文:2-gram TF-IDF ----------------------
# 初始化TF-IDF向量化器(包含1-gram和2-gram,捕捉词对上下文)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# 生成2-gram TF-IDF特征
tfidf_features = tfidf_vectorizer.fit_transform(tfidf_docs).toarray()
print(f"2-gram TF-IDF特征维度: {tfidf_features.shape}")

# ---------------------- 4. 捕捉全局语义上下文:Word2Vec ----------------------
# 训练Word2Vec模型(捕捉词的全局上下文语义)
w2v_model = Word2Vec(
    sentences=processed_docs,  # 训练语料
    vector_size=100,           # 词嵌入维度
    window=5,                  # 上下文窗口大小(前后5个词)
    min_count=1,               # 最小词频(保留所有词)
    workers=4                  # 并行线程数
)

def get_doc_vector(tokens, model, vector_size=100):
    """
    将文档转换为Word2Vec向量(所有词向量的平均值)
    """
    doc_vectors = []
    for token in tokens:
        if token in model.wv:
            doc_vectors.append(model.wv[token])
    if not doc_vectors:
        # 如果文档无有效词,返回全零向量
        return np.zeros(vector_size)
    # 取平均值作为文档向量
    return np.mean(doc_vectors, axis=0)

# 生成所有文档的Word2Vec向量
w2v_doc_vectors = np.array([get_doc_vector(tokens, w2v_model) for tokens in processed_docs])
print(f"Word2Vec文档向量维度: {w2v_doc_vectors.shape}")

# ---------------------- 5. 特征融合:合并上下文特征 ----------------------
# 拼接TF-IDF(局部上下文)和Word2Vec(全局上下文)特征
combined_features = np.hstack((tfidf_features, w2v_doc_vectors))
print(f"融合后特征维度: {combined_features.shape}")

# ---------------------- 6. 文档聚类(KMeans) ----------------------
# 设定聚类数(根据示例数据,分为2类:聚会/政党)
n_clusters = 2
# 初始化KMeans聚类器
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# 执行聚类
cluster_labels = kmeans.fit_predict(combined_features)

# 输出聚类结果
print("\n=== 聚类结果 ===")
for i, (doc, label) in enumerate(zip(documents, cluster_labels)):
    print(f"文档{i+1}: {doc[:50]}... | 聚类标签: {label}")

# 计算轮廓系数(评估聚类效果,越接近1越好)
sil_score = silhouette_score(combined_features, cluster_labels)
print(f"\n聚类轮廓系数: {sil_score:.4f}")

# ---------------------- 7. 聚类结果可视化(PCA降维) ----------------------
# PCA降维到2维
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(combined_features)

# 绘制散点图
plt.figure(figsize=(10, 6))
colors = ['red', 'blue']
# labels = ['聚会相关(party=聚会)', '政党相关(party=政党)']
labels = ['Party (social gathering)', 'Party (political party)']
for i in range(n_clusters):
    mask = cluster_labels == i
    plt.scatter(
        reduced_features[mask, 0], 
        reduced_features[mask, 1],
        c=colors[i],
        label=labels[i],
        alpha=0.7
    )

# 添加文档编号标注
for i, (x, y) in enumerate(reduced_features):
    plt.annotate(f"Doc{i+1}", (x, y), fontsize=8)

# plt.title('文档聚类结果(考虑上下文减少party二义性)')
# 修改为英文
plt.title('Document Clustering Results (Reducing "party" Ambiguity with Context)')

plt.xlabel('PCA维度1')
plt.ylabel('PCA维度2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# ---------------------- 8. 验证二义性减少效果:对比"party"的不同上下文向量 ----------------------
print("\n=== 验证party的上下文向量差异 ===")
# 提取"party"在不同上下文的词向量
party_cocktail = w2v_model.wv['party'] - w2v_model.wv['cocktail']
party_elections = w2v_model.wv['party'] - w2v_model.wv['elections']
# 计算余弦相似度(值越小,说明上下文差异越大)
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([party_cocktail], [party_elections])[0][0]
print(f"'party+cocktail'与'party+elections'的向量相似度: {similarity:.4f}")
相关推荐
星辰AI9 小时前
大模型对抗攻击与防御:保护 AI 系统安全
人工智能·ai·语言模型
weixin_550083159 小时前
PyTorch 实战:从零搭建手写数字识别系统(CNN 卷积神经网络)
人工智能·pytorch·cnn
Night_Elf9 小时前
AES-256加密+本地存储:国内本地密码管理器如何使用
人工智能·自动化
秋99 小时前
window中部署小龙虾OpenClaw
人工智能
星辰AI9 小时前
Stable Diffusion 实战教程:从安装到图像生成
人工智能·ai·语言模型
用户5191495848459 小时前
WordPress WPMasterToolkit 插件漏洞检测与利用工具
人工智能·aigc
AI医影跨模态组学9 小时前
Radiol Artif Intell 中山大学肿瘤防治中心放疗科:基于连续MRI的深度学习模型预测局部晚期鼻咽癌患者生存期
人工智能·深度学习·论文·医学·医学影像·影像组学
金智维科技官方9 小时前
圆桌对话:从流程自动化到智能流程,AI落地的下一站在哪里?
大数据·人工智能·ai·自动化·智能体
码字小学妹9 小时前
Google I/O 2026:Gemini 3.5 Flash 发布 + 国内 API 接入教程
人工智能
yezannnnnn9 小时前
Claude code 5 小时额度卡住?多账户错峰激活让你一天平滑使用不断额
人工智能·claude·vibecoding