python
复制代码
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
file_path = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
df_train = pd.read_csv(file_path) # Corrected: changed file_train to file_path
df_train['Misconception'] = df_train['Misconception'].fillna('NA')
df_train['CombinedText'] = df_train['QuestionText'] + " " + df_train['MC_Answer'] + " " + df_train['StudentExplanation']
# --- 1.1. 加载 GloVe 词向量 ---
print("--- 正在加载 GloVe 词向量 ---")
glove_file_path = '/kaggle/input/dataword/glove.6B.100d.txt'
word_embeddings = {}
embedding_dim = 100
try:
with open(glove_file_path, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
word_embeddings[word] = vector
print(f"已加载 {len(word_embeddings)} 个词的 GloVe 词向量 (维度: {embedding_dim})。")
except FileNotFoundError:
print(f"错误: GloVe 文件 '{glove_file_path}' 未找到。请确保文件已上传或路径正确。")
print("将跳过词嵌入,使用一个简化的特征提取器进行演示。")
word_embeddings = {"dummy": np.zeros(embedding_dim)}
embedding_dim = 100
# --- 1.2. 创建词嵌入特征提取器函数 ---
def get_embedding_features(texts, word_embeddings, embedding_dim):
"""
将文本列表转换为词嵌入特征矩阵。
每个文本的特征是其所有词向量的平均值。
"""
features_matrix = np.zeros((len(texts), embedding_dim))
for i, text in enumerate(texts):
words = text.lower().split()
word_vectors = []
for word in words:
if word in word_embeddings:
word_vectors.append(word_embeddings[word])
if word_vectors:
features_matrix[i] = np.mean(word_vectors, axis=0)
return features_matrix
# --- 1.3. 使用 TF-IDF 和词嵌入提取特征并拼接 ---
print("--- 正在使用 TF-IDF 和词嵌入提取文本特征并拼接 ---")
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df_train['CombinedText'])
X_embeddings = get_embedding_features(df_train['CombinedText'].astype(str).tolist(), word_embeddings, embedding_dim)
X_text = np.hstack((X_tfidf.toarray().astype(np.float32), X_embeddings.astype(np.float32)))
print(f"组合后的文本特征矩阵形状: {X_text.shape}")
df_train['CombinedText'] = X_text
X = df_train[['CombinedText']]
# Label Encoding for 'Category'
le_category = LabelEncoder()
le_category.fit(df_train['Category'])
df_train['Category'] = le_category.transform(df_train['Category'])
#num_category_classes = len(le_category.classes_)
# Label Encoding for 'Misconception'
le_misconception = LabelEncoder()
le_misconception.fit(df_train['Misconception'])
df_train['Misconception'] = le_misconception.transform(df_train['Misconception'])
#num_misconception_classes = len(le_misconception.classes_)
Y = df_train[['Category','Misconception']]
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
classifier = MultiOutputClassifier(RandomForestClassifier())
classifier.fit(X,Y)
testfile_path = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'
df_test = pd.read_csv(testfile_path)
df_test['CombinedText'] = df_test['QuestionText'] + " " + df_test['MC_Answer'] + " " + df_test['StudentExplanation']
tfidf_vectorizer1 = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer1.fit_transform(df_test['CombinedText'])
X_embeddings_test = get_embedding_features(df_test['CombinedText'].astype(str).tolist(), word_embeddings, embedding_dim)
X_test = np.hstack((X_tfidf.toarray().astype(np.float32), X_embeddings_test.astype(np.float32)))
print(f"组合后的文本特征矩阵形状: {X_text.shape}")
df_test['CombinedText'] = X_test
XX = df_test[['CombinedText']]
predictions = classifier.predict(XX)
predictions