最高准确率 0.69 auc 0.758
加特征过程:
|---------------------------------|-------|-------|
| | 准确率 | auc |
| age、category(gender,occupation) | 0.56 | 0.58 |
| cat(age,gender,occupation) | 0.576 | 0.593 |
| add all feature | 0.598 | 0.627 |
| add 发布时间(month) | 0.615 | 0.653 |
| add cat(发布时间) | 0.625 | 0.662 |
| add title | 0.63 | 0.68 |
| add 人均、物均打分 | 0.68 | 0.75 |
| add 年龄+性别+职业两两组合 | 0.69 | 0.758 |
import sys
import re
import nltk
print(nltk.data.path)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score ,accuracy_score,classification_report
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
print("..")
def preprocess_title(title):
# 去除括号中的年份(如(1995))
title = re.sub(r'\(\d+\)', '', title)
# 去除非字母字符
title = re.sub(r'[^a-zA-Z\s]', '', title)
# 转小写
title = title.lower()
# 分词
words = word_tokenize(title)
# 去除停用词(如the、a等无意义词)
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
# 拼接回字符串
return ' '.join(words)
u_data=pd.read_csv("d://d/ml-100k/u.data",sep='\t',names=['user_id', 'item_id', 'rating' ,'timestamp'])
u_user=pd.read_csv("d://d/ml-100k/u.user",sep='|',names=['user_id' ,'age', 'gender', 'occupation','zip code'])
u_item=pd.read_csv("d://d/ml-100k/u.item",encoding='latin-1',sep='|', \
names=['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL', \
'unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', \
'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', \
'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
data = pd.merge(u_data,u_user,on='user_id')
data = pd.merge(data,u_item,on='item_id')
for tag in ['gender','occupation']:
data[tag]=pd.factorize(data[tag])[0]
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
data['title_processed'] = data['title'].apply(preprocess_title)
data['user_avg_rating'] = data.groupby('user_id')['rating'].transform('mean').astype(int)
data['item_avg_rating'] = data.groupby('item_id')['rating'].transform('mean').astype(int)
data['age2']=data['age']/5
data['age_gender'] = poly.fit_transform(data[['age2','gender']])[:,2]
data['gender_occupation'] = poly.fit_transform(data[['gender','occupation']])[:,2]
data['age_occupation'] = poly.fit_transform(data[['age2','occupation']])[:,2]
# 初始化TF-IDF向量器,限制最大特征数(避免维度爆炸)
tfidf = TfidfVectorizer(max_features=100) # 保留100个最常见的关键词
# 拟合并转化标题文本,得到TF-IDF特征矩阵(稀疏矩阵)
title_tfidf = tfidf.fit_transform(data['title_processed'])
# 将稀疏矩阵转化为DataFrame,方便后续合并
title_features = pd.DataFrame(
title_tfidf.toarray(),
columns=[f'title_{i}' for i in range(title_tfidf.shape[1])],
index=data.index # 保持索引一致,方便合并
)
#print(data['release_date'].tolist()[0:10])
#视频生成时间有多少个月
release_date=[]
for ele in data['release_date'].tolist():
#print(ele)
if '-' not in str(ele):
release_date.append(None)
else:
release_date.append( datetime.strptime(ele, "%d-%b-%Y") )
release_date = pd.Series(release_date)
release_date = release_date.fillna(release_date.mean())
release_date = release_date.tolist()
#release_date = [ datetime.strptime(ele, "%d-%b-%Y") for ele in data['release_date'].tolist()]
cur_date = [ datetime.utcfromtimestamp(ele) for ele in data['timestamp'].tolist() ]
diff_mons = [ abs((cur_date[i] - release_date[i]).days/30 ) for i in range(len(cur_date)) ]
print(pd.Series(diff_mons).value_counts(normalize=False) )
data['diff_mons'] = pd.Series(diff_mons).astype(int)
X = data[['age','gender','occupation','Action','Adventure','Animation', 'Children','Comedy','Crime', 'Documentary','Drama',\
'Fantasy', 'Film-Noir', 'Horror','Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western','diff_mons',\
'user_avg_rating','item_avg_rating','age_gender','gender_occupation','age_occupation']]
X = pd.concat([X, title_features], axis=1)
dd = (data['rating'] >= 4)
Y = dd.astype(int)
print(data['age'].value_counts(normalize=False))
print(X.isnull().sum()) # 缺失值
print(X.describe()) # 数值特征分布(如age的最大值是否合理)
#sys.exit(-1)
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=40)
preprocessor = ColumnTransformer(
transformers=[
('cat',OneHotEncoder(handle_unknown='ignore'),['gender','occupation','age','diff_mons','user_avg_rating'
,'item_avg_rating','age_gender','gender_occupation','age_occupation']),
],
remainder='passthrough')
pipeline = Pipeline([
('preprocessor',preprocessor),
('classifier',LogisticRegression(max_iter=1000))
])
pipeline.fit(x_train,y_train)
y_pred = pipeline.predict(x_test)
y_prob = pipeline.predict_proba(x_test)[:,1]
print('准确率是:',accuracy_score(y_true=y_test,y_pred=y_pred)
)
print('aoc是',roc_auc_score(y_true=y_test,y_score=y_prob) )
print('分类报告是:',classification_report(y_true=y_test,y_pred=y_pred))