基于MATLAB实现协同过滤电影推荐系统,包含用户-用户和物品-物品两种协同过滤方法,并整合了数据预处理、模型训练和推荐生成全流程:
一、核心代码实现
matlab
%% 数据加载与预处理
function [Y, R, num_users, num_movies] = load_data()
load('ex8_movies.mat'); % 加载MovieLens 100k数据集
Y = double(Y); % 评分矩阵 (943用户×1682电影)
R = double(R); % 评分指示矩阵 (1表示已评分)
num_users = size(Y,2);
num_movies = size(Y,1);
% 数据归一化(均值归一化)
[Ynorm, Ymean] = normalizeRatings(Y, R);
end
function [Ynorm, Ymean] = normalizeRatings(Y, R)
[m, n] = size(Y);
Ymean = zeros(m,1);
Ynorm = zeros(size(Y));
for i = 1:m
idx = R(i,:) == 1;
Ymean(i) = mean(Y(i,idx));
Ynorm(i,idx) = Y(i,idx) - Ymean(i);
end
end
二、相似度计算模块
1. 用户-用户相似度(余弦相似度)
matlab
function W = user_similarity(Y, num_users)
W = zeros(num_users,num_users);
for i = 1:num_users
for j = i+1:num_users
% 计算共同评分的电影数量
common = sum(R(i,:) & R(j,:));
if common > 0
% 余弦相似度计算
W(i,j) = sum(Y(i,R(i,:) & R(j,:)) .* Y(j,R(i,:) & R(j,:))) / ...
(sqrt(sum(Y(i,R(i,:) & R(j,:)).^2)) * ...
sqrt(sum(Y(j,R(i,:) & R(j,:)).^2)));
W(j,i) = W(i,j);
end
end
end
end
2. 物品-物品相似度(皮尔逊相关系数)
matlab
function sim = item_similarity(Y, R)
[m,n] = size(Y);
sim = zeros(m,n);
for i = 1:m
for j = i+1:m
% 找到同时评分的用户
idx = R(:,i) & R(:,j);
if sum(idx) > 10
% 计算皮尔逊相关系数
sim(i,j) = corr(Y(idx,i), Y(idx,j));
sim(j,i) = sim(i,j);
else
sim(i,j) = 0;
end
end
end
end
三、预测与推荐生成
1. 用户-用户协同过滤
matlab
function predictions = user_based_cf(Y, W, user_idx, K)
% 找到K个最近邻
[~, idx] = sort(W(user_idx,:), 'descend');
neighbors = idx(2:K+1); % 排除自身
% 预测未评分电影
pred_ratings = zeros(size(Y,1),1);
for i = 1:size(Y,1)
if R(i,user_idx) == 0
% 加权平均评分
pred_ratings(i) = sum(W(user_idx,neighbors) .* Y(neighbors,i)) / ...
sum(W(user_idx,neighbors));
end
end
predictions = pred_ratings;
end
2. 物品-物品协同过滤
matlab
function predictions = item_based_cf(Y, sim, user_idx, K)
% 找到用户评分过的电影
rated_movies = find(R(user_idx,:));
% 预测所有电影评分
pred_ratings = Y(user_idx,:) * sim;
% 仅保留未评分电影的预测值
predictions = zeros(size(Y,1),1);
for i = 1:size(Y,1)
if R(user_idx,i) == 0
% 取相似度最高的K个电影加权平均
[~, top_idx] = maxk(sim(i,:), K);
predictions(i) = mean(Y(rated_movies,top_idx));
end
end
end
四、模型训练与评估
1. 正则化代价函数(带梯度计算)
matlab
function [J, grad] = cofiCostFunc(params, Y, R, num_users, num_movies, num_features, lambda)
X = reshape(params(1:num_movies*num_features), num_movies, num_features);
Theta = reshape(params(num_movies*num_features+1:end), num_users, num_features);
% 计算预测误差
E = (X * Theta' - Y) .* R;
J = 0.5 * sum(sum(E.^2)) / size(Y,1) + ...
(lambda/2) * sum(sum(Theta.^2)) + ...
(lambda/2) * sum(sum(X.^2));
% 计算梯度
X_grad = (E' * Theta) / size(Y,1) + lambda * X;
Theta_grad = (E * X') / size(Y,1) + lambda * Theta;
grad = [X_grad(:); Theta_grad(:)];
end
2. 模型训练(使用fmincg优化器)
matlab
function [X, Theta] = train_model(Ynorm, R, num_features, lambda)
[num_movies, num_users] = size(Ynorm);
% 初始化参数
X = randn(num_movies, num_features);
Theta = randn(num_users, num_features);
initial_params = [X(:); Theta(:)];
% 设置优化选项
options = optimset('GradObj', 'on', 'MaxIter', 100);
% 执行优化
[params, ~] = fmincg(@(t)(cofiCostFunc(t, Ynorm, R, num_users, num_movies, ...
num_features, lambda)), initial_params, options);
% 恢复参数矩阵
X = reshape(params(1:num_movies*num_features), num_movies, num_features);
Theta = reshape(params(num_movies*num_features+1:end), num_users, num_features);
end
五、完整推荐流程
matlab
%% 主程序流程
[Y, R, num_users, num_movies] = load_data();
Ynorm = normalizeRatings(Y, R);
% 用户-用户协同过滤
W = user_similarity(Y, num_users);
user_idx = 1; % 假设推荐用户ID为1
K = 20; % 最近邻数量
user_preds = user_based_cf(Y, W, user_idx, K);
% 物品-物品协同过滤
sim = item_similarity(Y, R);
item_preds = item_based_cf(Y, sim, user_idx, K);
% 模型训练(带特征学习)
lambda = 10;
num_features = 10;
[X, Theta] = train_model(Ynorm, R, num_features, lambda);
learned_ratings = X * Theta' + Ymean;
% 生成推荐列表
[~, sorted_idx] = sort(learned_ratings(user_idx,:), 'descend');
top_movies = sorted_idx(1:10);
% 显示推荐结果
movieList = loadMovieList();
disp('Top 10 Recommendations:');
for i = 1:10
fprintf('%d. %s (Predicted Rating: %.1f)\n', i, movieList{top_movies(i)}, ...
learned_ratings(user_idx,top_movies(i)));
end
参考代码 电影推荐系统,运用协同过滤算法 www.youwenfan.com/contentcso/96071.html
六、性能评估
matlab
%% 交叉验证评估
function rmse = evaluate_model(Y, R, num_users, num_movies, num_features, lambda)
cv = cvpartition(size(Y,2),'KFold',5);
rmse = 0;
for i = 1:cv.NumTestSets
% 划分训练集和测试集
test_idx = cv.test(i);
train_idx = ~test_idx;
% 训练模型
[X, Theta] = train_model(Y(:,train_idx), R(:,train_idx), ...
num_features, lambda);
% 预测
pred = X * Theta';
pred(test_idx) = 0; % 排除测试集
% 计算RMSE
rmse = rmse + sqrt(mean((pred(R(:,test_idx)==1) - Y(R(:,test_idx)==1)).^2));
end
rmse = rmse / cv.NumTestSets;
end
七、可视化分析
matlab
%% 评分热力图
figure;
imagesc(Y);
colormap(jet);
colorbar;
title('用户-电影评分矩阵');
xlabel('电影ID');
ylabel('用户ID');
%% 预测误差分布
figure;
hist(residuals, 30);
title('预测误差分布');
xlabel('误差值');
ylabel('频数');
八、扩展应用场景
- 实时推荐 结合Redis缓存最近邻计算结果,响应时间<50ms
- 多模态融合 添加电影类型标签数据,使用SVD++算法增强特征表达
- 增量学习 设计在线更新机制,处理新用户/新电影的持续流入