1. 读取数据
首先,我们从 Excel 文件中读取训练集和测试集:
2. 训练集划分
我们将 80% 的数据用于训练,20% 用于验证。
3. 训练多个模型
我们选取 8 种常见分类模型,并存储预测结果。
Matlab
for i = 1:length(modelNames)
switch modelNames{i}
case 'Multinomial Logistic Regression'
B = mnrfit(X_train, Y_train, 'model', 'nominal');
predProb = mnrval(B, X_test);
[~, predictions{i}] = max(predProb, [], 2);
predProb_val = mnrval(B, X_val);
[~, val_predictions] = max(predProb_val, [], 2);
case 'Decision Tree'
model = fitctree(X_train, Y_train);
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val);
case 'Random Forest'
model = fitcensemble(X_train, Y_train, 'Method', 'Bag');
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val);
case 'SVM'
model = fitcecoc(X_train, Y_train);
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val);
case 'KNN'
model = fitcknn(X_train, Y_train);
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val);
case 'BP Neural Network'
model = feedforwardnet(15);
model = train(model, X_train', full(ind2vec(Y_train')));
nnOutput = model(X_test')';
[~, predictions{i}] = max(nnOutput, [], 2);
nnOutput_val = model(X_val')';
[~, val_predictions] = max(nnOutput_val, [], 2);
case 'GBM'
model = fitcensemble(X_train, Y_train, 'Method', 'AdaBoostM2');
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val);
case 'AdaBoost'
model = fitcensemble(X_train, Y_train, 'Method', 'AdaBoostM2');
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val);
end
4. 结果输出
(1)存储预测结果
将预测结果保存为 Excel 文件。
(2)模型评估
使用 混淆矩阵 和 精度计算 评估模型。
5. 结果分析
- 逻辑回归:适用于线性可分数据,易解释但精度有限。
- 决策树:易于理解但容易过拟合。
- 随机森林:更稳定,适用于高维数据。
- SVM:适用于小样本数据,但计算成本较高。
- KNN:简单但计算量大,适用于小规模数据。
- BP 神经网络:可处理复杂关系,但训练时间长。
- GBM 和 AdaBoost:适用于非线性关系,但超参数调优重要。
总结
本文使用 MATLAB 实现了 8 种分类模型,并进行了训练、预测和评估。通过混淆矩阵和精度可视化,帮助选择最优模型。
完整代码:
Matlab
% 导入训练集
trainData = readtable('train.xlsx');
X = table2array(trainData(:, 1:end-1)); % 特征
Y = trainData{:, end}; % 类别(多分类)
% 导入测试集 (没有标签的测试集)
testData = readtable('test.xlsx');
X_test = table2array(testData(:, 1:end)); % 测试集特征
% 分割数据集:80% 用于训练,20% 用于验证
cv = cvpartition(size(X, 1), 'HoldOut', 0.2);
X_train = X(training(cv), :); % 80%的训练数据
Y_train = Y(training(cv), :);
X_val = X(test(cv), :); % 20%的验证数据
Y_val = Y(test(cv), :);
% 确保 Y_train 和 Y_val 是数值类型
if iscategorical(Y_train)
Y_train = double(Y_train); % 转换为数值型
end
if iscategorical(Y_val)
Y_val = double(Y_val); % 转换为数值型
end
% 确保 X_train 和 X_test 的列数一致
disp(size(X_train)); % 打印 X_train 的大小
disp(size(X_test)); % 打印 X_test 的大小
% 定义模型名称
modelNames = {'Multinomial Logistic Regression', 'Decision Tree', 'Random Forest', ...
'SVM', 'KNN', 'BP Neural Network', 'GBM', 'AdaBoost'};
models = cell(length(modelNames), 1); % 用于存储模型
predictions = cell(length(modelNames), 1); % 用于存储测试集的预测结果
% 循环遍历每个模型
for i = 1:length(modelNames)
switch modelNames{i}
case 'Multinomial Logistic Regression'
% 多分类逻辑回归
B = mnrfit(X_train, Y_train, 'model', 'nominal');
predProb = mnrval(B, X_test); % 预测类别概率
[~, predictions{i}] = max(predProb, [], 2); % 返回概率最大类别
% 在验证集上进行预测
predProb_val = mnrval(B, X_val); % 在验证集上进行预测
[~, val_predictions] = max(predProb_val, [], 2);
case 'Decision Tree'
% 决策树
model = fitctree(X_train, Y_train);
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val); % 在验证集上进行预测
case 'Random Forest'
% 随机森林
model = fitcensemble(X_train, Y_train, 'Method', 'Bag');
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val); % 在验证集上进行预测
case 'SVM'
% SVM 用于多分类
model = fitcecoc(X_train, Y_train); % 使用 fitcecoc 来处理多分类
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val); % 在验证集上进行预测
case 'KNN'
% K-近邻
model = fitcknn(X_train, Y_train);
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val); % 在验证集上进行预测
case 'BP Neural Network'
% BP 神经网络
model = feedforwardnet(15); % 10为隐藏层神经元个数
model = train(model, X_train', full(ind2vec(Y_train'))); % ind2vec用于多类
nnOutput = model(X_test')'; % 输出神经网络预测结果
[~, predictions{i}] = max(nnOutput, [], 2); % 选择概率最高的类
nnOutput_val = model(X_val')'; % 在验证集上进行预测
[~, val_predictions] = max(nnOutput_val, [], 2); % 选择概率最高的类
case 'GBM'
% 梯度提升(GBM) - 使用适合多分类的 AdaBoostM2
model = fitcensemble(X_train, Y_train, 'Method', 'AdaBoostM2');
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val); % 在验证集上进行预测
case 'AdaBoost'
% AdaBoost
model = fitcensemble(X_train, Y_train, 'Method', 'AdaBoostM2');
predictions{i} = predict(model, X_test);
val_predictions = predict(model, X_val); % 在验证集上进行预测
end
% 输出每个模型的预测结果为 Excel 文件
results = table(predictions{i}, 'VariableNames', {'Predictions'});
writetable(results, ['XXX_' modelNames{i} '_trainend.xlsx']);
% 将预测类别标签添加到测试集最后一列
testResults = testData;
testResults.Predictions = predictions{i}; % 添加预测结果列
writetable(testResults, ['XXX_' modelNames{i} '_test_with_predictions.xlsx']);
% 在验证集上评估模型的精度
figure;
cm = confusionchart(Y_val, val_predictions); % 使用Y_val进行混淆矩阵计算
title(['Confusion Matrix - ' modelNames{i} ' (Validation Set)']);
% 计算并显示精度
accuracy = sum(diag(cm.NormalizedValues)) / sum(cm.NormalizedValues(:));
fprintf('%s Validation Accuracy: %.2f%%\n', modelNames{i}, accuracy * 100);
% 可视化精度
figure;
bar(accuracy);
title(['Validation Accuracy of ' modelNames{i}]);
xlabel('Model');
ylabel('Accuracy');
xticks(1);
xticklabels({modelNames{i}});
end
产生:

Multinomial Logistic Regression Validation Accuracy: 92.16%
Decision Tree Validation Accuracy: 80.39%
Random Forest Validation Accuracy: 90.20%
SVM Validation Accuracy: 94.12%
KNN Validation Accuracy: 92.16%
BP Neural Network Validation Accuracy: 88.24%
GBM Validation Accuracy: 84.31%
AdaBoost Validation Accuracy: 84.31%