sklearn(文章统一使用简称)是Python里最实用的机器学习库,今天我教大家怎么简单的使用它。
安装
python
# 安装的时候用全名
# pip install scikit-learn
# 导入的时候用简称
import sklearn
from sklearn.linear_model import LogisticRegression
安装好后,接下来分几个方向介绍使用方法。
一、数据准备
python
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# 1. 加载数据(内置数据集练手用)
iris = datasets.load_iris()
X, y = iris.data, iris.target # X是特征,y是标签
# 2. 划分训练集和测试集(必须做)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42 # random_state固定随机种子,结果可复现
)
# 3. 数据标准化(很多算法需要)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # 先fit再transform
X_test_scaled = scaler.transform(X_test) # 用训练集的参数transform测试集
二、分类算法
python
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
# 各种分类器用法都差不多,三步走
classifiers = {
'逻辑回归': LogisticRegression(random_state=42),
'决策树': DecisionTreeClassifier(random_state=42),
'随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
'SVM': SVC(kernel='rbf', random_state=42),
'KNN': KNeighborsClassifier(n_neighbors=5)
}
for name, clf in classifiers.items():
clf.fit(X_train_scaled, y_train) # 1. 训练
score = clf.score(X_test_scaled, y_test) # 2. 预测并评估
print(f"{name}: 准确率 {score:.3f}")
三、回归算法
python
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# 加载回归数据
boston = datasets.load_boston() # 已弃用,这里只是示例
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 回归模型
regressors = {
'线性回归': LinearRegression(),
'岭回归': Ridge(alpha=1.0),
'Lasso回归': Lasso(alpha=0.1),
'随机森林回归': RandomForestRegressor(n_estimators=100, random_state=42)
}
for name, reg in regressors.items():
reg.fit(X_train, y_train)
r2_score = reg.score(X_test, y_test) # R²分数,越接近1越好
print(f"{name}: R²分数 {r2_score:.3f}")
四、聚类算法
python
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
# 无监督学习,没有y
X = iris.data
# KMeans(最常用)
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(X) # 直接得到分组标签
# 评估聚类效果(轮廓系数)
score = silhouette_score(X, labels)
print(f"KMeans轮廓系数: {score:.3f}")
# 其他聚类算法
dbscan = DBSCAN(eps=0.5, min_samples=5)
hierarchical = AgglomerativeClustering(n_clusters=3)
五、模型评估
python
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report,
mean_squared_error, r2_score
)
# 分类任务评估
clf = LogisticRegression(random_state=42)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print("准确率:", accuracy_score(y_test, y_pred))
print("精确率:", precision_score(y_test, y_pred, average='macro'))
print("召回率:", recall_score(y_test, y_pred, average='macro'))
print("F1分数:", f1_score(y_test, y_pred, average='macro'))
# 混淆矩阵(可视化谁分错了)
print("\n混淆矩阵:")
print(confusion_matrix(y_test, y_pred))
# 详细报告
print("\n分类报告:")
print(classification_report(y_test, y_pred))
六、特征工程
python
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# 1. 特征选择(选最重要的K个特征)
selector = SelectKBest(score_func=f_classif, k=2)
X_selected = selector.fit_transform(X_train, y_train)
# 2. 降维(PCA)
pca = PCA(n_components=2) # 降到2维
X_pca = pca.fit_transform(X_train)
# 3. 管道(把多个步骤串起来)
pipeline = Pipeline([
('scaler', StandardScaler()),
('selector', SelectKBest(k=2)),
('classifier', RandomForestClassifier(random_state=42))
])
pipeline.fit(X_train, y_train)
score = pipeline.score(X_test, y_test)
print(f"管道模型准确率: {score:.3f}")
七、找最优参数
python
from sklearn.model_selection import GridSearchCV
# 定义要调的参数
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10]
}
# 网格搜索
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(
rf,
param_grid,
cv=5, # 5折交叉验证
scoring='accuracy',
n_jobs=-1 # 用所有CPU核心
)
grid_search.fit(X_train, y_train)
print("最佳参数:", grid_search.best_params_)
print("最佳分数:", grid_search.best_score_)
# 用最佳参数训练最终模型
best_model = grid_search.best_estimator_
八、交叉验证
python
from sklearn.model_selection import cross_val_score, KFold
# 简单交叉验证
scores = cross_val_score(
RandomForestClassifier(random_state=42),
X, y,
cv=5, # 5折
scoring='accuracy'
)
print(f"交叉验证平均准确率: {scores.mean():.3f} (±{scores.std():.3f})")
# 自定义交叉验证策略
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in kf.split(X):
X_train_fold, X_val_fold = X[train_idx], X[val_idx]
y_train_fold, y_val_fold = y[train_idx], y[val_idx]
# 在每折上训练和评估
九、保存和加载模型
python
import joblib
import pickle
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# 保存模型
joblib.dump(model, 'model.joblib') # sklearn推荐
# 或者
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
# 加载模型
loaded_model = joblib.load('model.joblib')
predictions = loaded_model.predict(X_test)
上面学会后,就可以基于下面的流程模板自己玩一下。
机器学习工作流程模板
python
def ml_workflow(X, y):
"""标准机器学习流程"""
# 1. 划分数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 2. 数据预处理
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 3. 训练模型
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# 4. 评估
train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)
print(f"训练集准确率: {train_score:.3f}")
print(f"测试集准确率: {test_score:.3f}")
return model, scaler
sklearn的设计哲学是"一致性"------所有模型都有fit、predict、score方法,用起来都一个套路。把上面这些掌握了,80%的机器学习任务都能应付。剩下的20%,知道去哪里查文档就行。