scikit-learn中三个经典算法的实现示例

1. 线性回归

```python

import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

生成示例数据

np.random.seed(42)

X = 2 * np.random.rand(100, 1) # 100个样本,1个特征

y = 4 + 3 * X + np.random.randn(100, 1) # y = 4 + 3x + 噪声

划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

创建并训练模型

lin_reg = LinearRegression()

lin_reg.fit(X_train, y_train)

预测

y_pred = lin_reg.predict(X_test)

评估模型

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

print(f"线性回归结果:")

print(f"系数 (斜率): {lin_reg.coef_[0][0]:.4f}")

print(f"截距: {lin_reg.intercept_[0]:.4f}")

print(f"均方误差 (MSE): {mse:.4f}")

print(f"R²分数: {r2:.4f}")

可视化

plt.figure(figsize=(10, 6))

plt.scatter(X, y, alpha=0.6, label='原始数据')

plt.plot(X, lin_reg.predict(X), color='red', linewidth=2, label='回归线')

plt.xlabel('X')

plt.ylabel('y')

plt.title('线性回归')

plt.legend()

plt.grid(True, alpha=0.3)

plt.show()

```

2. 决策树(分类)

```python

from sklearn.tree import DecisionTreeClassifier

from sklearn.datasets import load_iris

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn import tree

import matplotlib.pyplot as plt

加载鸢尾花数据集

iris = load_iris()

X = iris.data

y = iris.target

feature_names = iris.feature_names

class_names = iris.target_names

划分训练集和测试集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

创建并训练决策树模型

dt_clf = DecisionTreeClassifier(

max_depth=3, # 限制树的最大深度

random_state=42,

criterion='gini' # 使用基尼系数

)

dt_clf.fit(X_train, y_train)

预测

y_pred = dt_clf.predict(X_test)

评估模型

accuracy = accuracy_score(y_test, y_pred)

print(f"决策树分类结果:")

print(f"准确率: {accuracy:.4f}")

print("\n分类报告:")

print(classification_report(y_test, y_pred, target_names=class_names))

print("\n混淆矩阵:")

print(confusion_matrix(y_test, y_pred))

可视化决策树

plt.figure(figsize=(12, 8))

tree.plot_tree(dt_clf,

feature_names=feature_names,

class_names=class_names,

filled=True,

rounded=True)

plt.title("决策树结构")

plt.show()

特征重要性

print("\n特征重要性:")

for name, importance in zip(feature_names, dt_clf.feature_importances_):

print(f"{name}: {importance:.4f}")

```

3. K-Means聚类

```python

from sklearn.cluster import KMeans

from sklearn.datasets import make_blobs

from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

import numpy as np

生成模拟数据

np.random.seed(42)

X, y_true = make_blobs(

n_samples=300,

centers=4, # 4个聚类中心

cluster_std=0.60,

random_state=42

)

使用肘部法则确定最佳K值

inertia = []

silhouette_scores = []

K_range = range(2, 10)

for k in K_range:

kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)

kmeans.fit(X)

inertia.append(kmeans.inertia_)

if k > 1: # 轮廓系数需要至少2个聚类

silhouette_scores.append(silhouette_score(X, kmeans.labels_))

选择最佳K值

optimal_k = 4 # 基于肘部法则和已知的数据生成过程

kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)

kmeans.fit(X)

y_pred = kmeans.labels_

centroids = kmeans.cluster_centers_

计算评估指标

inertia_score = kmeans.inertia_

silhouette_avg = silhouette_score(X, y_pred)

print(f"K-Means聚类结果 (K={optimal_k}):")

print(f"轮廓系数: {silhouette_avg:.4f}")

print(f"惯性 (Inertia): {inertia_score:.4f}")

print(f"聚类中心:\n{centroids}")

可视化

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

肘部法则图

axes[0].plot(K_range, inertia, 'bo-')

axes[0].set_xlabel('K值')

axes[0].set_ylabel('惯性 (Inertia)')

axes[0].set_title('肘部法则')

axes[0].grid(True, alpha=0.3)

轮廓系数图

axes[1].plot(range(2, 10), silhouette_scores, 'ro-')

axes[1].set_xlabel('K值')

axes[1].set_ylabel('轮廓系数')

axes[1].set_title('轮廓系数')

axes[1].grid(True, alpha=0.3)

聚类结果可视化

scatter = axes[2].scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', alpha=0.6)

axes[2].scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=200, label='聚类中心')

axes[2].set_xlabel('特征1')

axes[2].set_ylabel('特征2')

axes[2].set_title(f'K-Means聚类结果 (K={optimal_k})')

axes[2].legend()

axes[2].grid(True, alpha=0.3)

plt.tight_layout()

plt.show()

预测新样本

new_samples = np.array([[0, 0], [8, 3]])

predicted_clusters = kmeans.predict(new_samples)

print(f"\n新样本的预测聚类:")

for sample, cluster in zip(new_samples, predicted_clusters):

print(f"样本 {sample} -> 聚类 {cluster}")

```

4. 综合对比示例

```python

from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeRegressor

from sklearn.cluster import KMeans

from sklearn.datasets import make_regression

from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, r2_score

import numpy as np

import matplotlib.pyplot as plt

生成回归数据

np.random.seed(42)

X, y = make_regression(n_samples=200, n_features=1, noise=10, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

比较线性回归和决策树回归

models = {

'Linear Regression': LinearRegression(),

'Decision Tree': DecisionTreeRegressor(max_depth=3, random_state=42)

}

results = {}

plt.figure(figsize=(12, 5))

for idx, (name, model) in enumerate(models.items(), 1):

训练模型

model.fit(X_train, y_train)

预测

y_pred = model.predict(X_test)

评估

mse = mean_squared_error(y_test, y_pred)

r2 = r2_score(y_test, y_pred)

results[name] = {'MSE': mse, 'R2': r2}

可视化

plt.subplot(1, 2, idx)

plt.scatter(X_train, y_train, alpha=0.5, label='训练数据')

plt.scatter(X_test, y_test, alpha=0.5, label='测试数据')

生成平滑的预测线

X_range = np.linspace(X.min(), X.max(), 300).reshape(-1, 1)

y_range = model.predict(X_range)

plt.plot(X_range, y_range, 'r-', linewidth=2, label='模型预测')

plt.title(f'{name}\nMSE: {mse:.2f}, R²: {r2:.2f}')

plt.xlabel('X')

plt.ylabel('y')

plt.legend()

plt.grid(True, alpha=0.3)

plt.tight_layout()

plt.show()

print("模型比较结果:")

for name, metrics in results.items():

print(f"{name}:")

print(f" MSE: {metrics['MSE']:.4f}")

print(f" R²: {metrics['R2']:.4f}")

print()

```

这些示例展示了:

  1. **线性回归**:回归问题的基本实现和评估

  2. **决策树**:分类问题的实现,包括可视化树结构和特征重要性

  3. **K-Means**:聚类算法的实现,包含最佳K值选择和结果可视化

  4. **综合对比**:比较不同算法的性能

每个示例都包含:

  • 数据准备

  • 模型创建和训练

  • 预测和评估

  • 结果可视化

  • 关键参数的说明

相关推荐
Ai财富密码1 天前
AI for Coding:如何构建基于 SDD 的多人协作流水线?
开发语言·人工智能·python
hui函数1 天前
python全栈入门到实战【基础篇 01】Python初识:定位、优势与发展历程
开发语言·python
没那么特别的特别1 天前
【蓝桥杯】Python基础知识梳理
开发语言·python
福楠1 天前
模拟实现string类
c语言·开发语言·c++·算法
代码游侠1 天前
应用——C语言基础知识1
服务器·c语言·开发语言·笔记
AAA简单玩转程序设计1 天前
Python 效率飞升术:3基础进阶小工具,少写 100 行循环
python
CC.GG1 天前
【Qt】常用控件----按钮类控件
开发语言·数据库·qt
梨落秋霜1 天前
Python入门篇【序列切片】
开发语言·python