感知机：乳腺癌分类实现 & K 均值聚类：从零实现

一、感知机（Perceptron）：乳腺癌分类实现

目标：

理解感知机算法的工作原理。
从零开始逐步实现感知机。
使用威斯康星乳腺癌数据集训练模型。
评估模型性能。

导入依赖库

python 复制代码

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns  # 用于混淆矩阵可视化

步骤 1：数据加载与预处理

python 复制代码

# 加载威斯康星乳腺癌数据集
data = load_breast_cancer()
X = data.data  # 特征矩阵（形状：(569, 30)，569个样本，30个特征）
y = data.target  # 标签（0=恶性，1=良性）

# 1. 将标签转换为+1和-1（感知机需要双极标签，而非0/1）
y = np.where(y == 1, 1, -1)  # 良性→1，恶性→-1

# 2. 特征归一化（感知机对特征尺度敏感，归一化可加速收敛）
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # 均值为0，方差为1的标准化

# 3. 划分训练集（80%）和测试集（20%）
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y  # stratify保证标签分布一致
)

print(f"训练集形状：X_train={X_train.shape}, y_train={y_train.shape}")
print(f"测试集形状：X_test={X_test.shape}, y_test={y_test.shape}")

问题 6：实现权重初始化函数 `initialize_weights(n_features)`

python 复制代码

def initialize_weights(n_features):
    """
    初始化权重向量为零向量
    参数：
        n_features: 特征数量
    返回：
        weights: 形状为(n_features,)的零向量权重
    """
    weights = np.zeros(n_features)  # 权重与特征数一一对应
    return weights

步骤 2：设置超参数与初始化

python 复制代码

# 超参数设置
learning_rate = 0.01  # 学习率（步长）
epochs = 50  # 迭代次数
n_features = X_train.shape[1]  # 特征数量（30）

# 初始化权重和偏置
weights = initialize_weights(n_features)  # 权重向量（30维）
bias = 0.0  # 偏置项（初始为0）

问题 6：实现感知机训练函数 `train_perceptron()`

python 复制代码

def train_perceptron(X, y, weights, bias, learning_rate, epochs):
    """
    训练感知机模型
    核心原理：感知机学习规则------误分类样本驱动权重更新
    权重更新公式：w = w + η·y·x（η=学习率，y=真实标签，x=样本特征）
    偏置更新公式：b = b + η·y
    
    参数：
        X: 训练数据（n_samples, n_features）
        y: 训练标签（n_samples,），值为+1或-1
        weights: 初始权重向量（n_features,）
        bias: 初始偏置项（float）
        learning_rate: 学习率（0~1之间）
        epochs: 训练迭代次数（遍历数据集的次数）
    返回：
        weights: 更新后的权重向量
        bias: 更新后的偏置项
    """
    n_samples = X.shape[0]  # 训练样本数
    loss_history = []  # 记录每轮的误分类数（用于后续可视化）
    
    for epoch in range(epochs):
        misclassified = 0  # 记录当前轮次的误分类样本数
        
        # 遍历所有训练样本（随机梯度下降，逐样本更新）
        for i in range(n_samples):
            x_i = X[i]  # 第i个样本的特征（30维）
            y_i = y[i]  # 第i个样本的真实标签
            
            # 计算感知机输出（wx + b）
            linear_output = np.dot(x_i, weights) + bias
            
            # 符号激活函数：输出为+1或-1
            y_pred = 1 if linear_output > 0 else -1
            
            # 误分类判断：若预测值≠真实标签，更新权重和偏置
            if y_pred != y_i:
                weights += learning_rate * y_i * x_i  # 权重更新
                bias += learning_rate * y_i  # 偏置更新
                misclassified += 1  # 统计误分类数
        
        # 记录当前轮次的误分类数（用于观察收敛）
        loss_history.append(misclassified)
        
        # 打印每轮训练信息
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{epochs} | 误分类样本数：{misclassified} | 剩余误分类率：{misclassified/n_samples:.4f}")
    
    return weights, bias, loss_history

# 执行训练
trained_weights, trained_bias, loss_history = train_perceptron(
    X_train, y_train, weights, bias, learning_rate, epochs
)

问题 8：实现预测函数 `predict()`（使用符号激活函数）

python 复制代码

def predict(X, weights, bias):
    """
    感知机预测函数
    核心：符号激活函数------输入线性组合（wx + b），输出+1或-1
    
    参数：
        X: 待预测数据（n_samples, n_features）
        weights: 训练好的权重向量
        bias: 训练好的偏置项
    返回：
        y_pred: 预测标签（n_samples,），值为+1或-1
    """
    # 计算所有样本的线性输出（wx + b）
    linear_outputs = np.dot(X, weights) + bias
    
    # 符号激活：>0→1，否则→-1
    y_pred = np.where(linear_outputs > 0, 1, -1)
    
    return y_pred

# 对测试集进行预测
y_pred = predict(X_test, trained_weights, trained_bias)

步骤 3：计算准确率与混淆矩阵

python 复制代码

# 1. 计算测试集准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"\n测试集准确率：{accuracy:.4f}")

# 2. 计算混淆矩阵（将标签映射回0/1，方便理解：-1→0（恶性），1→1（良性））
y_test_mapped = np.where(y_test == -1, 0, 1)
y_pred_mapped = np.where(y_pred == -1, 0, 1)
cm = confusion_matrix(y_test_mapped, y_pred_mapped)

# 混淆矩阵可视化
plt.figure(figsize=(6, 4))
sns.heatmap(
    cm, 
    annot=True,  # 显示数值
    fmt="d",  # 整数格式
    cmap="Blues",
    xticklabels=["恶性（0）", "良性（1）"],
    yticklabels=["恶性（0）", "良性（1）"]
)
plt.xlabel("预测标签")
plt.ylabel("真实标签")
plt.title("感知机乳腺癌分类混淆矩阵")
plt.show()

# 混淆矩阵解读
tn, fp, fn, tp = cm.ravel()
print(f"\n混淆矩阵细节：")
print(f"真阴性（TN）：{tn}（恶性预测为恶性）")
print(f"假阳性（FP）：{fp}（良性预测为恶性）")
print(f"假阴性（FN）：{fn}（恶性预测为良性）")
print(f"真阳性（TP）：{tp}（良性预测为良性）")

问题 11：改变学习率，观察对收敛的影响

python 复制代码

# 测试不同学习率（0.001, 0.01, 0.1）
learning_rates = [0.001, 0.01, 0.1]
epochs = 50
loss_histories = []

for lr in learning_rates:
    # 重新初始化权重和偏置
    init_weights = initialize_weights(n_features)
    init_bias = 0.0
    
    # 训练
    _, _, loss_history = train_perceptron(
        X_train, y_train, init_weights, init_bias, lr, epochs
    )
    loss_histories.append(loss_history)
    print(f"\n学习率 {lr} 训练完成")

# 可视化不同学习率的收敛曲线（误分类数随轮次变化）
plt.figure(figsize=(8, 5))
for i, lr in enumerate(learning_rates):
    plt.plot(range(1, epochs+1), loss_histories[i], label=f"学习率={lr}")

plt.xlabel("训练轮次（Epoch）")
plt.ylabel("误分类样本数")
plt.title("不同学习率对感知机收敛的影响")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# 结论：
# - 学习率过小（0.001）：收敛慢，50轮后仍有较多误分类
# - 学习率适中（0.01）：收敛速度均衡，误分类数稳步下降
# - 学习率过大（0.1）：可能震荡（若数据未归一化，甚至不收敛），归一化后表现较好

问题 12：增加迭代次数，观察准确率是否提升

python 复制代码

# 测试不同迭代次数（50, 100, 200）
epoch_list = [50, 100, 200]
learning_rate = 0.01
test_accuracies = []

for epochs in epoch_list:
    # 重新初始化
    init_weights = initialize_weights(n_features)
    init_bias = 0.0
    
    # 训练
    trained_w, trained_b, _ = train_perceptron(
        X_train, y_train, init_weights, init_bias, learning_rate, epochs
    )
    
    # 预测并计算准确率
    y_pred = predict(X_test, trained_w, trained_b)
    acc = accuracy_score(y_test, y_pred)
    test_accuracies.append(acc)
    print(f"迭代次数 {epochs} | 测试集准确率：{acc:.4f}")

# 可视化迭代次数与准确率的关系
plt.figure(figsize=(6, 4))
plt.bar(epoch_list, test_accuracies, width=10, alpha=0.7, color="skyblue")
plt.xlabel("迭代次数（Epoch）")
plt.ylabel("测试集准确率")
plt.title("迭代次数对感知机准确率的影响")
plt.ylim(0.9, 1.0)  # 限定y轴范围，更清晰展示差异
plt.grid(axis="y", alpha=0.3)
plt.show()

# 结论：
# - 迭代次数从50增加到100：准确率可能提升（模型未完全收敛时）
# - 迭代次数从100增加到200：准确率趋于稳定（模型已收敛，继续迭代无明显提升）

二、K 均值聚类（K-Means Clustering）：从零实现

目标：

使用 Python 和 NumPy 从零实现 K 均值聚类算法。
逐步理解算法的每个核心步骤。

导入依赖库

python 复制代码

import numpy as np
import matplotlib.pyplot as plt

翻译后的问题及详细解答

问题 1：实现数据生成函数 `generate_data(n, k, seed)`

python 复制代码

def generate_data(n, k, seed):
    """
    生成围绕k个聚类中心的合成数据集
    参数：
        n: 每个聚类的样本数
        k: 聚类数量
        seed: 随机种子（保证结果可复现）
    返回：
        X: 合成数据矩阵（形状：(n*k, 2)，2维特征方便可视化）
    """
    np.random.seed(seed)  # 设置随机种子
    
    # 定义k个聚类中心（2维，范围在[-10, 10]之间）
    centroids = np.random.uniform(low=-10, high=10, size=(k, 2))
    
    # 生成每个聚类的样本（添加高斯噪声，标准差=1.0）
    X = []
    for centroid in centroids:
        # 围绕中心生成n个样本，噪声服从N(0, 1)
        cluster = centroid + np.random.normal(loc=0, scale=1.0, size=(n, 2))
        X.append(cluster)
    
    # 合并所有聚类为一个矩阵
    X = np.vstack(X)
    return X

# 生成数据：每个聚类50个样本，共3个聚类，种子=42
n_per_cluster = 50
k_clusters = 3
X = generate_data(n=n_per_cluster, k=k_clusters, seed=42)
print(f"合成数据集形状：{X.shape}")  # 输出：(150, 2)

# 可视化原始数据（无标签）
plt.figure(figsize=(6, 4))
plt.scatter(X[:, 0], X[:, 1], s=50, alpha=0.7, color="gray")
plt.title("原始合成数据集（无聚类标签）")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.grid(alpha=0.3)
plt.show()

问题 2：实现聚类中心初始化函数 `initialize_centroids(X, k)`

python 复制代码

def initialize_centroids(X, k):
    """
    从数据X中随机选择k个点作为初始聚类中心
    参数：
        X: 输入数据（n_samples, n_features）
        k: 聚类数量
    返回：
        centroids: 初始聚类中心（k, n_features）
    """
    n_samples = X.shape[0]
    
    # 随机选择k个不重复的样本索引
    random_indices = np.random.choice(n_samples, size=k, replace=False)
    
    # 选择对应的样本作为初始中心
    centroids = X[random_indices]
    return centroids

# 测试初始化
initial_centroids = initialize_centroids(X, k=k_clusters)
print(f"初始聚类中心：\n{initial_centroids}")

问题 3：实现距离计算函数 `compute_distances(X, centroids)`

python 复制代码

def compute_distances(X, centroids):
    """
    计算每个样本到所有聚类中心的欧氏距离
    欧氏距离公式：d(x, c) = sqrt(Σ(x_i - c_i)^2)
    参数：
        X: 输入数据（n_samples, n_features）
        centroids: 聚类中心（k, n_features）
    返回：
        distances: 距离矩阵（n_samples, k），distances[i][j]表示第i个样本到第j个中心的距离
    """
    n_samples = X.shape[0]
    k = centroids.shape[0]
    distances = np.zeros((n_samples, k))  # 初始化距离矩阵
    
    for i in range(n_samples):
        for j in range(k):
            # 计算第i个样本到第j个中心的欧氏距离
            distances[i][j] = np.linalg.norm(X[i] - centroids[j])  # np.linalg.norm计算L2范数
    
    return distances

# 测试距离计算
distances = compute_distances(X, initial_centroids)
print(f"距离矩阵形状：{distances.shape}")  # 输出：(150, 3)
print(f"前5个样本到3个中心的距离：\n{distances[:5]}")

问题 4：实现聚类分配函数 `assign_clusters(X, centroids)`

python 复制代码

def assign_clusters(X, centroids):
    """
    将每个样本分配给距离最近的聚类中心
    参数：
        X: 输入数据（n_samples, n_features）
        centroids: 聚类中心（k, n_features）
    返回：
        labels: 聚类标签（n_samples,），值为0~k-1（对应k个聚类）
    """
    # 计算每个样本到所有中心的距离
    distances = compute_distances(X, centroids)
    
    # 为每个样本分配距离最小的中心索引（0~k-1）
    labels = np.argmin(distances, axis=1)  # axis=1表示按行取最小值索引
    
    return labels

# 测试聚类分配
initial_labels = assign_clusters(X, initial_centroids)
print(f"聚类标签形状：{initial_labels.shape}")
print(f"前10个样本的聚类标签：{initial_labels[:10]}")

问题 5：实现聚类中心更新函数 `update_centroids(X, labels, k)`

python 复制代码

def update_centroids(X, labels, k):
    """
    计算每个聚类的均值，更新聚类中心
    参数：
        X: 输入数据（n_samples, n_features）
        labels: 聚类标签（n_samples,）
        k: 聚类数量
    返回：
        new_centroids: 更新后的聚类中心（k, n_features）
    """
    n_features = X.shape[1]
    new_centroids = np.zeros((k, n_features))  # 初始化新中心
    
    for j in range(k):
        # 筛选出第j个聚类的所有样本
        cluster_samples = X[labels == j]
        
        # 计算该聚类的均值（按列求平均），作为新中心
        new_centroids[j] = np.mean(cluster_samples, axis=0)
    
    return new_centroids

# 测试中心更新
new_centroids = update_centroids(X, initial_labels, k=k_clusters)
print(f"更新后的聚类中心：\n{new_centroids}")

问题 6：整合所有函数为 `k_means(X, k, max_iters=100, tol=1e-4)`

python 复制代码

def k_means(X, k, max_iters=100, tol=1e-4):
    """
    K均值聚类完整算法
    迭代步骤：1. 初始化中心 → 2. 分配聚类 → 3. 更新中心 → 4. 收敛判断
    收敛条件：中心变化量≤tol 或 达到最大迭代次数
    
    参数：
        X: 输入数据（n_samples, n_features）
        k: 聚类数量
        max_iters: 最大迭代次数（默认100）
        tol: 中心变化量阈值（默认1e-4）
    返回：
        final_centroids: 最终聚类中心（k, n_features）
        final_labels: 最终聚类标签（n_samples,）
        centroids_history: 中心更新历史（用于可视化迭代过程）
    """
    # 1. 初始化聚类中心
    centroids = initialize_centroids(X, k)
    centroids_history = [centroids.copy()]  # 记录中心更新历史
    
    for iter in range(max_iters):
        # 2. 为每个样本分配聚类标签
        labels = assign_clusters(X, centroids)
        
        # 3. 更新聚类中心
        new_centroids = update_centroids(X, labels, k)
        
        # 4. 收敛判断：计算新旧中心的最大变化量
        centroid_shift = np.max(np.linalg.norm(new_centroids - centroids, axis=1))
        
        # 记录更新历史
        centroids_history.append(new_centroids.copy())
        
        # 打印迭代信息
        if (iter + 1) % 10 == 0:
            print(f"Iteration {iter+1}/{max_iters} | 中心最大变化量：{centroid_shift:.6f}")
        
        # 若变化量≤tol，收敛并退出
        if centroid_shift <= tol:
            print(f"算法收敛（中心变化量≤{tol}），迭代次数：{iter+1}")
            break
        
        # 更新中心，进入下一轮迭代
        centroids = new_centroids
    
    # 返回最终结果
    return centroids, labels, centroids_history

# 运行K均值聚类
final_centroids, final_labels, centroids_history = k_means(
    X, k=k_clusters, max_iters=100, tol=1e-4
)
print(f"\n最终聚类中心：\n{final_centroids}")

问题 7：可视化最终聚类结果

python 复制代码

# 可视化最终聚类结果
plt.figure(figsize=(8, 6))

# 绘制每个聚类的样本
colors = ["red", "blue", "green"]  # 3个聚类的颜色
for j in range(k_clusters):
    cluster_samples = X[final_labels == j]
    plt.scatter(
        cluster_samples[:, 0], cluster_samples[:, 1],
        s=60, alpha=0.7, color=colors[j], label=f"聚类{j+1}"
    )

# 绘制最终聚类中心（黑色星号标记）
plt.scatter(
    final_centroids[:, 0], final_centroids[:, 1],
    s=200, color="black", marker="*", label="聚类中心"
)

plt.title("K均值聚类最终结果（k=3）")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# 可选：可视化聚类中心的迭代过程
plt.figure(figsize=(8, 6))
# 绘制所有样本
plt.scatter(X[:, 0], X[:, 1], s=50, alpha=0.5, color="gray")

# 绘制中心更新轨迹（用虚线连接）
for j in range(k_clusters):
    # 提取第j个中心的所有迭代位置
    centroid_trajectory = np.array([history[j] for history in centroids_history])
    plt.plot(
        centroid_trajectory[:, 0], centroid_trajectory[:, 1],
        color=colors[j], linestyle="--", linewidth=2, marker="o", markersize=4
    )

# 绘制最终中心
plt.scatter(
    final_centroids[:, 0], final_centroids[:, 1],
    s=200, color="black", marker="*", label="最终中心"
)

plt.title("聚类中心迭代轨迹")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

K 均值聚类折线图（中心迭代轨迹）的意义

观察收敛过程：中心轨迹的虚线逐渐趋于稳定，说明算法在逐步收敛，最终停留在聚类的 "质心" 位置。
验证算法有效性：若中心轨迹无明显震荡，且最终分散在不同聚类区域，说明聚类结果合理。
诊断异常情况：若中心轨迹震荡剧烈或无法稳定，可能是 k 值选择不当（如 k=2 时 3 个真实聚类被合并）或初始中心随机导致局部最优，可通过多次运行取最优结果解决。

感知机（乳腺癌分类）& K 均值聚类（从零实现）：完整原理 + 可运行代码

一、核心原理详细解析

（一）感知机（Perceptron）原理

感知机是二分类的线性判别模型，是神经网络和支持向量机的基础，核心思想是 "通过误分类样本驱动权重更新，找到能分离两类数据的超平面"。

1. 数学模型

输入：样本特征向量 X=(x1,x2,...,xd)（d 为特征数）
权重：W=(w1,w2,...,wd)（每个特征的重要性系数）
偏置：b（调整超平面位置的参数，避免过拟合原点）
线性输出：f(X)=W⋅X+b=∑i=1dwixi+b（点积运算）
激活函数：符号函数（双极输出，适配二分类）ypred=sign(f(X))={1−1f(X)>0f(X)≤0

2. 学习目标

找到最优权重 W∗ 和偏置 b∗，使得超平面 W∗⋅X+b∗=0 能将两类样本完全分离（仅适用于线性可分数据）。

3. 学习规则（误分类驱动更新）

感知机的损失函数定义为 "误分类样本到超平面的距离之和"，最小化损失等价于减少误分类样本。更新规则如下：

若样本 (Xi,yi) 误分类（ypred=yi）：
- 权重更新：W=W+η⋅yi⋅Xi（η 为学习率，控制步长）
- 偏置更新：b=b+η⋅yi
直观理解：将超平面向误分类样本方向移动，逐步修正分离边界。

4. 关键特性

仅适用于线性可分数据（否则会震荡不收敛）；
特征归一化是必要的（否则尺度大的特征会主导权重）；
学习率 η 决定收敛速度（过小收敛慢，过大可能震荡）。

（二）K 均值聚类（K-Means）原理

K-Means 是无监督学习的聚类算法，核心思想是 "通过迭代优化，将数据分成 K 个紧凑且互不重叠的簇"，目标是最小化簇内样本的离散程度。

1. 核心目标

最小化 "簇内平方和（Within-Cluster Sum of Squares, WCSS）"：WCSS=∑j=1K∑Xi∈Cj∣∣Xi−μj∣∣2

Cj：第 j 个簇的样本集合；μj：第 j 个簇的中心（均值向量）；
含义：每个样本到其簇中心的欧氏距离平方和，值越小说明簇内越紧凑。

2. 迭代步骤（EM 算法思想）

K-Means 通过 "期望（E）- 最大化（M）" 循环优化：

E 步（分配簇）：固定簇中心，计算每个样本到所有中心的距离，将样本分配给最近的簇（最小距离原则）；
M 步（更新中心）：固定样本簇标签，计算每个簇的均值向量，作为新的簇中心（最小化 WCSS）；
重复 E-M 步，直到簇中心变化量小于阈值（收敛）或达到最大迭代次数。

3. 关键细节

初始中心选择：从数据中随机选 K 个样本（避免初始中心重叠）；
距离度量：默认欧氏距离（适用于连续特征，对尺度敏感，需归一化）；
K 值选择：需手动指定（常用肘部法则：WCSS 随 K 增大下降，拐点处的 K 为最优）；
收敛条件：簇中心的最大变化量≤tol（如 1e-4），说明中心稳定，聚类结果可靠。

二、完整可运行代码（含详细注释）

python 复制代码

# ==============================================
# 依赖库导入（一次性导入所有需要的库）
# ==============================================
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns

# ==============================================
# 第一部分：感知机（Perceptron）- 乳腺癌分类
# ==============================================
print("="*50)
print("第一部分：感知机（乳腺癌分类）")
print("="*50)

# --------------------------
# 1. 原理回顾
# --------------------------
print("\n【感知机核心原理】")
print("1. 模型：y = sign(W·X + b)，sign为符号激活函数（+1/-1）")
print("2. 学习规则：误分类样本驱动权重更新（W = W + η·y·X，b = b + η·y）")
print("3. 目标：找到分离两类数据的超平面 W·X + b = 0")
print("4. 前提：数据线性可分（否则不收敛）")

# --------------------------
# 2. 数据加载与预处理
# --------------------------
# 加载威斯康星乳腺癌数据集（569样本，30特征，二分类）
data = load_breast_cancer()
X = data.data  # 特征矩阵 (569, 30)
y = data.target  # 标签 (569,)：0=恶性，1=良性

# 标签转换：0/1 → -1/1（适配感知机的符号激活函数）
y = np.where(y == 1, 1, -1)

# 特征归一化（感知机对特征尺度敏感，标准化为均值0、方差1）
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 划分训练集（80%）和测试集（20%），stratify保证标签分布一致
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n【数据信息】")
print(f"训练集：X_train={X_train.shape}, y_train={y_train.shape}")
print(f"测试集：X_test={X_test.shape}, y_test={y_test.shape}")
print(f"特征数：{X_train.shape[1]}，标签分布：训练集(1/-1)={np.sum(y_train==1)}/{np.sum(y_train==-1)}")

# --------------------------
# 3. 核心函数实现
# --------------------------
def initialize_weights(n_features):
    """
    初始化权重向量（零向量）
    原理：初始时对所有特征无偏向，权重均为0
    参数：n_features - 特征数量
    返回：weights - 零向量权重 (n_features,)
    """
    weights = np.zeros(n_features)
    return weights

def train_perceptron(X, y, weights, bias, learning_rate, epochs):
    """
    感知机训练函数（随机梯度下降，逐样本更新）
    原理：每遍历一个样本，判断是否误分类，误分类则更新权重和偏置
    参数：
        X: 训练数据 (n_samples, n_features)
        y: 训练标签 (n_samples,) → +1/-1
        weights: 初始权重 (n_features,)
        bias: 初始偏置 (float)
        learning_rate: 学习率（0~1）
        epochs: 迭代次数（遍历数据集的次数）
    返回：
        weights: 更新后的权重
        bias: 更新后的偏置
        loss_history: 每轮误分类样本数（用于收敛可视化）
    """
    n_samples = X.shape[0]
    loss_history = []  # 记录每轮误分类数
    
    for epoch in range(epochs):
        misclassified = 0  # 本轮误分类样本数
        
        for i in range(n_samples):
            x_i = X[i]  # 第i个样本特征 (30,)
            y_i = y[i]  # 第i个样本真实标签
            
            # 计算线性输出：W·X + b
            linear_output = np.dot(x_i, weights) + bias
            
            # 符号激活函数：输出预测标签
            y_pred = 1 if linear_output > 0 else -1
            
            # 误分类判断：预测≠真实标签 → 更新参数
            if y_pred != y_i:
                weights += learning_rate * y_i * x_i  # 权重更新
                bias += learning_rate * y_i  # 偏置更新
                misclassified += 1
        
        loss_history.append(misclassified)
        
        # 每10轮打印训练信息
        if (epoch + 1) % 10 == 0:
            mis_rate = misclassified / n_samples
            print(f"Epoch {epoch+1:2d}/{epochs} | 误分类数：{misclassified:3d} | 误分类率：{mis_rate:.4f}")
    
    return weights, bias, loss_history

def predict(X, weights, bias):
    """
    感知机预测函数
    原理：输入特征→线性输出→符号激活→输出标签
    参数：
        X: 待预测数据 (n_samples, n_features)
        weights: 训练好的权重
        bias: 训练好的偏置
    返回：
        y_pred: 预测标签 (n_samples,) → +1/-1
    """
    # 批量计算线性输出：(n_samples,) = (n_samples,30) · (30,) + (1,)
    linear_outputs = np.dot(X, weights) + bias
    
    # 符号激活：>0→1，否则→-1
    y_pred = np.where(linear_outputs > 0, 1, -1)
    return y_pred

# --------------------------
# 4. 模型训练与评估
# --------------------------
# 超参数设置
learning_rate = 0.01
epochs = 50
n_features = X_train.shape[1]

# 初始化权重和偏置
init_weights = initialize_weights(n_features)
init_bias = 0.0

# 训练模型
print("\n【开始训练】")
trained_weights, trained_bias, loss_history = train_perceptron(
    X_train, y_train, init_weights, init_bias, learning_rate, epochs
)

# 测试集预测
y_pred = predict(X_test, trained_weights, trained_bias)

# 计算准确率（将-1/1映射回0/1，方便理解）
y_test_mapped = np.where(y_test == -1, 0, 1)
y_pred_mapped = np.where(y_pred == -1, 0, 1)
accuracy = accuracy_score(y_test_mapped, y_pred_mapped)
print(f"\n【测试集评估】")
print(f"准确率：{accuracy:.4f}")

# 混淆矩阵计算与可视化
cm = confusion_matrix(y_test_mapped, y_pred_mapped)
plt.figure(figsize=(6, 4))
sns.heatmap(
    cm, annot=True, fmt="d", cmap="Blues",
    xticklabels=["恶性（0）", "良性（1）"],
    yticklabels=["恶性（0）", "良性（1）"]
)
plt.xlabel("预测标签")
plt.ylabel("真实标签")
plt.title("感知机乳腺癌分类混淆矩阵")
plt.show()

# 收敛曲线可视化（误分类数随轮次变化）
plt.figure(figsize=(8, 4))
plt.plot(range(1, epochs+1), loss_history, color="orange", linewidth=2)
plt.xlabel("训练轮次（Epoch）")
plt.ylabel("误分类样本数")
plt.title("感知机收敛曲线（学习率=0.01）")
plt.grid(alpha=0.3)
plt.show()

# --------------------------
# 5. 超参数影响分析（学习率+迭代次数）
# --------------------------
print("\n【超参数影响分析】")

# （1）不同学习率对收敛的影响
print("\n1. 不同学习率的收敛对比：")
learning_rates = [0.001, 0.01, 0.1]
lr_loss_histories = []

for lr in learning_rates:
    init_w = initialize_weights(n_features)
    init_b = 0.0
    _, _, loss_hist = train_perceptron(
        X_train, y_train, init_w, init_b, lr, epochs=50
    )
    lr_loss_histories.append(loss_hist)
    print(f"学习率={lr}：最终误分类数={loss_hist[-1]}")

# 可视化学习率影响
plt.figure(figsize=(8, 4))
for i, lr in enumerate(learning_rates):
    plt.plot(range(1, 51), lr_loss_histories[i], label=f"学习率={lr}")
plt.xlabel("训练轮次")
plt.ylabel("误分类样本数")
plt.title("不同学习率对感知机收敛的影响")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# （2）不同迭代次数对准确率的影响
print("\n2. 不同迭代次数的准确率对比：")
epoch_list = [50, 100, 200]
acc_list = []

for epochs in epoch_list:
    init_w = initialize_weights(n_features)
    init_b = 0.0
    w, b, _ = train_perceptron(
        X_train, y_train, init_w, init_b, learning_rate=0.01, epochs=epochs
    )
    y_pred = predict(X_test, w, b)
    y_pred_mapped = np.where(y_pred == -1, 0, 1)
    acc = accuracy_score(y_test_mapped, y_pred_mapped)
    acc_list.append(acc)
    print(f"迭代次数={epochs}：准确率={acc:.4f}")

# 可视化迭代次数影响
plt.figure(figsize=(6, 4))
plt.bar(epoch_list, acc_list, width=10, color="skyblue", alpha=0.7)
plt.xlabel("迭代次数（Epoch）")
plt.ylabel("测试集准确率")
plt.title("迭代次数对感知机准确率的影响")
plt.ylim(0.9, 1.0)
plt.grid(axis="y", alpha=0.3)
plt.show()

# ==============================================
# 第二部分：K均值聚类（K-Means）- 合成数据聚类
# ==============================================
print("\n" + "="*50)
print("第二部分：K均值聚类（K-Means）")
print("="*50)

# --------------------------
# 1. 原理回顾
# --------------------------
print("\n【K-Means核心原理】")
print("1. 目标：最小化簇内平方和（WCSS），使簇内样本紧凑、簇间分离")
print("2. 迭代步骤：初始化中心→分配簇（E步）→更新中心（M步）→收敛")
print("3. 收敛条件：簇中心变化量≤tol 或 达到最大迭代次数")
print("4. 关键：K值需手动指定，初始中心随机选择")

# --------------------------
# 2. 核心函数实现
# --------------------------
def generate_data(n, k, seed):
    """
    生成合成数据集（围绕k个中心的高斯分布数据）
    用于聚类可视化（2维特征）
    参数：
        n: 每个簇的样本数
        k: 簇数量
        seed: 随机种子（可复现）
    返回：
        X: 合成数据 (n*k, 2)
    """
    np.random.seed(seed)
    # 随机生成k个簇中心（范围[-10,10]）
    centroids = np.random.uniform(low=-10, high=10, size=(k, 2))
    X = []
    
    for centroid in centroids:
        # 围绕中心生成n个样本，添加高斯噪声（标准差=1）
        cluster = centroid + np.random.normal(loc=0, scale=1.0, size=(n, 2))
        X.append(cluster)
    
    return np.vstack(X)  # 合并为(n*k, 2)矩阵

def initialize_centroids(X, k):
    """
    初始化簇中心：从数据中随机选择k个样本
    原理：避免初始中心远离数据分布，保证算法稳定启动
    参数：
        X: 输入数据 (n_samples, n_features)
        k: 簇数量
    返回：
        centroids: 初始中心 (k, n_features)
    """
    n_samples = X.shape[0]
    # 随机选择k个不重复的样本索引
    random_indices = np.random.choice(n_samples, size=k, replace=False)
    return X[random_indices]

def compute_distances(X, centroids):
    """
    计算每个样本到所有簇中心的欧氏距离
    欧氏距离公式：d(x,c) = sqrt(Σ(x_i - c_i)^2)
    参数：
        X: 输入数据 (n_samples, n_features)
        centroids: 簇中心 (k, n_features)
    返回：
        distances: 距离矩阵 (n_samples, k)
    """
    n_samples = X.shape[0]
    k = centroids.shape[0]
    distances = np.zeros((n_samples, k))
    
    for i in range(n_samples):
        for j in range(k):
            # 计算第i个样本到第j个中心的欧氏距离
            distances[i][j] = np.linalg.norm(X[i] - centroids[j])
    
    return distances

def assign_clusters(X, centroids):
    """
    分配簇标签：每个样本分配给最近的簇中心（E步）
    原理：最小距离原则，保证簇内样本相似度高
    参数：
        X: 输入数据 (n_samples, n_features)
        centroids: 簇中心 (k, n_features)
    返回：
        labels: 簇标签 (n_samples,) → 0~k-1
    """
    distances = compute_distances(X, centroids)
    # 按行取最小值索引（每个样本最近的中心）
    labels = np.argmin(distances, axis=1)
    return labels

def update_centroids(X, labels, k):
    """
    更新簇中心：每个簇的均值向量（M步）
    原理：均值向量是使簇内平方和最小的最优中心
    参数：
        X: 输入数据 (n_samples, n_features)
        labels: 簇标签 (n_samples,)
        k: 簇数量
    返回：
        new_centroids: 更新后的中心 (k, n_features)
    """
    n_features = X.shape[1]
    new_centroids = np.zeros((k, n_features))
    
    for j in range(k):
        # 筛选第j个簇的所有样本
        cluster_samples = X[labels == j]
        # 计算簇内样本的均值（按列求平均）
        new_centroids[j] = np.mean(cluster_samples, axis=0)
    
    return new_centroids

def k_means(X, k, max_iters=100, tol=1e-4):
    """
    K-Means完整算法
    参数：
        X: 输入数据 (n_samples, n_features)
        k: 簇数量
        max_iters: 最大迭代次数
        tol: 中心变化量阈值（收敛条件）
    返回：
        final_centroids: 最终簇中心 (k, n_features)
        final_labels: 最终簇标签 (n_samples,)
        centroids_history: 中心更新历史（可视化用）
    """
    # 1. 初始化簇中心
    centroids = initialize_centroids(X, k)
    centroids_history = [centroids.copy()]  # 记录中心轨迹
    
    for iter in range(max_iters):
        # 2. E步：分配簇标签
        labels = assign_clusters(X, centroids)
        
        # 3. M步：更新簇中心
        new_centroids = update_centroids(X, labels, k)
        
        # 4. 收敛判断：计算中心变化量（最大欧氏距离）
        centroid_shift = np.max(np.linalg.norm(new_centroids - centroids, axis=1))
        
        # 记录中心历史
        centroids_history.append(new_centroids.copy())
        
        # 每10轮打印信息
        if (iter + 1) % 10 == 0:
            print(f"Iteration {iter+1:2d}/{max_iters} | 中心最大变化量：{centroid_shift:.6f}")
        
        # 满足收敛条件，退出迭代
        if centroid_shift <= tol:
            print(f"算法收敛（中心变化量≤{tol}），迭代次数：{iter+1}")
            break
        
        # 更新中心，进入下一轮
        centroids = new_centroids
    
    return centroids, labels, centroids_history

# --------------------------
# 3. 模型运行与可视化
# --------------------------
# 生成合成数据：3个簇，每个簇50个样本，种子=42
n_per_cluster = 50
k_clusters = 3
X_cluster = generate_data(n=n_per_cluster, k=k_clusters, seed=42)
print(f"\n【合成数据信息】")
print(f"数据形状：{X_cluster.shape}（{n_per_cluster*k_clusters}个样本，2个特征）")

# 可视化原始数据（无标签）
plt.figure(figsize=(6, 4))
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], s=50, alpha=0.7, color="gray")
plt.title("原始合成数据（无簇标签）")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.grid(alpha=0.3)
plt.show()

# 运行K-Means算法
print("\n【开始K-Means聚类】")
final_centroids, final_labels, centroids_history = k_means(
    X_cluster, k=k_clusters, max_iters=100, tol=1e-4
)

# 可视化最终聚类结果
plt.figure(figsize=(8, 6))
colors = ["red", "blue", "green"]  # 3个簇的颜色

# 绘制每个簇的样本
for j in range(k_clusters):
    cluster_samples = X_cluster[final_labels == j]
    plt.scatter(
        cluster_samples[:, 0], cluster_samples[:, 1],
        s=60, alpha=0.7, color=colors[j], label=f"簇{j+1}"
    )

# 绘制最终簇中心（黑色星号）
plt.scatter(
    final_centroids[:, 0], final_centroids[:, 1],
    s=200, color="black", marker="*", label="簇中心"
)

plt.title(f"K-Means聚类最终结果（k={k_clusters}）")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

# 可视化簇中心迭代轨迹
plt.figure(figsize=(8, 6))
# 绘制所有样本（灰色背景）
plt.scatter(X_cluster[:, 0], X_cluster[:, 1], s=50, alpha=0.3, color="gray")

# 绘制每个中心的迭代轨迹（虚线连接）
for j in range(k_clusters):
    # 提取第j个中心的所有迭代位置
    centroid_trajectory = np.array([hist[j] for hist in centroids_history])
    plt.plot(
        centroid_trajectory[:, 0], centroid_trajectory[:, 1],
        color=colors[j], linestyle="--", linewidth=2, marker="o", markersize=4,
        label=f"簇{j+1}中心轨迹"
    )

# 绘制最终中心（黑色星号）
plt.scatter(
    final_centroids[:, 0], final_centroids[:, 1],
    s=200, color="black", marker="*", label="最终簇中心"
)

plt.title("K-Means簇中心迭代轨迹")
plt.xlabel("特征1")
plt.ylabel("特征2")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

print("\n【聚类完成】")
print(f"最终簇中心：\n{final_centroids}")

三、代码运行说明

1. 依赖库安装

运行前需安装以下库（终端执行）：

python 复制代码

pip install numpy matplotlib scikit-learn seaborn

2. 核心输出

感知机部分：训练收敛曲线、测试集准确率、混淆矩阵、不同学习率 / 迭代次数的对比图；
K-Means 部分：原始数据图、最终聚类结果图、簇中心迭代轨迹图。

3. 关键结论

感知机：乳腺癌数据集线性可分，准确率可达 95% 以上，学习率 0.01 时收敛速度最优；
K-Means：合成数据聚类效果清晰，簇中心轨迹逐渐稳定，验证了算法的收敛性。

感知机：乳腺癌分类实现 & K 均值聚类：从零实现