01-编程基础与数学基石：概率与统计

概率与统计：AI处理不确定性的数学工具

一、为什么AI需要概率与统计？

1.1 现实世界充满不确定性

python 复制代码

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandas as pd

print("=" * 60)
print("概率与统计在AI中的应用场景")
print("=" * 60)

applications = {
    "朴素贝叶斯": "用条件概率做文本分类",
    "隐马尔可夫模型": "序列标注（词性标注、语音识别）",
    "变分自编码器(VAE)": "生成模型，学习数据分布",
    "贝叶斯神经网络": "量化模型预测的不确定性",
    "强化学习": "用概率处理随机策略",
    "异常检测": "用统计判断什么是异常"
}

for app, desc in applications.items():
    print(f"\n📌 {app}:")
    print(f"   {desc}")

# 直观示例：为什么需要概率？
print("\n🎲 示例：分类器的不确定性")
print("   模型预测: 这张图片90%是猫，10%是狗")
print("   → 概率让我们知道模型有多确定！")

二、概率分布：描述随机变量的行为

2.1 均匀分布：所有结果等可能

python 复制代码

# 均匀分布：每个值出现的概率相同
np.random.seed(42)

uniform_data = np.random.uniform(0, 1, 10000)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. 均匀分布
axes[0, 0].hist(uniform_data, bins=50, density=True, alpha=0.7, color='blue')
x = np.linspace(0, 1, 100)
axes[0, 0].plot(x, [1]*len(x), 'r-', linewidth=2, label='理论概率密度')
axes[0, 0].set_title('均匀分布 U(0,1)\n所有值等可能')
axes[0, 0].set_xlabel('x')
axes[0, 0].set_ylabel('概率密度')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# AI应用：随机初始化神经网络权重
print("\n📊 均匀分布AI应用:")
print("   神经网络权重初始化: np.random.uniform(-0.1, 0.1, size=(784, 256))")

2.2 正态分布：自然界最常见

python 复制代码

# 正态分布（高斯分布）：均值μ，标准差σ
means = [0, 0, 0, -2]
stds = [0.5, 1, 2, 0.7]
colors = ['blue', 'red', 'green', 'purple']
labels = [f'N(0, 0.5²)', f'N(0, 1²)', f'N(0, 2²)', f'N(-2, 0.7²)']

x = np.linspace(-6, 6, 1000)

for mu, sigma, color, label in zip(means, stds, colors, labels):
    y = stats.norm.pdf(x, mu, sigma)
    axes[0, 1].plot(x, y, color=color, linewidth=2, label=label)

axes[0, 1].set_title('正态分布家族\n均值μ决定中心，标准差σ决定宽度')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('概率密度')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 正态分布的68-95-99.7法则
axes[0, 2].text(0.1, 0.5, 
                '68-95-99.7 法则:\n'
                '• μ±σ: 68% 的数据\n'
                '• μ±2σ: 95% 的数据\n'
                '• μ±3σ: 99.7% 的数据',
                transform=axes[0, 2].transAxes,
                fontsize=12, bbox=dict(boxstyle='round', facecolor='lightyellow'))

# 演示正态分布
normal_data = np.random.randn(10000)  # 标准正态分布
axes[0, 2].hist(normal_data, bins=50, density=True, alpha=0.6, color='skyblue')
x_norm = np.linspace(-4, 4, 100)
axes[0, 2].plot(x_norm, stats.norm.pdf(x_norm), 'r-', linewidth=2)
axes[0, 2].set_title('标准正态分布 N(0,1)\nAI中用于初始化、噪声添加')
axes[0, 2].set_xlabel('x')
axes[0, 2].set_ylabel('密度')

print("\n📊 正态分布AI应用:")
print("   1. 权重初始化: 用正态分布初始化神经网络")
print("   2. 添加噪声: 训练时加高斯噪声增强鲁棒性")
print("   3. 误差分布: 很多自然现象误差服从正态分布")

2.3 伯努利分布：抛硬币

python 复制代码

# 伯努利分布：只有两种结果（0/1）
p = 0.7  # 成功的概率
bernoulli_data = np.random.binomial(1, p, 1000)

# 可视化
counts = np.bincount(bernoulli_data)
axes[1, 0].bar([0, 1], counts/1000, color=['red', 'green'], alpha=0.7)
axes[1, 0].set_xticks([0, 1])
axes[1, 0].set_xticklabels(['失败(0)', '成功(1)'])
axes[1, 0].set_ylim(0, 1)
axes[1, 0].set_ylabel('概率')
axes[1, 0].set_title(f'伯努利分布 B({p})\n每次试验只有两种结果')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 添加理论概率
axes[1, 0].axhline(y=1-p, xmin=0, xmax=0.4, color='red', linestyle='--', alpha=0.5)
axes[1, 0].axhline(y=p, xmin=0.6, xmax=1, color='green', linestyle='--', alpha=0.5)

# AI应用：二分类问题
print("\n📊 伯努利分布AI应用:")
print("   二分类问题: 是/否、猫/狗、垃圾邮件/正常邮件")
print("   输出层用Sigmoid: 输出[0,1]之间的概率")

# 演示：用伯努利分布做二分类
def binary_classifier_demo():
    """模拟二分类器的输出"""
    scores = np.array([0.1, 0.3, 0.6, 0.8, 0.9])
    probabilities = 1 / (1 + np.exp(-scores))  # Sigmoid
    
    print(f"\n   分类器输出概率: {probabilities}")
    print(f"   预测类别: {probabilities > 0.5}")
    
binary_classifier_demo()

三、条件概率与贝叶斯公式

3.1 条件概率：在已知信息下的概率

python 复制代码

# 条件概率：P(A|B) = P(A∩B) / P(B)
# 意义：在B发生的情况下，A发生的概率

# 示例：疾病检测
# 假设某疾病的患病率是1%，检测准确率99%
# 问：如果检测呈阳性，真的患病的概率是多少？

# 先验概率
P_disease = 0.01      # 患病概率
P_healthy = 0.99      # 健康概率

# 检测准确率
P_positive_given_disease = 0.99   # 真阳性率（敏感度）
P_negative_given_healthy = 0.99   # 真阴性率（特异度）
P_positive_given_healthy = 0.01   # 假阳性率

# 计算后验概率（贝叶斯公式）
P_disease_given_positive = (P_positive_given_disease * P_disease) / \
                           (P_positive_given_disease * P_disease + 
                            P_positive_given_healthy * P_healthy)

print("\n" + "=" * 60)
print("贝叶斯公式实例：疾病检测")
print("=" * 60)
print(f"患病率: {P_disease*100}%")
print(f"检测准确率: {P_positive_given_disease*100}%")
print(f"检测呈阳性时真正患病的概率: {P_disease_given_positive*100:.2f}%")
print("\n⚠️ 即使检测准确率99%，由于患病率低，阳性结果只有约50%是真的！")

# 可视化贝叶斯更新
fig, ax = plt.subplots(figsize=(10, 6))

# 模拟不同先验概率下的后验概率
priors = np.linspace(0.001, 0.1, 100)
posteriors = []

for prior in priors:
    posterior = (0.99 * prior) / (0.99 * prior + 0.01 * (1 - prior))
    posteriors.append(posterior)

ax.plot(priors, posteriors, 'b-', linewidth=2)
ax.plot([0, 0.1], [0, 0.1], 'r--', alpha=0.5, label='y=x（无信息增益）')
ax.set_xlabel('先验概率（患病率）')
ax.set_ylabel('后验概率（检测阳性后患病概率）')
ax.set_title('贝叶斯更新：检测信息如何改变信念')
ax.legend()
ax.grid(True, alpha=0.3)

# 标注我们的例子
ax.plot(P_disease, P_disease_given_positive, 'ro', markersize=10)
ax.annotate(f'患病率={P_disease*100}%\n后验={P_disease_given_positive*100:.1f}%',
            xy=(P_disease, P_disease_given_positive),
            xytext=(0.04, 0.3),
            arrowprops=dict(arrowstyle='->'))

plt.tight_layout()
plt.show()

3.2 朴素贝叶斯分类器

python 复制代码

# 实现一个简单的朴素贝叶斯分类器
class NaiveBayesClassifier:
    """
    朴素贝叶斯分类器
    假设：特征之间相互独立（"朴素"的含义）
    """
    
    def __init__(self):
        self.class_probs = {}      # P(y)
        self.feature_probs = {}    # P(x|y)
        self.classes = None
    
    def fit(self, X, y):
        """训练：计算先验概率和条件概率"""
        self.classes = np.unique(y)
        n_samples, n_features = X.shape
        
        # 计算每个类别的先验概率 P(y)
        for c in self.classes:
            self.class_probs[c] = np.sum(y == c) / n_samples
            self.feature_probs[c] = {}
            
            # 计算每个特征的条件概率 P(x_i|y)
            X_c = X[y == c]
            for i in range(n_features):
                # 对于连续特征，假设服从正态分布
                mean = np.mean(X_c[:, i])
                std = np.std(X_c[:, i])
                self.feature_probs[c][i] = (mean, std)
        
        return self
    
    def _gaussian_pdf(self, x, mean, std):
        """高斯概率密度函数"""
        if std == 0:
            return 1 if x == mean else 0
        return (1 / (np.sqrt(2 * np.pi) * std)) * \
               np.exp(-((x - mean) ** 2) / (2 * std ** 2))
    
    def predict_proba(self, X):
        """预测概率 P(y|x)"""
        n_samples = X.shape[0]
        probas = np.zeros((n_samples, len(self.classes)))
        
        for i, sample in enumerate(X):
            for j, c in enumerate(self.classes):
                # P(y) * Π P(x_i|y)
                prob = self.class_probs[c]
                for k, value in enumerate(sample):
                    mean, std = self.feature_probs[c][k]
                    prob *= self._gaussian_pdf(value, mean, std)
                probas[i, j] = prob
            
            # 归一化
            probas[i] = probas[i] / np.sum(probas[i])
        
        return probas
    
    def predict(self, X):
        probas = self.predict_proba(X)
        return self.classes[np.argmax(probas, axis=1)]

# 生成测试数据
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# 生成数据集
X, y = make_classification(n_samples=500, n_features=4, n_classes=2,
                           n_clusters_per_class=1, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 训练朴素贝叶斯
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

# 预测
y_pred = nb.predict(X_test)
accuracy = np.mean(y_pred == y_test)
y_proba = nb.predict_proba(X_test)

print("\n" + "=" * 60)
print("朴素贝叶斯分类器")
print("=" * 60)
print(f"准确率: {accuracy*100:.2f}%")
print(f"\n先验概率 P(y):")
for c, prob in nb.class_probs.items():
    print(f"  类别{c}: {prob:.3f}")

# 可视化预测不确定性
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 预测概率分布
axes[0].hist(y_proba[y_test==0, 0], bins=20, alpha=0.5, label='真实类别0', color='blue')
axes[0].hist(y_proba[y_test==1, 0], bins=20, alpha=0.5, label='真实类别1', color='red')
axes[0].set_xlabel('预测为类别0的概率')
axes[0].set_ylabel('频数')
axes[0].set_title('朴素贝叶斯预测概率分布')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 混淆矩阵
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
axes[1].imshow(cm, cmap='Blues')
for i in range(2):
    for j in range(2):
        axes[1].text(j, i, cm[i, j], ha='center', va='center', fontsize=16)
axes[1].set_xlabel('预测类别')
axes[1].set_ylabel('真实类别')
axes[1].set_title('混淆矩阵')
axes[1].set_xticks([0, 1])
axes[1].set_yticks([0, 1])

plt.tight_layout()
plt.show()

四、期望、方差、标准差

4.1 基本概念

python 复制代码

# 期望（均值）：数据的中心
# 方差：数据的离散程度
# 标准差：方差的平方根，更直观

# 生成不同方差的数据
np.random.seed(42)
data_low_var = np.random.normal(0, 1, 1000)      # 低方差
data_high_var = np.random.normal(0, 3, 1000)     # 高方差

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. 不同方差的分布对比
axes[0, 0].hist(data_low_var, bins=50, alpha=0.5, label=f'低方差 (σ=1)', color='blue')
axes[0, 0].hist(data_high_var, bins=50, alpha=0.5, label=f'高方差 (σ=3)', color='red')
axes[0, 0].set_xlabel('值')
axes[0, 0].set_ylabel('频数')
axes[0, 0].set_title('不同方差的数据分布')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. 期望的几何意义
x = np.linspace(-4, 4, 100)
y = stats.norm.pdf(x, 0, 1)
axes[0, 1].plot(x, y, 'b-', linewidth=2)
axes[0, 1].axvline(x=0, color='red', linestyle='--', linewidth=2, label='期望 μ=0')
axes[0, 1].fill_between(x, y, where=(x >= -1) & (x <= 1), alpha=0.3, color='green')
axes[0, 1].set_xlabel('x')
axes[0, 1].set_ylabel('概率密度')
axes[0, 1].set_title('期望是分布的中心')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. 方差的计算公式可视化
axes[1, 0].text(0.1, 0.5,
                '方差公式:\n\n'
                'Var(X) = E[(X - μ)²]\n\n'
                '     = E[X²] - (E[X])²\n\n'
                '标准差: σ = √Var(X)',
                transform=axes[1, 0].transAxes,
                fontsize=14, bbox=dict(boxstyle='round', facecolor='lightblue'))

# 4. AI中的应用
axes[1, 1].axis('off')
applications_text = """
🎯 在AI中的核心应用:

1. 特征标准化:
   X_scaled = (X - μ) / σ
   使不同特征在同一尺度

2. Batch Normalization:
   在训练过程中标准化激活值
   加速收敛，提高稳定性

3. 模型评估:
   均值 ± 标准差 表示预测不确定性

4. 初始化策略:
   Xavier初始化: Var(W) = 2/(n_in + n_out)
   防止梯度消失/爆炸
"""
axes[1, 1].text(0.1, 0.5, applications_text, transform=axes[1, 1].transAxes,
                fontsize=11, verticalalignment='center',
                bbox=dict(boxstyle='round', facecolor='lightyellow'))

plt.suptitle('期望、方差、标准差：描述数据的核心指标', fontsize=14)
plt.tight_layout()
plt.show()

# 演示：特征标准化
print("\n📊 特征标准化演示:")
data = np.random.randn(100, 3) * [1, 10, 100] + [0, 5, -50]
print(f"原始数据统计:")
print(f"  均值: {data.mean(axis=0)}")
print(f"  标准差: {data.std(axis=0)}")

# 标准化
data_scaled = (data - data.mean(axis=0)) / data.std(axis=0)
print(f"\n标准化后统计:")
print(f"  均值: {data_scaled.mean(axis=0)}")
print(f"  标准差: {data_scaled.std(axis=0)}")
print("✅ 所有特征现在在同一尺度！")

五、最大似然估计（MLE）

5.1 MLE的基本思想

最大似然估计：找到最有可能产生观测数据的参数

python 复制代码

# MLE直观理解：抛硬币
# 我们抛了10次硬币，得到7次正面
# 问：硬币正面概率p最可能是多少？

def likelihood(p, n_heads, n_tosses):
    """似然函数：给定p，观察到数据的概率"""
    from scipy.special import comb
    return comb(n_tosses, n_heads) * (p ** n_heads) * ((1-p) ** (n_tosses - n_heads))

n_tosses = 10
n_heads = 7
p_values = np.linspace(0, 1, 100)
likelihoods = [likelihood(p, n_heads, n_tosses) for p in p_values]

# 找到最大似然估计
mle_p = p_values[np.argmax(likelihoods)]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 似然函数
axes[0].plot(p_values, likelihoods, 'b-', linewidth=2)
axes[0].axvline(x=mle_p, color='red', linestyle='--', label=f'MLE: p={mle_p:.3f}')
axes[0].set_xlabel('硬币正面概率 p')
axes[0].set_ylabel('似然 L(p|data)')
axes[0].set_title(f'抛{n_tosses}次，{n_heads}次正面\nMLE估计: p={mle_p:.3f}')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 不同数据量下的MLE
axes[1].text(0.1, 0.5,
             f'MLE结果:\n\n'
             f'p̂ = {n_heads}/{n_tosses} = {n_heads/n_tosses:.2f}\n\n'
             f'这正是直观感受！\n'
             f'MLE给出的答案符合常识。',
             transform=axes[1].transAxes,
             fontsize=14, bbox=dict(boxstyle='round', facecolor='lightgreen'))

plt.suptitle('最大似然估计(MLE)：找最可能的参数', fontsize=14)
plt.tight_layout()
plt.show()

print(f"\n📊 最大似然估计结果:")
print(f"   抛硬币{n_tosses}次，{n_heads}次正面")
print(f"   MLE估计的p = {n_heads/n_tosses:.2f}")
print(f"   这符合直觉：正面比例就是概率估计！")

5.2 MLE在线性回归中的应用

python 复制代码

# 线性回归的MLE视角
# 假设：误差服从正态分布

np.random.seed(42)
X = np.linspace(0, 10, 50)
true_w, true_b = 2, 1
y_true = true_w * X + true_b
y_obs = y_true + np.random.normal(0, 2, 50)  # 添加噪声

# MLE估计：最小化负对数似然 = 最小化MSE
# 解正规方程：w = (X^T X)^(-1) X^T y

X_design = np.column_stack([np.ones(50), X])  # 添加偏置列
w_mle = np.linalg.inv(X_design.T @ X_design) @ X_design.T @ y_obs
w_est, b_est = w_mle[1], w_mle[0]

# 计算残差
y_pred = X_design @ w_mle
residuals = y_obs - y_pred

# 估计噪声标准差
sigma_est = np.std(residuals)

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. 拟合结果
axes[0, 0].scatter(X, y_obs, alpha=0.6, label='观测数据')
axes[0, 0].plot(X, y_true, 'g-', linewidth=2, label='真实线')
axes[0, 0].plot(X, y_pred, 'r--', linewidth=2, label=f'MLE拟合 (w={w_est:.2f}, b={b_est:.2f})')
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('y')
axes[0, 0].set_title('线性回归：MLE拟合结果')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. 残差分布
axes[0, 1].hist(residuals, bins=20, density=True, alpha=0.7, color='skyblue')
x_norm = np.linspace(-6, 6, 100)
axes[0, 1].plot(x_norm, stats.norm.pdf(x_norm, 0, sigma_est), 
                'r-', linewidth=2, label=f'N(0, {sigma_est:.2f}²)')
axes[0, 1].set_xlabel('残差')
axes[0, 1].set_ylabel('密度')
axes[0, 1].set_title('残差分布（应近似正态）')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. 似然函数面（3D）
w_range = np.linspace(1, 3, 50)
b_range = np.linspace(-1, 3, 50)
W, B = np.meshgrid(w_range, b_range)
LogLikelihood = np.zeros_like(W)

for i in range(len(w_range)):
    for j in range(len(b_range)):
        y_pred_grid = W[i, j] * X + B[i, j]
        residuals_grid = y_obs - y_pred_grid
        # 高斯似然
        log_lik = -0.5 * len(X) * np.log(2 * np.pi * 2**2) - \
                  np.sum(residuals_grid**2) / (2 * 2**2)
        LogLikelihood[j, i] = log_lik

ax3d = fig.add_subplot(2, 2, 3, projection='3d')
ax3d.plot_surface(W, B, LogLikelihood, cmap='viridis', alpha=0.8)
ax3d.scatter(w_est, b_est, np.max(LogLikelihood), color='red', s=100)
ax3d.set_xlabel('权重 w')
ax3d.set_ylabel('偏置 b')
ax3d.set_zlabel('对数似然')
ax3d.set_title('似然函数曲面\n红点是MLE估计')

# 4. MLE推导总结
axes[1, 1].axis('off')
mle_summary = f"""
📐 线性回归的MLE推导:

假设: y = wx + b + ε, ε ~ N(0, σ²)

似然函数: L = Π P(y_i|x_i)

对数似然: log L = -n/2·log(2πσ²) - Σ(y_i - ŷ_i)²/(2σ²)

最大化log L ⇔ 最小化 Σ(y_i - ŷ_i)²

结论: MLE等价于最小二乘法！

估计结果:
  ŵ = {w_est:.3f}
  b̂ = {b_est:.3f}
  σ̂ = {sigma_est:.3f}
"""
axes[1, 1].text(0.1, 0.5, mle_summary, transform=axes[1, 1].transAxes,
                fontsize=11, verticalalignment='center',
                bbox=dict(boxstyle='round', facecolor='lightyellow'))

plt.suptitle('最大似然估计在线性回归中的应用', fontsize=14)
plt.tight_layout()
plt.show()

六、实战：完整的贝叶斯推理系统

python 复制代码

# 实现一个贝叶斯垃圾邮件分类器
class BayesianSpamClassifier:
    """
    基于贝叶斯定理的垃圾邮件分类器
    使用词袋模型
    """
    
    def __init__(self):
        self.spam_word_counts = {}
        self.ham_word_counts = {}
        self.spam_total = 0
        self.ham_total = 0
        self.p_spam = 0.5  # 先验
        self.p_ham = 0.5
    
    def tokenize(self, text):
        """简单的分词"""
        import re
        words = re.findall(r'\b\w+\b', text.lower())
        return words
    
    def fit(self, emails, labels):
        """训练：统计词频"""
        for email, label in zip(emails, labels):
            words = self.tokenize(email)
            
            if label == 1:  # 垃圾邮件
                self.spam_total += len(words)
                for word in words:
                    self.spam_word_counts[word] = self.spam_word_counts.get(word, 0) + 1
            else:  # 正常邮件
                self.ham_total += len(words)
                for word in words:
                    self.ham_word_counts[word] = self.ham_word_counts.get(word, 0) + 1
        
        # 更新先验概率
        n_emails = len(emails)
        self.p_spam = sum(labels) / n_emails
        self.p_ham = 1 - self.p_spam
        
        return self
    
    def predict_proba(self, email):
        """预测是垃圾邮件的概率"""
        words = self.tokenize(email)
        
        # 计算log概率避免下溢
        log_p_spam = np.log(self.p_spam)
        log_p_ham = np.log(self.p_ham)
        
        for word in words:
            # 拉普拉斯平滑
            p_word_given_spam = (self.spam_word_counts.get(word, 0) + 1) / (self.spam_total + len(self.spam_word_counts))
            p_word_given_ham = (self.ham_word_counts.get(word, 0) + 1) / (self.ham_total + len(self.ham_word_counts))
            
            log_p_spam += np.log(p_word_given_spam)
            log_p_ham += np.log(p_word_given_ham)
        
        # 归一化得到概率
        p_spam_given_email = 1 / (1 + np.exp(log_p_ham - log_p_spam))
        return p_spam_given_email
    
    def predict(self, email, threshold=0.5):
        prob = self.predict_proba(email)
        return 1 if prob > threshold else 0

# 创建示例数据
emails = [
    "Win a free prize now click here",  # 垃圾
    "Meeting at 3pm tomorrow",          # 正常
    "Get rich quick scheme",             # 垃圾
    "Project deadline extended",         # 正常
    "Limited time offer discount",       # 垃圾
    "Lunch break schedule",              # 正常
    "Congratulations you won",           # 垃圾
    "Weekly team sync",                  # 正常
]
labels = [1, 0, 1, 0, 1, 0, 1, 0]

# 训练分类器
spam_filter = BayesianSpamClassifier()
spam_filter.fit(emails, labels)

# 测试
test_emails = [
    "Click here to win free money",
    "Team meeting tomorrow at 2",
    "You are the lucky winner",
    "Project status update"
]

print("\n" + "=" * 60)
print("贝叶斯垃圾邮件过滤器")
print("=" * 60)

for email in test_emails:
    prob = spam_filter.predict_proba(email)
    prediction = spam_filter.predict(email)
    status = "🚨 垃圾邮件" if prediction == 1 else "✅ 正常邮件"
    print(f"\n邮件: {email}")
    print(f"  垃圾邮件概率: {prob:.3f}")
    print(f"  判定: {status}")

七、学习检查清单

基础概念（必须掌握）

理解概率的直观含义
掌握常见分布（均匀、正态、伯努利）
理解条件概率
知道贝叶斯公式
掌握期望、方差、标准差的计算和意义

核心应用（重要）

能解释朴素贝叶斯分类器的原理
理解MLE的基本思想
知道特征标准化的必要性
理解正态分布在AI中的普遍性

八、总结

概率与统计在AI中的核心价值：

概念	AI应用	解决的问题
概率分布	生成模型、初始化	描述数据的不确定性
条件概率	贝叶斯分类	在已知信息下做推理
贝叶斯公式	贝叶斯推理	更新信念（后验概率）
期望/方差	标准化、BN	数据预处理、稳定训练
MLE	参数估计	找到最可能的模型参数

关键公式记忆：

复制代码

贝叶斯公式：P(A|B) = P(B|A)P(A) / P(B)

标准化：X_scaled = (X - μ) / σ

正态分布：X ~ N(μ, σ²)

记住：

概率处理不确定性
统计从数据中学习
贝叶斯更新我们的信念
MLE是参数估计的基石