机器学习入门

1. Pandas的使用

python 复制代码

# Pandas练习

# 导入pandas库
import pandas as pd

# 创建一个包含姓名、年龄、性别、城市和分数的字典
data_dict = {
  'name': ['张三', '李四', '王五'],
  'age': [20, 21, 22],
  'gender': ['男', '女', '男'],
  'city': ['北京', '上海', '广州'],
  'score': [80, 90, 85]
}

# 创建DataFrame
df = pd.DataFrame(data_dict)

# 查看DataFrame数据
print(df)
#     name  age    gender city  score     
# 0   张三   20      男    北京    80
# 1   李四   21      女    上海    90
# 2   王五   22      男    广州    85


try:

  # 读取CSV文件
  csv_df = pd.read_csv('./students.csv')
  # 查看CSV文件数据
  print(csv_df)
  #     name   score city  age
  # 0  Augus     80  北京   18
  # 1   Lucy     90  上海   60
  # 2   Lili     91  广州   34
  # 3   Jack     92  北京   22
  # 4  Poney     92  上海   45
    
  # 查看CSV文件前3行数据
  print(csv_df.head(3)) 
  #    name    score city  age
  # 0  Augus     80  北京   18
  # 1   Lucy     90  上海   60
  # 2   Lili     91  广州   34

  # 查看CSV文件统计信息
  print(csv_df.describe())
  #           score        age
  # count   5.00000   5.000000
  # mean   89.00000  35.800000
  # std     5.09902  17.181385
  # min    80.00000  18.000000
  # 25%    90.00000  22.000000
  # 50%    91.00000  34.000000
  # 75%    92.00000  45.000000
  # max    92.00000  60.000000

  # 查看CSV文件姓名列数据
  names = csv_df['name']
  print(names)
  # 0    Augus
  # 1     Lucy
  # 2     Lili
  # 3     Jack
  # 4    Poney

  # 查看CSV文件姓名和分数列数据
  names_scores = csv_df[['name', 'score']]
  print(names_scores)
  #     name  score
  # 0  Augus     80
  # 1   Lucy     90
  # 2   Lili     91
  # 3   Jack     92
  # 4  Poney     92

  # 查看CSV文件分数大于等于90的行数据
  high_scores = csv_df[csv_df['score'] >= 90]
  print(high_scores)
  #    name  score city  age
  # 1  Lucy     90  上海   60
  # 2  Lili     91  广州   34
  # 3  Jack     92  北京   22
  # 4 Poney     92  上海   45

  # 查看CSV文件城市为北京的行数据
  beijing_students = csv_df[csv_df['city'] == '北京']
  print(beijing_students)
  #    name    score  city    age
  # 0  Augus     80   北京     18
  # 3  Jack      92   北京     22

  # 查看CSV文件年龄小于等于30的行数据
  young_students = csv_df[csv_df['age'] <= 30]
  print(young_students)
  #    name  score city  age
  # 0  Augus  80   北京   18
  # 3  Jack   92   北京   22

  # 查看CSV文件年龄的平均值
  avg_age = csv_df['age'].mean()
  print(avg_age)
  # 35.8

  # 查看CSV文件分数的平均值
  avg_score = csv_df['score'].mean()
  print(avg_score)
  # 89.0

  # 查看CSV文件成绩最高的学生姓名
  top_student = csv_df.sort_values(by='score', ascending=False).iloc[0]
  print(top_student['name'])
  # Jack

except FileNotFoundError:
    print("文件未找到")

2. 线性回归

python 复制代码

# 线性回归

# 第1步：导入必要的库
# 导入数据处理模块
import numpy as np  
# 导入数据分析模块
import pandas as pd

# 导入将数据分为训练集和测试集的模块
from sklearn.model_selection import train_test_split
# 导入线性回归模型
from sklearn.linear_model import LinearRegression
# 导入评估模型性能的模块：均方误差 和 决定系数
from sklearn.metrics import mean_squared_error, r2_score


# 第2步：加载和准备数据

# 加载内置的加州房价数据集（内置数据）
# from sklearn.datasets import fetch_california_housing
# housing = fetch_california_housing()
# 转换为Pandas DataFrame 便于查看和处理
# df = pd.DataFrame(housing.data, columns=housing.feature_names)
# 添加PRICE列作为要预测的值
# df['PRICE'] = housing.target

# 加载要地数据（CSV文件）
df = pd.read_csv('./california_housing_dataset.csv')
# 将MedHouseVal列作为目标变量
df['PRICE'] = df.MedHouseVal
# 查看数据前五行
print(df.head())
#    MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude  MedHouseVal  PRICE
# 0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88    -122.23        4.526  4.526
# 1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86    -122.22        3.585  3.585
# 2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85    -122.24        3.521  3.521
# 3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85    -122.25        3.413  3.413
# 4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85    -122.25        3.422  3.422

# 查看数据信息
df.info()
#      Column       Non-Null Count  Dtype  
# ---  ------       --------------  -----
#  0   MedInc       20640 non-null  float64
#  1   HouseAge     20640 non-null  float64
#  2   AveRooms     20640 non-null  float64
#  3   AveBedrms    20640 non-null  float64
#  4   Population   20640 non-null  float64
#  5   AveOccup     20640 non-null  float64
#  6   Latitude     20640 non-null  float64
#  7   Longitude    20640 non-null  float64
#  8   MedHouseVal  20640 non-null  float64
#  9   PRICE        20640 non-null  float64

# 定义特征矩阵X（所有列，除了PRICE这一列）
X = df.drop('PRICE',axis=1)
print(X)
#        MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  Longitude  MedHouseVal
# 0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88    -122.23        4.526
# 1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86    -122.22        3.585
# 2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85    -122.24        3.521
# 3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85    -122.25        3.413
# 4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85    -122.25        3.422
# ...       ...       ...       ...        ...         ...       ...       ...        ...          ...


# 定义目标向量y（PRICE列）
y = df['PRICE']
print(y)
# 0   4.526
# 1   3.585
# 2   3.521
# 3   3.413
# 4   3.422
# ... ...

# 第3步：划分数据集为训练集和测试集

# 划分数据集为训练集和测试集（80%训练，20%测试）random_state=42用于控制数据分割的随机性,保证每次划分结果一致
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 查看训练集的特征矩阵形状
print(X_train.shape)
# (16512, 9)  ＝>16512个样本，9个特征

# 查看测试集的特征矩阵形状
print(X_test.shape)
# (4128, 9)  =>4128个样本，9个特征

# 第4步：创建和训练模型

# 创建线性回归模型
model = LinearRegression()
# 训练模型
model.fit(X_train, y_train)
# print(f"\n模型系数(w): {model.coef_}")
# print(f"\n模型截距(b): {model.intercept_}")

# 第5步：在测试集上评估模型性能

# 使用模型进行预测
y_pred = model.predict(X_test)
# 计算均方误差,越小越好
mse = mean_squared_error(y_test, y_pred)
print(f"\n均方误差(MSE): {mse:.2f}")
# 均方误差(MSE): 0.00

# 计算决定系数,越大越好,0-1
r2 = r2_score(y_test, y_pred)
print(f"\n决定系数(R^2): {r2:.2f}")
# 决定系数(R^2): 1.00

# 第6步：特征重要性分析

# 特征重要性分析（内置数据）
# feature_importance = pd.DataFrame({
#   'Feature':housing.feature_names,
#   'Coefficient': model.coef_
# })

# 特征重要性分析
feature_importance = pd.DataFrame({
  'Feature': X.columns,
  'Coefficient': model.coef_
})
print(feature_importance)

#      Feature   Coefficient
# 0     MedInc   0.448675
# 1   HouseAge   0.092145
# 2   AveRooms   -0.123323
# 3  AveBedrms   0.733775
# 4  Population  -0.000003
# 5   AveOccup   -0.003823
# 6   Latitude   -0.419482
# 7  Longitude   -0.433771
# 8  MedHouseVal 0.550065

# 第7步：计算其他评估指标

# 计算均方根误差(RMSE),越小越好
rmse = np.sqrt(mse)
print(f"\n均方根误差(RMSE): {rmse:.2f}")
# 均方根误差(RMSE): 0.00

# 计算平均绝对误差(MAE),越小越好
mae = mean_absolute_error(y_test, y_pred)
print(f"\n平均绝对误差(MAE): {mae:.2f}")
# 平均绝对误差(MAE): 0.00

3. 分类算法

python 复制代码

# 分类算法

# 第1步：导入必要的库和数据集

# 导入load_iris函数,用于加载鸢尾花数据集
from sklearn.datasets import load_iris
# 导入训练测试集划分函数
from sklearn.model_selection import train_test_split

# 导入支持向量机分类器
from sklearn.svm import SVC
# 导入决策树分类器
from sklearn.tree import DecisionTreeClassifier
# 导入K-近邻分类器
from sklearn.neighbors import KNeighborsClassifier
# 导入逻辑回归模型
from sklearn.linear_model import LogisticRegression

# 导入准确率评估函数
from sklearn.metrics import accuracy_score

# 加载鸢尾花数据集
iris = load_iris()
# iris.data包含所有样本的特征（花萼长度、花萼宽度、花瓣长度、花瓣宽度）
# 将特征数据赋值给X
X = iris.data
# iris.target包含所有样本对应的类别标签（0,1,2分别代表三种鸢尾花）
# 标签数据赋值给y
y = iris.target

# 划分训练集和测试集（stratify=y确保训练集和测试集的类别分布与原始数据集相同）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,stratify=y)

# 第2步：初始化分类器

# 初始化逻辑回归分类器(max_iter=200用于增加最大迭代次数,确保模型收敛)
lr_reg = LogisticRegression(max_iter=200)
# 初始化K-近邻分类器(n_neighbors=3表示使用3个最近邻居)
knn = KNeighborsClassifier(n_neighbors=3)
# 初始化支持向量机分类器(kernel='linear'表示使用线性核函数)
svm = SVC(kernel='linear')
# 初始化决策树分类器(random_state=42用于控制随机种子,确保结果可重复)
tree = DecisionTreeClassifier(random_state=42)


# 第3步：创建模型字典
models = {
  'Logistic Regression': lr_reg,
  'K-Nearest Neighbors': knn,
  'Support Vector Machine': svm,
  'Decision Tree': tree
}

# 第4步：循环训练和评估模型
for name, model in models.items():
    # 训练模型：用训练集拟合模型
    model.fit(X_train, y_train)
    # 在测试集上进行预测
    y_pred = model.predict(X_test)
    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)
    # 打印模型名称和准确率
    print(f"{name} Accuracy: {accuracy:.4f}")
    # Logistic Regression Accuracy: 0.9667
    # K-Nearest Neighbors Accuracy: 1.0000   
    # Support Vector Machine Accuracy: 1.0000
    # Decision Tree Accuracy: 0.9333

# 第5步：单独测试一个预测
# 用训练好的决策树模型来预测一个新样本
# 假设我们有一个新的花朵样本，特征为：花萼长度5.1cm, 花萼宽度3.5cm, 花瓣长度1.4cm, 花瓣宽度0.2cm
# 注意：输入必须是2D数组（即使只有一个样本）
new_flower = [[5.1, 3.5, 1.4, 0.2]]
# 使用决策树模型进行预测，返回预测的类别标签（0,1,2）
predicted_class = tree.predict(new_flower)
# 将数字标签转换为对应的类别名称
class_name = iris.target_names[predicted_class[0]]

# 打印预测结果
print(f"\n新样本{new_flower}被决策树预测为 ：{class_name}（类别为{predicted_class[0]}）")
# 新样本[[5.1, 3.5, 1.4, 0.2]]被决策树预测为 ：setosa（类别为0）

4. 模型评估陷阱

python 复制代码

# 导入生成模拟分类数据集的函数
from sklearn.datasets import make_classification
# 导入划分数据集的函数
from sklearn.model_selection import train_test_split
# 导入逻辑回归模型
from sklearn.linear_model import LogisticRegression
# 导入评估模型的函数(混淆矩阵、分类报告、准确率)
from sklearn.metrics import confusion_matrix,classification_report, accuracy_score

# 生成模拟分类数据集
X, y = make_classification(
  n_samples=1000,         # 生成1000个样本
  n_features=10,          # 每个样本有10个特征
  n_informative=5,        # 只有5个特征是有意义的
  n_redundant=0,          # 没有冗余特征 （即没有由有用特征线性组合而成的特征）
  n_classes=2,            # 分类问题有2个类别
  n_clusters_per_class=1, # 每个类别有1个簇
  weights=[0.95, 0.05],   # 类别0的样本比例为0.95，类别1的样本比例为0.05
  random_state=42         # 随机种子，保持每次结果可重复
)

# 划分数据集为训练集和测试集（70%训练，30%测试），并保持类别比例一致（stratify=y）
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.3, random_state=42,stratify=y
)

# 创建逻辑回归模型
model = LogisticRegression()
# 训练模型
model.fit(X_train, y_train)
# 使用模型对测试集进行预测
y_pred = model.predict(X_test)

# 计算模型准确率
# 注意：在不平衡数据集中，准确率可能会产生误导（比如全部预测为多数类也会有高准确率）
acc = accuracy_score(y_test, y_pred)
print(f"{acc:.4f}")
# 0.9800

# 生成混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print(cm)
# [[282   0][  6  12]]

# 生成分类报告
cr = classification_report(y_test, y_pred,target_names=['类别0','类别1'])
print(cr)
#                  precision    recall  f1-score   support

#          类别0       0.98      1.00      0.99       282
#          类别1       1.00      0.67      0.80        18

#     accuracy                            0.98       300
#    macro avg        0.99      0.83      0.89       300
# weighted avg        0.98      0.98      0.98       300