提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
文章目录
前言
决策树
一、决策树
单棵决策树 DecisionTreeRegressor
python
# 决策树回归案例 - 波士顿房价预测
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# 加载糖尿病数据集
diabetes = load_diabetes()
X = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
y = diabetes.target
# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 创建并训练决策树模型
dt_reg = DecisionTreeRegressor(
max_depth=3, # 限制树深度防止过拟合
min_samples_split=10, # 节点最小样本数
random_state=42
)
dt_reg.fit(X_train, y_train)
# 预测与评估
y_pred = dt_reg.predict(X_test)
print("决策树回归结果:")
随机森林RandomForestRegressor
python
# 随机森林回归案例 - 空气质量预测
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
# 加载空气质量数据集
data = fetch_openml(name='pm25', version=1, as_frame=True)
df = data.frame.dropna()
X = df.drop(columns=['target'])
y = df['target']
# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 创建并训练随机森林模型
rf_reg = RandomForestRegressor(
n_estimators=200, # 树的数量
max_depth=10, # 单棵树的最大深度
min_samples_leaf=5, # 叶节点最小样本数
random_state=42,
n_jobs=-1 # 使用所有CPU核心
)
rf_reg.fit(X_train, y_train)
# 预测与评估
y_pred = rf_reg.predict(X_test)
print("\n随机森林回归结果:")
XGBRegressor(一般用于房价预测)
python
# XGBoost回归案例 - 自行车租赁预测
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
# 加载自行车租赁数据集
data = fetch_openml(name='boston', version=1, as_frame=True)
df = data.frame
X = df.drop(columns=['MEDV'])
y = df['MEDV']
# 数据预处理(处理分类变量)
for col in X.select_dtypes(include=['object']).columns:
le = LabelEncoder()
X[col] = le.fit_transform(X[col])
# 数据分割
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 创建并训练XGBoost模型
xgb_reg = XGBRegressor(
n_estimators=500, # 树的数量
learning_rate=0.05, # 学习率
max_depth=6, # 树的最大深度
subsample=0.8, # 样本抽样比例
colsample_bytree=0.8, # 特征抽样比例
random_state=42,
n_jobs=-1 # 使用所有CPU核心
)
xgb_reg.fit(X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=20, # 早停防止过拟合
verbose=False)
# 预测与评估
y_pred = xgb_reg.predict(X_test)
print("\nXGBoost回归结果:")
总结
黑箱:一般是处于利益考量或者有上亿个参数人类不可学习等因素的模型,比如XGBoostRegressor。