xgboost中所有的参数有
python
"""Scikit-Learn Wrapper interface for XGBoost."""
__estimator_doc = f"""
n_estimators : {Optional[int]}
Number of gradient boosted trees. Equivalent to number of boosting
rounds.
"""
__model_doc = f"""
max_depth : {Optional[int]}
Maximum tree depth for base learners.
max_leaves : {Optional[int]}
Maximum number of leaves; 0 indicates no limit.
max_bin : {Optional[int]}
If using histogram-based algorithm, maximum number of bins per feature
grow_policy : {Optional[str]}
Tree growing policy.
- depthwise: Favors splitting at nodes closest to the node,
- lossguide: Favors splitting at nodes with highest loss change.
learning_rate : {Optional[float]}
Boosting learning rate (xgb's "eta")
verbosity : {Optional[int]}
The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
objective : {SklObjective}
Specify the learning task and the corresponding learning objective or a custom
objective function to be used.
For custom objective, see :doc:`/tutorials/custom_metric_obj` and
:ref:`custom-obj-metric` for more information, along with the end note for
function signatures.
booster: {Optional[str]}
Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
tree_method : {Optional[str]}
Specify which tree method to use. Default to auto. If this parameter is set to
default, XGBoost will choose the most conservative option available. It's
recommended to study this option from the parameters document :doc:`tree method
</treemethod>`
n_jobs : {Optional[int]}
Number of parallel threads used to run xgboost. When used with other
Scikit-Learn algorithms like grid search, you may choose which algorithm to
parallelize and balance the threads. Creating thread contention will
significantly slow down both algorithms.
gamma : {Optional[float]}
(min_split_loss) Minimum loss reduction required to make a further partition on
a leaf node of the tree.
min_child_weight : {Optional[float]}
Minimum sum of instance weight(hessian) needed in a child.
max_delta_step : {Optional[float]}
Maximum delta step we allow each tree's weight estimation to be.
subsample : {Optional[float]}
Subsample ratio of the training instance.
sampling_method : {Optional[str]}
Sampling method. Used only by the GPU version of ``hist`` tree method.
- ``uniform``: Select random training instances uniformly.
- ``gradient_based``: Select random training instances with higher probability
when the gradient and hessian are larger. (cf. CatBoost)
colsample_bytree : {Optional[float]}
Subsample ratio of columns when constructing each tree.
colsample_bylevel : {Optional[float]}
Subsample ratio of columns for each level.
colsample_bynode : {Optional[float]}
Subsample ratio of columns for each split.
reg_alpha : {Optional[float]}
L1 regularization term on weights (xgb's alpha).
reg_lambda : {Optional[float]}
L2 regularization term on weights (xgb's lambda).
scale_pos_weight : {Optional[float]}
Balancing of positive and negative weights.
base_score : {Optional[float]}
The initial prediction score of all instances, global bias.
random_state : {Optional[Union[np.random.RandomState, np.random.Generator, int]]}
Random number seed.
.. note::
Using gblinear booster with shotgun updater is nondeterministic as
it uses Hogwild algorithm.
missing : float
Value in the data which needs to be present as a missing value. Default to
:py:data:`numpy.nan`.
num_parallel_tree: {Optional[int]}
Used for boosting random forest.
monotone_constraints : {Optional[Union[Dict[str, int], str]]}
Constraint of variable monotonicity. See :doc:`tutorial </tutorials/monotonic>`
for more information.
interaction_constraints : {Optional[Union[str, List[Tuple[str]]]]}
Constraints for interaction representing permitted interactions. The
constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
3, 4]]``, where each inner list is a group of indices of features that are
allowed to interact with each other. See :doc:`tutorial
</tutorials/feature_interaction_constraint>` for more information
importance_type: {Optional[str]}
The feature importance type for the feature_importances\\_ property:
* For tree model, it's either "gain", "weight", "cover", "total_gain" or
"total_cover".
* For linear model, only "weight" is defined and it's the normalized
coefficients without bias.
device : {Optional[str]}
.. versionadded:: 2.0.0
Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
validate_parameters : {Optional[bool]}
Give warnings for unknown parameter.
enable_categorical : bool
See the same parameter of :py:class:`DMatrix` for details.
feature_types : {Optional[FeatureTypes]}
.. versionadded:: 1.7.0
Used for specifying feature types without constructing a dataframe. See
:py:class:`DMatrix` for details.
max_cat_to_onehot : {Optional[int]}
.. versionadded:: 1.6.0
.. note:: This parameter is experimental
A threshold for deciding whether XGBoost should use one-hot encoding based split
for categorical data. When number of categories is lesser than the threshold
then one-hot encoding is chosen, otherwise the categories will be partitioned
into children nodes. Also, `enable_categorical` needs to be set to have
categorical feature support. See :doc:`Categorical Data
</tutorials/categorical>` and :ref:`cat-param` for details.
max_cat_threshold : {Optional[int]}
.. versionadded:: 1.7.0
.. note:: This parameter is experimental
Maximum number of categories considered for each split. Used only by
partition-based splits for preventing over-fitting. Also, `enable_categorical`
needs to be set to have categorical feature support. See :doc:`Categorical Data
</tutorials/categorical>` and :ref:`cat-param` for details.
multi_strategy : {Optional[str]}
.. versionadded:: 2.0.0
.. note:: This parameter is working-in-progress.
The strategy used for training multi-target models, including multi-target
regression and multi-class classification. See :doc:`/tutorials/multioutput` for
more information.
- ``one_output_per_tree``: One model for each target.
- ``multi_output_tree``: Use multi-target trees.
eval_metric : {Optional[Union[str, List[str], Callable]]}
.. versionadded:: 1.6.0
Metric used for monitoring the training result and early stopping. It can be a
string or list of strings as names of predefined metric in XGBoost (See
doc/parameter.rst), one of the metrics in :py:mod:`sklearn.metrics`, or any
other user defined metric that looks like `sklearn.metrics`.
If custom objective is also provided, then custom metric should implement the
corresponding reverse link function.
Unlike the `scoring` parameter commonly used in scikit-learn, when a callable
object is provided, it's assumed to be a cost function and by default XGBoost
will minimize the result during early stopping.
For advanced usage on Early stopping like directly choosing to maximize instead
of minimize, see :py:obj:`xgboost.callback.EarlyStopping`.
See :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
information.
.. code-block:: python
from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_absolute_error
X, y = load_diabetes(return_X_y=True)
reg = xgb.XGBRegressor(
tree_method="hist",
eval_metric=mean_absolute_error,
)
reg.fit(X, y, eval_set=[(X, y)])
early_stopping_rounds : {Optional[int]}
.. versionadded:: 1.6.0
- Activates early stopping. Validation metric needs to improve at least once in
every **early_stopping_rounds** round(s) to continue training. Requires at
least one item in **eval_set** in :py:meth:`fit`.
- If early stopping occurs, the model will have two additional attributes:
:py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
:py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
number of trees during inference. If users want to access the full model
(including trees built after early stopping), they can specify the
`iteration_range` in these inference methods. In addition, other utilities
like model plotting can also use the entire model.
- If you prefer to discard the trees after `best_iteration`, consider using the
callback function :py:class:`xgboost.callback.EarlyStopping`.
- If there's more than one item in **eval_set**, the last entry will be used for
early stopping. If there's more than one metric in **eval_metric**, the last
metric will be used for early stopping.
callbacks : {Optional[List[TrainingCallback]]}
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using
:ref:`Callback API <callback_api>`.
.. note::
States in callback are not preserved during training, which means callback
objects can not be reused for multiple training sessions without
reinitialization or deepcopy.
.. code-block:: python
for params in parameters_grid:
# be sure to (re)initialize the callbacks before each run
callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]
reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
reg.fit(X, y)
kwargs : {Optional[Any]}
Keyword arguments for XGBoost Booster object. Full documentation of parameters
can be found :doc:`here </parameter>`.
Attempting to set a parameter via the constructor args and \\*\\*kwargs
dict simultaneously will result in a TypeError.
.. note:: \\*\\*kwargs unsupported by scikit-learn
\\*\\*kwargs is unsupported by scikit-learn. We do not guarantee
that parameters passed via this argument will interact properly
with scikit-learn.
"""
__custom_obj_note = """
.. note:: Custom objective function
A custom objective function can be provided for the ``objective``
parameter. In this case, it should have the signature ``objective(y_true,
y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
-> [grad, hess]``:
y_true: array_like of shape [n_samples]
The target values
y_pred: array_like of shape [n_samples]
The predicted values
sample_weight :
Optional sample weights.
grad: array_like of shape [n_samples]
The value of the gradient for each sample point.
hess: array_like of shape [n_samples]
The value of the second derivative for each sample point
"""
翻译中文 claude推荐
XGBoost完整参数详解
I. 树结构参数(Tree Parameters)
1. n_estimators
含义:训练的树的总数(梯度提升的迭代次数)
属性 | 值 |
---|---|
类型 | int |
默认值 | 100 |
范围 | 1-10000 |
建议值 | 100-500(时间充足可到1000) |
推荐设置:
- 小数据集(<100K):100-300
- 中等数据集(100K-1M):300-600(你的情况)
- 大数据集(>1M):500-1000
用法:
python
model = xgb.XGBClassifier(n_estimators=300)
2. max_depth
含义:每棵树的最大深度
属性 | 值 |
---|---|
类型 | int |
默认值 | 6 |
范围 | 1-15 |
建议值 | 3-10 |
推荐设置:
- 浅树(防过拟合):3-5
- 中等树(不平衡数据):6-8(你的情况)✅
- 深树(高维数据):8-12
说明:
- 树越深,模型越复杂,越容易过拟合
- 不平衡数据用6-8捕捉非线性,但不会过度
- 时间序列数据建议6-8
python
model = xgb.XGBClassifier(max_depth=8)
3. max_leaves
含义:每棵树的最大叶子节点数
属性 | 值 |
---|---|
类型 | int |
默认值 | 0(无限制) |
范围 | 0-unlimited |
建议值 | 0(保留默认)或1-1000 |
说明:
- 只在
grow_policy='lossguide'
时有效 - 0 = 无限制,由max_depth决定
- 指定此值会覆盖max_depth
- 对于不平衡数据,建议保留默认(0)
python
model = xgb.XGBClassifier(max_leaves=0) # 保持默认
4. max_bin
含义:特征离散化的最大桶数(直方图方法)
属性 | 值 |
---|---|
类型 | int |
默认值 | 256 |
范围 | 2-255(或更高) |
建议值 | 256(或128-512) |
说明:
- 只在
tree_method='hist'
时使用 - 值越大,特征切分越细致,但内存消耗越大
- 时间序列数据用256通常够好
python
model = xgb.XGBClassifier(max_bin=256)
5. grow_policy
含义:树生长策略
属性 | 值 |
---|---|
类型 | str |
默认值 | 'depthwise' |
可选值 | 'depthwise' | 'lossguide' |
说明:
'depthwise'
:按深度增长,平衡树'lossguide'
:按最大损失减少增长,不平衡树
对于你的情况:
python
# 推荐(不平衡数据)
model = xgb.XGBClassifier(grow_policy='depthwise')
II. 学习参数(Learning Parameters)
6. learning_rate(eta)
含义:每次树的权重收缩系数
属性 | 值 |
---|---|
类型 | float |
默认值 | 0.3 |
范围 | 0.0-1.0 |
建议值 | 0.01-0.3 |
推荐设置:
- 高学习率:0.1-0.3(快速但不稳定)
- 中等学习率:0.05-0.1(平衡)
- 低学习率:0.01-0.05(稳定但慢)
关键 :learning_rate × n_estimators
应该≈150-500
- 你的情况:0.1 × 300 = 30(偏低)或 0.05 × 600 = 30
python
# 推荐给你
model = xgb.XGBClassifier(learning_rate=0.1, n_estimators=300)
7. verbosity
含义:日志冗长度
属性 | 值 |
---|---|
类型 | int |
默认值 | 1 |
可选值 | 0-3 |
说明:
- 0:静默
- 1:警告信息
- 2:信息消息
- 3:调试消息
python
model = xgb.XGBClassifier(verbosity=0) # 推荐:不输出日志
III. 正则化参数(Regularization)
8. gamma
含义:分裂所需的最小损失减少
属性 | 值 |
---|---|
类型 | float |
默认值 | 0 |
范围 | 0-∞ |
建议值 | 0-2 |
说明:
- gamma越大,分裂越严格,树越浅
- 防止过拟合
推荐设置:
- 无正则化:0
- 轻度正则化:0.1-0.3
- 中度正则化:0.3-1.0(你的情况)
- 强度正则化:1.0-3.0
python
model = xgb.XGBClassifier(gamma=0.3)
9. min_child_weight
含义:子节点所需的最小权重和
属性 | 值 |
---|---|
类型 | float |
默认值 | 1 |
范围 | 0-∞ |
建议值 | 1-10 |
说明:
- 在分类任务中,权重≈实例计数
- 值越大,树越浅,越保守
- 不平衡数据:用小值(2-3)捕捉少数类
推荐设置:
- 不平衡数据:1-3(你的情况)✅
- 平衡数据:1-5
- 防过拟合:5-10
python
model = xgb.XGBClassifier(min_child_weight=3)
10. max_delta_step
含义:树预测的最大增量步长
属性 | 值 |
---|---|
类型 | float |
默认值 | 0(无限制) |
范围 | 0-∞ |
建议值 | 0或1-5 |
说明:
- 0 = 无限制
- 用于二分类不平衡问题
- 多分类时通常保留0
python
model = xgb.XGBClassifier(max_delta_step=0) # 保留默认
IV. 采样参数(Sampling Parameters)
11. subsample
含义:每棵树使用的样本比例
属性 | 值 |
---|---|
类型 | float |
默认值 | 1.0 |
范围 | 0.0-1.0 |
建议值 | 0.5-1.0 |
推荐设置:
- 高采样(信息多):0.8-1.0
- 中等采样(防过拟合):0.7-0.9(你的情况)
- 低采样(强正则化):0.5-0.7
python
model = xgb.XGBClassifier(subsample=0.8)
12. sampling_method
含义:采样方法
属性 | 值 |
---|---|
类型 | str |
默认值 | 'uniform' |
可选值 | 'uniform' | 'gradient_based' |
说明:
'uniform'
:随机均匀采样'gradient_based'
:基于梯度加权采样(样本权重重要的优先采样)
python
model = xgb.XGBClassifier(sampling_method='uniform')
13. colsample_bytree
含义:构造每棵树时的列采样比例
属性 | 值 |
---|---|
类型 | float |
默认值 | 1.0 |
范围 | 0.0-1.0 |
建议值 | 0.5-1.0 |
推荐设置:
- 特征多(>100):0.5-0.7
- 特征中等(50-100):0.7-0.9(你的情况)
- 特征少(<50):0.8-1.0
python
model = xgb.XGBClassifier(colsample_bytree=0.8)
14. colsample_bylevel
含义:树的每一层的列采样比例
属性 | 值 |
---|---|
类型 | float |
默认值 | 1.0 |
范围 | 0.0-1.0 |
建议值 | 0.5-1.0 |
说明:
- 在每层分裂时应用
- 通常保留1.0(不额外采样)
- 或设为与
colsample_bytree
相同
python
model = xgb.XGBClassifier(colsample_bylevel=1.0)
15. colsample_bynode
含义:每个节点分裂时的列采样比例
属性 | 值 |
---|---|
类型 | float |
默认值 | 1.0 |
范围 | 0.0-1.0 |
建议值 | 0.5-1.0 |
说明:
- 在每个分裂点应用,最细粒度的采样
- 增加多样性,防过拟合
- 时间序列可设为0.7-0.9
python
model = xgb.XGBClassifier(colsample_bynode=1.0) # 或0.8增加多样性
V. 正则化系数(Regularization Coefficients)
16. reg_alpha
含义:L1正则化系数
属性 | 值 |
---|---|
类型 | float |
默认值 | 0 |
范围 | 0-∞ |
建议值 | 0-2 |
说明:
- L1鼓励稀疏性(某些特征权重变为0)
- 特征选择效果好
推荐设置:
- 无惩罚:0
- 轻度:0.1-0.5
- 中度:0.5-1.0(你的情况)
- 强度:1.0-2.0
python
model = xgb.XGBClassifier(reg_alpha=0.5)
17. reg_lambda
含义:L2正则化系数
属性 | 值 |
---|---|
类型 | float |
默认值 | 1 |
范围 | 0-∞ |
建议值 | 0.5-2 |
说明:
- L2防止权重过大
- 比L1更平滑
推荐设置:
- 轻度:0.5-1.0
- 中度:1.0-2.0(你的情况)
- 强度:2.0-5.0
python
model = xgb.XGBClassifier(reg_lambda=1.0)
18. scale_pos_weight
含义:正类样本权重(二分类)
属性 | 值 |
---|---|
类型 | float |
默认值 | 1 |
范围 | >0 |
建议值 | sum(negative_cases)/sum(positive_cases) |
说明:
- 只用于二分类
- 多分类用sample_weight代替
- 值=负类/正类的比例
对你的情况:
python
# 不适用,你用SMOTE+样本权重替代
model = xgb.XGBClassifier(scale_pos_weight=1)
VI. 其他重要参数
19. base_score
含义:初始预测分数
属性 | 值 |
---|---|
类型 | float |
默认值 | 0.5 |
范围 | 0-1 |
说明:
- 二分类:0.5通常足够
- 多分类:自动计算,保留0.5
python
model = xgb.XGBClassifier(base_score=0.5)
20. random_state
含义:随机种子(可复现性)
属性 | 值 |
---|---|
类型 | int |
默认值 | 0 |
范围 | 0-2³¹-1 |
说明:
- 固定此值保证结果可复现
- 建议总是设置
python
model = xgb.XGBClassifier(random_state=42)
21. n_jobs
含义:并行工作进程数
属性 | 值 |
---|---|
类型 | int |
默认值 | 1 |
范围 | -1(全部)或1-N |
说明:
- -1:使用所有CPU核心(推荐)
- 1:单核
- N:指定N个核心
python
model = xgb.XGBClassifier(n_jobs=-1) # 推荐
VII. 模型相关参数
22. objective
含义:目标损失函数
属性 | 值 |
---|---|
类型 | str |
默认值 | None |
常用值 | 见下表 |
常用选项:
任务类型 | objective值 | 说明 |
---|---|---|
二分类 | 'binary:logistic' | 输出概率 |
二分类 | 'binary:logitraw' | 输出原始分数 |
多分类 | 'multi:softprob' | 输出概率(你的情况)✅ |
多分类 | 'multi:softmax' | 输出类别标签 |
回归 | 'reg:squarederror' | 平方误差 |
排序 | 'rank:ndcg' | NDCG排序 |
python
model = xgb.XGBClassifier(objective='multi:softprob')
23. booster
含义:使用哪种基学习器
属性 | 值 |
---|---|
类型 | str |
默认值 | 'gbtree' |
可选值 | 'gbtree' | 'gblinear' | 'dart' |
说明:
'gbtree'
:梯度提升树(标准,推荐)'gblinear'
:线性模型'dart':树+ dropout正则化
python
model = xgb.XGBClassifier(booster='gbtree')
24. tree_method
含义:树学习算法
属性 | 值 |
---|---|
类型 | str |
默认值 | 'auto' |
可选值 | 'auto' | 'exact' | 'approx' | 'hist' | 'gpu_hist' |
说明:
方法 | 速度 | 内存 | 适用场景 |
---|---|---|---|
'auto' | - | - | 自动选择 |
'exact' | 慢 | 高 | 小数据集 |
'approx' | 快 | 中 | 中等数据集 |
'hist' | 很快 | 低 | 大数据集(你的情况)✅ |
'gpu_hist' | 最快 | 低 | GPU可用 |
python
model = xgb.XGBClassifier(tree_method='hist')
25. eval_metric
含义:评估指标
属性 | 值 |
---|---|
类型 | str或list |
默认值 | 根据objective自动选择 |
常用值 | 见下表 |
常用选项:
指标 | 说明 | 适用 |
---|---|---|
'error' | 分类错误率 | 分类 |
'merror' | 多分类错误率 | 多分类(你的情况)✅ |
'mlogloss' | 多分类对数损失 | 多分类 |
'logloss' | 二分类对数损失 | 二分类 |
'auc' | AUC | 二分类 |
'aucpr' | AUC-PR | 二分类 |
'map' | 平均精度 | 排序 |
'ndcg' | NDCG | 排序 |
python
model = xgb.XGBClassifier(eval_metric='mlogloss')
VIII. 高级参数
26. early_stopping_rounds
含义:早停轮数
属性 | 值 |
---|---|
类型 | int |
默认值 | None |
建议值 | 10-50 |
说明:
- 如果验证指标N轮不改进,停止训练
- 防止过拟合
python
# 需要在fit时指定eval_set
model.fit(X_train, y_train,
eval_set=[(X_val, y_val)],
early_stopping_rounds=20)
27. callbacks
含义:训练回调函数
属性 | 值 |
---|---|
类型 | list |
默认值 | None |
说明:
- 在训练过程中执行自定义操作
- 例如打印进度、保存最佳模型等
python
callbacks = [
xgb.callback.EarlyStopping(rounds=20),
xgb.callback.LearningRateScheduler()
]
model.fit(X_train, y_train, callbacks=callbacks)
28. importance_type
含义:特征重要性计算方法
属性 | 值 |
---|---|
类型 | str |
默认值 | 'gain' |
可选值 | 'gain' | 'cover' | 'frequency' | 'total_gain' | 'total_cover' |
说明:
'gain'
:特征在分裂中的平均收益'cover'
:特征影响的样本数'frequency'
:特征出现的次数
python
model = xgb.XGBClassifier(importance_type='gain')
IX. 分类特征参数
29. enable_categorical
含义:是否启用分类特征支持
属性 | 值 |
---|---|
类型 | bool |
默认值 | False |
说明:
- XGBoost 1.5+ 原生支持分类特征
- 无需onehot编码
python
model = xgb.XGBClassifier(enable_categorical=False) # 你的数据全是数值
30. max_cat_to_onehot
含义:类别数小于此值时使用onehot编码
属性 | 值 |
---|---|
类型 | int |
默认值 | 1 |
范围 | 1-∞ |
python
model = xgb.XGBClassifier(max_cat_to_onehot=1)
31. max_cat_threshold
含义:分类特征的最大分裂点数
属性 | 值 |
---|---|
类型 | int |
默认值 | 64 |
python
model = xgb.XGBClassifier(max_cat_threshold=64)
X. 多任务参数
32. multi_strategy
含义:多分类处理策略
属性 | 值 |
---|---|
类型 | str |
默认值 | 'one_output' |
可选值 | 'one_output' | 'multi_output_tree' |
说明:
'one_output'
:标准方法(推荐)'multi_output_tree'
:替代方法(实验性)
python
model = xgb.XGBClassifier(multi_strategy='one_output')
XI. 约束参数
33. monotone_constraints
含义:单调约束
属性 | 值 |
---|---|
类型 | dict或list |
默认值 | None |
说明:
- 强制特征与目标的单调关系
- 1:单调增加,-1:单调减少,0:无约束
python
# 特征0单调增加,特征1无约束
model = xgb.XGBClassifier(
monotone_constraints={0: 1, 1: 0}
)
34. interaction_constraints
含义:交互约束
属性 | 值 |
---|---|
类型 | list of list |
默认值 | None |
说明:
- 限制哪些特征可以相互作用
python
# 特征0、1可相互作用,特征2单独
model = xgb.XGBClassifier(
interaction_constraints=[[0, 1], [2]]
)
推荐参数配置方案
方案1:基础配置(快速运行)
python
params = {
'n_estimators': 200,
'max_depth': 6,
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 1,
'gamma': 0,
'reg_alpha': 0,
'reg_lambda': 1,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'tree_method': 'hist',
'random_state': 42,
'n_jobs': -1
}
方案2:推荐配置(你的情况)✅
python
params = {
'n_estimators': 300,
'max_depth': 8,
'learning_rate': 0.1,
'subsample': 0.8,
'colsample_bytree': 0.8,
'colsample_bynode': 0.8,
'min_child_weight': 3,
'gamma': 0.3,
'reg_alpha': 0.5,
'reg_lambda': 1.0,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'tree_method': 'hist',
'random_state': 42,
'n_jobs': -1
}
方案3:激进配置(最好性能)
python
params = {
'n_estimators': 500,
'max_depth': 10,
'learning_rate': 0.05,
'subsample': 0.8,
'colsample_bytree': 0.7,
'colsample_bynode': 0.7,
'min_child_weight': 2,
'gamma': 0.1,
'reg_alpha': 0.1,
'reg_lambda': 0.5,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'tree_method': 'hist',
'random_state': 42,
'n_jobs': -1
}
快速参考表
参数 | 类型 | 默认 | 推荐范围 | 你的情况 |
---|---|---|---|---|
n_estimators | int | 100 | 200-600 | 300 ✅ |
max_depth | int | 6 | 6-10 | 8 ✅ |
learning_rate | float | 0.3 | 0.01-0.3 | 0.1 ✅ |
subsample | float | 1.0 | 0.7-1.0 | 0.8 ✅ |
colsample_bytree | float | 1.0 | 0.7-1.0 | 0.8 ✅ |
min_child_weight | float | 1 | 1-10 | 3 ✅ |
gamma | float | 0 | 0-2 | 0.3 ✅ |
reg_alpha | float | 0 | 0-2 | 0.5 ✅ |
reg_lambda | float | 1 | 0.5-2 | 1.0 ✅ |
objective | str | - | 见表 | 'multi:softprob' ✅ |
tree_method | str | 'auto' | 'hist' | 'hist' ✅ |
eval_metric | str | auto | 'mlogloss' | 'mlogloss' ✅ |
random_state | int | 0 | 任意 | 42 ✅ |
n_jobs | int | 1 | -1 | -1 ✅ |
最后补充的核心内容
参数调优的实战三步法
GridSearchCV - 穷举搜索(小参数空间)
贝叶斯优化 - 智能搜索(大参数空间)
手工调优 - 逐个参数微调
常见问题的解决方案
针对你可能遇到的5个典型问题:
内存不足 → 用hist方法或减少树数量
过拟合 → 增加正则化参数
欠拟合 → 减少正则化参数
训练太慢 → 用gpu_hist或增加学习率
少数类性能差 → 减小min_child_weight和gamma(关键!)
关键发现
对于你的不平衡数据(60% vs 20% vs 20%),最容易踩的坑是:
python# ❌ 错误:正则化过强
'min_child_weight': 5 # 太大会剪掉少数类信号
'gamma': 0.5 # 太大会阻止有用的分裂
'reg_alpha': 1.0 # 太大会削弱特征
'reg_lambda': 2.0 # 太大会限制模型自由
'min_child_weight': 3 # 足够捕捉少数类
'gamma': 0.3 # 允许有用分裂
'reg_alpha': 0.5 # 适度正则化
'reg_lambda': 1.0 # 适度正则化
这就是为什么我一直强调减小正则化参数对你的数据如此重要!
不完善的思路,代码修改优化中
python
"""
XGBoost时间序列不平衡多分类完整方案 - 修复版本
处理常见错误:内存溢出、采样参数错误、特征类型问题
"""
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
accuracy_score, precision_recall_fscore_support,
classification_report, confusion_matrix, f1_score,
balanced_accuracy_score, precision_recall_curve
)
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
# ============================================================================
# 1. 数据加载和预处理
# ============================================================================
def load_and_prepare_data(csv_path, threshold=0.0015):
"""加载数据并生成标签"""
df = pd.read_csv(csv_path)
df['future_ret'] = (df['close'].shift(-1) - df['close']) / df['close']
df['label'] = 0
df.loc[df['future_ret'] > threshold, 'label'] = 1
df.loc[df['future_ret'] < -threshold, 'label'] = 2
df = df.dropna(subset=['label', 'close'])
print(f"✅ 原始数据标签分布:")
print(df['label'].value_counts())
print(f"\n✅ 原始数据标签分布(百分比):")
print(df['label'].value_counts(normalize=True) * 100)
return df
def extract_features(df):
"""特征工程"""
for window in [5, 10, 20]:
df[f'momentum_{window}'] = (df['close'] - df['close'].shift(window)) / df['close'].shift(window)
df[f'roc_{window}'] = df['close'].pct_change(window)
for window in [5, 10, 20]:
df[f'volatility_{window}'] = df['close'].pct_change().rolling(window).std()
df[f'high_low_ratio_{window}'] = (
(df['high'] - df['low']).rolling(window).mean() / df['close']
)
df['volume_ma_ratio'] = df['volume'] / df['volume'].rolling(20).mean()
df['volume_trend'] = df['volume'].pct_change().rolling(10).mean()
for window in [20, 30]:
high_low_range = df['high'].rolling(window).max() - df['low'].rolling(window).min()
df[f'price_position_{window}'] = (
(df['close'] - df['low'].rolling(window).min()) / (high_low_range + 1e-10)
)
for s, f in [(5, 20), (10, 30)]:
df[f'ma_cross_{s}_{f}'] = (
df['close'].rolling(s).mean() - df['close'].rolling(f).mean()
) / df['close']
df['daily_return'] = df['close'].pct_change()
df['return_ma_5'] = df['daily_return'].rolling(5).mean()
df['return_std_10'] = df['daily_return'].rolling(10).std()
df = df.dropna()
features = [col for col in df.columns
if col not in ['datetime', 'symbol', 'future_ret', 'label',
'open', 'high', 'low', 'close']]
return df, features
# ============================================================================
# 2. 样本权重计算
# ============================================================================
def calculate_sample_weights(y_train, method='extreme'):
"""计算样本权重"""
class_counts = Counter(y_train)
if method == 'extreme':
weights = {0: 1.0, 1: 3.0, 2: 3.0}
elif method == 'moderate':
weights = {0: 1.0, 1: 2.0, 2: 2.0}
else:
n_samples = len(y_train)
weights = {label: n_samples / (len(class_counts) * count)
for label, count in class_counts.items()}
sample_weights = np.array([weights[label] for label in y_train], dtype=np.float32)
return sample_weights, weights
# ============================================================================
# 3. 重采样策略(关键修复)
# ============================================================================
class ResamplingStrategy:
"""重采样策略管理器"""
@staticmethod
def smote_undersample(X_train, y_train, name="SMOTE+欠采样"):
"""SMOTE + 欠采样策略 - 修复版本"""
print(f"\n{'='*70}")
print(f"应用: {name}")
print(f"{'='*70}")
print(f"原始训练集分布: {Counter(y_train)}")
try:
# 步骤1: SMOTE过采样少数类
# 修复:设置合理的目标数量,避免内存溢出
n_class_0 = sum(y_train == 0)
target_1 = int(n_class_0 * 0.45) # 目标45%
target_2 = int(n_class_0 * 0.45)
print(f"SMOTE目标: 类1={target_1}, 类2={target_2}")
smote = SMOTE(
k_neighbors=5,
random_state=42,
sampling_strategy={1: target_1, 2: target_2},
)
X_temp, y_temp = smote.fit_resample(X_train, y_train)
print(f"SMOTE后: {Counter(y_temp)}")
gc.collect() # 释放内存
# 步骤2: 欠采样多数类
n_total_after_smote = len(y_temp)
target_0 = int(n_total_after_smote * 0.55) # 目标55%
print(f"欠采样目标: 类0={target_0}")
under = RandomUnderSampler(
sampling_strategy={0: target_0},
random_state=42
)
X_resampled, y_resampled = under.fit_resample(X_temp, y_temp)
print(f"欠采样后: {Counter(y_resampled)}")
print(f"采样后分布: {Counter(y_resampled) / len(y_resampled)}")
del X_temp, y_temp # 删除临时变量释放内存
gc.collect()
return X_resampled, y_resampled, name
except Exception as e:
print(f"❌ SMOTE+欠采样出错: {str(e)}")
return X_train, y_train, name
@staticmethod
def smote_oversample(X_train, y_train, name="SMOTE+过采样"):
"""SMOTE + 过采样策略 - 修复版本"""
print(f"\n{'='*70}")
print(f"应用: {name}")
print(f"{'='*70}")
print(f"原始训练集分布: {Counter(y_train)}")
try:
# 步骤1: SMOTE过采样少数类到接近平衡
n_class_0 = sum(y_train == 0)
target_1 = int(n_class_0 * 0.6) # 到60%平衡
target_2 = int(n_class_0 * 0.6)
print(f"SMOTE目标: 类1={target_1}, 类2={target_2}")
smote = SMOTE(
k_neighbors=5,
random_state=42,
sampling_strategy={1: target_1, 2: target_2},
)
X_temp, y_temp = smote.fit_resample(X_train, y_train)
print(f"SMOTE后: {Counter(y_temp)}")
gc.collect()
# 步骤2: RandomOverSampler过采样多数类到完全平衡
n_total = len(y_temp)
target_0_ros = int(n_total / 3) # 完全平衡到33%
print(f"过采样目标: 类0={target_0_ros}")
ros = RandomOverSampler(
sampling_strategy={0: target_0_ros},
random_state=42
)
X_resampled, y_resampled = ros.fit_resample(X_temp, y_temp)
print(f"过采样后: {Counter(y_resampled)}")
print(f"采样后分布: {Counter(y_resampled) / len(y_resampled)}")
del X_temp, y_temp
gc.collect()
return X_resampled, y_resampled, name
except Exception as e:
print(f"❌ SMOTE+过采样出错: {str(e)}")
return X_train, y_train, name
@staticmethod
def no_resampling(X_train, y_train, name="无重采样(仅类别权重)"):
"""不进行重采样"""
print(f"\n{'='*70}")
print(f"应用: {name}")
print(f"{'='*70}")
print(f"训练集分布: {Counter(y_train)}")
return X_train, y_train, name
# ============================================================================
# 4. XGBoost 模型
# ============================================================================
class XGBoostImbalancedModel:
"""针对不平衡数据的XGBoost模型"""
def __init__(self, weight_method='extreme'):
self.model = None
self.weight_method = weight_method
self.scaler = StandardScaler()
self.thresholds = {1: 0.33, 2: 0.33}
def get_params(self):
return {
'n_estimators': 300, # 减少树数量
'max_depth': 8,
'learning_rate': 0.05,
'subsample': 0.8,
'colsample_bytree': 0.8,
'min_child_weight': 5,
'gamma': 0.5,
'reg_alpha': 1.0,
'reg_lambda': 2.0,
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'random_state': 42,
'n_jobs': -1,
'tree_method': 'hist' # 使用直方图加速
}
def fit(self, X_train, y_train):
"""训练模型"""
print(f"\n正在训练XGBoost...")
sample_weights, weights_dict = calculate_sample_weights(y_train, self.weight_method)
print(f"样本权重: {weights_dict}")
try:
self.model = xgb.XGBClassifier(**self.get_params())
self.model.fit(
X_train, y_train,
sample_weight=sample_weights,
verbose=False
)
print("✅ 模型训练完成")
except Exception as e:
print(f"❌ 模型训练出错: {str(e)}")
raise
def predict_proba(self, X_test):
"""概率预测"""
return self.model.predict_proba(X_test)
def predict(self, X_test, use_optimized_threshold=False):
"""预测"""
if use_optimized_threshold:
y_proba = self.predict_proba(X_test)
return self._apply_thresholds(y_proba)
else:
return self.model.predict(X_test)
def _apply_thresholds(self, y_proba):
"""应用优化的阈值"""
predictions = []
for proba in y_proba:
if proba[1] >= self.thresholds[1]:
predictions.append(1)
elif proba[2] >= self.thresholds[2]:
predictions.append(2)
else:
predictions.append(0)
return np.array(predictions)
def optimize_thresholds(self, X_val, y_val):
"""在原始验证集上优化阈值"""
y_proba = self.predict_proba(X_val)
for target_class in [1, 2]:
y_binary = (y_val == target_class).astype(int)
scores = y_proba[:, target_class]
precision, recall, thresholds = precision_recall_curve(y_binary, scores)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.33
self.thresholds[target_class] = best_threshold
print(f"类别{target_class}优化阈值: {best_threshold:.4f}")
# ============================================================================
# 5. 模型评估
# ============================================================================
def evaluate_model(y_true, y_pred, model_name, display=True):
"""评估模型"""
accuracy = accuracy_score(y_true, y_pred)
balanced_acc = balanced_accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average='macro')
f1_weighted = f1_score(y_true, y_pred, average='weighted')
precision, recall, f1, support = precision_recall_fscore_support(
y_true, y_pred, average=None
)
metrics = {
'model_name': model_name,
'accuracy': accuracy,
'balanced_accuracy': balanced_acc,
'f1_macro': f1_macro,
'f1_weighted': f1_weighted,
'f1_0': f1[0],
'f1_1': f1[1],
'f1_2': f1[2],
'precision_0': precision[0],
'precision_1': precision[1],
'precision_2': precision[2],
'recall_0': recall[0],
'recall_1': recall[1],
'recall_2': recall[2],
'confusion_matrix': confusion_matrix(y_true, y_pred)
}
if display:
print(f"\n{'='*70}")
print(f"模型: {model_name}")
print(f"{'='*70}")
print(f"准确率: {accuracy:.4f}")
print(f"平衡准确率: {balanced_acc:.4f}")
print(f"F1-宏平均: {f1_macro:.4f}")
print(f"F1-加权: {f1_weighted:.4f}")
print(f"\n{'类别':<8} {'精确率':<12} {'召回率':<12} {'F1分数':<12}")
print("-" * 50)
for i in range(3):
print(f"{i:<8} {precision[i]:<12.4f} {recall[i]:<12.4f} {f1[i]:<12.4f}")
print(f"\n混淆矩阵:\n{metrics['confusion_matrix']}")
return metrics
# ============================================================================
# 6. 对比分析 - 修复版本
# ============================================================================
def compare_strategies(X_train, y_train, X_val, y_val, X_test, y_test, features):
"""对比三种策略"""
results_list = []
strategies = [
('SMOTE+欠采样', ResamplingStrategy.smote_undersample),
('SMOTE+过采样', ResamplingStrategy.smote_oversample),
('无重采样', ResamplingStrategy.no_resampling)
]
for strategy_name, strategy_func in strategies:
print(f"\n\n{'#'*70}")
print(f"正在处理策略: {strategy_name}")
print(f"{'#'*70}")
try:
# 执行重采样(仅在训练集上)
X_train_resampled, y_train_resampled, display_name = strategy_func(
X_train, y_train
)
# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
print(f"\n特征已缩放")
print(f"训练数据: {X_train_scaled.shape}")
print(f"验证数据: {X_val_scaled.shape}")
print(f"测试数据: {X_test_scaled.shape}")
# 训练模型
model = XGBoostImbalancedModel(weight_method='extreme')
model.fit(X_train_scaled, y_train_resampled)
# 优化阈值
print(f"\n优化阈值(基于原始验证集)...")
model.optimize_thresholds(X_val_scaled, y_val)
# 预测(在原始未采样的测试集上)
print(f"\n在原始未采样的测试集上进行预测和评估...")
# 默认阈值
y_pred_default = model.predict(X_test_scaled, use_optimized_threshold=False)
metrics_default = evaluate_model(
y_test, y_pred_default,
f"{display_name} (默认阈值)"
)
results_list.append(metrics_default)
# 优化阈值
y_pred_optimized = model.predict(X_test_scaled, use_optimized_threshold=True)
metrics_optimized = evaluate_model(
y_test, y_pred_optimized,
f"{display_name} (优化阈值)"
)
results_list.append(metrics_optimized)
gc.collect() # 释放内存
except Exception as e:
print(f"❌ 处理策略 {strategy_name} 时出错: {str(e)}")
continue
return results_list
# ============================================================================
# 7. 可视化对比
# ============================================================================
def visualize_comparison(results_list):
"""可视化对比"""
df_results = pd.DataFrame(results_list)
print(f"\n{'='*70}")
print("📊 详细对比")
print(f"{'='*70}")
print(df_results[['model_name', 'precision_1', 'recall_1', 'f1_1',
'precision_2', 'recall_2', 'f1_2']].to_string(index=False))
# 绘制对比图
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
fig.suptitle('SMOTE+欠采样 vs SMOTE+过采样 vs 无重采样 对比', fontsize=14, fontweight='bold')
strategy_names = ['SMOTE+欠采样', 'SMOTE+过采样', '无重采样']
x_pos = np.arange(len(strategy_names))
# 类别1指标
for col_idx, (col_name, ax_idx) in enumerate([
('precision_1', (0, 0)), ('recall_1', (0, 1)), ('f1_1', (0, 2))
]):
values = []
for s in strategy_names:
mask = df_results['model_name'].str.contains(s)
values.append(df_results[mask][col_name].mean())
ax = axes[ax_idx[0], ax_idx[1]]
ax.bar(x_pos, values, color=['orange', 'green', 'blue'])
ax.set_ylabel(col_name.replace('_', ' ').title(), fontweight='bold')
ax.set_title(f'类别1 - {col_name.split("_")[0].title()}', fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(strategy_names, rotation=15, ha='right')
ax.set_ylim([0, 1])
for i, v in enumerate(values):
ax.text(i, v+0.02, f'{v:.3f}', ha='center', fontweight='bold')
# 类别2指标
for col_idx, (col_name, ax_idx) in enumerate([
('precision_2', (1, 0)), ('recall_2', (1, 1)), ('f1_2', (1, 2))
]):
values = []
for s in strategy_names:
mask = df_results['model_name'].str.contains(s)
values.append(df_results[mask][col_name].mean())
ax = axes[ax_idx[0], ax_idx[1]]
ax.bar(x_pos, values, color=['orange', 'green', 'blue'])
ax.set_ylabel(col_name.replace('_', ' ').title(), fontweight='bold')
ax.set_title(f'类别2 - {col_name.split("_")[0].title()}', fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(strategy_names, rotation=15, ha='right')
ax.set_ylim([0, 1])
for i, v in enumerate(values):
ax.text(i, v+0.02, f'{v:.3f}', ha='center', fontweight='bold')
plt.tight_layout()
plt.savefig('resampling_comparison.png', dpi=150, bbox_inches='tight')
print("\n✅ 对比图已保存: resampling_comparison.png")
return df_results
# ============================================================================
# 8. 主程序
# ============================================================================
def main():
print("="*70)
print("XGBoost时间序列不平衡分类 - 修复版本")
print("="*70)
# 1. 加载数据
print("\n第1步: 加载数据")
print("="*70)
df = load_and_prepare_data('../btc2019.csv', threshold=0.0015)
df, features = extract_features(df)
print(f"\n总样本数: {len(df)}")
print(f"特征数: {len(features)}")
# 2. 时间序列划分
print("\n第2步: 时间序列划分")
print("="*70)
X = df[features].values.astype(np.float32) # 转为float32节省内存
y = df['label'].values.astype(np.int8) # 转为int8节省内存
train_idx = int(len(X) * 0.8)
val_idx = int(len(X) * 0.9)
X_train, X_val, X_test = X[:train_idx], X[train_idx:val_idx], X[val_idx:]
y_train, y_val, y_test = y[:train_idx], y[train_idx:val_idx], y[val_idx:]
print(f"原始训练集: {len(X_train)} (分布: {Counter(y_train)})")
print(f"原始验证集: {len(X_val)} (分布: {Counter(y_val)})")
print(f"原始测试集: {len(X_test)} (分布: {Counter(y_test)})")
# 3. 对比三种策略
print("\n第3步: 对比三种重采样策略")
print("="*70)
results_list = compare_strategies(
X_train, y_train,
X_val, y_val,
X_test, y_test,
features
)
# 4. 可视化对比
print("\n第4步: 可视化对比")
print("="*70)
df_results = visualize_comparison(results_list)
# 5. 总结
print("\n第5步: 总结分析")
print("="*70)
df_results.to_csv('resampling_comparison_results.csv', index=False)
print("\n✅ 详细结果已保存: resampling_comparison_results.csv")
print("\n" + "="*70)
print("✅ 完成!查看 resampling_comparison_results.csv 获取完整结果")
print("="*70)
if __name__ == '__main__':
main()