Kaggle-Predict Calorie Expenditure-(回归+xgb+cat+lgb+模型融合)

Predict Calorie Expenditure

题意:

给出每个人的基本信息,预测运动后的卡路里消耗值。

数据处理:

1.构造出人体机能、运动相关的特征值。

2.所有特征值进行从新组合,注意唯独爆炸

3.对连续信息分箱变成离散

建立模型:

1.xgb模型,lgb模型,cat模型

2.使用stack堆叠融合,使用3折交叉验证

3.对xgb、lgb、cat进行K折交叉验证,最终和stack进行结果融合。

代码:
python 复制代码
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

def init():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志
    warnings.simplefilter('ignore')  # 忽略警告日志
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option("display.max_rows", 1000)
    pd.set_option("display.max_columns", 1000)


def show_dataframe(df):
    print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
    print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
    print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
    print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
    print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
    print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
    #print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)


def show_relation(data, colx, coly):  # 输出某一特征值与目标值的关系
    if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
        seaborn.boxplot(x=colx, y=coly, data=data)
    else:
        plt.scatter(data[colx], data[coly])
    plt.xlabel(colx)
    plt.ylabel(coly)
    plt.show()

# 自定义RMSLE评分函数(GridSearchCV需要最大化评分,因此返回负RMSLE)
def rmsle_scorer(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, None)  # 防止对0取对数
    y_true = np.clip(y_true, 1e-15, None)
    log_error = np.log(y_pred + 1) - np.log(y_true + 1)
    rmsle = np.sqrt(np.mean(log_error ** 2))
    return -rmsle  # 返回负值,因为GridSearchCV默认最大化评分

if __name__ == '__main__':
    init()

    df_train = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv')
    df_test = pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv')


    #for col in df_train.columns:
    #   show_relation(df_train, col, 'Calories')

    #特征工程
    df_all = pd.concat([df_train.drop(['id', 'Calories'], axis=1), df_test.drop(['id'], axis=1)], axis=0)

    df_all['Sex'] = df_all['Sex'].map({'male': 0, 'female': 1})
    df_all = df_all.reset_index(drop=True)
    #构造BMI
    df_all['BMI'] = df_all['Weight'] / (df_all['Height'] / 100) ** 2

    #Harris-Benedict公式
    df_all['BMR'] = 0
    df_all.loc[df_all['Sex'] == 0, 'BMR'] = 88.362 + (13.397 * df_all['Weight']) + (4.799 * df_all['Height']) - (5.677 * df_all['Age'])
    df_all.loc[df_all['Sex'] == 1, 'BMR'] = 447.593 + (9.247 * df_all['Weight']) + (3.098 * df_all['Height']) - (4.330 * df_all['Age'])

    # 数值特征标准化
    #numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']
    #scaler = StandardScaler()
    #df_all[numeric_features] = scaler.fit_transform(df_all[numeric_features])

    #运动强度特征
    df_all['Max_HR'] = 220 - df_all['Age']  # 最大心率
    df_all['HR_Reserve_Ratio'] = df_all['Heart_Rate'] / df_all['Max_HR']

    #交互特征
    df_all['Weight_Duration'] = df_all['Weight'] * df_all['Duration']
    df_all['Sex_Weight'] = df_all['Sex'] * df_all['Weight']

    # 构造运动功率特征
    df_all['workload'] = df_all['Weight'] * df_all['Duration'] * df_all['Heart_Rate'] / 1000

    # 构造生理特征交互项
    df_all['age_heart_ratio'] = df_all['Age'] / df_all['Heart_Rate']

    # 时间维度特征(如有时间戳)
    df_all['hour_of_day'] = df_all['Duration']/60/24

    # 组合特征
    numeric_cols = df_all.columns
    for i in range(len(numeric_cols)):
        feature_1 = numeric_cols[i]
        for j in range(i + 1, len(numeric_cols)):
            feature_2 = numeric_cols[j]
            df_all[f'{feature_1}_x_{feature_2}'] = df_all[feature_1] * df_all[feature_2]

    #数值归一化
    #scaler = RobustScaler()
    #df_all = scaler.fit_transform(df_all)
    now_col = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp', 'BMI']
    for i in now_col:
        df_all[i + "_box"] = pd.cut(df_all[i], bins=10, labels=False, right=False)

    X_train = df_all[:df_train.shape[0]]
    Y_train = np.log1p(df_train['Calories'])
    x_test = df_all[df_train.shape[0]:]

    #xgb
    model_xgb =estimator=XGBRegressor(
            random_state=42,
            n_estimators=8000,
            objective='reg:squarederror',
            eval_metric='rmse',
            device='cuda',
            learning_rate=0.05,
            max_depth=8,
            colsample_bytree=0.75,
            subsample=0.9,
            #reg_lambda = 1,
            #reg_alpha = 0.5,
            early_stopping_rounds=500,
    )
    #lgb
    model_lgb = lightgbm.LGBMRegressor(
        n_estimators=3000,  # 增加迭代次数配合早停
        learning_rate=0.03,  # 减小学习率
        num_leaves=15,  # 限制模型复杂度
        min_child_samples=25,  # 增加最小叶子样本数
        reg_alpha=0.1,  # L1正则化
        reg_lambda=0.1,  # L2正则化
        objective='regression_l1',  # 改用MAE损失
        early_stopping_rounds=500,
    )
    #cat
    model_cat = CatBoostRegressor(
        iterations=3500,
        learning_rate=0.02,
        depth=12,
        loss_function='RMSE',
        l2_leaf_reg=3,
        random_seed=42,
        eval_metric='RMSE',
        early_stopping_rounds=200,
        verbose=1000,
        task_type='GPU',
    )
    #融合
    #创建基模型列表(需禁用早停以生成完整预测)
    base_models = [
        ('xgb', XGBRegressor(
            early_stopping_rounds=None,  # 禁用早停
            **{k: v for k, v in model_xgb.get_params().items() if k != 'early_stopping_rounds'}
        )),
        ('lgb', LGBMRegressor(
            early_stopping_rounds=None,  # 禁用早停
            **{k: v for k, v in model_lgb.get_params().items() if k != 'early_stopping_rounds'}
        )),
        ('cat', CatBoostRegressor(
            early_stopping_rounds=None,  # 禁用早停
            **{k: v for k, v in model_cat.get_params().items() if k != 'early_stopping_rounds'}
        ))
    ]
    meta_model = RidgeCV()
    model_stack = StackingRegressor(
        estimators=base_models,
        final_estimator=meta_model,
        cv=3,  # 使用3折交叉验证生成元特征
        passthrough=False,  # 不使用原始特征
        verbose=1
    )

    FOLDS = 20
    KF = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    cat_features = ['Sex']
    oof_cat = np.zeros(len(df_train))
    pred_cat = np.zeros(len(df_test))
    oof_xgb = np.zeros(len(df_train))
    pred_xgb = np.zeros(len(df_test))
    oof_lgb = np.zeros(len(df_train))
    pred_lgb = np.zeros(len(df_test))

    for i, (train_idx, valid_idx) in enumerate(KF.split(X_train, Y_train)):
        print('#' * 15, i + 1, '#' * 15)
        ## SPLIT DS
        x_train, y_train = X_train.iloc[train_idx], Y_train.iloc[train_idx]
        x_valid, y_valid = X_train.iloc[valid_idx], Y_train.iloc[valid_idx]

        ## CATBOOST fit
        model_cat.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], cat_features=cat_features,
                      use_best_model=True, verbose=0)
        ## XGB FIR
        model_xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=0)
        ## LGB MODEL
        model_lgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
        ## PREDICTION CATBOOST
        oof_cat[valid_idx] = model_cat.predict(x_valid)
        pred_cat += model_cat.predict(x_test)
        ## PREDICTION XGB
        oof_xgb[valid_idx] = model_xgb.predict(x_valid)
        pred_xgb += model_xgb.predict(x_test)
        ## PREDICTION LGB
        oof_lgb[valid_idx] = model_lgb.predict(x_valid)
        pred_lgb += model_lgb.predict(x_test)

        cat_rmse = mean_squared_error(y_valid, oof_cat[valid_idx]) ** 0.5
        xgb_rmse = mean_squared_error(y_valid, oof_xgb[valid_idx]) ** 0.5
        lgb_rmse = mean_squared_error(y_valid, oof_lgb[valid_idx]) ** 0.5

        print(
            f'FOLD {i + 1} CATBOOST_RMSE = {cat_rmse:.4f} <=> XGB_RMSE = {xgb_rmse:.4f} <=> LGB_RMSE = {lgb_rmse:.4f}')

    #预测
    pred_cat /= FOLDS
    pred_xgb /= FOLDS
    pred_lgb /= FOLDS
    pred_stack = model_stack.predict(df_test)

    pred_all = np.expm1(pred_xgb) * 0.1 + np.expm1(pred_stack) * 0.80 + np.expm1(pred_cat) * 0.1

    submission = pd.DataFrame({
        'id': df_test['id'],
        'Calories': pred_all
    })
    submission['Calories'] = np.clip(submission['Calories'], a_min=1, a_max=20*df_test['Duration'])
    submission.to_csv('/kaggle/working/submission.csv', index=False)
相关推荐
EMA2 分钟前
MaxKB 技术解析文档
人工智能
湘美书院--湘美谈教育2 分钟前
湘美谈教育AI赋能系列经验集锦:学好唐诗宋词的点滴心得体会
大数据·人工智能·深度学习·神经网络·机器学习
迦蓝叶8 分钟前
【开源自荐】JAiRouter:一个轻量级 AI 模型服务网关的开源实践
java·人工智能·spring·开源·llm-gateway·mass
Java知识技术分享16 分钟前
opencode安装ui-ux-pro-max和frontend-ui-ux技能
人工智能·ui·个人开发·ai编程·ux
苏映视官方账号22 分钟前
精品案例丨方寸之间,“微” 毫毕现 —— 圆刀机高精度检测工艺优化实例
人工智能·数码相机·视觉检测·制造
Cloud_Shy61824 分钟前
解读《Effective Python 3rd Edition》:从练气到老魔(第六章 Item 40 - 43)
android·开发语言·人工智能·笔记·python·学习方法
Sammyyyyy24 分钟前
月之暗面 Kimi Code 0.4.0 发布,终端 AI 编码助手全面采用 TypeScript,实现毫秒级启动
前端·javascript·人工智能·ai·typescript·servbay
装不满的克莱因瓶24 分钟前
掌握生成对抗网络(GAN)的优化目标与评估指标——从博弈函数到生成质量衡量体系
人工智能·python·深度学习·算法·机器学习
whyfail27 分钟前
小米 MiMo Code 开源:能免费用 2.5 模型的 AI 编程 Agent
人工智能
慕木沐33 分钟前
【Spring AI + Google ADK 】流式输出时 outputKey 状态缓存失败的问题
人工智能·spring·缓存