6 回归集成:xgb、lgb、cat

这个代码是从kaggle上拷贝过来的:

  1. 如何使用三个树模型模块化训练;

  2. 文本特征如何做,如何挖掘;

  3. 时间特征的处理;

  4. 模型权重集成;

    import pandas as pd
    import math
    import numpy as np
    import joblib
    import optuna

    from lightgbm import LGBMRegressor
    from catboost import CatBoostRegressor
    from xgboost import XGBRegressor
    from sklearn.preprocessing import *
    from sklearn.metrics import *
    from sklearn.model_selection import *

    from sklearn.decomposition import TruncatedSVD
    from sklearn.feature_extraction.text import TfidfVectorizer

    import datetime
    import gc
    from sklearn.base import clone

    pd.set_option('display.max_columns', None)

    import warnings
    warnings.filterwarnings("ignore")

    d_s = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/solution_example.csv')

    te_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')

    tr_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')

    tr_d.drop('id',axis=1,inplace=True)
    te_d.drop('id',axis=1,inplace=True)

    tr_d['holiday_name'].fillna('None', inplace=True)
    te_d['holiday_name'].fillna('None', inplace=True)

    def Process_Date(Df):

    复制代码
     Df['date'] = pd.to_datetime(Df['date'])
    
     Df['year'] = Df['date'].dt.year
    
     Df['day'] = Df['date'].dt.day
    
     Df['month'] = Df['date'].dt.month
    
     Df['month_name'] = Df['date'].dt.month_name()
    
     Df['day_of_week'] = Df['date'].dt.day_name()
    
     Df['week'] = Df['date'].dt.isocalendar().week
     
     Df['year_sin'] = np.sin(2 * np.pi * Df['year'])
     Df['year_cos'] = np.cos(2 * np.pi * Df['year'])
     Df['month_sin'] = np.sin(2 * np.pi * Df['month'] / 12) 
     Df['month_cos'] = np.cos(2 * np.pi * Df['month'] / 12)
     Df['day_sin'] = np.sin(2 * np.pi * Df['day'] / 31)  
     Df['day_cos'] = np.cos(2 * np.pi * Df['day'] / 31)
     Df['group']=(Df['year']-2020)*48+Df['month']*4+Df['day']//7
     
     Df['total_holidays_month'] = Df.groupby(['year', 'month'])['holiday'].transform('sum')
     Df['total_shops_closed_week'] = Df.groupby(['year', 'week'])['shops_closed'].transform('sum')
    
     Df['group_sin'] = np.sin(2 * np.pi * Df['group'] / Df['group'].max())
     Df['group_cos'] = np.cos(2 * np.pi * Df['group'] / Df['group'].max())
    
    
     return Df

    tr_d = Process_Date(tr_d)
    te_d = Process_Date(te_d)

    tr_d = tr_d[['warehouse', 'date', 'holiday_name', 'holiday', 'shops_closed',
    'winter_school_holidays', 'school_holidays', 'year', 'day', 'month',
    'month_name', 'day_of_week', 'week', 'year_sin', 'year_cos',
    'month_sin', 'month_cos', 'day_sin', 'day_cos', 'group',
    'total_holidays_month', 'total_shops_closed_week',
    'group_sin', 'group_cos',
    'orders']]

    le_month = LabelEncoder()
    le_week = LabelEncoder()
    le_war = LabelEncoder()

    tr_d['month_name'] = le_month.fit_transform(tr_d['month_name'])
    tr_d['day_of_week'] = le_week.fit_transform(tr_d['day_of_week'])
    tr_d['warehouse'] = le_war.fit_transform(tr_d['warehouse'])

    te_d['month_name'] = le_month.transform(te_d['month_name'])
    te_d['day_of_week'] = le_week.transform(te_d['day_of_week'])
    te_d['warehouse'] = le_war.transform(te_d['warehouse'])

    def apply_tfidf_svd(df, text_column, max_features=1000, n_components=10):

    复制代码
     vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
    
     vectors = vectorizer.fit_transform(df[text_column])
    
     svd = TruncatedSVD(n_components)
    
     x_sv = svd.fit_transform(vectors)
    
     tfidf_df = pd.DataFrame(x_sv)
    
     cols = [(text_column + "_tfidf_" + str(f)) for f in tfidf_df.columns.to_list()]
    
     tfidf_df.columns = cols
    
     df = df.reset_index(drop=True)
    
     df = pd.concat([df, tfidf_df], axis="columns")
    
     return df

    tr_d = apply_tfidf_svd(tr_d,'holiday_name')
    te_d = apply_tfidf_svd(te_d,'holiday_name')

    tr_d.drop(['date','holiday_name'],axis=1,inplace=True)
    te_d.drop(['date','holiday_name'],axis=1,inplace=True)

    print(f"Shape Of Train Data is {tr_d.shape}")
    print(f"Shape Of Test Data is {te_d.shape}")

    %%time

    X = tr_d.drop('orders',axis=1)
    y =tr_d['orders']

    def cross_validate(model, n_splits=15):

    复制代码
     scores = []
     test_preds = np.zeros(len(te_d))
     
     groups = X['group']
     
     kfold = GroupKFold(n_splits=n_splits)
     
     for fold, (train_index, valid_index) in enumerate(kfold.split(X, y, groups=groups)):
         
         X_train = X.iloc[train_index]
         y_train = y.iloc[train_index]
         X_val = X.iloc[valid_index]
         y_val = y.iloc[valid_index]
                     
         m = clone(model)
         m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
         
         y_pred = m.predict(X_val)
         score = mean_absolute_percentage_error(y_val, y_pred)
         
         scores.append(score)
         
         test_preds += m.predict(te_d) / n_splits
    
         gc.collect()
    
     print(f" MAPE mean: {np.array(scores).mean():.7f} (+- {np.array(scores).std():.7f})")
    
     return test_preds

    %%time

    SEED = 2375

    cat = CatBoostRegressor(verbose=0,learning_rate=0.01,iterations=2000,
    random_state = SEED)
    cat_test_preds = cross_validate(cat)

    SEED = 1023
    xgb = XGBRegressor(n_estimators=1000,learning_rate=0.05,verbosity=0,
    random_state=SEED)
    xgb_test_preds = cross_validate(xgb)

    %%time

    lgb = LGBMRegressor(verbose=-1,
    random_state = SEED
    )
    lgb_test_preds = cross_validate(lgb)

    %%time

    weights = {
    'cat_test_preds': 0.45,
    'lgb_test_preds': 0.45,
    'xgb_test_preds': 0.1,

    }

    cat_test_preds_weighted = cat_test_preds * weights['cat_test_preds']
    lgb_test_preds_weighted = lgb_test_preds * weights['lgb_test_preds']
    xgb_test_preds_weighted = xgb_test_preds * weights['xgb_test_preds']

    ensemble_preds = cat_test_preds_weighted + lgb_test_preds_weighted + xgb_test_preds_weighted

    d_s['orders'] = ensemble_preds
    d_s['id'] = d_s['id']

    d_s.to_csv('Submission.csv', index=False)

    print(d_s.head())

相关推荐
澪-sl6 分钟前
基于CNN的人脸关键点检测
人工智能·深度学习·神经网络·计算机视觉·cnn·视觉检测·卷积神经网络
羊小猪~~21 分钟前
数据库学习笔记(十七)--触发器的使用
数据库·人工智能·后端·sql·深度学习·mysql·考研
摸爬滚打李上进39 分钟前
重生学AI第十六集:线性层nn.Linear
人工智能·pytorch·python·神经网络·机器学习
HuashuiMu花水木40 分钟前
PyTorch笔记1----------Tensor(张量):基本概念、创建、属性、算数运算
人工智能·pytorch·笔记
lishaoan771 小时前
使用tensorflow的线性回归的例子(四)
人工智能·tensorflow·线性回归
AI让世界更懂你1 小时前
【ACL系列论文写作指北15-如何进行reveiw】-公平、公正、公开
人工智能·自然语言处理
牛客企业服务2 小时前
2025年AI面试推荐榜单,数字化招聘转型优选
人工智能·python·算法·面试·职场和发展·金融·求职招聘
视觉语言导航2 小时前
RAL-2025 | 清华大学数字孪生驱动的机器人视觉导航!VR-Robo:面向视觉机器人导航与运动的现实-模拟-现实框架
人工智能·深度学习·机器人·具身智能
**梯度已爆炸**3 小时前
自然语言处理入门
人工智能·自然语言处理
ctrlworks3 小时前
楼宇自控核心功能:实时监控设备运行,快速诊断故障,赋能设备寿命延长
人工智能·ba系统厂商·楼宇自控系统厂家·ibms系统厂家·建筑管理系统厂家·能耗监测系统厂家