6 回归集成：xgb、lgb、cat

这个代码是从kaggle上拷贝过来的：

如何使用三个树模型模块化训练；
文本特征如何做，如何挖掘；
时间特征的处理；
模型权重集成；

import pandas as pd
import math
import numpy as np
import joblib
import optuna

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import *
from sklearn.metrics import *
from sklearn.model_selection import *

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

import datetime
import gc
from sklearn.base import clone

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

d_s = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/solution_example.csv')

te_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/test.csv')

tr_d = pd.read_csv('/kaggle/input/rohlik-orders-forecasting-challenge/train.csv')

tr_d.drop('id',axis=1,inplace=True)
te_d.drop('id',axis=1,inplace=True)

tr_d['holiday_name'].fillna('None', inplace=True)
te_d['holiday_name'].fillna('None', inplace=True)

def Process_Date(Df):
复制代码
```
 Df['date'] = pd.to_datetime(Df['date'])

 Df['year'] = Df['date'].dt.year

 Df['day'] = Df['date'].dt.day

 Df['month'] = Df['date'].dt.month

 Df['month_name'] = Df['date'].dt.month_name()

 Df['day_of_week'] = Df['date'].dt.day_name()

 Df['week'] = Df['date'].dt.isocalendar().week
 
 Df['year_sin'] = np.sin(2 * np.pi * Df['year'])
 Df['year_cos'] = np.cos(2 * np.pi * Df['year'])
 Df['month_sin'] = np.sin(2 * np.pi * Df['month'] / 12) 
 Df['month_cos'] = np.cos(2 * np.pi * Df['month'] / 12)
 Df['day_sin'] = np.sin(2 * np.pi * Df['day'] / 31)  
 Df['day_cos'] = np.cos(2 * np.pi * Df['day'] / 31)
 Df['group']=(Df['year']-2020)*48+Df['month']*4+Df['day']//7
 
 Df['total_holidays_month'] = Df.groupby(['year', 'month'])['holiday'].transform('sum')
 Df['total_shops_closed_week'] = Df.groupby(['year', 'week'])['shops_closed'].transform('sum')

 Df['group_sin'] = np.sin(2 * np.pi * Df['group'] / Df['group'].max())
 Df['group_cos'] = np.cos(2 * np.pi * Df['group'] / Df['group'].max())


 return Df
```
tr_d = Process_Date(tr_d)
te_d = Process_Date(te_d)

tr_d = tr_d[['warehouse', 'date', 'holiday_name', 'holiday', 'shops_closed',
'winter_school_holidays', 'school_holidays', 'year', 'day', 'month',
'month_name', 'day_of_week', 'week', 'year_sin', 'year_cos',
'month_sin', 'month_cos', 'day_sin', 'day_cos', 'group',
'total_holidays_month', 'total_shops_closed_week',
'group_sin', 'group_cos',
'orders']]

le_month = LabelEncoder()
le_week = LabelEncoder()
le_war = LabelEncoder()

tr_d['month_name'] = le_month.fit_transform(tr_d['month_name'])
tr_d['day_of_week'] = le_week.fit_transform(tr_d['day_of_week'])
tr_d['warehouse'] = le_war.fit_transform(tr_d['warehouse'])

te_d['month_name'] = le_month.transform(te_d['month_name'])
te_d['day_of_week'] = le_week.transform(te_d['day_of_week'])
te_d['warehouse'] = le_war.transform(te_d['warehouse'])

def apply_tfidf_svd(df, text_column, max_features=1000, n_components=10):
复制代码
```
 vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')

 vectors = vectorizer.fit_transform(df[text_column])

 svd = TruncatedSVD(n_components)

 x_sv = svd.fit_transform(vectors)

 tfidf_df = pd.DataFrame(x_sv)

 cols = [(text_column + "_tfidf_" + str(f)) for f in tfidf_df.columns.to_list()]

 tfidf_df.columns = cols

 df = df.reset_index(drop=True)

 df = pd.concat([df, tfidf_df], axis="columns")

 return df
```
tr_d = apply_tfidf_svd(tr_d,'holiday_name')
te_d = apply_tfidf_svd(te_d,'holiday_name')

tr_d.drop(['date','holiday_name'],axis=1,inplace=True)
te_d.drop(['date','holiday_name'],axis=1,inplace=True)

print(f"Shape Of Train Data is {tr_d.shape}")
print(f"Shape Of Test Data is {te_d.shape}")

%%time

X = tr_d.drop('orders',axis=1)
y =tr_d['orders']

def cross_validate(model, n_splits=15):
复制代码
```
 scores = []
 test_preds = np.zeros(len(te_d))
 
 groups = X['group']
 
 kfold = GroupKFold(n_splits=n_splits)
 
 for fold, (train_index, valid_index) in enumerate(kfold.split(X, y, groups=groups)):
     
     X_train = X.iloc[train_index]
     y_train = y.iloc[train_index]
     X_val = X.iloc[valid_index]
     y_val = y.iloc[valid_index]
                 
     m = clone(model)
     m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
     
     y_pred = m.predict(X_val)
     score = mean_absolute_percentage_error(y_val, y_pred)
     
     scores.append(score)
     
     test_preds += m.predict(te_d) / n_splits

     gc.collect()

 print(f" MAPE mean: {np.array(scores).mean():.7f} (+- {np.array(scores).std():.7f})")

 return test_preds
```
%%time

SEED = 2375

cat = CatBoostRegressor(verbose=0,learning_rate=0.01,iterations=2000,
random_state = SEED)
cat_test_preds = cross_validate(cat)

SEED = 1023
xgb = XGBRegressor(n_estimators=1000,learning_rate=0.05,verbosity=0,
random_state=SEED)
xgb_test_preds = cross_validate(xgb)

%%time

lgb = LGBMRegressor(verbose=-1,
random_state = SEED
)
lgb_test_preds = cross_validate(lgb)

%%time

weights = {
'cat_test_preds': 0.45,
'lgb_test_preds': 0.45,
'xgb_test_preds': 0.1,

}

cat_test_preds_weighted = cat_test_preds * weights['cat_test_preds']
lgb_test_preds_weighted = lgb_test_preds * weights['lgb_test_preds']
xgb_test_preds_weighted = xgb_test_preds * weights['xgb_test_preds']

ensemble_preds = cat_test_preds_weighted + lgb_test_preds_weighted + xgb_test_preds_weighted

d_s['orders'] = ensemble_preds
d_s['id'] = d_s['id']

d_s.to_csv('Submission.csv', index=False)

print(d_s.head())