Kaggle-Predicting Optimal Fertilizers-(多分类+xgboost+同一特征值多样性)

Predicting Optimal Fertilizers

题意:

给出土壤的特性,预测出3种最佳的肥料

数据处理:

1.有数字型和类别型,类别不能随意换成数字,独热编码。cat可以直接处理category类型。

2.构造一些相关土壤特性特征

3.由于label是category类型,但是xgb不可以处理category类型,因此需要先编码,最后求出结果之后再解码。

建立模型:

1.catboost交叉验证、xgboost交叉验证

代码:
python 复制代码
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from catboost import Pool, CatBoostClassifier


def init():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志
    warnings.simplefilter('ignore')  # 忽略警告日志
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option("display.max_rows", 1000)
    pd.set_option("display.max_columns", 1000)


def show_dataframe(df):
    print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
    print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
    print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
    print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
    print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
    print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
    # print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)


def show_relation(data, colx, coly):
    if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
        seaborn.boxplot(x=colx, y=coly, data=data)
    else:
        plt.scatter(data[colx], data[coly])
    plt.xlabel(colx)
    plt.ylabel(coly)
    plt.show()


def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        score = 0.0
        for i in range(min(k, len(p))):
            if p[i] == a:
                score += 1.0 / (i + 1)
                break
        return score

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


if __name__ == '__main__':
    init()

    df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
    df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
    df_train_additional = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
    pd.concat([df_train, df_train_additional], ignore_index=True)

    print("Start Feature enggering" + "-" * 70 + "\n")
    df_all = pd.concat([df_train.drop(['id', 'Fertilizer Name'], axis=1), df_test.drop(['id'], axis=1)], axis=0)

    df_all['Temp_Humidity_Interaction'] = df_all['Temparature'] * df_all['Humidity']
    df_all['N_P_Ratio'] = df_all['Nitrogen'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['K_P_Ratio'] = df_all['Potassium'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['Soil_Crop_Combination'] = df_all['Soil Type'].astype(str) + '_' + df_all['Crop Type'].astype(str)

    df_all['P_to_K'] = df_all['Phosphorous'] / (df_all['Potassium'] + 1e-5)
    df_all['Total_NPK'] = df_all['Nitrogen'] + df_all['Phosphorous'] + df_all['Potassium']
    df_all['Climate_Index'] = (df_all['Temparature'] + df_all['Humidity']) / 2
    df_all['Water_Stress'] = df_all['Humidity'] - df_all['Moisture']

    original_numerical_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
    for col in original_numerical_cols:
        df_all[f'{col}_Binned'] = df_all[col].astype(str)

    numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous',
                          'Temp_Humidity_Interaction', 'N_P_Ratio', 'K_P_Ratio']
    categorical_features = ['Soil Type', 'Crop Type', 'Soil_Crop_Combination']
    categorical_features.extend([f'{col}_Binned' for col in original_numerical_cols])

    poly_features_to_transform = original_numerical_cols
    poly = PolynomialFeatures(degree=2, include_bias=False)
    df_all_transformers = poly.fit_transform(df_all[poly_features_to_transform])

    poly_feature_names = poly.get_feature_names_out(poly_features_to_transform)
    df_all = df_all.drop(columns=poly_features_to_transform)
    df_all = pd.concat([df_all, pd.DataFrame(df_all_transformers, columns=poly_feature_names,index=df_all.index)], axis=1)

    numerical_features = df_all.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df_all.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

    all_features_ordered = numerical_features + categorical_features
    df_all = df_all[all_features_ordered]

    all_categories_union = {}
    for col in categorical_features:
        if col in df_all.columns:
            all_categories_union[col] = pd.concat([
                df_all[col],
            ], axis=0).astype(str).unique()
        else:
            print(f"Warning: Categorical column '{col}' not found after feature engineering. Skipping conversion.")

    for col in categorical_features:
        if col in df_all.columns:
            df_all[col] = pd.Categorical(df_all[col], categories=all_categories_union[col])

    le = LabelEncoder()
    X_train = df_all[:df_train.shape[0]]
    Y_train = df_train['Fertilizer Name']
    Y_train = le.fit_transform(Y_train)
    X_test = df_all[df_train.shape[0]:]

    print("Training model" + "-" * 70 + "\n")
    model_xgb = XGBClassifier(
        max_depth=8,  # 降低树深度
        colsample_bytree=0.5,  # 控制特征采样比例
        subsample=0.7,  # 控制数据采样比例
        n_estimators=3000,  # 减少迭代轮数
        learning_rate=0.03,  # 降低学习率
        gamma=0.5,  # 增加分裂难度
        max_delta_step=2,  # 限制权重更新步长
        reg_alpha=5,  # 增强L1正则化
        reg_lambda=3,  # 增强L2正则化
        early_stopping_rounds=100,  # 更早停止训练
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',
        device='cuda'
    )


    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    pred_xgb = np.zeros((X_test.shape[0], len(le.classes_)))

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):
        print(f"\nFold {fold + 1}/{kfold.n_splits}")

        x_fold_train, x_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = Y_train[train_idx], Y_train[val_idx]

        model_xgb.fit(
            x_fold_train, y_fold_train,
            eval_set = [(x_fold_val, y_fold_val)],
            verbose = 100,
        )

        pred_xgb += model_xgb.predict_proba(X_test) / kfold.n_splits

    pred_top3_xgb = np.argsort(pred_xgb, axis=1)[:, -3:][:, ::-1]
    top3_label = []
    for row in pred_top3_xgb:
        converted = [le.classes_[i] for i in row]
        top3_label.append(converted)

    submission = pd.DataFrame({
        'id': df_test['id'],
        'Fertilizer Name': [' '.join(preds) for preds in top3_label],
    })
    submission.to_csv('/kaggle/working/submission.csv', index=False)
#xgb0.35642
相关推荐
2501_9247306126 分钟前
智慧城管复杂人流场景下识别准确率↑32%:陌讯多模态感知引擎实战解析
大数据·人工智能·算法·计算机视觉·目标跟踪·视觉检测·边缘计算
CONDIMENTTTT40 分钟前
[机器学习]05-基于Fisher线性判别的鸢尾花数据集分类
人工智能·分类·数据挖掘
归辞...1 小时前
「iOS」————分类与扩展
ios·分类·cocoa
Kingfar_11 小时前
智能移动终端导航APP用户体验研究案例分享
人工智能·算法·人机交互·ux·用户界面·用户体验
攻城狮7号1 小时前
小米开源大模型 MiDashengLM-7B:不仅是“听懂”,更能“理解”声音
人工智能·midashenglm-7b·小米开源大模型·声音理解大模型
程序边界2 小时前
AI鉴伪技术:守护数字时代的真实性防线
人工智能
bryant_meng2 小时前
【DeepID】《Deep Learning Face Representation from Predicting 10,000 Classes》
人工智能·深度学习·人脸识别·verification·identification
申耀的科技观察2 小时前
【观察】亚信科技:AI大模型交付引领,三大新引擎重构业务增长逻辑
大数据·人工智能·科技·重构
CONDIMENTTTT2 小时前
[机器学习]04-基于K近邻(KNN)的鸢尾花数据集分类
人工智能·机器学习
Cprsensors3 小时前
压力传感器选型铁三角:介质·安全·精度
人工智能·机器人·无人机