Kaggle-Predicting Optimal Fertilizers-(多分类+xgboost+同一特征值多样性)

Predicting Optimal Fertilizers

题意:

给出土壤的特性,预测出3种最佳的肥料

数据处理:

1.有数字型和类别型,类别不能随意换成数字,独热编码。cat可以直接处理category类型。

2.构造一些相关土壤特性特征

3.由于label是category类型,但是xgb不可以处理category类型,因此需要先编码,最后求出结果之后再解码。

建立模型:

1.catboost交叉验证、xgboost交叉验证

代码:
python 复制代码
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from catboost import Pool, CatBoostClassifier


def init():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志
    warnings.simplefilter('ignore')  # 忽略警告日志
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option("display.max_rows", 1000)
    pd.set_option("display.max_columns", 1000)


def show_dataframe(df):
    print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
    print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
    print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
    print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
    print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
    print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
    # print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)


def show_relation(data, colx, coly):
    if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
        seaborn.boxplot(x=colx, y=coly, data=data)
    else:
        plt.scatter(data[colx], data[coly])
    plt.xlabel(colx)
    plt.ylabel(coly)
    plt.show()


def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        score = 0.0
        for i in range(min(k, len(p))):
            if p[i] == a:
                score += 1.0 / (i + 1)
                break
        return score

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


if __name__ == '__main__':
    init()

    df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
    df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
    df_train_additional = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
    pd.concat([df_train, df_train_additional], ignore_index=True)

    print("Start Feature enggering" + "-" * 70 + "\n")
    df_all = pd.concat([df_train.drop(['id', 'Fertilizer Name'], axis=1), df_test.drop(['id'], axis=1)], axis=0)

    df_all['Temp_Humidity_Interaction'] = df_all['Temparature'] * df_all['Humidity']
    df_all['N_P_Ratio'] = df_all['Nitrogen'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['K_P_Ratio'] = df_all['Potassium'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['Soil_Crop_Combination'] = df_all['Soil Type'].astype(str) + '_' + df_all['Crop Type'].astype(str)

    df_all['P_to_K'] = df_all['Phosphorous'] / (df_all['Potassium'] + 1e-5)
    df_all['Total_NPK'] = df_all['Nitrogen'] + df_all['Phosphorous'] + df_all['Potassium']
    df_all['Climate_Index'] = (df_all['Temparature'] + df_all['Humidity']) / 2
    df_all['Water_Stress'] = df_all['Humidity'] - df_all['Moisture']

    original_numerical_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
    for col in original_numerical_cols:
        df_all[f'{col}_Binned'] = df_all[col].astype(str)

    numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous',
                          'Temp_Humidity_Interaction', 'N_P_Ratio', 'K_P_Ratio']
    categorical_features = ['Soil Type', 'Crop Type', 'Soil_Crop_Combination']
    categorical_features.extend([f'{col}_Binned' for col in original_numerical_cols])

    poly_features_to_transform = original_numerical_cols
    poly = PolynomialFeatures(degree=2, include_bias=False)
    df_all_transformers = poly.fit_transform(df_all[poly_features_to_transform])

    poly_feature_names = poly.get_feature_names_out(poly_features_to_transform)
    df_all = df_all.drop(columns=poly_features_to_transform)
    df_all = pd.concat([df_all, pd.DataFrame(df_all_transformers, columns=poly_feature_names,index=df_all.index)], axis=1)

    numerical_features = df_all.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df_all.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

    all_features_ordered = numerical_features + categorical_features
    df_all = df_all[all_features_ordered]

    all_categories_union = {}
    for col in categorical_features:
        if col in df_all.columns:
            all_categories_union[col] = pd.concat([
                df_all[col],
            ], axis=0).astype(str).unique()
        else:
            print(f"Warning: Categorical column '{col}' not found after feature engineering. Skipping conversion.")

    for col in categorical_features:
        if col in df_all.columns:
            df_all[col] = pd.Categorical(df_all[col], categories=all_categories_union[col])

    le = LabelEncoder()
    X_train = df_all[:df_train.shape[0]]
    Y_train = df_train['Fertilizer Name']
    Y_train = le.fit_transform(Y_train)
    X_test = df_all[df_train.shape[0]:]

    print("Training model" + "-" * 70 + "\n")
    model_xgb = XGBClassifier(
        max_depth=8,  # 降低树深度
        colsample_bytree=0.5,  # 控制特征采样比例
        subsample=0.7,  # 控制数据采样比例
        n_estimators=3000,  # 减少迭代轮数
        learning_rate=0.03,  # 降低学习率
        gamma=0.5,  # 增加分裂难度
        max_delta_step=2,  # 限制权重更新步长
        reg_alpha=5,  # 增强L1正则化
        reg_lambda=3,  # 增强L2正则化
        early_stopping_rounds=100,  # 更早停止训练
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',
        device='cuda'
    )


    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    pred_xgb = np.zeros((X_test.shape[0], len(le.classes_)))

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):
        print(f"\nFold {fold + 1}/{kfold.n_splits}")

        x_fold_train, x_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = Y_train[train_idx], Y_train[val_idx]

        model_xgb.fit(
            x_fold_train, y_fold_train,
            eval_set = [(x_fold_val, y_fold_val)],
            verbose = 100,
        )

        pred_xgb += model_xgb.predict_proba(X_test) / kfold.n_splits

    pred_top3_xgb = np.argsort(pred_xgb, axis=1)[:, -3:][:, ::-1]
    top3_label = []
    for row in pred_top3_xgb:
        converted = [le.classes_[i] for i in row]
        top3_label.append(converted)

    submission = pd.DataFrame({
        'id': df_test['id'],
        'Fertilizer Name': [' '.join(preds) for preds in top3_label],
    })
    submission.to_csv('/kaggle/working/submission.csv', index=False)
#xgb0.35642
相关推荐
测试员周周4 小时前
【Appium 系列】第16节-WebView-H5上下文切换 — 混合应用的自动化难点
运维·开发语言·人工智能·功能测试·appium·自动化·测试用例
K姐研究社6 小时前
怎么用AI制作电商口播视频,开拍APP一键生成
人工智能·音视频
LaughingZhu6 小时前
Product Hunt 每日热榜 | 2026-05-21
前端·人工智能·经验分享·chatgpt·html
传说故事7 小时前
【论文阅读】MotuBrain: An Advanced World Action Model for Robot Control
论文阅读·人工智能·具身智能·wam
北京耐用通信7 小时前
全域适配工业场景耐达讯自动化Modbus TCP 转 PROFIBUS 网关轻松实现以太网与现场总线互通
网络·人工智能·网络协议·自动化·信息与通信
火山引擎开发者社区7 小时前
TRAE × 火山引擎 Supabase:为你的 AI 应用装上“数据引擎”
人工智能
小a彤7 小时前
GE 在 CANN 五层架构中的位置
人工智能·深度学习·transformer
前端若水8 小时前
会话管理:创建、切换、删除对话历史
前端·人工智能·python·react.js
Upsy-Daisy8 小时前
AI Agent 项目学习笔记(八):Tool Calling 工具调用机制总览
人工智能·笔记·学习
企学宝8 小时前
企学宝5月专题课程丨《OpenClaw AI 智能体实战营:从零基础部署到全场景自动化落地》
人工智能·ai·企业培训