Kaggle-Predicting Optimal Fertilizers-(多分类+xgboost+同一特征值多样性)

Predicting Optimal Fertilizers

题意:

给出土壤的特性,预测出3种最佳的肥料

数据处理:

1.有数字型和类别型,类别不能随意换成数字,独热编码。cat可以直接处理category类型。

2.构造一些相关土壤特性特征

3.由于label是category类型,但是xgb不可以处理category类型,因此需要先编码,最后求出结果之后再解码。

建立模型:

1.catboost交叉验证、xgboost交叉验证

代码:
python 复制代码
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from catboost import Pool, CatBoostClassifier


def init():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志
    warnings.simplefilter('ignore')  # 忽略警告日志
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option("display.max_rows", 1000)
    pd.set_option("display.max_columns", 1000)


def show_dataframe(df):
    print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
    print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
    print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
    print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
    print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
    print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
    # print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)


def show_relation(data, colx, coly):
    if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
        seaborn.boxplot(x=colx, y=coly, data=data)
    else:
        plt.scatter(data[colx], data[coly])
    plt.xlabel(colx)
    plt.ylabel(coly)
    plt.show()


def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        score = 0.0
        for i in range(min(k, len(p))):
            if p[i] == a:
                score += 1.0 / (i + 1)
                break
        return score

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


if __name__ == '__main__':
    init()

    df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
    df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
    df_train_additional = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
    pd.concat([df_train, df_train_additional], ignore_index=True)

    print("Start Feature enggering" + "-" * 70 + "\n")
    df_all = pd.concat([df_train.drop(['id', 'Fertilizer Name'], axis=1), df_test.drop(['id'], axis=1)], axis=0)

    df_all['Temp_Humidity_Interaction'] = df_all['Temparature'] * df_all['Humidity']
    df_all['N_P_Ratio'] = df_all['Nitrogen'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['K_P_Ratio'] = df_all['Potassium'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['Soil_Crop_Combination'] = df_all['Soil Type'].astype(str) + '_' + df_all['Crop Type'].astype(str)

    df_all['P_to_K'] = df_all['Phosphorous'] / (df_all['Potassium'] + 1e-5)
    df_all['Total_NPK'] = df_all['Nitrogen'] + df_all['Phosphorous'] + df_all['Potassium']
    df_all['Climate_Index'] = (df_all['Temparature'] + df_all['Humidity']) / 2
    df_all['Water_Stress'] = df_all['Humidity'] - df_all['Moisture']

    original_numerical_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
    for col in original_numerical_cols:
        df_all[f'{col}_Binned'] = df_all[col].astype(str)

    numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous',
                          'Temp_Humidity_Interaction', 'N_P_Ratio', 'K_P_Ratio']
    categorical_features = ['Soil Type', 'Crop Type', 'Soil_Crop_Combination']
    categorical_features.extend([f'{col}_Binned' for col in original_numerical_cols])

    poly_features_to_transform = original_numerical_cols
    poly = PolynomialFeatures(degree=2, include_bias=False)
    df_all_transformers = poly.fit_transform(df_all[poly_features_to_transform])

    poly_feature_names = poly.get_feature_names_out(poly_features_to_transform)
    df_all = df_all.drop(columns=poly_features_to_transform)
    df_all = pd.concat([df_all, pd.DataFrame(df_all_transformers, columns=poly_feature_names,index=df_all.index)], axis=1)

    numerical_features = df_all.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df_all.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

    all_features_ordered = numerical_features + categorical_features
    df_all = df_all[all_features_ordered]

    all_categories_union = {}
    for col in categorical_features:
        if col in df_all.columns:
            all_categories_union[col] = pd.concat([
                df_all[col],
            ], axis=0).astype(str).unique()
        else:
            print(f"Warning: Categorical column '{col}' not found after feature engineering. Skipping conversion.")

    for col in categorical_features:
        if col in df_all.columns:
            df_all[col] = pd.Categorical(df_all[col], categories=all_categories_union[col])

    le = LabelEncoder()
    X_train = df_all[:df_train.shape[0]]
    Y_train = df_train['Fertilizer Name']
    Y_train = le.fit_transform(Y_train)
    X_test = df_all[df_train.shape[0]:]

    print("Training model" + "-" * 70 + "\n")
    model_xgb = XGBClassifier(
        max_depth=8,  # 降低树深度
        colsample_bytree=0.5,  # 控制特征采样比例
        subsample=0.7,  # 控制数据采样比例
        n_estimators=3000,  # 减少迭代轮数
        learning_rate=0.03,  # 降低学习率
        gamma=0.5,  # 增加分裂难度
        max_delta_step=2,  # 限制权重更新步长
        reg_alpha=5,  # 增强L1正则化
        reg_lambda=3,  # 增强L2正则化
        early_stopping_rounds=100,  # 更早停止训练
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',
        device='cuda'
    )


    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    pred_xgb = np.zeros((X_test.shape[0], len(le.classes_)))

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):
        print(f"\nFold {fold + 1}/{kfold.n_splits}")

        x_fold_train, x_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = Y_train[train_idx], Y_train[val_idx]

        model_xgb.fit(
            x_fold_train, y_fold_train,
            eval_set = [(x_fold_val, y_fold_val)],
            verbose = 100,
        )

        pred_xgb += model_xgb.predict_proba(X_test) / kfold.n_splits

    pred_top3_xgb = np.argsort(pred_xgb, axis=1)[:, -3:][:, ::-1]
    top3_label = []
    for row in pred_top3_xgb:
        converted = [le.classes_[i] for i in row]
        top3_label.append(converted)

    submission = pd.DataFrame({
        'id': df_test['id'],
        'Fertilizer Name': [' '.join(preds) for preds in top3_label],
    })
    submission.to_csv('/kaggle/working/submission.csv', index=False)
#xgb0.35642
相关推荐
冬奇Lab22 分钟前
Workflow 系列(04):Multi-Agent 协调——编排器边界、并发控制与上下文隔离
人工智能·工作流引擎
冬奇Lab32 分钟前
每日一个开源项目(第147篇):HyperGraphRAG - 用超图表示 N 元关系,RAG 的第三代范式
人工智能·开源·graphql
甲维斯1 小时前
Github + 阿里云oss实现类似codex的自动更新!
人工智能
阿里云大数据AI技术3 小时前
光轮智能 × 阿里云:共建 Physical AI 云上数据、评测与持续学习基础设施
人工智能·机器学习
机器之心3 小时前
实锤了:Claude Code偷查用户,时区、中国AI实验室全是关键词
人工智能·openai
网易云信3 小时前
Cursor点燃个人开发者,企业级AI为何频频受挫?Agent工厂从提效工具到AI员工的跃迁
人工智能·开源
网易云信3 小时前
解锁触手可及的温暖:网易智企 x Wander Puffs AI 云游泡芙
人工智能
转转技术团队3 小时前
从 PRD 到可验证代码:AI 需求开发闭环实践
人工智能