Kaggle-Predicting Optimal Fertilizers-(多分类+xgboost+同一特征值多样性)

Predicting Optimal Fertilizers

题意:

给出土壤的特性,预测出3种最佳的肥料

数据处理:

1.有数字型和类别型,类别不能随意换成数字,独热编码。cat可以直接处理category类型。

2.构造一些相关土壤特性特征

3.由于label是category类型,但是xgb不可以处理category类型,因此需要先编码,最后求出结果之后再解码。

建立模型:

1.catboost交叉验证、xgboost交叉验证

代码:
python 复制代码
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
import lightgbm
from mlxtend.regressor import StackingCVRegressor
from sklearn import clone
from sklearn.ensemble import VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, make_scorer, mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from xgboost import XGBRegressor, XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from catboost import Pool, CatBoostClassifier


def init():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志
    warnings.simplefilter('ignore')  # 忽略警告日志
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option("display.max_rows", 1000)
    pd.set_option("display.max_columns", 1000)


def show_dataframe(df):
    print("查看特征值和特征值类型\n" + str(df.dtypes) + "\n" + "-" * 100)
    print("查看前10行信息\n" + str(df.head()) + "\n" + "-" * 100)
    print("查看每个特征值的各种数据统计信息\n" + str(df.describe()) + "\n" + "-" * 100)
    print("输出重复行的个数\n" + str(df.duplicated().sum()) + "\n" + "-" * 100)
    print("查看每列的缺失值个数\n" + str(df.isnull().sum()) + "\n" + "-" * 100)
    print("查看缺失值的具体信息\n" + str(df.info()) + "\n" + "-" * 100)
    # print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df['X'].value_counts()) + "\n" + "-" * 100)


def show_relation(data, colx, coly):
    if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
        seaborn.boxplot(x=colx, y=coly, data=data)
    else:
        plt.scatter(data[colx], data[coly])
    plt.xlabel(colx)
    plt.ylabel(coly)
    plt.show()


def mapk(actual, predicted, k=3):
    def apk(a, p, k):
        score = 0.0
        for i in range(min(k, len(p))):
            if p[i] == a:
                score += 1.0 / (i + 1)
                break
        return score

    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])


if __name__ == '__main__':
    init()

    df_train = pd.read_csv('/kaggle/input/playground-series-s5e6/train.csv')
    df_test = pd.read_csv('/kaggle/input/playground-series-s5e6/test.csv')
    df_train_additional = pd.read_csv('/kaggle/input/fertilizer-prediction/Fertilizer Prediction.csv')
    pd.concat([df_train, df_train_additional], ignore_index=True)

    print("Start Feature enggering" + "-" * 70 + "\n")
    df_all = pd.concat([df_train.drop(['id', 'Fertilizer Name'], axis=1), df_test.drop(['id'], axis=1)], axis=0)

    df_all['Temp_Humidity_Interaction'] = df_all['Temparature'] * df_all['Humidity']
    df_all['N_P_Ratio'] = df_all['Nitrogen'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['K_P_Ratio'] = df_all['Potassium'] / (df_all['Phosphorous'].replace(0, 1e-6))
    df_all['Soil_Crop_Combination'] = df_all['Soil Type'].astype(str) + '_' + df_all['Crop Type'].astype(str)

    df_all['P_to_K'] = df_all['Phosphorous'] / (df_all['Potassium'] + 1e-5)
    df_all['Total_NPK'] = df_all['Nitrogen'] + df_all['Phosphorous'] + df_all['Potassium']
    df_all['Climate_Index'] = (df_all['Temparature'] + df_all['Humidity']) / 2
    df_all['Water_Stress'] = df_all['Humidity'] - df_all['Moisture']

    original_numerical_cols = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
    for col in original_numerical_cols:
        df_all[f'{col}_Binned'] = df_all[col].astype(str)

    numerical_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous',
                          'Temp_Humidity_Interaction', 'N_P_Ratio', 'K_P_Ratio']
    categorical_features = ['Soil Type', 'Crop Type', 'Soil_Crop_Combination']
    categorical_features.extend([f'{col}_Binned' for col in original_numerical_cols])

    poly_features_to_transform = original_numerical_cols
    poly = PolynomialFeatures(degree=2, include_bias=False)
    df_all_transformers = poly.fit_transform(df_all[poly_features_to_transform])

    poly_feature_names = poly.get_feature_names_out(poly_features_to_transform)
    df_all = df_all.drop(columns=poly_features_to_transform)
    df_all = pd.concat([df_all, pd.DataFrame(df_all_transformers, columns=poly_feature_names,index=df_all.index)], axis=1)

    numerical_features = df_all.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df_all.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

    all_features_ordered = numerical_features + categorical_features
    df_all = df_all[all_features_ordered]

    all_categories_union = {}
    for col in categorical_features:
        if col in df_all.columns:
            all_categories_union[col] = pd.concat([
                df_all[col],
            ], axis=0).astype(str).unique()
        else:
            print(f"Warning: Categorical column '{col}' not found after feature engineering. Skipping conversion.")

    for col in categorical_features:
        if col in df_all.columns:
            df_all[col] = pd.Categorical(df_all[col], categories=all_categories_union[col])

    le = LabelEncoder()
    X_train = df_all[:df_train.shape[0]]
    Y_train = df_train['Fertilizer Name']
    Y_train = le.fit_transform(Y_train)
    X_test = df_all[df_train.shape[0]:]

    print("Training model" + "-" * 70 + "\n")
    model_xgb = XGBClassifier(
        max_depth=8,  # 降低树深度
        colsample_bytree=0.5,  # 控制特征采样比例
        subsample=0.7,  # 控制数据采样比例
        n_estimators=3000,  # 减少迭代轮数
        learning_rate=0.03,  # 降低学习率
        gamma=0.5,  # 增加分裂难度
        max_delta_step=2,  # 限制权重更新步长
        reg_alpha=5,  # 增强L1正则化
        reg_lambda=3,  # 增强L2正则化
        early_stopping_rounds=100,  # 更早停止训练
        objective='multi:softprob',
        random_state=13,
        enable_categorical=True,
        tree_method='hist',
        device='cuda'
    )


    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    pred_xgb = np.zeros((X_test.shape[0], len(le.classes_)))

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train, Y_train)):
        print(f"\nFold {fold + 1}/{kfold.n_splits}")

        x_fold_train, x_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = Y_train[train_idx], Y_train[val_idx]

        model_xgb.fit(
            x_fold_train, y_fold_train,
            eval_set = [(x_fold_val, y_fold_val)],
            verbose = 100,
        )

        pred_xgb += model_xgb.predict_proba(X_test) / kfold.n_splits

    pred_top3_xgb = np.argsort(pred_xgb, axis=1)[:, -3:][:, ::-1]
    top3_label = []
    for row in pred_top3_xgb:
        converted = [le.classes_[i] for i in row]
        top3_label.append(converted)

    submission = pd.DataFrame({
        'id': df_test['id'],
        'Fertilizer Name': [' '.join(preds) for preds in top3_label],
    })
    submission.to_csv('/kaggle/working/submission.csv', index=False)
#xgb0.35642
相关推荐
和光同尘@8 分钟前
66. 加一 (编程基础0到1)(Leetcode)
数据结构·人工智能·算法·leetcode·职场和发展
飞哥数智坊20 分钟前
放弃 Cursor 后,我又试了 CodeBuddy,感觉国产又行了
人工智能·codebuddy
新智元40 分钟前
世界首富换人!81 岁硅谷狂人 4000 亿身价碾压马斯克,33 岁华裔才女逆袭
人工智能·openai
lingling00943 分钟前
分子生物学ELN系统:如何通过衍因科技实现实验室效率革命
人工智能
机器之心1 小时前
交互扩展时代来临:创智复旦字节重磅发布AgentGym-RL,昇腾加持,开创智能体训练新范式
人工智能·openai
max5006001 小时前
实时多模态电力交易决策系统:设计与实现
图像处理·人工智能·深度学习·算法·音视频
男孩李1 小时前
浅谈代理流程自动化 (APA)
运维·人工智能·自动化
君名余曰正则1 小时前
机器学习06——支持向量机(SVM核心思想与求解、核函数、软间隔与正则化、支持向量回归、核方法)
人工智能·机器学习·支持向量机
sjr20012 小时前
从huggingface下载模型时有哪些文件?
人工智能·机器学习
moz与京2 小时前
【面试向】热门技术话题(上)
人工智能·物联网·机器学习·面试·web3·区块链·元宇宙