竞赛实战--天池金融风控分类问题

背景

1、金融风控分类问题,作为机器学习竞赛是一个比较好的选择

2、如何进行数据处理

代码

数据分析部分

python 复制代码
#!/usr/bin/env python
# coding: utf-8

import os
import gc
import numpy as np
import pandas as pd
import warnings
import lightgbm as lgb
import catboost as cbt
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kstest

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
# plt.ion()
python 复制代码
# ## 导入数据
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, 'data')

train_data_file = os.path.join(DATA_PATH, "train.csv")
train_data = pd.read_csv(train_data_file)

test_data_file = os.path.join(DATA_PATH, "testA.csv")
test_data = pd.read_csv(test_data_file)

target = train_data['isDefault']
train_data = train_data.drop(['isDefault'], axis=1)

data = pd.concat([train_data, test_data])


objectList = [i for i in train_data.columns if train_data[i].dtype == 'O']
classList = [i for i in train_data.select_dtypes(exclude=['object']).columns if len(train_data[i].unique()) <= 10]
numericalList = [i for i in train_data.select_dtypes(exclude=['object']).columns if i not in classList]

对不同类型变量进行分类分组处理

python 复制代码
# ## 变量分类和缺失值处理
info = pd.DataFrame(data.isnull().sum())
info = info[info[0] != 0]
miss_fea = info.index

miss_objectList = [i for i in miss_fea if i in objectList]
miss_classList = [i for i in miss_fea if i in classList]
miss_numericalList = [i for i in miss_fea if i in numericalList]

# 填补缺失值
data['employmentLength'] = data['employmentLength'].fillna(0)
data['n11'] = data['n11'].fillna(0)
data['n12'] = data['n12'].fillna(0)
data['employmentTitle'] = data['employmentTitle'].fillna(data['employmentTitle'].mode()[0])
data['postCode'] = data['postCode'].fillna(data['postCode'].mode()[0])
data['dti'] = data['dti'].fillna(data['postCode'].mean())
data['pubRecBankruptcies'] = data['pubRecBankruptcies'].fillna(data['pubRecBankruptcies'].mean())
data['revolUtil'] = data['revolUtil'].fillna(data['revolUtil'].mean())
data['title'] = data['title'].fillna(data['title'].mode()[0])

NoNameList = [i for i in miss_numericalList if i.startswith("n")]
for i in NoNameList:
    data[i] = data[i].fillna(data[i].mode()[0])

# ## object 变量处理
data['employmentLength'].replace({'10+ years': '10 years', '< 1 year': '0 years', '0': '0 years'}, inplace=True)
data['employmentLength'] = data['employmentLength'].apply(lambda s: int(str(s).split()[0]) if pd.notnull(s) else s)

data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data = data.drop(['issueDate'], axis=1)

le = LabelEncoder()
data['grade'] = le.fit_transform(data['grade'])
data['subGrade'] = le.fit_transform(data['subGrade'])

# 删除不需要的列
dropList = ['id', 'ficoRangeHigh', 'applicationType', 'policyCode', 'n3', 'n11', 'n12', 'n13']
data.drop(dropList, axis=1, inplace=True)


train_data = data[:800000]
# 将target和train_data进行重新拼接
train_data['isDefault']=target
test_data = data[800000:]
print("Divide data.")
python 复制代码
# # ## 异常值处理
# percentile = pd.DataFrame()
# numList = [i for i in train_data.columns if i not in classList]

# # 正态分布检测
# for i in numList:
#     print(kstest(data[i], 'norm', (data[i].mean(), data[i].std())))

# # 异常值处理
# stdsc = StandardScaler()
# for i in numList:
#     new_i = "zheng_" + i
#     train_data[new_i] = stdsc.fit_transform(train_data[i].values.reshape(-1, 1))
#     data_std = np.std(train_data[new_i])
#     data_mean = np.mean(train_data[new_i])
#     outliers_cut_off = data_std * 3
#     lower_rule = data_mean - outliers_cut_off
#     upper_rule = data_mean + outliers_cut_off
#     train_data = train_data[(train_data[new_i] < upper_rule) & (train_data[new_i] > lower_rule)]
# train_data = train_data.iloc[:, :38]

保存数据,在部分情况下由于数据体量过大,保存中间数据有助于后续处理。

python 复制代码
FEATURE_PATH = os.path.join(BASE_DIR, 'feature')
feature_train_data = os.path.join(FEATURE_PATH, 'train_data.csv')
feature_test_data = os.path.join(FEATURE_PATH, 'test_data.csv')
train_data.to_csv(feature_train_data,index=0)
test_data.to_csv(feature_test_data,index=0)

模型搭建部分

python 复制代码
# 定义模型训练函数
def train_model(x_train, y_train, test_data, params, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
    oof = np.zeros(len(x_train))
    predictions = np.zeros((len(test_data), n_splits))

    for fold_, (train_idx, valid_idx) in enumerate(skf.split(x_train, y_train)):
        print(f"\nFold {fold_ + 1}")
        x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        train_set = lgb.Dataset(x_tr, label=y_tr)
        val_set = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(params, train_set, 5000, valid_sets=[val_set], 
                        verbose_eval=250, early_stopping_rounds=50)

        oof[valid_idx] = clf.predict(x_val, num_iteration=clf.best_iteration)
        predictions[:, fold_] = clf.predict(test_data, num_iteration=clf.best_iteration)
    
    print("\n\nCV AUC: {:<0.4f}".format(roc_auc_score(y_train, oof)))

    return oof, predictions

# 训练模型并生成预测
oof, predictions = train_model(x_train_gbdt, y_train_gbdt, x_test_bgdt, default_params)

参考资料

相关推荐
Power20246661 小时前
NLP论文速读|LongReward:基于AI反馈来提升长上下文大语言模型
人工智能·深度学习·机器学习·自然语言处理·nlp
数据猎手小k1 小时前
AIDOVECL数据集:包含超过15000张AI生成的车辆图像数据集,目的解决旨在解决眼水平分类和定位问题。
人工智能·分类·数据挖掘
数据猎手小k1 小时前
AndroidLab:一个系统化的Android代理框架,包含操作环境和可复现的基准测试,支持大型语言模型和多模态模型。
android·人工智能·机器学习·语言模型
sp_fyf_20242 小时前
计算机前沿技术-人工智能算法-大语言模型-最新研究进展-2024-11-01
人工智能·深度学习·神经网络·算法·机器学习·语言模型·数据挖掘
陈燚_重生之又为程序员2 小时前
基于梧桐数据库的实时数据分析解决方案
数据库·数据挖掘·数据分析
知来者逆2 小时前
研究大语言模型在心理保健智能顾问的有效性和挑战
人工智能·神经网络·机器学习·语言模型·自然语言处理
老艾的AI世界3 小时前
新一代AI换脸更自然,DeepLiveCam下载介绍(可直播)
图像处理·人工智能·深度学习·神经网络·目标检测·机器学习·ai换脸·视频换脸·直播换脸·图片换脸
Chef_Chen4 小时前
从0开始学习机器学习--Day14--如何优化神经网络的代价函数
神经网络·学习·机器学习
AI街潜水的八角5 小时前
基于C++的决策树C4.5机器学习算法(不调包)
c++·算法·决策树·机器学习
喵~来学编程啦5 小时前
【论文精读】LPT: Long-tailed prompt tuning for image classification
人工智能·深度学习·机器学习·计算机视觉·论文笔记