竞赛实战--天池金融风控分类问题

背景

1、金融风控分类问题,作为机器学习竞赛是一个比较好的选择

2、如何进行数据处理

代码

数据分析部分

python 复制代码
#!/usr/bin/env python
# coding: utf-8

import os
import gc
import numpy as np
import pandas as pd
import warnings
import lightgbm as lgb
import catboost as cbt
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kstest

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
# plt.ion()
python 复制代码
# ## 导入数据
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(BASE_DIR, 'data')

train_data_file = os.path.join(DATA_PATH, "train.csv")
train_data = pd.read_csv(train_data_file)

test_data_file = os.path.join(DATA_PATH, "testA.csv")
test_data = pd.read_csv(test_data_file)

target = train_data['isDefault']
train_data = train_data.drop(['isDefault'], axis=1)

data = pd.concat([train_data, test_data])


objectList = [i for i in train_data.columns if train_data[i].dtype == 'O']
classList = [i for i in train_data.select_dtypes(exclude=['object']).columns if len(train_data[i].unique()) <= 10]
numericalList = [i for i in train_data.select_dtypes(exclude=['object']).columns if i not in classList]

对不同类型变量进行分类分组处理

python 复制代码
# ## 变量分类和缺失值处理
info = pd.DataFrame(data.isnull().sum())
info = info[info[0] != 0]
miss_fea = info.index

miss_objectList = [i for i in miss_fea if i in objectList]
miss_classList = [i for i in miss_fea if i in classList]
miss_numericalList = [i for i in miss_fea if i in numericalList]

# 填补缺失值
data['employmentLength'] = data['employmentLength'].fillna(0)
data['n11'] = data['n11'].fillna(0)
data['n12'] = data['n12'].fillna(0)
data['employmentTitle'] = data['employmentTitle'].fillna(data['employmentTitle'].mode()[0])
data['postCode'] = data['postCode'].fillna(data['postCode'].mode()[0])
data['dti'] = data['dti'].fillna(data['postCode'].mean())
data['pubRecBankruptcies'] = data['pubRecBankruptcies'].fillna(data['pubRecBankruptcies'].mean())
data['revolUtil'] = data['revolUtil'].fillna(data['revolUtil'].mean())
data['title'] = data['title'].fillna(data['title'].mode()[0])

NoNameList = [i for i in miss_numericalList if i.startswith("n")]
for i in NoNameList:
    data[i] = data[i].fillna(data[i].mode()[0])

# ## object 变量处理
data['employmentLength'].replace({'10+ years': '10 years', '< 1 year': '0 years', '0': '0 years'}, inplace=True)
data['employmentLength'] = data['employmentLength'].apply(lambda s: int(str(s).split()[0]) if pd.notnull(s) else s)

data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data = data.drop(['issueDate'], axis=1)

le = LabelEncoder()
data['grade'] = le.fit_transform(data['grade'])
data['subGrade'] = le.fit_transform(data['subGrade'])

# 删除不需要的列
dropList = ['id', 'ficoRangeHigh', 'applicationType', 'policyCode', 'n3', 'n11', 'n12', 'n13']
data.drop(dropList, axis=1, inplace=True)


train_data = data[:800000]
# 将target和train_data进行重新拼接
train_data['isDefault']=target
test_data = data[800000:]
print("Divide data.")
python 复制代码
# # ## 异常值处理
# percentile = pd.DataFrame()
# numList = [i for i in train_data.columns if i not in classList]

# # 正态分布检测
# for i in numList:
#     print(kstest(data[i], 'norm', (data[i].mean(), data[i].std())))

# # 异常值处理
# stdsc = StandardScaler()
# for i in numList:
#     new_i = "zheng_" + i
#     train_data[new_i] = stdsc.fit_transform(train_data[i].values.reshape(-1, 1))
#     data_std = np.std(train_data[new_i])
#     data_mean = np.mean(train_data[new_i])
#     outliers_cut_off = data_std * 3
#     lower_rule = data_mean - outliers_cut_off
#     upper_rule = data_mean + outliers_cut_off
#     train_data = train_data[(train_data[new_i] < upper_rule) & (train_data[new_i] > lower_rule)]
# train_data = train_data.iloc[:, :38]

保存数据,在部分情况下由于数据体量过大,保存中间数据有助于后续处理。

python 复制代码
FEATURE_PATH = os.path.join(BASE_DIR, 'feature')
feature_train_data = os.path.join(FEATURE_PATH, 'train_data.csv')
feature_test_data = os.path.join(FEATURE_PATH, 'test_data.csv')
train_data.to_csv(feature_train_data,index=0)
test_data.to_csv(feature_test_data,index=0)

模型搭建部分

python 复制代码
# 定义模型训练函数
def train_model(x_train, y_train, test_data, params, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2019)
    oof = np.zeros(len(x_train))
    predictions = np.zeros((len(test_data), n_splits))

    for fold_, (train_idx, valid_idx) in enumerate(skf.split(x_train, y_train)):
        print(f"\nFold {fold_ + 1}")
        x_tr, x_val = x_train.iloc[train_idx], x_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

        train_set = lgb.Dataset(x_tr, label=y_tr)
        val_set = lgb.Dataset(x_val, label=y_val)

        clf = lgb.train(params, train_set, 5000, valid_sets=[val_set], 
                        verbose_eval=250, early_stopping_rounds=50)

        oof[valid_idx] = clf.predict(x_val, num_iteration=clf.best_iteration)
        predictions[:, fold_] = clf.predict(test_data, num_iteration=clf.best_iteration)
    
    print("\n\nCV AUC: {:<0.4f}".format(roc_auc_score(y_train, oof)))

    return oof, predictions

# 训练模型并生成预测
oof, predictions = train_model(x_train_gbdt, y_train_gbdt, x_test_bgdt, default_params)

参考资料

相关推荐
CV学术叫叫兽39 分钟前
一站式学习:害虫识别与分类图像分割
学习·分类·数据挖掘
罗小罗同学2 小时前
医工交叉入门书籍分享:Transformer模型在机器学习领域的应用|个人观点·24-11-22
深度学习·机器学习·transformer
孤独且没人爱的纸鹤2 小时前
【深度学习】:从人工神经网络的基础原理到循环神经网络的先进技术,跨越智能算法的关键发展阶段及其未来趋势,探索技术进步与应用挑战
人工智能·python·深度学习·机器学习·ai
羊小猪~~2 小时前
tensorflow案例7--数据增强与测试集, 训练集, 验证集的构建
人工智能·python·深度学习·机器学习·cnn·tensorflow·neo4j
Sxiaocai2 小时前
使用TensorFlow实现简化版 GoogLeNet 模型进行 MNIST 图像分类
分类·tensorflow·neo4j
zhangfeng11332 小时前
pytorch 的交叉熵函数,多分类,二分类
人工智能·pytorch·分类
YRr YRr3 小时前
如何使用 PyTorch 实现图像分类数据集的加载和处理
pytorch·深度学习·分类
不去幼儿园4 小时前
【MARL】深入理解多智能体近端策略优化(MAPPO)算法与调参
人工智能·python·算法·机器学习·强化学习
无脑敲代码,bug漫天飞5 小时前
COR 损失函数
人工智能·机器学习
HPC_fac130520678166 小时前
以科学计算为切入点:剖析英伟达服务器过热难题
服务器·人工智能·深度学习·机器学习·计算机视觉·数据挖掘·gpu算力