模型训练通过pandas一次分组计算多个标签的KS
- 模型训练通常是一堆特征X和一个标签y,但是模型经常需要再多个不同的y标签上去看效果,进行综合评估,如信贷领域中需要看在不同的mob的表现
- 下面会使用pandas计算多个y标签效果
准备数据
-
生成模拟数据
pythonimport numpy as np import pandas as pd from sklearn import metrics np.random.seed(2025) def gen_data(labels=['y1', 'y2', 'y3'], nums=1000): data = dict( # 模拟分组月份 month=np.random.choice(range(1, 6), nums), pred=np.random.rand(nums) ) df = pd.DataFrame(data) for y in labels: df[y] = np.random.choice([0, 1, np.nan], nums) return df df=gen_data(nums=10) print(df) # 输出如下 # # month pred y1 y2 y3 # 0 3 0.964238 1.0 1.0 1.0 # 1 5 0.800984 1.0 NaN NaN # 2 1 0.455205 NaN 0.0 0.0 # 3 4 0.801058 1.0 0.0 0.0 # 4 4 0.041718 NaN 1.0 NaN # 5 5 0.769458 0.0 1.0 0.0 # 6 1 0.003171 0.0 1.0 1.0 # 7 1 0.292809 NaN 0.0 0.0 # 8 3 0.610914 NaN 0.0 NaN # 9 2 0.913027 1.0 1.0 0.0
-
创建计算单列ks的函数,注意处理模型打分或者标签为空的情况
python# 计算单个标签的ks def calc_one_ks(y_true, y_pred): # 去掉标签为空的 y_true = y_true.reset_index(drop=True) idx = y_true[y_true.notna()].index # 去掉打分为空的 y_pred = pd.Series(y_pred).reset_index(drop=True) idx2 = y_pred[y_pred.notna()].index # 取模型打分和标签都不为空的 idx_uni = list(set(idx) & set(idx2)) y_true = y_true[idx_uni] y_pred = y_pred[idx_uni] if len(y_true) == 0: return None tpr, fpr, _ = metrics.roc_curve(y_true, y_pred) ks = max(abs(tpr - fpr)) return ks
-
使用pandas的groupBy和apply分组计算多个标签的ks值,同时输出主标签的数量和正样本率
pythondef get_ks(s: pd.Series, labels, y_main=None, score='pred'): cnt = len(s) y_main = y_main or labels[0] pos_cnt = s[y_main].sum() pos_pct = round(pos_cnt / cnt, 4) res = dict(cnt=cnt, pos_cnt=pos_cnt, pos_pct=pos_pct) for y in labels: res[f'{y}_ks'] = calc_one_ks(s[y], s[score]) return pd.Series(res)
完整代码
python
import numpy as np
import pandas as pd
from sklearn import metrics
np.random.seed(2025)
def gen_data(labels=['y1', 'y2', 'y3'], nums=1000):
data = dict(
# 模拟分组月份
month=np.random.choice(range(1, 6), nums),
pred=np.random.rand(nums)
)
df = pd.DataFrame(data)
for y in labels:
df[y] = np.random.choice([0, 1, np.nan], nums)
return df
# 计算单个标签的ks
def calc_one_ks(y_true, y_pred):
# 去掉标签为空的
y_true = y_true.reset_index(drop=True)
idx = y_true[y_true.notna()].index
# 去掉打分为空的
y_pred = pd.Series(y_pred).reset_index(drop=True)
idx2 = y_pred[y_pred.notna()].index
# 取模型打分和标签都不为空的
idx_uni = list(set(idx) & set(idx2))
y_true = y_true[idx_uni]
y_pred = y_pred[idx_uni]
if len(y_true) == 0:
return None
tpr, fpr, _ = metrics.roc_curve(y_true, y_pred)
ks = max(abs(tpr - fpr))
return ks
def get_ks(s: pd.Series, labels, y_main=None, score='pred'):
cnt = len(s)
y_main = y_main or labels[0]
pos_cnt = s[y_main].sum()
pos_pct = round(pos_cnt / cnt, 4)
res = dict(cnt=cnt, pos_cnt=pos_cnt, pos_pct=pos_pct)
for y in labels:
res[f'{y}_ks'] = calc_one_ks(s[y], s[score])
return pd.Series(res)
if __name__ == '__main__':
labels = ['y1', 'y2', 'y3']
df = gen_data(labels)
g_cols = ['month']
df_res = df.groupby(g_cols).apply(lambda s: get_ks(s, labels))
print(df_res)
# 输出
# cnt pos_cnt pos_pct y1_ks y2_ks y3_ks
# month
# 1 212.0 55.0 0.2594 0.206674 0.270109 0.101449
# 2 183.0 66.0 0.3607 0.138675 0.088418 0.135717
# 3 188.0 63.0 0.3351 0.139406 0.120635 0.125490
# 4 196.0 63.0 0.3214 0.104762 0.192693 0.148323
# 5 221.0 80.0 0.3620 0.120833 0.092691 0.073430
注意,使用pandas直接计算的方式仅适用于样本量较小的情况下,如果样本量较大可以使用sql或者spark进行计算,后续会再进行更新