路过了学校花店
荒野到海边
有一种浪漫的爱
是浪费时间
徘徊到繁华世界
才发现你背影
平凡得特别
绕过了城外边界
还是没告别
爱错过了太久
反而错得完美无缺
幸福兜了一个圈
🎵 林宥嘉《兜圈》
Python
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
# 示例数据
data = {
'feature1': np.random.rand(1000),
'feature2': np.random.rand(1000),
'feature3': np.random.rand(1000),
'target': np.random.randint(0, 2, 1000)
}
df = pd.DataFrame(data)
# 自动选择最佳分箱数量的函数
def find_best_bins(df, feature, target, max_bins=10):
best_bins = 2
best_score = -np.inf
for bins in range(2, max_bins + 1):
df['bin'] = pd.cut(df[feature], bins=bins, labels=False)
model = LogisticRegression()
# 使用分箱后的特征进行交叉验证评分
scores = cross_val_score(model, df[['bin']], df[target], scoring='roc_auc', cv=5)
mean_score = scores.mean()
if mean_score > best_score:
best_score = mean_score
best_bins = bins
return best_bins
# 计算 WoE 和 IV 的函数
def calculate_woe_iv(df, feature, target, bins):
epsilon = 1e-6 # 平滑处理,避免除零
df['bin'] = pd.cut(df[feature], bins=bins)
# 计算每个箱的总数、正样本数和负样本数
binned = df.groupby('bin')[target].agg(['count', 'sum'])
binned.columns = ['total', 'positive']
binned['negative'] = binned['total'] - binned['positive']
# 计算每个箱或类别的正负样本比例
binned['positive_ratio'] = (binned['positive'] + epsilon) / (binned['positive'].sum() + epsilon)
binned['negative_ratio'] = (binned['negative'] + epsilon) / (binned['negative'].sum() + epsilon)
# 计算 WoE 和 IV
binned['woe'] = np.log(binned['positive_ratio'] / binned['negative_ratio'])
binned['iv'] = (binned['positive_ratio'] - binned['negative_ratio']) * binned['woe']
# 计算总 IV
iv = binned['iv'].sum()
return iv
# 对 DataFrame 中每个特征列进行分箱,并选择最佳分箱数量
def binning_dataframe(df, target, max_bins=10):
binned_df = df.copy()
bin_info = {}
iv_info = {}
for feature in df.columns:
if feature != target:
best_bins = find_best_bins(df, feature, target, max_bins)
bin_info[feature] = best_bins
binned_df[feature] = pd.cut(df[feature], bins=best_bins, labels=False)
# 计算 IV 值
iv = calculate_woe_iv(df, feature, target, best_bins)
iv_info[feature] = iv
return binned_df, bin_info, iv_info
# 进行分箱并选择最佳分箱数量
binned_df, bin_info, iv_info = binning_dataframe(df, 'target', max_bins=10)
print("分箱信息:")
print(bin_info)
print("\nIV 信息:")
print(iv_info)
print("\n分箱后的 DataFrame:")
print(binned_df.head())