import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import chi2_contingency, ks_2samp
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
import lightgbm as lgb
from lifelines import CoxPHFitter
from lifelines.statistics import logrank_test
# 1. WOE IV分箱工具函数
def woe_iv_calc(df, feat_name, target="churn", bins=10):
data = df[[feat_name, target]].copy()
data["bin"] = pd.qcut(data[feat_name], q=bins, duplicates="drop")
total_good = (data[target]==0).sum()
total_bad = (data[target]==1).sum()
bin_stats = data.groupby("bin").agg(
bad_cnt=(target, "sum"),
all_cnt=(target, "count")
).reset_index()
bin_stats["good_cnt"] = bin_stats["all_cnt"] - bin_stats["bad_cnt"]
bin_stats["bad_pct"] = bin_stats["bad_cnt"] / total_bad
bin_stats["good_pct"] = bin_stats["good_cnt"] / total_good
bin_stats["bad_pct"] = bin_stats["bad_pct"].replace(0, 1e-8)
bin_stats["good_pct"] = bin_stats["good_pct"].replace(0, 1e-8)
bin_stats["woe"] = np.log(bin_stats["bad_pct"] / bin_stats["good_pct"])
bin_stats["iv"] = (bin_stats["bad_pct"] - bin_stats["good_pct"]) * bin_stats["woe"]
total_iv = bin_stats["iv"].sum()
woe_map = dict(zip(bin_stats["bin"], bin_stats["woe"]))
return woe_map, round(total_iv,4), bin_stats
# 2. PSI分布稳定性计算函数
def calculate_psi(base_arr, target_arr, bins=10):
base_bin = pd.qcut(base_arr, q=bins, duplicates="drop")
bin_edge = base_bin.cat.categories
base_cnt = pd.value_counts(base_bin, normalize=True).sort_index()
target_bin = pd.cut(target_arr, bins=bin_edge, include_lowest=True)
target_cnt = pd.value_counts(target_bin, normalize=True).sort_index()
psi_detail = pd.DataFrame({"base_pct": base_cnt, "target_pct": target_cnt}).fillna(1e-8)
def psi_calc(pb, pt):
return (pt - pb) * np.log(pt / pb)
psi_detail["psi_bin"] = psi_detail.apply(lambda x: psi_calc(x["base_pct"], x["target_pct"]), axis=1)
total_psi = psi_detail["psi_bin"].sum()
return round(total_psi,4), psi_detail
# 3. 收益最优阈值求解函数
def get_opt_threshold(df_prob_label, cost_send=2.5, revenue_save=18):
res_df = df_prob_label.copy()
thresholds = np.linspace(0, 1, 200)
profit_record = []
for thr in thresholds:
send = res_df[res_df["prob"] >= thr]
send_num = len(send)
real_churn_send = send["y_true"].sum()
net_profit = real_churn_send * revenue_save - send_num * cost_send
profit_record.append({
"threshold": thr,
"send_user": send_num,
"recall_success": real_churn_send,
"net_profit": net_profit
})
profit_df = pd.DataFrame(profit_record)
best = profit_df.loc[profit_df["net_profit"].idxmax()]
return best, profit_df
# 4. A/B召回卡方检验工具函数
def ab_chi2_test(exp_total, exp_survive, ctrl_total, ctrl_survive, alpha=0.05):
exp_churn = exp_total - exp_survive
ctrl_churn = ctrl_total - ctrl_survive
table = [[exp_survive, exp_churn], [ctrl_survive, ctrl_churn]]
chi2, p, _, _ = chi2_contingency(table)
exp_rate = exp_survive / exp_total
ctrl_rate = ctrl_survive / ctrl_total
lift = exp_rate - ctrl_rate
res = {
"exp_retention": exp_rate,
"ctrl_retention": ctrl_rate,
"lift": lift,
"chi2": chi2,
"p_value": p,
"significant": p < alpha
}
return res
# Step1 模拟短视频用户数据集
np.random.seed(42)
n = 10000
df = pd.DataFrame({
"user_id": np.arange(n),
"avg_play_min": np.random.gamma(3, 2, n),
"login_gap_days": np.random.poisson(3, n),
"week_view_cnt": np.random.poisson(12, n),
"is_pay": np.random.binomial(1, 0.25, n),
"comment_total": np.random.poisson(2, n),
"follow_author": np.random.poisson(5, n),
"last3_login_times": np.random.poisson(4, n),
"week_share_cnt": np.random.poisson(1, n),
"vip_days": np.random.exponential(10, n)
})
# 构造伯努利流失标签
logit = -0.4*df["avg_play_min"] + 0.7*df["login_gap_days"] -1.3*df["is_pay"] -0.2*df["last3_login_times"]
p_churn = 1 / (1 + np.exp(-logit))
df["churn"] = np.random.binomial(1, p_churn, n)
print("==== 样本流失分布 ====")
print(df["churn"].value_counts(normalize=True).round(3))
print("-"*60)
# Step2 WOE特征筛选演示
feat_test = "login_gap_days"
woe_map, iv_val, bin_detail = woe_iv_calc(df, feat_test, target="churn", bins=8)
print(f"\n【WOE IV分析】特征 {feat_test} IV值 = {iv_val}")
print(bin_detail[["bin", "bad_cnt", "good_cnt", "woe", "iv"]].round(4))
print("-"*60)
# Step3 数据集拆分 & LightGBM建模
feat_cols = [
"avg_play_min", "login_gap_days", "week_view_cnt", "is_pay",
"comment_total", "follow_author", "last3_login_times",
"week_share_cnt", "vip_days"
]
X = df[feat_cols]
y = df["churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_valid = lgb.Dataset(X_test, label=y_test, reference=lgb_train)
params = {
"objective": "binary",
"metric": ["auc"],
"learning_rate": 0.05,
"num_leaves": 31,
"scale_pos_weight": len(y_train[y_train==0])/len(y_train[y_train==1]),
"seed": 42,
"verbosity": -1
}
model = lgb.train(params, lgb_train, num_boost_round=300, valid_sets=[lgb_valid], early_stopping_rounds=30)
y_pred_proba = model.predict(X_test, num_iteration=model.best_iteration)
res_df = pd.DataFrame({"y_true": y_test.values, "prob": y_pred_proba})
# Step4 模型评估 AUC + KS检验
auc = roc_auc_score(res_df["y_true"], res_df["prob"])
prob_churn = res_df[res_df["y_true"]==1]["prob"]
prob_stay = res_df[res_df["y_true"]==0]["prob"]
ks_stat, ks_p = ks_2samp(prob_churn, prob_stay)
print(f"\n【模型评估】AUC={auc:.4f}, KS={ks_stat:.4f}, KS_p={ks_p:.2e}")
feat_import = pd.DataFrame({"feature":feat_cols, "imp":model.feature_importance()}).sort_values("imp", ascending=False)
print("特征重要性TOP5:")
print(feat_import.head())
print("-"*60)
# Step5 PSI分布漂移检测
train_prob_base = model.predict(X_train, num_iteration=model.best_iteration)
psi_val, psi_detail = calculate_psi(train_prob_base, y_pred_proba, bins=10)
print(f"\n【PSI漂移检测】整体PSI={psi_val}")
if psi_val >= 0.25:
print("告警:分布严重漂移,需重训练模型")
elif psi_val >= 0.1:
print("提醒:轻度漂移,持续观测")
else:
print("模型分布稳定")
print("-"*60)
# Step6 业务最优收益阈值自动计算
best_thr_info, profit_df = get_opt_threshold(res_df, cost_send=2.5, revenue_save=18)
print(f"\n【最优分层阈值】阈值={best_thr_info['threshold']:.3f}")
print(f"触达用户数:{best_thr_info['send_user']:.0f}, 预估挽回:{best_thr_info['recall_success']:.0f}, 净收益:{best_thr_info['net_profit']:.2f}")
# 分层打标签
def risk_tag(prob):
if prob >= best_thr_info["threshold"]:
return "高危召回"
elif prob >= best_thr_info["threshold"] * 0.6:
return "中危观察"
else:
return "低危留存"
res_df["risk_level"] = res_df["prob"].apply(risk_tag)
print("\n分层用户分布:")
print(res_df["risk_level"].value_counts())
print("各分层真实流失率:")
print(res_df.groupby("risk_level")["y_true"].mean().round(3))
print("-"*60)
# Step7 流失召回A/B测试卡方检验
ab_result = ab_chi2_test(exp_total=2000, exp_survive=860, ctrl_total=2000, ctrl_survive=620)
print("\n【召回A/B实验结果】")
print(f"实验组留存率:{ab_result['exp_retention']:.2%}")
print(f"对照组留存率:{ab_result['ctrl_retention']:.2%}")
print(f"留存提升:{ab_result['lift']:.2%}, P值={ab_result['p_value']:.2e}")
if ab_result["significant"]:
print("结论:活动统计显著有效,可持续投放")
else:
print("结论:无显著提升,活动无效")
print("-"*60)
# Step8 Cox生存分析(预测流失时长)
surv_df = df[["avg_play_min","login_gap_days","is_pay","churn"]].copy()
surv_df["T"] = np.random.randint(7,60,size=len(surv_df))
surv_df["E"] = surv_df["churn"]
cph = CoxPHFitter()
cph.fit(surv_df, duration_col="T", event_col="E")
print("\n【Cox生存模型 HR风险比】")
print(cph.summary[["coef","exp(coef)","p"]].round(4))
# 单用户留存概率
sample = surv_df.iloc[0][["avg_play_min","login_gap_days","is_pay"]]
surv_curve = cph.predict_survival_function(sample)
print("\n该用户7/14/30天留存概率:")
print(surv_curve.loc[[7,14,30]].round(3))
# LogRank检验付费/非付费差异
pay1 = surv_df[surv_df["is_pay"]==1]
pay0 = surv_df[surv_df["is_pay"]==0]
lr_res = logrank_test(pay1["T"], pay0["T"], event_observed_A=pay1["E"], event_observed_B=pay0["E"])
print(f"\nLogRank检验P值={lr_res.p_value:.2e}")
if lr_res.p_value < 0.05:
print("付费与非付费用户流失速度显著不同")
print("-"*60)
# Step9 可视化 ROC曲线 + 分层流失率
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5))
# ROC曲线绘制
fpr, tpr, _ = roc_curve(res_df["y_true"], res_df["prob"])
ax1.plot(fpr, tpr, c="#E63946", label=f"AUC={auc:.4f}")
ax1.plot([0,1],[0,1], "--", c="gray")
ax1.set_title("ROC-AUC曲线")
ax1.set_xlabel("假正率FPR")
ax1.set_ylabel("真正率TPR")
ax1.legend()
# 分层流失率柱状图
level_rate = res_df.groupby("risk_level")["y_true"].mean()
ax2.bar(level_rate.index, level_rate.values, color=["#457B9D","#F1A208","#E63946"])
ax2.set_title("各风险层级真实流失率")
ax2.set_ylabel("流失率")
for i, v in enumerate(level_rate.values):
ax2.text(i, v+0.02, f"{v:.2f}", ha="center")
plt.tight_layout()
plt.show()
# Step10 导出高危用户运营名单
df["all_prob"] = model.predict(df[feat_cols], num_iteration=model.best_iteration)
df["risk"] = df["all_prob"].apply(risk_tag)
high_risk = df[df["risk"]=="高危召回"][["user_id","all_prob","risk"]].round(3)
high_risk.to_csv("流失高危召回名单.csv", index=False, encoding="utf-8-sig")
print(f"\n高危待召回用户总量:{len(high_risk)},名单已导出:流失高危召回名单.csv")