什么是RFM分析:R(Recency)最近一次消费、F(Frequency)、M(Monetary),最近买过、经常买、花得多的用户,是最有价值的用户。

数据集:https://www.kaggle.com/code/ekrembayar/rfm-analysis-online-retail-ii(Kaggle RFM 分析在线零售 II)
一、数据清洗
1.初探数据



2.处理重复值


3.处理缺失值

Custmoer ID作为重要的销售信息,缺失占比超过20%,缺失无法做用户分析,将缺失的删除。

Description为描述产品信息的列,不是重要的列,将缺失的内容,统一填充为"Unknown"
df['Description'] = df['Description'].fillna('Unknown')
4.处理异常值
过滤销售数量和销售价格<=0的行


IQR(四分位距法)是数据清洗中常用的异常值识别与剔除方法 ,其核心原理基于数据的四分位数构建合理取值范围。该方法首先计算数据的下四分位数 Q1 与上四分位数 Q3,并通过IQR = Q3 − Q1 得到中间 50% 数据的分布范围;再以Q1 − 1.5×IQR 和Q3 + 1.5×IQR 作为正常数据的上下边界,将超出该范围的数据判定为异常值。IQR 方法基于分位数统计,不受极端值强烈影响,无需假设数据服从正态分布,具有较强的稳健性,适用于各类数值型数据的异常值检测与预处理,是数据分析与预处理中最常用的稳健异常处理手段之一。

常用的异常值检测的方法:


5.数据转换与特征工程
核心主要为日期处理,目的是将字符串格式的日期转为标准的 datetime 时间类型,以便实现正确的时间排序、筛选、年月季度提取、时间差计算等时间相关分析,同时统一日期格式、消除脏数据带来的分析错误。
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['Date'] = df['InvoiceDate'].dt.date
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['Hour'] = df['InvoiceDate'].dt.hour
df['Weekday'] = df['InvoiceDate'].dt.dayofweek # 0=周一, 6=周日
df['Weekday_Name'] = df['InvoiceDate'].dt.day_name()
完整的清洗代码:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# ============================================================
# 1. 读取原始数据
# ============================================================
def load_data(file_path):
"""
加载原始数据
"""
print("=" * 60)
print("步骤1:加载数据")
print("=" * 60)
df = pd.read_csv(file_path, encoding='latin1')
print(f"✅ 成功读取CSV文件: {file_path}")
print(f"原始数据形状: {df.shape}")
print(f"原始列名: {df.columns.tolist()}")
return df
# ============================================================
# 2. 数据探索(了解数据质量)
# ============================================================
def explore_data(df):
"""
探索数据质量
"""
print("\n" + "=" * 60)
print("步骤2:数据探索")
print("=" * 60)
# 2.1 查看基本信息
print("\n【2.1 基本信息】")
print(f"行数: {df.shape[0]:,}")
print(f"列数: {df.shape[1]}")
# 2.2 数据类型
print("\n【2.2 数据类型】")
print(df.dtypes)
# 2.3 缺失值统计
print("\n【2.3 缺失值统计】")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
'缺失数量': missing,
'缺失占比(%)': missing_pct.round(2)
})
print(missing_df[missing_df['缺失数量'] > 0])
# 2.4 重复值统计
print("\n【2.4 重复值统计】")
dup_count = df.duplicated().sum()
print(f"完全重复的行数: {dup_count} ({dup_count/len(df)*100:.2f}%)")
# 2.5 数值列统计
print("\n【2.5 数值列统计】")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
print(df[numeric_cols].describe())
return df # 返回原始df,不要返回missing_df
# ============================================================
# 3. 标准化列名(重命名为统一格式)
# ============================================================
def standardize_columns(df):
"""
标准化列名:重命名为方便使用的格式
"""
print("\n" + "=" * 60)
print("步骤3:标准化列名")
print("=" * 60)
# 列名映射(原始列名 -> 新列名)
column_mapping = {
'Invoice': 'InvoiceNo',
'StockCode': 'StockCode',
'Description': 'Description',
'Quantity': 'Quantity',
'InvoiceDate': 'InvoiceDate',
'Price': 'UnitPrice',
'Customer ID': 'CustomerID',
'Country': 'Country'
}
# 重命名列
df = df.rename(columns=column_mapping)
print(f"原始列名: ['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'Price', 'Customer ID', 'Country']")
print(f"新列名: {df.columns.tolist()}")
return df
# ============================================================
# 4. 处理重复值
# ============================================================
def remove_duplicates(df):
"""
删除完全重复的行
"""
print("\n" + "=" * 60)
print("步骤4:处理重复值")
print("=" * 60)
before = len(df)
df = df.drop_duplicates()
after = len(df)
print(f"删除前: {before:,} 行")
print(f"删除后: {after:,} 行")
print(f"删除了: {before - after:,} 行重复数据")
return df
# ============================================================
# 5. 处理缺失值
# ============================================================
def handle_missing_values(df):
"""
处理缺失值
"""
print("\n" + "=" * 60)
print("步骤5:处理缺失值")
print("=" * 60)
# 5.1 删除 CustomerID 缺失的行(无法做用户分析)
before = len(df)
df = df.dropna(subset=['CustomerID'])
after = len(df)
print(f"\n【5.1 删除CustomerID缺失】")
print(f"删除前: {before:,} 行")
print(f"删除后: {after:,} 行")
print(f"删除了: {before - after:,} 行 (缺失的CustomerID)")
# 5.2 填充 Description 缺失
before_missing = df['Description'].isnull().sum()
df['Description'] = df['Description'].fillna('Unknown')
after_missing = df['Description'].isnull().sum()
print(f"\n【5.2 填充Description缺失】")
print(f"填充前缺失: {before_missing:,} 行")
print(f"填充后缺失: {after_missing:,} 行")
# 5.3 检查其他列的缺失情况
print(f"\n【5.3 其他列缺失检查】")
other_missing = df.isnull().sum()
if other_missing.sum() > 0:
print(other_missing[other_missing > 0])
else:
print("✅ 所有列已无缺失值")
return df
# ============================================================
# 6. 处理异常值
# ============================================================
def handle_outliers(df):
"""
处理异常值
"""
print("\n" + "=" * 60)
print("步骤6:处理异常值")
print("=" * 60)
# 6.1 过滤取消订单(InvoiceNo以C开头)
before = len(df)
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
after = len(df)
print(f"\n【6.1 过滤取消订单】")
print(f"删除前: {before:,} 行")
print(f"删除后: {after:,} 行")
print(f"删除了: {before - after:,} 行取消订单")
# 6.2 过滤数量 <= 0
before = len(df)
df = df[df['Quantity'] > 0]
after = len(df)
print(f"\n【6.2 过滤数量<=0】")
print(f"删除前: {before:,} 行")
print(f"删除后: {after:,} 行")
print(f"删除了: {before - after:,} 行")
# 6.3 过滤单价 <= 0
before = len(df)
df = df[df['UnitPrice'] > 0]
after = len(df)
print(f"\n【6.3 过滤单价<=0】")
print(f"删除前: {before:,} 行")
print(f"删除后: {after:,} 行")
print(f"删除了: {before - after:,} 行")
# 6.4 使用IQR方法检测并展示极端异常值(仅展示,不删除)
print(f"\n【6.4 IQR异常值检测(仅展示,不删除)】")
for col in ['Quantity', 'UnitPrice']:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
outliers = df[(df[col] < lower) | (df[col] > upper)]
print(f" {col}: 检测到 {len(outliers):,} 个异常值 ({len(outliers)/len(df)*100:.2f}%)")
print(f" 正常范围: [{lower:.2f}, {upper:.2f}]")
return df
# ============================================================
# 7. 数据类型转换
# ============================================================
def convert_data_types(df):
"""
转换数据类型
"""
print("\n" + "=" * 60)
print("步骤7:数据类型转换")
print("=" * 60)
# 7.1 日期转换
print("\n【7.1 日期转换】")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
print(f"日期范围: {df['InvoiceDate'].min()} 到 {df['InvoiceDate'].max()}")
print(f"InvoiceDate 类型: {df['InvoiceDate'].dtype}")
# 7.2 CustomerID 转换为整数
print("\n【7.2 CustomerID转换】")
df['CustomerID'] = df['CustomerID'].astype('Int64')
print(f"CustomerID 类型: {df['CustomerID'].dtype}")
print(f"CustomerID 范围: {df['CustomerID'].min()} - {df['CustomerID'].max()}")
# 7.3 其他类型优化
print("\n【7.3 其他类型优化】")
df['Country'] = df['Country'].astype('category')
print(f"Country 已转换为 category 类型")
return df
# ============================================================
# 8. 特征工程
# ============================================================
def create_features(df):
"""
创建新特征
"""
print("\n" + "=" * 60)
print("步骤8:特征工程")
print("=" * 60)
# 8.1 计算总金额
df['TotalAmount'] = df['Quantity'] * df['UnitPrice']
print(f"\n【8.1 总金额】")
print(f"总金额范围: £{df['TotalAmount'].min():.2f} - £{df['TotalAmount'].max():.2f}")
print(f"总销售额: £{df['TotalAmount'].sum():,.2f}")
print(f"平均每笔交易: £{df['TotalAmount'].mean():.2f}")
# 8.2 提取日期特征
print(f"\n【8.2 日期特征】")
df['Year'] = df['InvoiceDate'].dt.year
df['Month'] = df['InvoiceDate'].dt.month
df['Day'] = df['InvoiceDate'].dt.day
df['Hour'] = df['InvoiceDate'].dt.hour
df['Weekday'] = df['InvoiceDate'].dt.dayofweek
df['Weekday_Name'] = df['InvoiceDate'].dt.day_name()
print(f"已添加: Year, Month, Day, Hour, Weekday, Weekday_Name")
# 8.3 是否周末
df['IsWeekend'] = df['Weekday'].isin([5, 6]).astype(int)
print(f"已添加: IsWeekend")
return df
# ============================================================
# 9. 最终验证与统计
# ============================================================
def final_validation(df):
"""
最终数据质量验证
"""
print("\n" + "=" * 60)
print("步骤9:最终验证")
print("=" * 60)
print("\n【9.1 数据概览】")
print(f"最终数据形状: {df.shape}")
print(f"用户数: {df['CustomerID'].nunique():,}")
print(f"订单数: {df['InvoiceNo'].nunique():,}")
print(f"商品数: {df['StockCode'].nunique():,}")
print(f"国家数: {df['Country'].nunique():,}")
print("\n【9.2 缺失值检查】")
missing = df.isnull().sum()
if missing.sum() == 0:
print("✅ 无缺失值")
else:
print(missing[missing > 0])
print("\n【9.3 数据类型检查】")
print(df.dtypes)
print("\n【9.4 数值列统计】")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(df[numeric_cols].describe())
print("\n【9.5 国家分布(前10)】")
print(df['Country'].value_counts().head(10))
print("\n【9.6 数据时间范围】")
print(f"最早交易: {df['InvoiceDate'].min()}")
print(f"最晚交易: {df['InvoiceDate'].max()}")
print(f"时间跨度: {(df['InvoiceDate'].max() - df['InvoiceDate'].min()).days} 天")
return df
# ============================================================
# 10. 保存清洗后的数据
# ============================================================
def save_cleaned_data(df, output_path):
"""
保存清洗后的数据
"""
print("\n" + "=" * 60)
print("步骤10:保存数据")
print("=" * 60)
# 保存为CSV
df.to_csv(output_path, index=False)
print(f"✅ 已保存到: {output_path}")
return df
# ============================================================
# 主函数:执行完整清洗流程
# ============================================================
def main():
"""
执行完整的数据清洗流程
"""
print("\n" + "=" * 60)
print("数据清洗流程开始")
print("=" * 60)
# 文件路径(请修改为你的实际路径)
input_file = r"H:\大论文初稿\online_retail_II\online_retail_II.csv"
output_file = r"H:\大论文初稿\online_retail_II\online_retail_II_cleaned.csv"
# 执行清洗流程
df = load_data(input_file)
df = explore_data(df) # 注意:这里返回df
df = standardize_columns(df)
df = remove_duplicates(df)
df = handle_missing_values(df)
df = handle_outliers(df)
df = convert_data_types(df)
df = create_features(df)
df = final_validation(df)
df = save_cleaned_data(df, output_file)
print("\n" + "=" * 60)
print("✅ 数据清洗完成!")
print("=" * 60)
return df
# ============================================================
# 运行
# ============================================================
if __name__ == "__main__":
cleaned_df = main()
二、RFM分析
1.计算R、F、M原始值


2.查看R、F、M数据分布

3.自定义分箱
使用分位数分箱导致边界和区间数量对不上,改用自定义分箱。
try:
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=4, labels=[4, 3, 2, 1], duplicates='drop')
rfm['F_Score'] = pd.qcut(rfm['Frequency'], q=4, labels=[1, 2, 3, 4], duplicates='drop')
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=4, labels=[1, 2, 3, 4], duplicates='drop')
print("✅ 分位数分箱成功")
except ValueError as e:
print(f"⚠️ 分位数分箱失败: {e}")
print("切换到自定义分箱...")
# 自定义分箱
r_bins = [0, 30, 90, 180, 365]
r_labels = [4, 3, 2, 1]
rfm['R_Score'] = pd.cut(rfm['Recency'], bins=r_bins, labels=r_labels, right=False)
f_bins = [0, 1, 3, 8, 1000]
f_labels = [1, 2, 3, 4]
rfm['F_Score'] = pd.cut(rfm['Frequency'], bins=f_bins, labels=f_labels, right=False)
m_bins = [0, 100, 500, 2000, 100000]
m_labels = [1, 2, 3, 4]
rfm['M_Score'] = pd.cut(rfm['Monetary'], bins=m_bins, labels=m_labels, right=False)
print("✅ 自定义分箱成功")
# 删除缺失值
rfm = rfm.dropna(subset=['R_Score', 'F_Score', 'M_Score'])
# 转换为整数
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)
print(f"\n得分分布:")
print(f" R_Score: {rfm['R_Score'].value_counts().sort_index().to_dict()}")
print(f" F_Score: {rfm['F_Score'].value_counts().sort_index().to_dict()}")
print(f" M_Score: {rfm['M_Score'].value_counts().sort_index().to_dict()}")
4.GMV分析
GMV:一定时期内所有订单的总金额
单品GMV = 单价 × 数量
# 单个订单的GMV
订单GMV = Σ(该订单中所有商品的 单价 × 数量)
# 总GMV
总GMV = Σ(所有订单的 订单GMV) = Σ(所有商品的 单价 × 数量)

在RFM分析中,GMV贡献用于衡量不同用户层级对总销售额的贡献占比。
# 步骤1:计算各层级的GMV
segment_gmv = rfm.groupby('Segment')['Monetary'].sum()
# 步骤2:计算各层级GMV占比
segment_gmv_ratio = segment_gmv / segment_gmv.sum() * 100
print("各层级GMV贡献:")
for segment, gmv in segment_gmv.items():
ratio = segment_gmv_ratio[segment]
print(f" {segment}: £{gmv:,.2f} ({ratio:.1f}%)")
5.完整RFM分析代码
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# ============================================================
# 1. 读取清洗后的数据
# ============================================================
df = pd.read_csv(r"H:\大论文初稿\online_retail_II\online_retail_II_cleaned.csv")
# 确保日期格式正确
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
# 确认列名(应该是 CustomerID,不是 Customer ID)
print("列名列表:", df.columns.tolist())
# ============================================================
# 2. 计算R、F、M原始值
# ============================================================
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
print(f"快照日期: {snapshot_date.date()}")
# 使用 CustomerID(注意:没有空格)
rfm = df.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (snapshot_date - x.max()).days, # R值
'InvoiceNo': 'nunique', # F值
'TotalAmount': 'sum' # M值
}).rename(columns={
'InvoiceDate': 'Recency',
'InvoiceNo': 'Frequency',
'TotalAmount': 'Monetary'
})
print(f"RFM数据形状: {rfm.shape}")
print(f"用户数: {len(rfm):,}")
# ============================================================
# 3. RFM打分
# ============================================================
try:
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=4, labels=[4, 3, 2, 1], duplicates='drop')
rfm['F_Score'] = pd.qcut(rfm['Frequency'], q=4, labels=[1, 2, 3, 4], duplicates='drop')
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=4, labels=[1, 2, 3, 4], duplicates='drop')
print("✅ 分位数分箱成功")
except ValueError as e:
print(f"⚠️ 分位数分箱失败: {e}")
print("切换到自定义分箱...")
# 自定义分箱
r_bins = [0, 30, 90, 180, 365]
r_labels = [4, 3, 2, 1]
rfm['R_Score'] = pd.cut(rfm['Recency'], bins=r_bins, labels=r_labels, right=False)
f_bins = [0, 1, 3, 8, 1000]
f_labels = [1, 2, 3, 4]
rfm['F_Score'] = pd.cut(rfm['Frequency'], bins=f_bins, labels=f_labels, right=False)
m_bins = [0, 100, 500, 2000, 100000]
m_labels = [1, 2, 3, 4]
rfm['M_Score'] = pd.cut(rfm['Monetary'], bins=m_bins, labels=m_labels, right=False)
print("✅ 自定义分箱成功")
# 删除缺失值
rfm = rfm.dropna(subset=['R_Score', 'F_Score', 'M_Score'])
# 转换为整数
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)
print(f"\n得分分布:")
print(f" R_Score: {rfm['R_Score'].value_counts().sort_index().to_dict()}")
print(f" F_Score: {rfm['F_Score'].value_counts().sort_index().to_dict()}")
print(f" M_Score: {rfm['M_Score'].value_counts().sort_index().to_dict()}")
# ============================================================
# 4. 用户分层
# ============================================================
def rfm_segment(row):
if row['R_Score'] >= 3 and row['F_Score'] >= 3 and row['M_Score'] >= 3:
return '高价值用户'
elif row['R_Score'] <= 2 and row['F_Score'] >= 3:
return '需唤醒用户'
elif row['R_Score'] >= 3 and row['F_Score'] <= 2:
return '潜力用户'
else:
return '流失风险用户'
rfm['Segment'] = rfm.apply(rfm_segment, axis=1)
# ============================================================
# 5. GMV贡献分析
# ============================================================
print("\n" + "=" * 60)
print("GMV贡献分析")
print("=" * 60)
# 各层级统计
segment_stats = rfm.groupby('Segment').agg({
'Monetary': ['count', 'sum', 'mean', 'median']
}).round(2)
segment_stats.columns = ['用户数', 'GMV总额', '人均GMV', '中位数GMV']
segment_stats = segment_stats.sort_values('GMV总额', ascending=False)
# 计算占比
segment_stats['用户占比(%)'] = (segment_stats['用户数'] / segment_stats['用户数'].sum() * 100).round(2)
segment_stats['GMV占比(%)'] = (segment_stats['GMV总额'] / segment_stats['GMV总额'].sum() * 100).round(2)
print("\n【各层级GMV贡献统计】")
print(segment_stats)
# 贡献倍数
high_value_gmv = segment_stats.loc['高价值用户', '人均GMV'] if '高价值用户' in segment_stats.index else 0
avg_gmv = rfm['Monetary'].mean()
multiplier = high_value_gmv / avg_gmv if avg_gmv > 0 else 0
print(f"\n【贡献分析】")
print(f" 高价值用户人均消费是平均水平的 {multiplier:.1f} 倍")
# 二八定律
top_20_pct = rfm.nlargest(int(len(rfm) * 0.2), 'Monetary')['Monetary'].sum()
top_20_ratio = top_20_pct / rfm['Monetary'].sum() * 100
print(f" 前20%的用户贡献了 {top_20_ratio:.1f}% 的GMV")
# ============================================================
# 6. 可视化
# ============================================================
fig = plt.figure(figsize=(16, 10))
# 图1:用户占比 vs GMV贡献对比
ax1 = fig.add_subplot(2, 2, 1)
segments = segment_stats.index.tolist()
user_pct = segment_stats['用户占比(%)'].values
gmv_pct = segment_stats['GMV占比(%)'].values
x = np.arange(len(segments))
width = 0.35
bars1 = ax1.bar(x - width/2, user_pct, width, label='用户占比 (%)', color='steelblue', alpha=0.8)
bars2 = ax1.bar(x + width/2, gmv_pct, width, label='GMV贡献 (%)', color='coral', alpha=0.8)
ax1.set_xlabel('用户层级')
ax1.set_ylabel('百分比 (%)')
ax1.set_title('用户占比 vs GMV贡献对比', fontsize=14)
ax1.set_xticks(x)
ax1.set_xticklabels(segments, rotation=45, ha='right')
ax1.legend()
for bar, val in zip(bars1, user_pct):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', ha='center', va='bottom', fontsize=9)
for bar, val in zip(bars2, gmv_pct):
ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', ha='center', va='bottom', fontsize=9)
# 图2:GMV贡献饼图
ax2 = fig.add_subplot(2, 2, 2)
colors = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4']
ax2.pie(segment_stats['GMV占比(%)'], labels=segment_stats.index, autopct='%1.1f%%', colors=colors)
ax2.set_title('各层级GMV贡献占比', fontsize=14)
# 图3:人均GMV对比
ax3 = fig.add_subplot(2, 2, 3)
avg_gmv_values = segment_stats['人均GMV']
bars = ax3.bar(segments, avg_gmv_values, color=colors, alpha=0.8)
ax3.set_xlabel('用户层级')
ax3.set_ylabel('人均GMV (£)')
ax3.set_title('各层级人均GMV对比', fontsize=14)
ax3.set_xticklabels(segments, rotation=45, ha='right')
for bar, val in zip(bars, avg_gmv_values):
ax3.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, f'£{val:.0f}', ha='center', va='bottom', fontsize=9)
# 图4:累积GMV贡献曲线
ax4 = fig.add_subplot(2, 2, 4)
rfm_sorted = rfm.sort_values('Monetary', ascending=False)
rfm_sorted['cumulative_gmv'] = rfm_sorted['Monetary'].cumsum()
rfm_sorted['cumulative_pct'] = rfm_sorted['cumulative_gmv'] / rfm_sorted['Monetary'].sum() * 100
rfm_sorted['user_pct'] = (np.arange(len(rfm_sorted)) + 1) / len(rfm_sorted) * 100
ax4.plot(rfm_sorted['user_pct'], rfm_sorted['cumulative_pct'], 'b-', linewidth=2, label='实际曲线')
ax4.plot([0, 100], [0, 100], 'r--', linewidth=1, label='完全平等线')
ax4.fill_between(rfm_sorted['user_pct'], rfm_sorted['cumulative_pct'], rfm_sorted['user_pct'], alpha=0.3)
ax4.set_xlabel('用户累计百分比 (%)')
ax4.set_ylabel('GMV累计百分比 (%)')
ax4.set_title('累积GMV贡献曲线(洛伦兹曲线)', fontsize=14)
ax4.legend()
ax4.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('GMV_contribution_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
print("\n✅ 图表已保存: GMV_contribution_analysis.png")
# ============================================================
# 7. 业务建议
# ============================================================
print("\n" + "=" * 60)
print("业务建议报告")
print("=" * 60)
for segment in ['高价值用户', '需唤醒用户', '潜力用户', '流失风险用户']:
if segment in segment_stats.index:
seg_data = rfm[rfm['Segment'] == segment]
print(f"\n【{segment}】")
print(f" ├─ 用户数: {len(seg_data):,}人 ({segment_stats.loc[segment, '用户占比(%)']:.1f}%)")
print(f" ├─ GMV贡献: £{seg_data['Monetary'].sum():,.2f} ({segment_stats.loc[segment, 'GMV占比(%)']:.1f}%)")
print(f" ├─ 人均GMV: £{seg_data['Monetary'].mean():.2f}")
print(f" └─ 平均购买次数: {seg_data['Frequency'].mean():.1f}次")
# ============================================================
# 8. 保存结果
# ============================================================
rfm.to_csv('RFM_analysis_complete.csv')
segment_stats.to_csv('GMV_contribution_stats.csv')
print("\n✅ 结果已保存")
print(" - RFM_analysis_complete.csv")
print(" - GMV_contribution_stats.csv")

