文章目录
-
- 一、数据挖掘与用户行为分析概述
- 二、用户行为分析模型构建流程
-
- [2.1 数据准备与预处理](#2.1 数据准备与预处理)
- [2.2 探索性数据分析](#2.2 探索性数据分析)
- 三、核心用户行为分析模型
-
- [3.1 用户分群模型(聚类分析)](#3.1 用户分群模型(聚类分析))
- [3.2 用户价值预测模型(回归分析)](#3.2 用户价值预测模型(回归分析))
- [3.3 用户流失预测模型(分类)](#3.3 用户流失预测模型(分类))
- 四、模型应用与业务洞察
- 五、模型部署与监控
一、数据挖掘与用户行为分析概述
数据挖掘是从大量数据中提取隐含的、先前未知的、有价值的信息和知识的过程。在用户行为分析领域,数据挖掘技术可以帮助企业:
- 理解用户行为模式
- 预测用户未来行为
- 发现用户群体特征
- 优化产品和服务策略
用户行为数据通常包括:浏览记录、点击流、购买历史、搜索查询、停留时长等。
二、用户行为分析模型构建流程
2.1 数据准备与预处理
python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
# 模拟用户行为数据生成
def generate_user_behavior_data(n_users=1000, n_days=30):
"""生成模拟用户行为数据"""
np.random.seed(42)
# 生成用户ID
user_ids = [f'user_{i}' for i in range(n_users)]
data = []
for user_id in user_ids:
# 用户注册时间
reg_date = datetime.now() - timedelta(days=np.random.randint(30, 365))
# 生成30天的行为数据
for day_offset in range(n_days):
date = reg_date + timedelta(days=day_offset)
# 用户属性
age_group = np.random.choice(['18-25', '26-35', '36-45', '46-55', '56+'],
p=[0.2, 0.35, 0.25, 0.15, 0.05])
gender = np.random.choice(['M', 'F'], p=[0.48, 0.52])
# 行为指标
sessions = np.random.poisson(3) # 会话次数
pageviews = np.random.poisson(15) # 页面浏览数
avg_session_duration = np.random.uniform(30, 600) # 平均会话时长(秒)
bounce_rate = np.random.beta(2, 5) # 跳出率
# 转化相关
conversion = 1 if np.random.random() < 0.15 else 0 # 是否转化
revenue = np.random.exponential(100) if conversion else 0 # 收入
data.append({
'user_id': user_id,
'date': date,
'age_group': age_group,
'gender': gender,
'sessions': sessions,
'pageviews': pageviews,
'avg_session_duration': avg_session_duration,
'bounce_rate': bounce_rate,
'conversion': conversion,
'revenue': revenue
})
return pd.DataFrame(data)
# 生成数据
df = generate_user_behavior_data()
print(f"数据形状: {df.shape}")
print(f"数据示例:\n{df.head()}")
# 数据预处理
def preprocess_data(df):
"""数据预处理函数"""
df_processed = df.copy()
# 处理分类变量
le = LabelEncoder()
df_processed['gender_encoded'] = le.fit_transform(df_processed['gender'])
# 对年龄组进行编码
age_mapping = {'18-25': 0, '26-35': 1, '36-45': 2, '46-55': 3, '56+': 4}
df_processed['age_group_encoded'] = df_processed['age_group'].map(age_mapping)
# 特征缩放
scaler = StandardScaler()
numerical_cols = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])
# 创建时间特征
df_processed['day_of_week'] = df_processed['date'].dt.dayofweek
df_processed['month'] = df_processed['date'].dt.month
df_processed['is_weekend'] = df_processed['day_of_week'].isin([5, 6]).astype(int)
return df_processed
df_processed = preprocess_data(df)
print(f"\n预处理后的数据形状: {df_processed.shape}")
2.2 探索性数据分析
python
def exploratory_data_analysis(df):
"""探索性数据分析"""
# 1. 基本统计信息
print("基本统计信息:")
print(df.describe())
# 2. 可视化
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
# 转化率分布
conversion_rate = df.groupby('user_id')['conversion'].mean()
axes[0, 0].hist(conversion_rate, bins=20, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('用户转化率分布')
axes[0, 0].set_xlabel('转化率')
axes[0, 0].set_ylabel('用户数')
# 用户行为指标相关性
behavior_corr = df[['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']].corr()
im = axes[0, 1].imshow(behavior_corr, cmap='coolwarm', aspect='auto')
axes[0, 1].set_title('行为指标相关性热图')
plt.colorbar(im, ax=axes[0, 1])
# 不同年龄组的转化率
age_conversion = df.groupby('age_group')['conversion'].mean()
axes[1, 0].bar(age_conversion.index, age_conversion.values, color='skyblue')
axes[1, 0].set_title('不同年龄组转化率')
axes[1, 0].set_xlabel('年龄组')
axes[1, 0].set_ylabel('转化率')
# 会话时长分布
axes[1, 1].hist(df['avg_session_duration'], bins=30, edgecolor='black', alpha=0.7)
axes[1, 1].set_title('平均会话时长分布')
axes[1, 1].set_xlabel('会话时长(秒)')
axes[1, 1].set_ylabel('频次')
plt.tight_layout()
plt.show()
# 3. 关键指标计算
print(f"\n关键指标:")
print(f"整体转化率: {df['conversion'].mean():.2%}")
print(f"总用户数: {df['user_id'].nunique()}")
print(f"平均页面浏览数: {df['pageviews'].mean():.2f}")
print(f"平均会话时长: {df['avg_session_duration'].mean():.2f}秒")
exploratory_data_analysis(df)
三、核心用户行为分析模型
3.1 用户分群模型(聚类分析)
python
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
def user_segmentation_model(df, n_clusters=4):
"""用户分群模型"""
# 选择特征
features = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
X = df_processed[features]
# 使用肘部法则确定最佳聚类数
inertia = []
silhouette_scores = []
k_range = range(2, 8)
for k in k_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X)
inertia.append(kmeans.inertia_)
if len(np.unique(kmeans.labels_)) > 1:
silhouette_scores.append(silhouette_score(X, kmeans.labels_))
else:
silhouette_scores.append(0)
# 可视化
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(k_range, inertia, marker='o')
axes[0].set_title('肘部法则')
axes[0].set_xlabel('聚类数')
axes[0].set_ylabel('内聚度')
axes[1].plot(k_range, silhouette_scores, marker='o', color='orange')
axes[1].set_title('轮廓系数')
axes[1].set_xlabel('聚类数')
axes[1].set_ylabel('轮廓系数')
plt.tight_layout()
plt.show()
# 使用最佳聚类数
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"最佳聚类数: {optimal_k}")
# 训练最终模型
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df_processed['user_segment'] = kmeans_final.fit_predict(X)
# 分析每个分群的特征
segment_analysis = df_processed.groupby('user_segment').agg({
'sessions': 'mean',
'pageviews': 'mean',
'avg_session_duration': 'mean',
'bounce_rate': 'mean',
'conversion': 'mean',
'user_id': 'nunique'
}).round(3)
segment_analysis.columns = ['平均会话数', '平均页面浏览', '平均会话时长',
'平均跳出率', '转化率', '用户数']
print("\n用户分群分析:")
print(segment_analysis)
# 可视化聚类结果
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1],
c=df_processed['user_segment'],
cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='用户分群')
plt.title('用户分群可视化(PCA降维)')
plt.xlabel('主成分1')
plt.ylabel('主成分2')
plt.show()
return df_processed, kmeans_final
df_segmented, kmeans_model = user_segmentation_model(df_processed)
3.2 用户价值预测模型(回归分析)
python
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
def user_value_prediction_model(df):
"""用户价值预测模型"""
# 准备特征和目标变量
feature_cols = ['sessions', 'pageviews', 'avg_session_duration',
'bounce_rate', 'gender_encoded', 'age_group_encoded',
'day_of_week', 'is_weekend']
X = df[feature_cols]
y = df['revenue']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 训练随机森林回归模型
rf_model = RandomForestRegressor(
n_estimators=100,
max_depth=10,
min_samples_split=5,
random_state=42
)
rf_model.fit(X_train, y_train)
# 预测
y_pred = rf_model.predict(X_test)
# 模型评估
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("模型评估指标:")
print(f"均方误差(MSE): {mse:.2f}")
print(f"均方根误差(RMSE): {rmse:.2f}")
print(f"平均绝对误差(MAE): {mae:.2f}")
print(f"R²分数: {r2:.2f}")
# 特征重要性分析
feature_importance = pd.DataFrame({
'feature': feature_cols,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n特征重要性排序:")
print(feature_importance)
# 可视化预测结果
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
plt.xlabel('实际价值')
plt.ylabel('预测价值')
plt.title('用户价值预测 vs 实际值')
plt.show()
# 交叉验证
cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2')
print(f"\n交叉验证R²分数: {cv_scores.mean():.2f} (±{cv_scores.std():.2f})")
return rf_model, feature_importance
rf_model, feature_importance = user_value_prediction_model(df_processed)
3.3 用户流失预测模型(分类)
python
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
def churn_prediction_model(df):
"""用户流失预测模型"""
# 创建流失标签:连续3天没有会话的用户标记为流失
df['date_str'] = df['date'].dt.date
user_last_activity = df.groupby('user_id')['date_str'].max().reset_index()
# 假设分析日期为最后一天
analysis_date = df['date_str'].max()
churn_threshold = analysis_date - timedelta(days=3)
user_last_activity['is_churned'] = (
user_last_activity['date_str'] < churn_threshold
).astype(int)
# 合并流失标签
df_with_churn = pd.merge(
df,
user_last_activity[['user_id', 'is_churned']],
on='user_id',
how='left'
)
# 准备特征
feature_cols = ['sessions', 'pageviews', 'avg_session_duration',
'bounce_rate', 'gender_encoded', 'age_group_encoded',
'day_of_week', 'is_weekend']
X = df_with_churn[feature_cols]
y = df_with_churn['is_churned']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 训练模型
clf_model = RandomForestClassifier(
n_estimators=100,
max_depth=8,
class_weight='balanced',
random_state=42
)
clf_model.fit(X_train, y_train)
# 预测
y_pred = clf_model.predict(X_test)
y_pred_proba = clf_model.predict_proba(X_test)[:, 1]
# 模型评估
print("分类报告:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC分数: {roc_auc_score(y_test, y_pred_proba):.3f}")
# 混淆矩阵
cm = confusion_matrix(y_test, y_pred)
print("\n混淆矩阵:")
print(cm)
# ROC曲线
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'ROC曲线 (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')
plt.plot([0, 1], [0, 1], 'k--', label='随机猜测')
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.title('ROC曲线 - 用户流失预测')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 特征重要性
feature_importance = pd.DataFrame({
'feature': feature_cols,
'importance': clf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\n流失预测特征重要性:")
print(feature_importance)
# 可视化特征重要性
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'][::-1],
feature_importance['importance'][::-1])
plt.xlabel('重要性')
plt.title('用户流失预测特征重要性')
plt.tight_layout()
plt.show()
return clf_model, df_with_churn
clf_model, df_with_churn = churn_prediction_model(df_processed)
四、模型应用与业务洞察
python
def generate_business_insights(df_segmented, df_with_churn, clf_model):
"""生成业务洞察报告"""
insights = []
# 1. 用户分群洞察
segment_stats = df_segmented.groupby('user_segment').agg({
'conversion': 'mean',
'revenue': 'mean',
'user_id': 'nunique'
}).round(3)
high_value_segment = segment_stats['revenue'].idxmax()
insights.append(f"1. 最高价值用户群体: 分群{high_value_segment}, "
f"平均收入: ${segment_stats.loc[high_value_segment, 'revenue']:.2f}")
# 2. 流失风险洞察
high_risk_users = df_with_churn[
df_with_churn['is_churned'] == 1
]['user_id'].nunique()
total_users = df_with_churn['user_id'].nunique()
churn_rate = high_risk_users / total_users
insights.append(f"2. 用户流失率: {churn_rate:.2%} ({high_risk_users}/{total_users}用户)")
# 3. 转化驱动因素
conversion_correlation = df_segmented[['sessions', 'pageviews',
'avg_session_duration', 'conversion']].corr()
strongest_predictor = conversion_correlation['conversion'].abs().sort_values(
ascending=False
).index[1] # 跳过自身相关性
insights.append(f"3. 转化最强预测因子: {strongest_predictor}, "
f"相关性: {conversion_correlation.loc[strongest_predictor, 'conversion']:.3f}")
# 4. 推荐优化策略
low_conversion_segment = segment_stats['conversion'].idxmin()
insights.append(f"4. 优化建议: 重点关注分群{low_conversion_segment}的用户, "
f"当前转化率仅{segment_stats.loc[low_conversion_segment, 'conversion']:.2%}")
# 5. 个性化推荐时机
avg_session_by_hour = df_segmented.groupby(
df_segmented['date'].dt.hour
)['conversion'].mean()
best_hour = avg_session_by_hour.idxmax()
insights.append(f"5. 最佳营销时机: 每天{best_hour}:00, "
f"此时转化率最高: {avg_session_by_hour.max():.2%}")
print("业务洞察报告:")
print("="*50)
for i, insight in enumerate(insights, 1):
print(f"{insight}")
print("="*50)
generate_business_insights(df_segmented, df_with_churn, clf_model)
五、模型部署与监控
python
class UserBehaviorModelPipeline:
"""用户行为分析模型管道"""
def __init__(self):
self.models = {}
self.scaler = StandardScaler()
self.label_encoders = {}
def train_pipeline(self, df):
"""训练完整管道"""
# 数据预处理
df_processed = self._preprocess_data(df)
# 训练分群模型
print("训练用户分群模型...")
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
features = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
self.models['segmentation'] = kmeans.fit(df_processed[features])
# 训练价值预测模型
print("训练用户价值预测模型...")
rf_regressor = RandomForestRegressor(n_estimators=50, random_state=42)
X_reg = df_processed[features + ['gender_encoded', 'age_group_encoded']]
y_reg = df_processed['revenue']
self.models['value_prediction'] = rf_regressor.fit(X_reg, y_reg)
# 训练流失预测模型
print("训练用户流失预测模型...")
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
X_clf = df_processed[features + ['gender_encoded', 'age_group_encoded']]
# 模拟流失标签
y_clf = (df_processed['sessions'] < df_processed['sessions'].median()).astype(int)
self.models['churn_prediction'] = rf_classifier.fit(X_clf, y_clf)
print("模型训练完成!")
def _preprocess_data(self, df):
"""数据预处理"""
df_processed = df.copy()
# 编码分类变量
if 'gender' in df.columns:
le_gender = LabelEncoder()
df_processed['gender_encoded'] = le_gender.fit_transform(df_processed['gender'])
self.label_encoders['gender'] = le_gender
# 标准化数值特征
numerical_cols = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
df_processed[numerical_cols] = self.scaler.fit_transform(df_processed[numerical_cols])
return df_processed
def predict(self, new_data):
"""对新数据进行预测"""
# 预处理新数据
new_data_processed = new_data.copy()
if 'gender' in new_data.columns:
new_data_processed['gender_encoded'] = self.label_encoders['gender'].transform(
new_data['gender']
)
numerical_cols = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
new_data_processed[numerical_cols] = self.scaler.transform(
new_data[numerical_cols]
)
# 进行预测
features = numerical_cols + ['gender_encoded', 'age_group_encoded']
predictions = {
'user_segment': self.models['segmentation'].predict(
new_data_processed[numerical_cols]
),
'predicted_value': self.models['value_prediction'].predict(
new_data_processed[features]
),
'churn_probability': self.models['churn_prediction'].predict_proba(
new_data_processed[features]
)[:, 1]
}
return pd.DataFrame(predictions)
# 创建和训练管道
pipeline = UserBehaviorModelPipeline()
pipeline.train_pipeline(df)
# 模拟新用户数据
new_users = pd.DataFrame({
'user_id': ['new_user_1', 'new_user_2', 'new_user_3'],
'sessions': [5, 2, 8],
'pageviews': [25, 10, 40],
'avg_session_duration': [300, 120, 450],
'bounce_rate': [0.3, 0.6, 0.2],
'gender': ['M', 'F', 'M'],
'age_group_encoded': [1, 2, 1]
})
# 进行预测
predictions = pipeline.predict(new_users)
print("\n新用户预测结果:")
print(predictions)