【数据挖掘】用户行为分析中的应用与模型构建

文章目录

一、数据挖掘与用户行为分析概述

数据挖掘是从大量数据中提取隐含的、先前未知的、有价值的信息和知识的过程。在用户行为分析领域,数据挖掘技术可以帮助企业:

  1. 理解用户行为模式
  2. 预测用户未来行为
  3. 发现用户群体特征
  4. 优化产品和服务策略

用户行为数据通常包括:浏览记录、点击流、购买历史、搜索查询、停留时长等。

二、用户行为分析模型构建流程

2.1 数据准备与预处理

python 复制代码
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 模拟用户行为数据生成
def generate_user_behavior_data(n_users=1000, n_days=30):
    """生成模拟用户行为数据"""
    np.random.seed(42)
    
    # 生成用户ID
    user_ids = [f'user_{i}' for i in range(n_users)]
    
    data = []
    for user_id in user_ids:
        # 用户注册时间
        reg_date = datetime.now() - timedelta(days=np.random.randint(30, 365))
        
        # 生成30天的行为数据
        for day_offset in range(n_days):
            date = reg_date + timedelta(days=day_offset)
            
            # 用户属性
            age_group = np.random.choice(['18-25', '26-35', '36-45', '46-55', '56+'], 
                                        p=[0.2, 0.35, 0.25, 0.15, 0.05])
            gender = np.random.choice(['M', 'F'], p=[0.48, 0.52])
            
            # 行为指标
            sessions = np.random.poisson(3)  # 会话次数
            pageviews = np.random.poisson(15)  # 页面浏览数
            avg_session_duration = np.random.uniform(30, 600)  # 平均会话时长(秒)
            bounce_rate = np.random.beta(2, 5)  # 跳出率
            
            # 转化相关
            conversion = 1 if np.random.random() < 0.15 else 0  # 是否转化
            revenue = np.random.exponential(100) if conversion else 0  # 收入
            
            data.append({
                'user_id': user_id,
                'date': date,
                'age_group': age_group,
                'gender': gender,
                'sessions': sessions,
                'pageviews': pageviews,
                'avg_session_duration': avg_session_duration,
                'bounce_rate': bounce_rate,
                'conversion': conversion,
                'revenue': revenue
            })
    
    return pd.DataFrame(data)

# 生成数据
df = generate_user_behavior_data()
print(f"数据形状: {df.shape}")
print(f"数据示例:\n{df.head()}")

# 数据预处理
def preprocess_data(df):
    """数据预处理函数"""
    df_processed = df.copy()
    
    # 处理分类变量
    le = LabelEncoder()
    df_processed['gender_encoded'] = le.fit_transform(df_processed['gender'])
    
    # 对年龄组进行编码
    age_mapping = {'18-25': 0, '26-35': 1, '36-45': 2, '46-55': 3, '56+': 4}
    df_processed['age_group_encoded'] = df_processed['age_group'].map(age_mapping)
    
    # 特征缩放
    scaler = StandardScaler()
    numerical_cols = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
    df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])
    
    # 创建时间特征
    df_processed['day_of_week'] = df_processed['date'].dt.dayofweek
    df_processed['month'] = df_processed['date'].dt.month
    df_processed['is_weekend'] = df_processed['day_of_week'].isin([5, 6]).astype(int)
    
    return df_processed

df_processed = preprocess_data(df)
print(f"\n预处理后的数据形状: {df_processed.shape}")

2.2 探索性数据分析

python 复制代码
def exploratory_data_analysis(df):
    """探索性数据分析"""
    
    # 1. 基本统计信息
    print("基本统计信息:")
    print(df.describe())
    
    # 2. 可视化
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    
    # 转化率分布
    conversion_rate = df.groupby('user_id')['conversion'].mean()
    axes[0, 0].hist(conversion_rate, bins=20, edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('用户转化率分布')
    axes[0, 0].set_xlabel('转化率')
    axes[0, 0].set_ylabel('用户数')
    
    # 用户行为指标相关性
    behavior_corr = df[['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']].corr()
    im = axes[0, 1].imshow(behavior_corr, cmap='coolwarm', aspect='auto')
    axes[0, 1].set_title('行为指标相关性热图')
    plt.colorbar(im, ax=axes[0, 1])
    
    # 不同年龄组的转化率
    age_conversion = df.groupby('age_group')['conversion'].mean()
    axes[1, 0].bar(age_conversion.index, age_conversion.values, color='skyblue')
    axes[1, 0].set_title('不同年龄组转化率')
    axes[1, 0].set_xlabel('年龄组')
    axes[1, 0].set_ylabel('转化率')
    
    # 会话时长分布
    axes[1, 1].hist(df['avg_session_duration'], bins=30, edgecolor='black', alpha=0.7)
    axes[1, 1].set_title('平均会话时长分布')
    axes[1, 1].set_xlabel('会话时长(秒)')
    axes[1, 1].set_ylabel('频次')
    
    plt.tight_layout()
    plt.show()
    
    # 3. 关键指标计算
    print(f"\n关键指标:")
    print(f"整体转化率: {df['conversion'].mean():.2%}")
    print(f"总用户数: {df['user_id'].nunique()}")
    print(f"平均页面浏览数: {df['pageviews'].mean():.2f}")
    print(f"平均会话时长: {df['avg_session_duration'].mean():.2f}秒")

exploratory_data_analysis(df)

三、核心用户行为分析模型

3.1 用户分群模型(聚类分析)

python 复制代码
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

def user_segmentation_model(df, n_clusters=4):
    """用户分群模型"""
    
    # 选择特征
    features = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
    X = df_processed[features]
    
    # 使用肘部法则确定最佳聚类数
    inertia = []
    silhouette_scores = []
    k_range = range(2, 8)
    
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X)
        inertia.append(kmeans.inertia_)
        
        if len(np.unique(kmeans.labels_)) > 1:
            silhouette_scores.append(silhouette_score(X, kmeans.labels_))
        else:
            silhouette_scores.append(0)
    
    # 可视化
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].plot(k_range, inertia, marker='o')
    axes[0].set_title('肘部法则')
    axes[0].set_xlabel('聚类数')
    axes[0].set_ylabel('内聚度')
    
    axes[1].plot(k_range, silhouette_scores, marker='o', color='orange')
    axes[1].set_title('轮廓系数')
    axes[1].set_xlabel('聚类数')
    axes[1].set_ylabel('轮廓系数')
    
    plt.tight_layout()
    plt.show()
    
    # 使用最佳聚类数
    optimal_k = k_range[np.argmax(silhouette_scores)]
    print(f"最佳聚类数: {optimal_k}")
    
    # 训练最终模型
    kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    df_processed['user_segment'] = kmeans_final.fit_predict(X)
    
    # 分析每个分群的特征
    segment_analysis = df_processed.groupby('user_segment').agg({
        'sessions': 'mean',
        'pageviews': 'mean',
        'avg_session_duration': 'mean',
        'bounce_rate': 'mean',
        'conversion': 'mean',
        'user_id': 'nunique'
    }).round(3)
    
    segment_analysis.columns = ['平均会话数', '平均页面浏览', '平均会话时长', 
                               '平均跳出率', '转化率', '用户数']
    
    print("\n用户分群分析:")
    print(segment_analysis)
    
    # 可视化聚类结果
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                         c=df_processed['user_segment'], 
                         cmap='viridis', alpha=0.6)
    plt.colorbar(scatter, label='用户分群')
    plt.title('用户分群可视化(PCA降维)')
    plt.xlabel('主成分1')
    plt.ylabel('主成分2')
    plt.show()
    
    return df_processed, kmeans_final

df_segmented, kmeans_model = user_segmentation_model(df_processed)

3.2 用户价值预测模型(回归分析)

python 复制代码
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def user_value_prediction_model(df):
    """用户价值预测模型"""
    
    # 准备特征和目标变量
    feature_cols = ['sessions', 'pageviews', 'avg_session_duration', 
                   'bounce_rate', 'gender_encoded', 'age_group_encoded',
                   'day_of_week', 'is_weekend']
    
    X = df[feature_cols]
    y = df['revenue']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # 训练随机森林回归模型
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        random_state=42
    )
    
    rf_model.fit(X_train, y_train)
    
    # 预测
    y_pred = rf_model.predict(X_test)
    
    # 模型评估
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("模型评估指标:")
    print(f"均方误差(MSE): {mse:.2f}")
    print(f"均方根误差(RMSE): {rmse:.2f}")
    print(f"平均绝对误差(MAE): {mae:.2f}")
    print(f"R²分数: {r2:.2f}")
    
    # 特征重要性分析
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n特征重要性排序:")
    print(feature_importance)
    
    # 可视化预测结果
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.6)
    plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=2)
    plt.xlabel('实际价值')
    plt.ylabel('预测价值')
    plt.title('用户价值预测 vs 实际值')
    plt.show()
    
    # 交叉验证
    cv_scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2')
    print(f"\n交叉验证R²分数: {cv_scores.mean():.2f} (±{cv_scores.std():.2f})")
    
    return rf_model, feature_importance

rf_model, feature_importance = user_value_prediction_model(df_processed)

3.3 用户流失预测模型(分类)

python 复制代码
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

def churn_prediction_model(df):
    """用户流失预测模型"""
    
    # 创建流失标签:连续3天没有会话的用户标记为流失
    df['date_str'] = df['date'].dt.date
    user_last_activity = df.groupby('user_id')['date_str'].max().reset_index()
    
    # 假设分析日期为最后一天
    analysis_date = df['date_str'].max()
    churn_threshold = analysis_date - timedelta(days=3)
    
    user_last_activity['is_churned'] = (
        user_last_activity['date_str'] < churn_threshold
    ).astype(int)
    
    # 合并流失标签
    df_with_churn = pd.merge(
        df,
        user_last_activity[['user_id', 'is_churned']],
        on='user_id',
        how='left'
    )
    
    # 准备特征
    feature_cols = ['sessions', 'pageviews', 'avg_session_duration', 
                   'bounce_rate', 'gender_encoded', 'age_group_encoded',
                   'day_of_week', 'is_weekend']
    
    X = df_with_churn[feature_cols]
    y = df_with_churn['is_churned']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # 训练模型
    clf_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=8,
        class_weight='balanced',
        random_state=42
    )
    
    clf_model.fit(X_train, y_train)
    
    # 预测
    y_pred = clf_model.predict(X_test)
    y_pred_proba = clf_model.predict_proba(X_test)[:, 1]
    
    # 模型评估
    print("分类报告:")
    print(classification_report(y_test, y_pred))
    
    print(f"ROC AUC分数: {roc_auc_score(y_test, y_pred_proba):.3f}")
    
    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    print("\n混淆矩阵:")
    print(cm)
    
    # ROC曲线
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, label=f'ROC曲线 (AUC = {roc_auc_score(y_test, y_pred_proba):.3f})')
    plt.plot([0, 1], [0, 1], 'k--', label='随机猜测')
    plt.xlabel('假正率')
    plt.ylabel('真正率')
    plt.title('ROC曲线 - 用户流失预测')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    
    # 特征重要性
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': clf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n流失预测特征重要性:")
    print(feature_importance)
    
    # 可视化特征重要性
    plt.figure(figsize=(10, 6))
    plt.barh(feature_importance['feature'][::-1], 
             feature_importance['importance'][::-1])
    plt.xlabel('重要性')
    plt.title('用户流失预测特征重要性')
    plt.tight_layout()
    plt.show()
    
    return clf_model, df_with_churn

clf_model, df_with_churn = churn_prediction_model(df_processed)

四、模型应用与业务洞察

python 复制代码
def generate_business_insights(df_segmented, df_with_churn, clf_model):
    """生成业务洞察报告"""
    
    insights = []
    
    # 1. 用户分群洞察
    segment_stats = df_segmented.groupby('user_segment').agg({
        'conversion': 'mean',
        'revenue': 'mean',
        'user_id': 'nunique'
    }).round(3)
    
    high_value_segment = segment_stats['revenue'].idxmax()
    insights.append(f"1. 最高价值用户群体: 分群{high_value_segment}, "
                   f"平均收入: ${segment_stats.loc[high_value_segment, 'revenue']:.2f}")
    
    # 2. 流失风险洞察
    high_risk_users = df_with_churn[
        df_with_churn['is_churned'] == 1
    ]['user_id'].nunique()
    
    total_users = df_with_churn['user_id'].nunique()
    churn_rate = high_risk_users / total_users
    
    insights.append(f"2. 用户流失率: {churn_rate:.2%} ({high_risk_users}/{total_users}用户)")
    
    # 3. 转化驱动因素
    conversion_correlation = df_segmented[['sessions', 'pageviews', 
                                          'avg_session_duration', 'conversion']].corr()
    
    strongest_predictor = conversion_correlation['conversion'].abs().sort_values(
        ascending=False
    ).index[1]  # 跳过自身相关性
    
    insights.append(f"3. 转化最强预测因子: {strongest_predictor}, "
                   f"相关性: {conversion_correlation.loc[strongest_predictor, 'conversion']:.3f}")
    
    # 4. 推荐优化策略
    low_conversion_segment = segment_stats['conversion'].idxmin()
    insights.append(f"4. 优化建议: 重点关注分群{low_conversion_segment}的用户, "
                   f"当前转化率仅{segment_stats.loc[low_conversion_segment, 'conversion']:.2%}")
    
    # 5. 个性化推荐时机
    avg_session_by_hour = df_segmented.groupby(
        df_segmented['date'].dt.hour
    )['conversion'].mean()
    
    best_hour = avg_session_by_hour.idxmax()
    insights.append(f"5. 最佳营销时机: 每天{best_hour}:00, "
                   f"此时转化率最高: {avg_session_by_hour.max():.2%}")
    
    print("业务洞察报告:")
    print("="*50)
    for i, insight in enumerate(insights, 1):
        print(f"{insight}")
    print("="*50)

generate_business_insights(df_segmented, df_with_churn, clf_model)

五、模型部署与监控

python 复制代码
class UserBehaviorModelPipeline:
    """用户行为分析模型管道"""
    
    def __init__(self):
        self.models = {}
        self.scaler = StandardScaler()
        self.label_encoders = {}
        
    def train_pipeline(self, df):
        """训练完整管道"""
        # 数据预处理
        df_processed = self._preprocess_data(df)
        
        # 训练分群模型
        print("训练用户分群模型...")
        kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
        features = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
        self.models['segmentation'] = kmeans.fit(df_processed[features])
        
        # 训练价值预测模型
        print("训练用户价值预测模型...")
        rf_regressor = RandomForestRegressor(n_estimators=50, random_state=42)
        X_reg = df_processed[features + ['gender_encoded', 'age_group_encoded']]
        y_reg = df_processed['revenue']
        self.models['value_prediction'] = rf_regressor.fit(X_reg, y_reg)
        
        # 训练流失预测模型
        print("训练用户流失预测模型...")
        rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
        X_clf = df_processed[features + ['gender_encoded', 'age_group_encoded']]
        # 模拟流失标签
        y_clf = (df_processed['sessions'] < df_processed['sessions'].median()).astype(int)
        self.models['churn_prediction'] = rf_classifier.fit(X_clf, y_clf)
        
        print("模型训练完成!")
        
    def _preprocess_data(self, df):
        """数据预处理"""
        df_processed = df.copy()
        
        # 编码分类变量
        if 'gender' in df.columns:
            le_gender = LabelEncoder()
            df_processed['gender_encoded'] = le_gender.fit_transform(df_processed['gender'])
            self.label_encoders['gender'] = le_gender
        
        # 标准化数值特征
        numerical_cols = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
        df_processed[numerical_cols] = self.scaler.fit_transform(df_processed[numerical_cols])
        
        return df_processed
    
    def predict(self, new_data):
        """对新数据进行预测"""
        # 预处理新数据
        new_data_processed = new_data.copy()
        
        if 'gender' in new_data.columns:
            new_data_processed['gender_encoded'] = self.label_encoders['gender'].transform(
                new_data['gender']
            )
        
        numerical_cols = ['sessions', 'pageviews', 'avg_session_duration', 'bounce_rate']
        new_data_processed[numerical_cols] = self.scaler.transform(
            new_data[numerical_cols]
        )
        
        # 进行预测
        features = numerical_cols + ['gender_encoded', 'age_group_encoded']
        
        predictions = {
            'user_segment': self.models['segmentation'].predict(
                new_data_processed[numerical_cols]
            ),
            'predicted_value': self.models['value_prediction'].predict(
                new_data_processed[features]
            ),
            'churn_probability': self.models['churn_prediction'].predict_proba(
                new_data_processed[features]
            )[:, 1]
        }
        
        return pd.DataFrame(predictions)

# 创建和训练管道
pipeline = UserBehaviorModelPipeline()
pipeline.train_pipeline(df)

# 模拟新用户数据
new_users = pd.DataFrame({
    'user_id': ['new_user_1', 'new_user_2', 'new_user_3'],
    'sessions': [5, 2, 8],
    'pageviews': [25, 10, 40],
    'avg_session_duration': [300, 120, 450],
    'bounce_rate': [0.3, 0.6, 0.2],
    'gender': ['M', 'F', 'M'],
    'age_group_encoded': [1, 2, 1]
})

# 进行预测
predictions = pipeline.predict(new_users)
print("\n新用户预测结果:")
print(predictions)
相关推荐
艾莉丝努力练剑1 小时前
【Python基础:语法第二课】Python 流程控制详解:条件语句 + 循环语句 + 人生重开模拟器实战
人工智能·爬虫·python·pycharm
智链RFID1 小时前
信创RFID:涉密数据共享的“安全密钥”
网络·人工智能·安全
lisw051 小时前
社区数据仓库的可持续连接性!
大数据·数据仓库·人工智能·机器学习
大模型真好玩1 小时前
Chatbox支持接入LangGraph智能体?一切都靠Trae Solo!
人工智能·agent·trae
智海观潮1 小时前
AIGC、Agent、MCP、A2A和AG-UI促进AI从基础能力到协同生态演进
人工智能·chatgpt·aigc·mcp
L***一1 小时前
数字化时代中专生职业能力提升路径探析:聚焦数据分析类认证
数据挖掘·数据分析
棒棒的皮皮1 小时前
【OpenCV】Python图像处理之开发环境搭建
人工智能·python·opencv·计算机视觉
mingo_敏1 小时前
OpenCV中Blob检测的全面解析与实战技巧
人工智能·opencv·计算机视觉