前
之前在一家短视频公司做过两年推荐系统,对这套"流量分配"的玩法算是有些了解。很多UP主抱怨平台"限流",新人觉得"没有扶持",其实背后都是一套复杂的算法在运作。
今天从技术角度聊聊内容平台的核心机制:推荐算法怎么决定谁能上热门、创作者激励怎么计算、新用户怎么冷启动、评论区怎么排序。希望能帮大家理解这些平台的运作逻辑。
整体架构概览
┌─────────────────────────────────────────────────────────────────────────────┐
│ 内容平台核心系统架构 │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ 用户端 │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │ 浏览 │ │ 创作 │ │ 互动 │ │ 消费 │ │ │
│ │ │ Feed │ │ 发布 │ │ 点赞评论 │ │ 打赏 │ │ │
│ │ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │ │
│ └────────┼────────────┼────────────┼────────────┼──────────────────────┘ │
│ │ │ │ │ │
│ ▼ ▼ ▼ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ 数据采集层 │ │
│ │ - 曝光日志 - 点击日志 - 播放日志 - 互动日志 - 创作日志 │ │
│ └─────────────────────────────┬───────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ 实时计算层 (Flink/Spark) │ │
│ │ │ │
│ │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ │ 用户画像 │ │ 内容理解 │ │ 行为统计 │ │ 实时特征 │ │ │
│ │ │ 更新 │ │ 标签提取 │ │ 聚合 │ │ 计算 │ │ │
│ │ └───────────┘ └───────────┘ └───────────┘ └───────────┘ │ │
│ └─────────────────────────────┬───────────────────────────────────────┘ │
│ │ │
│ ┌───────────────────────┼───────────────────────┐ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ │
│ │ 推荐系统 │ │ 激励系统 │ │ 风控系统 │ │
│ │ │ │ │ │ │ │
│ │ - 召回 │ │ - 流量池 │ │ - 反作弊 │ │
│ │ - 排序 │ │ - 收益 │ │ - 审核 │ │
│ │ - 重排 │ │ - 任务 │ │ - 限流 │ │
│ └───────────┘ └───────────┘ └───────────┘ │
│ │ │ │ │
│ └───────────────────────┼───────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────────────────────┐ │
│ │ 存储层 │ │
│ │ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ │ MySQL │ │ Redis │ │ HBase │ │ ES │ │ │
│ │ │ 业务数据 │ │ 热数据 │ │ 用户行为 │ │ 内容索引 │ │ │
│ │ └───────────┘ └───────────┘ └───────────┘ └───────────┘ │ │
│ └─────────────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────────┘
内容推荐算法
推荐系统整体流程
用户请求推荐
│
▼
┌─────────────────────────────────────────────────────────────────┐
│ 召回层 (Recall) │
│ │
│ 从海量内容中快速筛选出候选集(几百到几千条) │
│ │
│ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │
│ │ 协同过滤 │ │ 内容召回 │ │ 热门召回 │ │ 关注召回 │ │ 向量召回 │ │
│ │ (I2I) │ │ (Tag) │ │ (Hot) │ │ (Follow)│ │ (ANN) │ │
│ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ └────┬────┘ │
│ └───────────┴───────────┴───────────┴───────────┘ │
│ │ │
└──────────────────────────────┼──────────────────────────────────┘
│ 候选集(~1000条)
▼
┌─────────────────────────────────────────────────────────────────┐
│ 粗排层 (Pre-Ranking) │
│ │
│ 使用轻量级模型快速过滤,减少精排压力 │
│ │
│ - 简单的LR/浅层NN模型 │
│ - 过滤明显不匹配的内容 │
│ - 输出 ~200条 │
│ │
└──────────────────────────────┼──────────────────────────────────┘
│ 粗排结果(~200条)
▼
┌─────────────────────────────────────────────────────────────────┐
│ 精排层 (Ranking) │
│ │
│ 使用复杂深度学习模型精确预估 │
│ │
│ ┌──────────────────────────────────────────────────────────┐ │
│ │ 多目标预估 │ │
│ │ - 点击率 (CTR) │ │
│ │ - 完播率 (Finish Rate) │ │
│ │ - 点赞率 (Like Rate) │ │
│ │ - 评论率 (Comment Rate) │ │
│ │ - 转发率 (Share Rate) │ │
│ │ - 关注率 (Follow Rate) │ │
│ └──────────────────────────────────────────────────────────┘ │
│ │
│ 综合得分 = f(CTR, 完播, 点赞, 评论, 转发, 关注, ...) │
│ │
└──────────────────────────────┼──────────────────────────────────┘
│ 精排结果(~50条)
▼
┌─────────────────────────────────────────────────────────────────┐
│ 重排层 (Re-Ranking) │
│ │
│ 业务规则干预,优化整体体验 │
│ │
│ - 多样性打散(同作者、同类型间隔) │
│ - 新内容/新作者扶持插入 │
│ - 广告插入 │
│ - 运营位/活动内容插入 │
│ - 去重(已看过的内容) │
│ │
└──────────────────────────────┼──────────────────────────────────┘
│
▼
最终推荐列表
召回算法实现
python
"""
多路召回系统实现
"""
import numpy as np
from collections import defaultdict
import redis
import faiss
class RecallSystem:
"""多路召回系统"""
def __init__(self, redis_client, faiss_index):
self.redis = redis_client
self.faiss_index = faiss_index
def recall(self, user_id, user_profile, context, num_recall=1000):
"""
多路召回
Args:
user_id: 用户ID
user_profile: 用户画像
context: 上下文(时间、地点、设备等)
num_recall: 总召回数量
Returns:
候选内容列表,带召回来源标记
"""
candidates = {}
# 1. 协同过滤召回 (ItemCF/UserCF)
cf_items = self.collaborative_filtering_recall(
user_id, num=300
)
for item_id, score in cf_items:
candidates[item_id] = {
'score': score,
'source': 'cf'
}
# 2. 内容召回(基于用户兴趣标签)
tag_items = self.tag_based_recall(
user_profile['interest_tags'], num=200
)
for item_id, score in tag_items:
if item_id in candidates:
candidates[item_id]['score'] += score * 0.5
else:
candidates[item_id] = {'score': score, 'source': 'tag'}
# 3. 热门召回
hot_items = self.hot_recall(
category=user_profile.get('prefer_category'),
num=100
)
for item_id, score in hot_items:
if item_id not in candidates:
candidates[item_id] = {'score': score, 'source': 'hot'}
# 4. 关注作者召回
follow_items = self.follow_recall(user_id, num=200)
for item_id, score in follow_items:
if item_id in candidates:
candidates[item_id]['score'] += score * 0.8 # 关注的权重高
else:
candidates[item_id] = {'score': score, 'source': 'follow'}
# 5. 向量召回(Embedding相似)
if user_profile.get('embedding') is not None:
ann_items = self.ann_recall(
user_profile['embedding'], num=300
)
for item_id, score in ann_items:
if item_id in candidates:
candidates[item_id]['score'] += score * 0.6
else:
candidates[item_id] = {'score': score, 'source': 'ann'}
# 6. 新内容探索召回(给新内容曝光机会)
explore_items = self.explore_recall(num=100)
for item_id, score in explore_items:
if item_id not in candidates:
candidates[item_id] = {'score': score, 'source': 'explore'}
# 按分数排序,取top
sorted_items = sorted(
candidates.items(),
key=lambda x: x[1]['score'],
reverse=True
)[:num_recall]
return sorted_items
def collaborative_filtering_recall(self, user_id, num=300):
"""
Item-based协同过滤召回
基于用户历史行为,找相似物品
"""
# 获取用户最近交互的物品
recent_items = self.redis.lrange(
f'user:{user_id}:recent_items', 0, 50
)
if not recent_items:
return []
candidates = defaultdict(float)
for item_id in recent_items:
# 获取该物品的相似物品
similar_items = self.redis.zrevrange(
f'item:{item_id}:similar', 0, 50, withscores=True
)
for sim_item_id, sim_score in similar_items:
if sim_item_id not in recent_items: # 过滤已交互
candidates[sim_item_id] += sim_score
# 排序返回
sorted_items = sorted(
candidates.items(),
key=lambda x: x[1],
reverse=True
)[:num]
return sorted_items
def ann_recall(self, user_embedding, num=300):
"""
向量近似最近邻召回 (ANN)
使用Faiss进行高效向量检索
"""
user_vec = np.array([user_embedding]).astype('float32')
# Faiss搜索
distances, indices = self.faiss_index.search(user_vec, num)
results = []
for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
if idx >= 0: # 有效索引
item_id = self.index_to_item_id(idx)
score = 1.0 / (1.0 + dist) # 距离转相似度
results.append((item_id, score))
return results
def hot_recall(self, category=None, num=100):
"""
热门召回
基于时间衰减的热度分数
"""
if category:
key = f'hot:category:{category}'
else:
key = 'hot:global'
hot_items = self.redis.zrevrange(key, 0, num - 1, withscores=True)
return [(item_id, score) for item_id, score in hot_items]
def explore_recall(self, num=100):
"""
探索召回 - 给新内容曝光机会
从最近发布的内容中随机采样
"""
# 获取最近24小时的新内容
new_items = self.redis.zrevrange(
'items:recent_24h', 0, 1000
)
if len(new_items) <= num:
return [(item_id, 0.5) for item_id in new_items]
# 随机采样
sampled = np.random.choice(new_items, num, replace=False)
return [(item_id, 0.5) for item_id in sampled]
class ItemSimilarityCalculator:
"""
物品相似度计算(离线任务)
用于协同过滤召回
"""
def calculate_item_similarity(self, interaction_data):
"""
计算物品相似度
使用Item-CF公式:
sim(i,j) = |N(i) ∩ N(j)| / sqrt(|N(i)| * |N(j)|)
其中N(i)表示喜欢物品i的用户集合
"""
# 构建物品-用户倒排表
item_users = defaultdict(set)
user_items = defaultdict(set)
for user_id, item_id, score in interaction_data:
if score > 0: # 正向交互
item_users[item_id].add(user_id)
user_items[user_id].add(item_id)
# 计算物品相似度
item_sim = defaultdict(dict)
for user_id, items in user_items.items():
items = list(items)
# 用户交互的物品两两计算
for i in range(len(items)):
for j in range(i + 1, len(items)):
item_i, item_j = items[i], items[j]
# 累加共现次数
if item_j not in item_sim[item_i]:
item_sim[item_i][item_j] = 0
item_sim[item_i][item_j] += 1
if item_i not in item_sim[item_j]:
item_sim[item_j][item_i] = 0
item_sim[item_j][item_i] += 1
# 归一化
for item_i in item_sim:
ni = len(item_users[item_i])
for item_j in item_sim[item_i]:
nj = len(item_users[item_j])
# 余弦相似度
item_sim[item_i][item_j] /= np.sqrt(ni * nj)
return item_sim
排序模型
python
"""
精排模型实现 - 多目标深度学习模型
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiTaskRankingModel(nn.Module):
"""
多目标排序模型
同时预估多个目标:CTR、完播率、点赞、评论、转发、关注
使用MMoE (Multi-gate Mixture-of-Experts) 架构
"""
def __init__(self,
user_feature_dim,
item_feature_dim,
context_feature_dim,
embedding_dim=64,
num_experts=6,
num_tasks=6):
super().__init__()
self.num_experts = num_experts
self.num_tasks = num_tasks
# 特征嵌入层
self.user_embedding = nn.Linear(user_feature_dim, embedding_dim)
self.item_embedding = nn.Linear(item_feature_dim, embedding_dim)
self.context_embedding = nn.Linear(context_feature_dim, embedding_dim)
# 特征交叉层
input_dim = embedding_dim * 3
# Expert网络(共享)
self.experts = nn.ModuleList([
nn.Sequential(
nn.Linear(input_dim, 256),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(256, 128),
nn.ReLU(),
) for _ in range(num_experts)
])
# 每个任务的Gate网络
self.gates = nn.ModuleList([
nn.Sequential(
nn.Linear(input_dim, num_experts),
nn.Softmax(dim=-1)
) for _ in range(num_tasks)
])
# 每个任务的Tower网络
self.towers = nn.ModuleList([
nn.Sequential(
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(64, 1),
nn.Sigmoid()
) for _ in range(num_tasks)
])
# 任务名称
self.task_names = ['ctr', 'finish_rate', 'like', 'comment', 'share', 'follow']
def forward(self, user_features, item_features, context_features):
"""
前向传播
Returns:
dict: 各任务的预估值
"""
# 特征嵌入
user_emb = F.relu(self.user_embedding(user_features))
item_emb = F.relu(self.item_embedding(item_features))
context_emb = F.relu(self.context_embedding(context_features))
# 拼接
x = torch.cat([user_emb, item_emb, context_emb], dim=-1)
# Expert输出
expert_outputs = [expert(x) for expert in self.experts]
expert_outputs = torch.stack(expert_outputs, dim=1) # [batch, num_experts, dim]
# 各任务输出
task_outputs = {}
for i, task_name in enumerate(self.task_names):
# Gate选择Expert
gate_weights = self.gates[i](x) # [batch, num_experts]
gate_weights = gate_weights.unsqueeze(-1) # [batch, num_experts, 1]
# 加权求和
gated_output = torch.sum(expert_outputs * gate_weights, dim=1) # [batch, dim]
# Tower输出
task_outputs[task_name] = self.towers[i](gated_output).squeeze(-1)
return task_outputs
def compute_final_score(self, task_outputs, weights=None):
"""
计算最终排序分数
融合多个目标的预估值
"""
if weights is None:
# 默认权重(可根据业务调整)
weights = {
'ctr': 1.0,
'finish_rate': 1.5, # 完播率权重高
'like': 0.8,
'comment': 1.0, # 评论价值高
'share': 1.2, # 转发价值高
'follow': 0.5
}
score = sum(
task_outputs[task] * weight
for task, weight in weights.items()
)
return score
class RankingLoss(nn.Module):
"""
多任务排序损失函数
"""
def __init__(self, task_weights=None):
super().__init__()
if task_weights is None:
task_weights = {
'ctr': 1.0,
'finish_rate': 1.0,
'like': 0.8,
'comment': 0.8,
'share': 0.8,
'follow': 0.5
}
self.task_weights = task_weights
self.bce = nn.BCELoss()
def forward(self, predictions, labels):
"""
计算多任务损失
"""
total_loss = 0
task_losses = {}
for task_name, weight in self.task_weights.items():
if task_name in labels and labels[task_name] is not None:
task_loss = self.bce(predictions[task_name], labels[task_name])
task_losses[task_name] = task_loss.item()
total_loss += weight * task_loss
return total_loss, task_losses
class FeatureExtractor:
"""
特征提取器
"""
def extract_user_features(self, user_profile):
"""
用户特征提取
"""
features = []
# 基础属性
features.extend([
user_profile.get('age', 0) / 100,
user_profile.get('gender', 0),
user_profile.get('city_level', 3) / 5,
])
# 行为统计
features.extend([
np.log1p(user_profile.get('total_watch_time', 0)) / 10,
np.log1p(user_profile.get('total_likes', 0)) / 10,
np.log1p(user_profile.get('total_comments', 0)) / 10,
user_profile.get('avg_watch_duration', 0) / 60,
user_profile.get('active_days', 0) / 30,
])
# 兴趣分布(假设有20个类别)
interest_dist = user_profile.get('interest_distribution', [0] * 20)
features.extend(interest_dist)
return np.array(features, dtype=np.float32)
def extract_item_features(self, item_info):
"""
内容特征提取
"""
features = []
# 基础属性
features.extend([
item_info.get('duration', 0) / 300, # 视频时长
item_info.get('is_original', 0), # 是否原创
np.log1p(item_info.get('author_fans', 0)) / 15, # 作者粉丝数
])
# 实时统计(带时间衰减)
features.extend([
np.log1p(item_info.get('play_count', 0)) / 15,
np.log1p(item_info.get('like_count', 0)) / 12,
np.log1p(item_info.get('comment_count', 0)) / 10,
item_info.get('like_rate', 0), # 点赞率
item_info.get('finish_rate', 0), # 完播率
])
# 内容类别(one-hot或embedding)
category_emb = item_info.get('category_embedding', [0] * 16)
features.extend(category_emb)
# 内容质量分
features.append(item_info.get('quality_score', 0.5))
return np.array(features, dtype=np.float32)
def extract_context_features(self, context):
"""
上下文特征提取
"""
features = []
# 时间特征
hour = context.get('hour', 12)
features.extend([
np.sin(2 * np.pi * hour / 24),
np.cos(2 * np.pi * hour / 24),
])
weekday = context.get('weekday', 0)
features.extend([
np.sin(2 * np.pi * weekday / 7),
np.cos(2 * np.pi * weekday / 7),
])
# 设备信息
features.extend([
context.get('is_wifi', 0),
context.get('device_type', 0) / 3, # 手机/平板/PC
])
# 页面位置
features.append(context.get('position', 0) / 100)
# 刷新次数
features.append(np.log1p(context.get('refresh_count', 0)) / 5)
return np.array(features, dtype=np.float32)
重排与多样性
python
"""
重排层实现 - 多样性打散与业务规则
"""
class ReRanker:
"""
重排器
在精排结果基础上进行业务规则调整
"""
def __init__(self, config):
self.config = config
def rerank(self, ranked_items, user_id, context):
"""
重排主函数
Args:
ranked_items: 精排结果列表 [(item_id, score, item_info), ...]
user_id: 用户ID
context: 上下文
Returns:
重排后的列表
"""
# 1. 去重(过滤已看过的内容)
items = self.filter_seen(ranked_items, user_id)
# 2. 多样性打散
items = self.diversity_rerank(items)
# 3. 新内容扶持插入
items = self.insert_new_content(items, user_id)
# 4. 运营位插入
items = self.insert_operation_content(items, context)
# 5. 广告插入
items = self.insert_ads(items, user_id, context)
return items
def diversity_rerank(self, items,
author_gap=3, # 同作者间隔
category_gap=2): # 同类别间隔
"""
多样性打散
使用MMR (Maximal Marginal Relevance) 思想
同时考虑相关性和多样性
"""
if not items:
return items
result = []
remaining = list(items)
# 记录最近选择的作者和类别
recent_authors = []
recent_categories = []
while remaining and len(result) < len(items):
best_idx = -1
best_score = -float('inf')
for i, (item_id, score, info) in enumerate(remaining):
# 基础分数
adjusted_score = score
author_id = info.get('author_id')
category = info.get('category')
# 作者惩罚
if author_id in recent_authors[-author_gap:]:
author_penalty = 0.5 ** (author_gap - recent_authors[-author_gap:].index(author_id))
adjusted_score *= author_penalty
# 类别惩罚
if category in recent_categories[-category_gap:]:
category_penalty = 0.7 ** (category_gap - recent_categories[-category_gap:].index(category))
adjusted_score *= category_penalty
if adjusted_score > best_score:
best_score = adjusted_score
best_idx = i
if best_idx >= 0:
selected = remaining.pop(best_idx)
result.append(selected)
# 更新最近选择记录
recent_authors.append(selected[2].get('author_id'))
recent_categories.append(selected[2].get('category'))
return result
def insert_new_content(self, items, user_id,
insert_ratio=0.1, # 新内容比例
insert_positions=[3, 8, 15]): # 插入位置
"""
新内容扶持插入
给新发布的内容曝光机会
"""
# 获取可插入的新内容
new_contents = self.get_new_content_for_user(user_id)
if not new_contents:
return items
result = list(items)
inserted = 0
for pos in insert_positions:
if inserted >= len(new_contents):
break
actual_pos = pos + inserted
if actual_pos < len(result):
result.insert(actual_pos, new_contents[inserted])
inserted += 1
return result
def insert_ads(self, items, user_id, context,
ad_positions=[5, 12, 20]):
"""
广告插入
根据用户和上下文选择合适的广告
"""
# 获取广告候选
ads = self.get_ads_for_user(user_id, context)
if not ads:
return items
result = list(items)
for i, pos in enumerate(ad_positions):
if i >= len(ads):
break
actual_pos = pos + i # 考虑之前插入的偏移
if actual_pos < len(result):
ad_item = ads[i]
ad_item['is_ad'] = True
result.insert(actual_pos, ad_item)
return result
创作者激励算法
流量池机制
python
"""
流量池机制 - 内容分发的核心逻辑
新内容发布后,进入不同级别的流量池
根据表现决定是否进入更大的流量池
"""
class TrafficPoolSystem:
"""
流量池系统
阶梯式流量分配:
初级池(300-500播放) → 中级池(3000-5000) → 高级池(1-10万) → 热门池(100万+)
"""
# 流量池配置
POOLS = {
'initial': {
'name': '初级池',
'target_play': 500,
'time_limit_hours': 24,
'promotion_threshold': { # 晋级阈值
'like_rate': 0.03, # 点赞率 > 3%
'comment_rate': 0.005, # 评论率 > 0.5%
'finish_rate': 0.3, # 完播率 > 30%
'share_rate': 0.001, # 转发率 > 0.1%
}
},
'middle': {
'name': '中级池',
'target_play': 5000,
'time_limit_hours': 48,
'promotion_threshold': {
'like_rate': 0.04,
'comment_rate': 0.008,
'finish_rate': 0.35,
'share_rate': 0.002,
}
},
'advanced': {
'name': '高级池',
'target_play': 100000,
'time_limit_hours': 72,
'promotion_threshold': {
'like_rate': 0.05,
'comment_rate': 0.01,
'finish_rate': 0.4,
'share_rate': 0.005,
}
},
'hot': {
'name': '热门池',
'target_play': None, # 无上限
'time_limit_hours': 168, # 7天
'promotion_threshold': None
}
}
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
def on_content_publish(self, content_id, author_id):
"""
内容发布时初始化
"""
# 设置初始流量池
content_status = {
'content_id': content_id,
'author_id': author_id,
'current_pool': 'initial',
'publish_time': time.time(),
'pool_enter_time': time.time(),
'stats': {
'play_count': 0,
'like_count': 0,
'comment_count': 0,
'share_count': 0,
'total_watch_time': 0,
'finish_count': 0,
}
}
# 存储到Redis
self.redis.hset(
f'content:{content_id}:pool_status',
mapping=content_status
)
# 加入初级池队列
self.redis.zadd(
'pool:initial:contents',
{content_id: time.time()}
)
# 作者新内容计数
self.redis.incr(f'author:{author_id}:content_count_24h')
self.redis.expire(f'author:{author_id}:content_count_24h', 86400)
def evaluate_content(self, content_id):
"""
评估内容表现,决定是否晋级/淘汰
"""
status = self.get_content_status(content_id)
if not status:
return
current_pool = status['current_pool']
pool_config = self.POOLS[current_pool]
# 计算指标
stats = status['stats']
play_count = max(stats['play_count'], 1) # 避免除零
metrics = {
'like_rate': stats['like_count'] / play_count,
'comment_rate': stats['comment_count'] / play_count,
'finish_rate': stats['finish_count'] / play_count,
'share_rate': stats['share_count'] / play_count,
}
# 检查是否达到目标播放量
target_reached = (
pool_config['target_play'] is None or
stats['play_count'] >= pool_config['target_play']
)
# 检查是否超时
time_in_pool = time.time() - status['pool_enter_time']
timeout = time_in_pool > pool_config['time_limit_hours'] * 3600
# 决策
if pool_config['promotion_threshold']:
# 检查是否满足晋级条件
can_promote = all(
metrics[key] >= threshold
for key, threshold in pool_config['promotion_threshold'].items()
)
if target_reached and can_promote:
# 晋级到下一个流量池
self.promote_content(content_id, current_pool)
elif timeout:
# 超时淘汰(停止推荐)
self.demote_content(content_id, current_pool)
def promote_content(self, content_id, from_pool):
"""
晋级到更高流量池
"""
pool_order = ['initial', 'middle', 'advanced', 'hot']
current_idx = pool_order.index(from_pool)
if current_idx < len(pool_order) - 1:
next_pool = pool_order[current_idx + 1]
# 从当前池移除
self.redis.zrem(f'pool:{from_pool}:contents', content_id)
# 加入新池
self.redis.zadd(
f'pool:{next_pool}:contents',
{content_id: time.time()}
)
# 更新状态
self.redis.hset(
f'content:{content_id}:pool_status',
'current_pool', next_pool
)
self.redis.hset(
f'content:{content_id}:pool_status',
'pool_enter_time', time.time()
)
print(f"Content {content_id} promoted: {from_pool} -> {next_pool}")
def demote_content(self, content_id, from_pool):
"""
淘汰(停止推荐但不删除)
"""
self.redis.zrem(f'pool:{from_pool}:contents', content_id)
self.redis.hset(
f'content:{content_id}:pool_status',
'current_pool', 'archived'
)
print(f"Content {content_id} archived from {from_pool}")
def get_pool_weights(self):
"""
获取各流量池在召回时的权重
高级池内容获得更多曝光机会
"""
return {
'initial': 0.15, # 15%流量给新内容
'middle': 0.25, # 25%
'advanced': 0.35, # 35%
'hot': 0.25, # 25%给热门
}
创作者收益计算
python
"""
创作者收益计算系统
收益来源:
1. 播放分成(CPM)
2. 互动奖励
3. 创作激励基金
4. 直播打赏
5. 付费内容
"""
class CreatorRevenueSystem:
"""
创作者收益系统
"""
# 基础配置
BASE_CPM = {
'tier1': 20.0, # 优质创作者 ¥20/千次播放
'tier2': 10.0, # 中等创作者
'tier3': 5.0, # 普通创作者
'new': 3.0, # 新创作者
}
# 互动奖励
INTERACTION_REWARD = {
'like': 0.01, # 点赞 ¥0.01
'comment': 0.05, # 评论 ¥0.05
'share': 0.1, # 转发 ¥0.1
'follow': 0.2, # 新增粉丝 ¥0.2
'favorite': 0.02, # 收藏 ¥0.02
}
# 质量系数
QUALITY_MULTIPLIER = {
'original': 1.5, # 原创加成
'high_quality': 1.3, # 优质内容
'vertical': 1.2, # 垂直领域
'normal': 1.0,
}
def __init__(self, db):
self.db = db
def calculate_content_revenue(self, content_id, period='daily'):
"""
计算单个内容的收益
"""
# 获取内容信息
content = self.db.get_content(content_id)
author = self.db.get_author(content['author_id'])
# 获取统计数据
stats = self.db.get_content_stats(content_id, period)
revenue = {
'content_id': content_id,
'period': period,
'breakdown': {},
'total': 0
}
# 1. 播放收益 (CPM)
author_tier = self.get_author_tier(author)
base_cpm = self.BASE_CPM[author_tier]
# 有效播放(去除刷量、过短播放等)
valid_plays = self.filter_valid_plays(stats['plays'])
play_revenue = (valid_plays / 1000) * base_cpm
revenue['breakdown']['play'] = play_revenue
# 2. 质量加成
quality_type = self.assess_content_quality(content)
quality_multiplier = self.QUALITY_MULTIPLIER[quality_type]
# 3. 互动收益
interaction_revenue = 0
for action, unit_reward in self.INTERACTION_REWARD.items():
count = stats.get(f'{action}_count', 0)
interaction_revenue += count * unit_reward
revenue['breakdown']['interaction'] = interaction_revenue
# 4. 完播加成
finish_rate = stats.get('finish_rate', 0)
finish_bonus = 0
if finish_rate > 0.5:
finish_bonus = play_revenue * 0.2 # 完播率>50%,额外20%
elif finish_rate > 0.3:
finish_bonus = play_revenue * 0.1
revenue['breakdown']['finish_bonus'] = finish_bonus
# 5. 计算总收益
base_revenue = play_revenue + interaction_revenue + finish_bonus
total_revenue = base_revenue * quality_multiplier
revenue['breakdown']['quality_multiplier'] = quality_multiplier
revenue['total'] = round(total_revenue, 2)
return revenue
def calculate_author_daily_revenue(self, author_id, date):
"""
计算作者单日总收益
"""
# 获取该作者当日所有内容的收益
contents = self.db.get_author_contents_with_plays(author_id, date)
total_revenue = 0
content_revenues = []
for content in contents:
content_rev = self.calculate_content_revenue(
content['content_id'],
period='daily'
)
total_revenue += content_rev['total']
content_revenues.append(content_rev)
# 创作激励基金(额外奖励)
incentive = self.calculate_incentive_bonus(author_id, date)
return {
'author_id': author_id,
'date': date,
'content_revenues': content_revenues,
'content_total': total_revenue,
'incentive_bonus': incentive,
'grand_total': total_revenue + incentive
}
def calculate_incentive_bonus(self, author_id, date):
"""
创作激励基金计算
基于多维度评估:
- 内容质量
- 更新频率
- 粉丝增长
- 领域贡献
"""
author = self.db.get_author(author_id)
monthly_stats = self.db.get_author_monthly_stats(author_id)
bonus = 0
# 1. 活跃创作者奖励(月更新>=15条)
if monthly_stats['content_count'] >= 15:
bonus += 100 # ¥100
# 2. 粉丝增长奖励
fan_growth = monthly_stats['fan_growth']
if fan_growth >= 10000:
bonus += 500
elif fan_growth >= 1000:
bonus += 100
elif fan_growth >= 100:
bonus += 20
# 3. 优质内容奖励(当月有爆款)
if monthly_stats['max_play_count'] >= 1000000:
bonus += 1000
elif monthly_stats['max_play_count'] >= 100000:
bonus += 200
# 4. 新人创作者扶持
if author['days_since_join'] <= 30:
bonus *= 1.5 # 新人加成50%
return round(bonus, 2)
def filter_valid_plays(self, plays_data):
"""
过滤无效播放
排除:
- 播放时长过短(<3秒)
- 同一用户重复播放
- 机器刷量
- 异常IP
"""
valid_count = 0
for play in plays_data:
# 播放时长检查
if play['duration'] < 3:
continue
# 重复播放检查
if play.get('is_repeat', False):
continue
# 风控标记检查
if play.get('risk_flag', False):
continue
valid_count += 1
return valid_count
def get_author_tier(self, author):
"""
获取作者等级
基于综合表现评定
"""
score = 0
# 粉丝数
fans = author.get('fan_count', 0)
if fans >= 1000000:
score += 40
elif fans >= 100000:
score += 30
elif fans >= 10000:
score += 20
elif fans >= 1000:
score += 10
# 内容质量分
quality_score = author.get('avg_quality_score', 0)
score += quality_score * 30
# 活跃度
active_score = author.get('active_score', 0)
score += active_score * 20
# 账号时长
days = author.get('days_since_join', 0)
if days >= 365:
score += 10
elif days >= 90:
score += 5
# 分级
if score >= 80:
return 'tier1'
elif score >= 50:
return 'tier2'
elif score >= 20:
return 'tier3'
else:
return 'new'
任务系统
python
"""
创作者任务系统
通过任务引导创作者行为
"""
class CreatorTaskSystem:
"""
创作者任务系统
"""
# 任务定义
TASKS = {
# 日常任务
'daily_publish': {
'name': '每日发布',
'description': '今日发布1条视频',
'type': 'daily',
'target': 1,
'reward': {'coins': 10, 'exp': 20},
'metric': 'publish_count'
},
'daily_interact': {
'name': '积极互动',
'description': '回复5条评论',
'type': 'daily',
'target': 5,
'reward': {'coins': 5, 'exp': 10},
'metric': 'reply_count'
},
# 周任务
'weekly_quality': {
'name': '优质创作',
'description': '本周发布的视频平均完播率>30%',
'type': 'weekly',
'target': 0.3,
'reward': {'coins': 100, 'exp': 200},
'metric': 'avg_finish_rate'
},
'weekly_growth': {
'name': '粉丝增长',
'description': '本周新增100粉丝',
'type': 'weekly',
'target': 100,
'reward': {'coins': 50, 'exp': 100},
'metric': 'fan_growth'
},
# 成就任务
'first_1k_fans': {
'name': '初露锋芒',
'description': '粉丝数达到1000',
'type': 'achievement',
'target': 1000,
'reward': {'coins': 500, 'exp': 1000, 'badge': 'rising_star'},
'metric': 'total_fans'
},
'first_viral': {
'name': '一鸣惊人',
'description': '单条视频播放量达到10万',
'type': 'achievement',
'target': 100000,
'reward': {'coins': 1000, 'exp': 2000, 'badge': 'viral_creator'},
'metric': 'max_play_count'
},
# 活动任务
'spring_festival': {
'name': '新春创作',
'description': '发布带#新年#话题的视频',
'type': 'event',
'target': 1,
'reward': {'coins': 200, 'exp': 500, 'traffic_boost': 1.5},
'metric': 'topic_publish',
'start_time': '2024-02-01',
'end_time': '2024-02-15',
'extra_params': {'topic': '新年'}
},
}
def __init__(self, db, redis_client):
self.db = db
self.redis = redis_client
def get_author_tasks(self, author_id):
"""
获取作者可用任务列表
"""
tasks = []
author = self.db.get_author(author_id)
for task_id, task_config in self.TASKS.items():
# 检查任务是否可用
if not self.is_task_available(task_id, task_config, author):
continue
# 获取任务进度
progress = self.get_task_progress(author_id, task_id, task_config)
tasks.append({
'task_id': task_id,
'name': task_config['name'],
'description': task_config['description'],
'type': task_config['type'],
'target': task_config['target'],
'current': progress['current'],
'progress': progress['percentage'],
'completed': progress['completed'],
'reward': task_config['reward'],
})
return tasks
def is_task_available(self, task_id, task_config, author):
"""
检查任务是否对该作者可用
"""
# 活动任务检查时间
if task_config['type'] == 'event':
now = datetime.now()
start = datetime.strptime(task_config['start_time'], '%Y-%m-%d')
end = datetime.strptime(task_config['end_time'], '%Y-%m-%d')
if not (start <= now <= end):
return False
# 成就任务检查是否已完成
if task_config['type'] == 'achievement':
if self.is_achievement_completed(author['id'], task_id):
return False
return True
def get_task_progress(self, author_id, task_id, task_config):
"""
获取任务进度
"""
metric = task_config['metric']
target = task_config['target']
# 根据任务类型获取对应时间范围的数据
if task_config['type'] == 'daily':
current = self.get_daily_metric(author_id, metric)
elif task_config['type'] == 'weekly':
current = self.get_weekly_metric(author_id, metric)
elif task_config['type'] == 'achievement':
current = self.get_total_metric(author_id, metric)
elif task_config['type'] == 'event':
current = self.get_event_metric(
author_id, metric,
task_config.get('extra_params', {})
)
else:
current = 0
percentage = min(current / target, 1.0) if target > 0 else 0
completed = current >= target
return {
'current': current,
'percentage': percentage,
'completed': completed
}
def complete_task(self, author_id, task_id):
"""
完成任务,发放奖励
"""
task_config = self.TASKS.get(task_id)
if not task_config:
return {'success': False, 'error': 'Task not found'}
# 检查是否已完成
progress = self.get_task_progress(author_id, task_id, task_config)
if not progress['completed']:
return {'success': False, 'error': 'Task not completed'}
# 检查是否已领取
if self.is_reward_claimed(author_id, task_id, task_config['type']):
return {'success': False, 'error': 'Reward already claimed'}
# 发放奖励
reward = task_config['reward']
if 'coins' in reward:
self.db.add_author_coins(author_id, reward['coins'])
if 'exp' in reward:
self.db.add_author_exp(author_id, reward['exp'])
if 'badge' in reward:
self.db.grant_badge(author_id, reward['badge'])
if 'traffic_boost' in reward:
# 流量加成,影响后续发布内容的初始流量
self.redis.setex(
f'author:{author_id}:traffic_boost',
86400 * 7, # 7天有效
reward['traffic_boost']
)
# 记录领取
self.mark_reward_claimed(author_id, task_id, task_config['type'])
return {
'success': True,
'reward': reward
}
新用户激励机制
新用户冷启动
python
"""
新用户冷启动系统
解决新用户没有行为数据,推荐困难的问题
"""
class NewUserColdStart:
"""
新用户冷启动
"""
def __init__(self, db, redis_client, recommend_service):
self.db = db
self.redis = redis_client
self.recommend = recommend_service
def on_user_register(self, user_id, register_info):
"""
用户注册时初始化
"""
# 1. 基础画像初始化
initial_profile = self.build_initial_profile(register_info)
# 2. 设置新用户标记
self.redis.setex(
f'user:{user_id}:is_new',
86400 * 7, # 7天内算新用户
1
)
# 3. 初始化新用户任务
self.init_new_user_tasks(user_id)
# 4. 发放新用户奖励
self.grant_new_user_rewards(user_id)
return initial_profile
def build_initial_profile(self, register_info):
"""
构建初始用户画像
基于注册信息推断用户兴趣
"""
profile = {
'interest_tags': [],
'interest_distribution': [0.05] * 20, # 均匀分布
'prefer_duration': 'medium', # 默认中等时长
'active_hours': list(range(18, 23)), # 默认活跃时段
}
# 基于年龄推断
age = register_info.get('age')
if age:
if age < 18:
profile['interest_tags'].extend(['游戏', '动漫', '校园'])
elif age < 25:
profile['interest_tags'].extend(['搞笑', '音乐', '美食'])
elif age < 35:
profile['interest_tags'].extend(['科技', '职场', '生活'])
else:
profile['interest_tags'].extend(['新闻', '健康', '旅行'])
# 基于性别推断
gender = register_info.get('gender')
if gender == 'female':
profile['interest_tags'].extend(['美妆', '穿搭', '萌宠'])
elif gender == 'male':
profile['interest_tags'].extend(['数码', '汽车', '体育'])
# 基于来源渠道推断
channel = register_info.get('channel')
channel_interests = {
'game_ad': ['游戏', '电竞'],
'beauty_ad': ['美妆', '护肤'],
'tech_blog': ['科技', '数码'],
}
if channel in channel_interests:
profile['interest_tags'].extend(channel_interests[channel])
return profile
def get_cold_start_recommendations(self, user_id, num=20):
"""
新用户推荐策略
混合多种策略,快速探索用户兴趣
"""
recommendations = []
# 1. 热门内容(40%)- 保底,大众喜欢的
hot_items = self.recommend.get_hot_items(num=int(num * 0.4))
for item in hot_items:
item['source'] = 'hot'
recommendations.extend(hot_items)
# 2. 基于初始画像(30%)- 猜你喜欢
profile = self.db.get_user_profile(user_id)
if profile and profile.get('interest_tags'):
profile_items = self.recommend.get_items_by_tags(
profile['interest_tags'],
num=int(num * 0.3)
)
for item in profile_items:
item['source'] = 'profile_guess'
recommendations.extend(profile_items)
# 3. 探索性内容(20%)- 多样化,帮助发现兴趣
explore_items = self.recommend.get_diverse_items(num=int(num * 0.2))
for item in explore_items:
item['source'] = 'explore'
recommendations.extend(explore_items)
# 4. 新创作者内容(10%)- 给新UP主曝光
new_creator_items = self.recommend.get_new_creator_items(num=int(num * 0.1))
for item in new_creator_items:
item['source'] = 'new_creator'
recommendations.extend(new_creator_items)
# 打散混合
np.random.shuffle(recommendations)
return recommendations
def update_profile_from_behavior(self, user_id, behavior):
"""
根据用户行为更新画像
新用户的每一次行为都非常重要
"""
profile = self.db.get_user_profile(user_id)
content = self.db.get_content(behavior['content_id'])
content_tags = content.get('tags', [])
# 正向行为:加强兴趣
if behavior['action'] in ['like', 'follow', 'share', 'favorite']:
for tag in content_tags:
if tag not in profile['interest_tags']:
profile['interest_tags'].append(tag)
# 更新兴趣分布
tag_idx = self.get_tag_index(tag)
if tag_idx >= 0:
profile['interest_distribution'][tag_idx] += 0.1
# 完播:中等程度加强
elif behavior['action'] == 'finish':
for tag in content_tags:
tag_idx = self.get_tag_index(tag)
if tag_idx >= 0:
profile['interest_distribution'][tag_idx] += 0.05
# 快速划走:减弱兴趣
elif behavior['action'] == 'skip' and behavior.get('duration', 0) < 3:
for tag in content_tags:
tag_idx = self.get_tag_index(tag)
if tag_idx >= 0:
profile['interest_distribution'][tag_idx] -= 0.05
# 归一化
total = sum(profile['interest_distribution'])
if total > 0:
profile['interest_distribution'] = [
x / total for x in profile['interest_distribution']
]
# 保存更新
self.db.update_user_profile(user_id, profile)
def init_new_user_tasks(self, user_id):
"""
初始化新用户任务
"""
tasks = [
{
'task_id': 'complete_profile',
'name': '完善资料',
'description': '设置头像和昵称',
'reward': {'coins': 50}
},
{
'task_id': 'first_like',
'name': '首次点赞',
'description': '给喜欢的视频点个赞',
'reward': {'coins': 10}
},
{
'task_id': 'first_follow',
'name': '关注创作者',
'description': '关注你喜欢的创作者',
'reward': {'coins': 20}
},
{
'task_id': 'watch_5_videos',
'name': '探索内容',
'description': '观看5个视频',
'reward': {'coins': 30}
},
{
'task_id': 'first_comment',
'name': '首次评论',
'description': '发表你的第一条评论',
'reward': {'coins': 20}
},
]
for task in tasks:
self.redis.hset(
f'user:{user_id}:new_user_tasks',
task['task_id'],
json.dumps(task)
)
def grant_new_user_rewards(self, user_id):
"""
发放新用户奖励
"""
rewards = {
'coins': 100, # 金币
'vip_trial_days': 3, # VIP试用
'ad_free_hours': 24, # 免广告时长
}
self.db.add_user_coins(user_id, rewards['coins'])
self.db.grant_vip_trial(user_id, rewards['vip_trial_days'])
self.redis.setex(
f'user:{user_id}:ad_free',
rewards['ad_free_hours'] * 3600,
1
)
return rewards
新创作者扶持
python
"""
新创作者扶持计划
"""
class NewCreatorSupport:
"""
新创作者扶持系统
目标:让新创作者更容易获得初始曝光和粉丝
"""
# 新手期定义
NEW_CREATOR_PERIOD_DAYS = 30
NEW_CREATOR_MAX_CONTENTS = 20
def __init__(self, db, redis_client, traffic_pool):
self.db = db
self.redis = redis_client
self.traffic_pool = traffic_pool
def is_new_creator(self, author_id):
"""
判断是否是新创作者
"""
author = self.db.get_author(author_id)
days_since_join = author.get('days_since_join', 0)
total_contents = author.get('content_count', 0)
return (days_since_join <= self.NEW_CREATOR_PERIOD_DAYS and
total_contents <= self.NEW_CREATOR_MAX_CONTENTS)
def get_traffic_boost(self, author_id, content_id):
"""
获取新创作者流量加成
Returns:
float: 流量加成系数(1.0表示无加成)
"""
if not self.is_new_creator(author_id):
return 1.0
author = self.db.get_author(author_id)
content_count = author.get('content_count', 0)
# 前5条内容获得更大加成
if content_count <= 5:
base_boost = 2.0
elif content_count <= 10:
base_boost = 1.5
else:
base_boost = 1.2
# 根据内容质量调整
content = self.db.get_content(content_id)
quality_score = content.get('quality_score', 0.5)
if quality_score >= 0.8:
quality_boost = 1.3
elif quality_score >= 0.6:
quality_boost = 1.1
else:
quality_boost = 1.0
return base_boost * quality_boost
def on_new_creator_content_publish(self, author_id, content_id):
"""
新创作者发布内容时的特殊处理
"""
if not self.is_new_creator(author_id):
return
# 1. 获取流量加成
boost = self.get_traffic_boost(author_id, content_id)
# 2. 设置初始流量池为中级(跳过初级池)
if boost >= 1.5:
# 高加成内容直接进入中级池
self.traffic_pool.set_initial_pool(content_id, 'middle')
# 3. 记录新手内容标记(召回时优先)
self.redis.setex(
f'content:{content_id}:new_creator_boost',
86400 * 3, # 3天内有效
boost
)
# 4. 检查是否达成新手里程碑
self.check_new_creator_milestones(author_id)
def check_new_creator_milestones(self, author_id):
"""
检查新手里程碑
"""
author = self.db.get_author(author_id)
milestones = []
# 首次发布
if author['content_count'] == 1:
milestones.append({
'type': 'first_publish',
'reward': {'coins': 100, 'exp': 200}
})
# 首个100播放
max_plays = author.get('max_content_plays', 0)
if max_plays >= 100 and not self.milestone_achieved(author_id, 'first_100_plays'):
milestones.append({
'type': 'first_100_plays',
'reward': {'coins': 50, 'exp': 100}
})
# 首个粉丝
if author['fan_count'] >= 1 and not self.milestone_achieved(author_id, 'first_fan'):
milestones.append({
'type': 'first_fan',
'reward': {'coins': 30, 'exp': 50}
})
# 发放里程碑奖励
for milestone in milestones:
self.grant_milestone_reward(author_id, milestone)
def get_new_creator_guide(self, author_id):
"""
获取新手引导内容
"""
return {
'tips': [
{
'title': '内容质量是关键',
'content': '高质量的内容更容易获得推荐,建议每个视频都认真制作'
},
{
'title': '保持更新频率',
'content': '建议每周至少发布2-3条内容,保持账号活跃'
},
{
'title': '积极互动',
'content': '回复评论、与粉丝互动,有助于提升账号权重'
},
{
'title': '找准定位',
'content': '专注某一领域的内容,更容易被系统识别和推荐'
},
],
'recommended_tools': [
'视频剪辑教程',
'封面设计指南',
'热门话题推荐',
],
'support_policies': [
'新手期内容获得额外流量扶持',
'完成新手任务可获得奖励',
'参加新手训练营可获得1对1指导',
]
}
评论系统设计
评论排序算法
python
"""
评论排序系统
决定哪些评论显示在前面
"""
class CommentRankingSystem:
"""
评论排序系统
综合考虑:热度、质量、时效性、相关性
"""
def rank_comments(self, content_id, user_id=None, sort_type='hot'):
"""
评论排序
Args:
content_id: 内容ID
user_id: 当前用户ID(用于个性化)
sort_type: 排序类型 - hot/new/top
"""
# 获取所有评论
comments = self.db.get_content_comments(content_id)
if sort_type == 'new':
# 按时间倒序
return sorted(comments, key=lambda x: x['create_time'], reverse=True)
elif sort_type == 'top':
# 按点赞数倒序
return sorted(comments, key=lambda x: x['like_count'], reverse=True)
else: # hot - 默认热门排序
# 计算热度分数
for comment in comments:
comment['hot_score'] = self.calculate_hot_score(comment, user_id)
return sorted(comments, key=lambda x: x['hot_score'], reverse=True)
def calculate_hot_score(self, comment, user_id=None):
"""
计算评论热度分数
公式: score = (log(likes + 1) + log(replies + 1) × 0.5) × time_decay × quality × personal_boost
"""
# 1. 互动分
like_score = np.log1p(comment.get('like_count', 0))
reply_score = np.log1p(comment.get('reply_count', 0)) * 0.5
interaction_score = like_score + reply_score
# 2. 时间衰减
age_hours = (time.time() - comment['create_time']) / 3600
time_decay = 1.0 / (1.0 + age_hours / 24) # 24小时衰减到0.5
# 3. 质量分
quality_score = self.assess_comment_quality(comment)
# 4. 个性化加成
personal_boost = 1.0
if user_id:
personal_boost = self.get_personal_boost(comment, user_id)
# 5. 作者评论加成
author_boost = 1.0
if comment.get('is_author_comment'):
author_boost = 2.0 # 作者评论加成
# 6. 置顶加成
pin_boost = 1.0
if comment.get('is_pinned'):
pin_boost = 10.0
score = interaction_score * time_decay * quality_score * personal_boost * author_boost * pin_boost
return score
def assess_comment_quality(self, comment):
"""
评估评论质量
"""
text = comment.get('text', '')
score = 0.5 # 基础分
# 长度适中(10-100字)
length = len(text)
if 10 <= length <= 100:
score += 0.2
elif length > 100:
score += 0.1
# 包含表情(适量)
emoji_count = self.count_emojis(text)
if 1 <= emoji_count <= 3:
score += 0.1
# 不是纯表情/符号
if self.has_meaningful_content(text):
score += 0.2
# 历史数据:该用户评论的平均质量
user_avg_quality = self.get_user_comment_quality(comment['user_id'])
score = score * 0.7 + user_avg_quality * 0.3
return min(score, 1.0)
def get_personal_boost(self, comment, user_id):
"""
个性化加成
- 关注的人的评论
- 互动过的人的评论
- 相似用户的评论
"""
boost = 1.0
# 关注的人
if self.db.is_following(user_id, comment['user_id']):
boost *= 1.5
# 互动过的人
interaction_count = self.db.get_user_interaction_count(user_id, comment['user_id'])
if interaction_count > 0:
boost *= (1 + min(interaction_count, 10) * 0.05)
return boost
class CommentModerationSystem:
"""
评论审核系统
"""
def __init__(self, db, nlp_service, redis_client):
self.db = db
self.nlp = nlp_service
self.redis = redis_client
def moderate_comment(self, comment):
"""
评论审核
Returns:
dict: {
'approved': bool,
'reason': str,
'risk_level': int, # 0-100
}
"""
text = comment.get('text', '')
user_id = comment.get('user_id')
result = {
'approved': True,
'reason': None,
'risk_level': 0,
'checks': []
}
# 1. 关键词过滤
keyword_check = self.check_keywords(text)
result['checks'].append(keyword_check)
if keyword_check['blocked']:
result['approved'] = False
result['reason'] = keyword_check['reason']
result['risk_level'] = 100
return result
# 2. AI内容审核
ai_check = self.nlp.moderate_text(text)
result['checks'].append(ai_check)
if ai_check['risk_level'] > 80:
result['approved'] = False
result['reason'] = ai_check['reason']
result['risk_level'] = ai_check['risk_level']
return result
# 3. 用户信誉检查
user_check = self.check_user_reputation(user_id)
result['checks'].append(user_check)
if user_check['risk_level'] > 70:
result['risk_level'] = max(result['risk_level'], user_check['risk_level'])
# 高风险用户进入人工审核队列
result['need_manual_review'] = True
# 4. 频率限制检查
rate_check = self.check_rate_limit(user_id)
result['checks'].append(rate_check)
if rate_check['exceeded']:
result['approved'] = False
result['reason'] = '评论过于频繁,请稍后再试'
return result
# 5. 重复内容检查
duplicate_check = self.check_duplicate(user_id, text)
result['checks'].append(duplicate_check)
if duplicate_check['is_duplicate']:
result['approved'] = False
result['reason'] = '请勿重复发送相同内容'
return result
# 综合风险评分
result['risk_level'] = max(
check.get('risk_level', 0) for check in result['checks']
)
return result
def check_keywords(self, text):
"""
关键词过滤
"""
# 加载敏感词库(分级)
blocked_words = self.redis.smembers('moderation:blocked_words')
warning_words = self.redis.smembers('moderation:warning_words')
text_lower = text.lower()
# 检查屏蔽词
for word in blocked_words:
if word in text_lower:
return {
'blocked': True,
'reason': '包含违规内容',
'risk_level': 100
}
# 检查警告词
warning_count = sum(1 for word in warning_words if word in text_lower)
risk_level = min(warning_count * 20, 60)
return {
'blocked': False,
'warning_count': warning_count,
'risk_level': risk_level
}
def check_user_reputation(self, user_id):
"""
检查用户信誉
"""
user = self.db.get_user(user_id)
risk_level = 0
# 新用户风险
if user.get('days_since_join', 0) < 7:
risk_level += 20
# 历史违规
violation_count = user.get('violation_count', 0)
risk_level += min(violation_count * 15, 50)
# 被举报次数
report_count = user.get('report_count', 0)
risk_level += min(report_count * 5, 30)
return {
'risk_level': min(risk_level, 100)
}
def check_rate_limit(self, user_id, limit=10, window=60):
"""
频率限制
默认:60秒内最多10条评论
"""
key = f'user:{user_id}:comment_rate'
count = self.redis.incr(key)
if count == 1:
self.redis.expire(key, window)
return {
'exceeded': count > limit,
'current': count,
'limit': limit
}
评论互动机制
python
"""
评论互动系统
"""
class CommentInteractionSystem:
"""
评论互动
包括:点赞、回复、举报、@提醒
"""
def like_comment(self, user_id, comment_id):
"""
点赞评论
"""
# 检查是否已点赞
if self.db.has_liked_comment(user_id, comment_id):
return {'success': False, 'error': 'Already liked'}
# 记录点赞
self.db.add_comment_like(user_id, comment_id)
# 更新评论点赞数
self.db.incr_comment_like_count(comment_id)
# 通知评论作者
comment = self.db.get_comment(comment_id)
if comment['user_id'] != user_id:
self.notify_service.send_notification(
user_id=comment['user_id'],
type='comment_like',
content=f'你的评论收到了一个赞',
related_id=comment_id
)
return {'success': True}
def reply_comment(self, user_id, parent_comment_id, text, reply_to_user_id=None):
"""
回复评论
"""
# 审核
moderation_result = self.moderation.moderate_comment({
'user_id': user_id,
'text': text
})
if not moderation_result['approved']:
return {
'success': False,
'error': moderation_result['reason']
}
# 获取父评论
parent_comment = self.db.get_comment(parent_comment_id)
# 创建回复
reply = {
'user_id': user_id,
'content_id': parent_comment['content_id'],
'parent_id': parent_comment_id,
'root_id': parent_comment.get('root_id') or parent_comment_id,
'reply_to_user_id': reply_to_user_id,
'text': text,
'create_time': time.time()
}
reply_id = self.db.create_comment(reply)
# 更新父评论回复数
self.db.incr_comment_reply_count(parent_comment_id)
# 通知
# 通知父评论作者
if parent_comment['user_id'] != user_id:
self.notify_service.send_notification(
user_id=parent_comment['user_id'],
type='comment_reply',
content=f'你的评论收到了一条回复',
related_id=reply_id
)
# 通知被@的用户
if reply_to_user_id and reply_to_user_id != user_id:
self.notify_service.send_notification(
user_id=reply_to_user_id,
type='mention',
content=f'有人在评论中@了你',
related_id=reply_id
)
# 处理文本中的@
mentioned_users = self.extract_mentions(text)
for mentioned_user_id in mentioned_users:
if mentioned_user_id not in [user_id, reply_to_user_id]:
self.notify_service.send_notification(
user_id=mentioned_user_id,
type='mention',
content=f'有人在评论中@了你',
related_id=reply_id
)
return {
'success': True,
'reply_id': reply_id
}
def report_comment(self, user_id, comment_id, reason, description=''):
"""
举报评论
"""
# 检查是否已举报
if self.db.has_reported_comment(user_id, comment_id):
return {'success': False, 'error': 'Already reported'}
# 创建举报记录
report = {
'reporter_id': user_id,
'comment_id': comment_id,
'reason': reason,
'description': description,
'create_time': time.time(),
'status': 'pending'
}
report_id = self.db.create_comment_report(report)
# 更新评论被举报次数
report_count = self.db.incr_comment_report_count(comment_id)
# 达到阈值自动隐藏
if report_count >= 5:
self.db.hide_comment(comment_id, reason='auto_hide_by_reports')
# 加入审核队列
self.redis.lpush('moderation:comment_reports', report_id)
return {'success': True, 'report_id': report_id}
反作弊系统
python
"""
反作弊系统
检测和防止刷量、刷赞、刷评论等行为
"""
class AntiFraudSystem:
"""
反作弊系统
"""
def __init__(self, db, redis_client, ml_model):
self.db = db
self.redis = redis_client
self.ml_model = ml_model
def check_action(self, user_id, action_type, target_id, context=None):
"""
检查用户行为是否可疑
Args:
user_id: 用户ID
action_type: 行为类型 (play/like/comment/follow/share)
target_id: 目标ID (内容ID/用户ID)
context: 上下文信息
Returns:
dict: {
'is_fraud': bool,
'risk_score': float,
'reason': str
}
"""
result = {
'is_fraud': False,
'risk_score': 0,
'reasons': []
}
# 1. 频率检查
rate_check = self.check_action_rate(user_id, action_type)
if rate_check['is_abnormal']:
result['risk_score'] += rate_check['score']
result['reasons'].append(rate_check['reason'])
# 2. 设备/IP检查
device_check = self.check_device_ip(user_id, context)
if device_check['is_abnormal']:
result['risk_score'] += device_check['score']
result['reasons'].append(device_check['reason'])
# 3. 行为模式检查
pattern_check = self.check_behavior_pattern(user_id, action_type)
if pattern_check['is_abnormal']:
result['risk_score'] += pattern_check['score']
result['reasons'].append(pattern_check['reason'])
# 4. 关联账户检查
relation_check = self.check_account_relation(user_id, target_id)
if relation_check['is_abnormal']:
result['risk_score'] += relation_check['score']
result['reasons'].append(relation_check['reason'])
# 5. ML模型预测
ml_check = self.ml_fraud_detection(user_id, action_type, target_id, context)
result['risk_score'] += ml_check['score']
# 判定
if result['risk_score'] >= 80:
result['is_fraud'] = True
# 记录可疑行为
if result['risk_score'] >= 50:
self.log_suspicious_action(user_id, action_type, target_id, result)
return result
def check_action_rate(self, user_id, action_type):
"""
频率异常检测
"""
# 获取各时间窗口的行为次数
windows = {
'1min': 60,
'10min': 600,
'1hour': 3600,
'1day': 86400
}
# 正常阈值
thresholds = {
'play': {'1min': 10, '10min': 50, '1hour': 200, '1day': 500},
'like': {'1min': 5, '10min': 30, '1hour': 100, '1day': 300},
'comment': {'1min': 3, '10min': 15, '1hour': 50, '1day': 100},
'follow': {'1min': 3, '10min': 20, '1hour': 50, '1day': 100},
'share': {'1min': 3, '10min': 10, '1hour': 30, '1day': 50},
}
is_abnormal = False
score = 0
reasons = []
for window_name, window_seconds in windows.items():
key = f'user:{user_id}:{action_type}:count:{window_name}'
count = int(self.redis.get(key) or 0)
threshold = thresholds.get(action_type, {}).get(window_name, 100)
if count > threshold:
is_abnormal = True
exceed_ratio = count / threshold
score += min(exceed_ratio * 20, 40)
reasons.append(f'{window_name}内{action_type}次数异常: {count}')
return {
'is_abnormal': is_abnormal,
'score': score,
'reason': '; '.join(reasons) if reasons else None
}
def check_device_ip(self, user_id, context):
"""
设备/IP异常检测
"""
if not context:
return {'is_abnormal': False, 'score': 0}
is_abnormal = False
score = 0
reasons = []
device_id = context.get('device_id')
ip = context.get('ip')
# 检查同一设备关联的账户数
if device_id:
device_users = self.redis.scard(f'device:{device_id}:users')
if device_users > 3:
is_abnormal = True
score += 30
reasons.append(f'设备关联{device_users}个账户')
# 检查同一IP关联的账户数
if ip:
ip_users = self.redis.scard(f'ip:{ip}:users')
if ip_users > 10:
is_abnormal = True
score += 20
reasons.append(f'IP关联{ip_users}个账户')
# 检查是否代理IP
if self.is_proxy_ip(ip):
score += 15
reasons.append('使用代理IP')
# 检查设备是否模拟器
if context.get('is_emulator'):
score += 25
reasons.append('使用模拟器')
return {
'is_abnormal': is_abnormal,
'score': score,
'reason': '; '.join(reasons) if reasons else None
}
def check_behavior_pattern(self, user_id, action_type):
"""
行为模式异常检测
"""
# 获取用户最近行为序列
recent_actions = self.db.get_user_recent_actions(user_id, limit=100)
is_abnormal = False
score = 0
reasons = []
# 检查行为时间间隔是否过于规律(机器行为)
if len(recent_actions) >= 10:
intervals = []
for i in range(1, len(recent_actions)):
interval = recent_actions[i-1]['time'] - recent_actions[i]['time']
intervals.append(interval)
# 计算间隔的标准差
if intervals:
std = np.std(intervals)
mean = np.mean(intervals)
# 间隔过于规律(标准差小)
if std < mean * 0.1 and mean < 5: # 平均间隔<5秒且非常规律
is_abnormal = True
score += 40
reasons.append('行为时间间隔过于规律')
# 检查是否批量操作同一批内容/用户
if action_type in ['like', 'follow']:
targets = [a['target_id'] for a in recent_actions if a['type'] == action_type]
if len(targets) >= 10:
# 检查目标是否有关联(同一作者、同一时间发布等)
target_authors = [self.db.get_content_author(t) for t in targets[:20]]
unique_authors = len(set(target_authors))
if unique_authors <= 2:
is_abnormal = True
score += 35
reasons.append('集中操作同一作者的内容')
return {
'is_abnormal': is_abnormal,
'score': score,
'reason': '; '.join(reasons) if reasons else None
}
def handle_fraud_detection(self, user_id, action_type, fraud_result):
"""
处理检测到的作弊行为
"""
risk_score = fraud_result['risk_score']
if risk_score >= 90:
# 高风险:立即封禁账户
self.db.ban_user(user_id, reason='fraud_detected', duration=86400*30)
self.notify_user_banned(user_id)
elif risk_score >= 70:
# 中高风险:限制功能 + 人工审核
self.db.restrict_user(user_id, restrictions=['comment', 'follow'])
self.add_to_manual_review(user_id, fraud_result)
elif risk_score >= 50:
# 中风险:行为不计数 + 警告
self.db.mark_action_invalid(user_id, action_type)
self.warn_user(user_id)
# 记录
self.log_fraud_action(user_id, action_type, fraud_result)
总结
内容平台的核心技术体系:
| 模块 | 关键技术 |
|---|---|
| 推荐召回 | 协同过滤、向量召回(ANN)、热门召回、关注召回 |
| 推荐排序 | 多目标深度学习(MMoE)、CTR/CVR预估 |
| 重排多样性 | MMR算法、规则打散、新内容插入 |
| 流量池 | 阶梯式分发、指标晋级、时间衰减 |
| 创作者收益 | CPM分成、互动奖励、质量加成 |
| 新用户冷启动 | 画像推断、探索推荐、新手任务 |
| 评论排序 | 热度分数、时间衰减、个性化加成 |
| 反作弊 | 频率检测、设备指纹、行为模式、ML模型 |
这套系统的设计原则:
- 公平性:让优质内容有机会被看到
- 效率:帮助用户快速找到感兴趣的内容
- 生态健康:激励创作者持续产出优质内容
- 安全:防止作弊,维护平台秩序
希望这篇文章能帮大家理解内容平台背后的技术逻辑。有问题欢迎评论区讨论~
参考资料:
- 《推荐系统实践》
- YouTube/TikTok推荐系统论文
- B站技术博客
- 字节跳动推荐算法分享