项目1:微服务电商平台
项目简介
构建一个完整的微服务架构电商平台,包含用户服务、商品服务、订单服务、支付服务、推荐服务等多个独立服务,使用消息队列进行服务间通信,实现高可用、可扩展的分布式系统。
技术栈
后端服务
- 框架:FastAPI / Flask
- 通信:gRPC / RESTful API
- 消息队列:RabbitMQ / Kafka
- 缓存:Redis Cluster
- 数据库:PostgreSQL / MySQL + MongoDB
- 搜索引擎:Elasticsearch
基础设施
- 容器化:Docker + Docker Compose
- 编排:Kubernetes (K8s)
- 服务网格:Istio
- 配置中心:Consul / Nacos
- 服务注册与发现:Consul / Eureka
监控与日志
- 监控:Prometheus + Grafana
- 日志:ELK Stack (Elasticsearch + Logstash + Kibana)
- 链路追踪:Jaeger / Zipkin
- 告警:Alertmanager
系统架构
┌─────────────────────────────────────────────────────────┐
│ API Gateway (Kong/Nginx) │
│ 服务网关 + 负载均衡 │
└─────────────────────────────────────────────────────────┘
│
┌───────────────────┼───────────────────┐
│ │ │
┌───────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐
│ 用户服务 │ │ 商品服务 │ │ 订单服务 │
│ (User) │ │ (Product) │ │ (Order) │
│ FastAPI │ │ FastAPI │ │ FastAPI │
│ PostgreSQL │ │ PostgreSQL │ │ PostgreSQL │
└──────┬───────┘ └──────┬──────┘ └──────┬──────┘
│ │ │
└───────────────────┼───────────────────┘
│
┌───────────▼──────────┐
│ 消息队列 (RabbitMQ) │
└───────────┬──────────┘
│
┌──────────────────┼──────────────────┐
│ │ │
┌───────▼──────┐ ┌──────▼──────┐ ┌──────▼──────┐
│ 支付服务 │ │ 库存服务 │ │ 推荐服务 │
│ (Payment) │ │ (Inventory) │ │(Recommend) │
│ FastAPI │ │ FastAPI │ │ FastAPI │
└──────────────┘ └─────────────┘ └─────────────┘
│
┌───────────▼──────────┐
│ Redis Cluster │
│ (缓存层) │
└──────────────────────┘
核心服务实现
1. API网关(gateway/main.py)
python
from fastapi import FastAPI, Request, HTTPException
from fastapi.responses import JSONResponse
import httpx
import asyncio
from typing import Dict
import jwt
from datetime import datetime, timedelta
import redis
from circuit_breaker import CircuitBreaker
import os
app = FastAPI(title="API Gateway")
# 服务注册表
SERVICE_REGISTRY = {
"user": "http://user-service:8001",
"product": "http://product-service:8002",
"order": "http://order-service:8003",
"payment": "http://payment-service:8004"
}
# 从环境变量获取配置
REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD') # 生产环境应设置密码
# Redis连接(用于限流)
redis_client = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
# 熔断器字典
circuit_breakers: Dict[str, CircuitBreaker] = {
service: CircuitBreaker(failure_threshold=5, timeout=60)
for service in SERVICE_REGISTRY
}
class RateLimiter:
"""限流器"""
def __init__(self, max_requests: int = 100, window: int = 60):
self.max_requests = max_requests
self.window = window
async def is_allowed(self, key: str) -> bool:
"""检查是否允许请求"""
current = redis_client.get(key)
if current is None:
redis_client.setex(key, self.window, 1)
return True
if int(current) < self.max_requests:
redis_client.incr(key)
return True
return False
rate_limiter = RateLimiter(max_requests=100, window=60)
# 从环境变量获取JWT密钥(生产环境必须设置)
JWT_SECRET_KEY = os.getenv('JWT_SECRET_KEY')
if not JWT_SECRET_KEY:
raise ValueError("JWT_SECRET_KEY环境变量未设置,请在生产环境中配置")
JWT_ALGORITHM = os.getenv('JWT_ALGORITHM', 'HS256')
async def verify_token(token: str) -> dict:
"""验证JWT Token"""
try:
payload = jwt.decode(token, JWT_SECRET_KEY, algorithms=[JWT_ALGORITHM])
return payload
except jwt.ExpiredSignatureError:
raise HTTPException(status_code=401, detail="Token已过期")
except jwt.JWTError:
raise HTTPException(status_code=401, detail="Token无效")
@app.middleware("http")
async def auth_middleware(request: Request, call_next):
"""认证中间件"""
# 排除不需要认证的路径
if request.url.path in ["/health", "/login", "/register"]:
return await call_next(request)
# 获取Token
token = request.headers.get("Authorization", "").replace("Bearer ", "")
if not token:
return JSONResponse(
status_code=401,
content={"detail": "未提供认证Token"}
)
# 验证Token
try:
user_data = await verify_token(token)
request.state.user = user_data
except HTTPException as e:
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
return await call_next(request)
@app.middleware("http")
async def rate_limit_middleware(request: Request, call_next):
"""限流中间件"""
# 获取客户端IP
client_ip = request.client.host
key = f"rate_limit:{client_ip}"
# 检查限流
if not await rate_limiter.is_allowed(key):
return JSONResponse(
status_code=429,
content={"detail": "请求过于频繁,请稍后再试"}
)
return await call_next(request)
async def forward_request(service: str, path: str, method: str, **kwargs):
"""转发请求到微服务"""
service_url = SERVICE_REGISTRY.get(service)
if not service_url:
raise HTTPException(status_code=404, detail=f"服务 {service} 不存在")
url = f"{service_url}{path}"
circuit_breaker = circuit_breakers[service]
# 检查熔断器状态
if circuit_breaker.is_open():
raise HTTPException(
status_code=503,
detail=f"服务 {service} 暂时不可用(熔断器打开)"
)
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await getattr(client, method.lower())(url, **kwargs)
# 成功调用,记录成功
circuit_breaker.record_success()
return response.json()
except Exception as e:
# 调用失败,记录失败
circuit_breaker.record_failure()
raise HTTPException(status_code=503, detail=f"服务调用失败: {str(e)}")
# 路由转发示例
@app.api_route("/api/users/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
async def user_service_proxy(path: str, request: Request):
"""用户服务代理"""
return await forward_request(
service="user",
path=f"/{path}",
method=request.method,
headers=dict(request.headers),
json=await request.json() if request.method in ["POST", "PUT"] else None
)
@app.api_route("/api/products/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
async def product_service_proxy(path: str, request: Request):
"""商品服务代理"""
return await forward_request(
service="product",
path=f"/{path}",
method=request.method,
headers=dict(request.headers),
json=await request.json() if request.method in ["POST", "PUT"] else None
)
@app.get("/health")
async def health_check():
"""健康检查"""
return {"status": "healthy", "timestamp": datetime.now().isoformat()}
@app.get("/services/status")
async def services_status():
"""服务状态"""
status = {}
async with httpx.AsyncClient(timeout=5.0) as client:
for service, url in SERVICE_REGISTRY.items():
try:
response = await client.get(f"{url}/health")
status[service] = {
"status": "healthy" if response.status_code == 200 else "unhealthy",
"circuit_breaker": "closed" if not circuit_breakers[service].is_open() else "open"
}
except Exception:
status[service] = {
"status": "unreachable",
"circuit_breaker": "open" if circuit_breakers[service].is_open() else "closed"
}
return status
2. 熔断器实现(circuit_breaker.py)
python
from datetime import datetime, timedelta
from enum import Enum
import threading
class CircuitState(Enum):
"""熔断器状态"""
CLOSED = "closed" # 关闭(正常)
OPEN = "open" # 打开(熔断)
HALF_OPEN = "half_open" # 半开(尝试恢复)
class CircuitBreaker:
"""熔断器模式实现"""
def __init__(self, failure_threshold: int = 5, timeout: int = 60):
"""
初始化熔断器
:param failure_threshold: 失败次数阈值
:param timeout: 熔断超时时间(秒)
"""
self.failure_threshold = failure_threshold
self.timeout = timeout
self.failure_count = 0
self.last_failure_time = None
self.state = CircuitState.CLOSED
self.lock = threading.Lock()
def is_open(self) -> bool:
"""检查熔断器是否打开"""
with self.lock:
if self.state == CircuitState.OPEN:
# 检查是否可以进入半开状态
if datetime.now() - self.last_failure_time > timedelta(seconds=self.timeout):
self.state = CircuitState.HALF_OPEN
return False
return True
return False
def record_success(self):
"""记录成功调用"""
with self.lock:
if self.state == CircuitState.HALF_OPEN:
# 半开状态下成功,恢复到关闭状态
self.state = CircuitState.CLOSED
self.failure_count = 0
def record_failure(self):
"""记录失败调用"""
with self.lock:
self.failure_count += 1
self.last_failure_time = datetime.now()
if self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
3. 订单服务(order-service/main.py)
python
from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session
from typing import List
import asyncio
import aio_pika
import json
from datetime import datetime
from . import models, schemas, database
from .saga_orchestrator import SagaOrchestrator
app = FastAPI(title="Order Service")
# 消息队列连接
async def get_rabbitmq_connection():
"""获取RabbitMQ连接"""
connection = await aio_pika.connect_robust("amqp://guest:guest@rabbitmq/")
return connection
@app.post("/orders", response_model=schemas.Order)
async def create_order(
order: schemas.OrderCreate,
background_tasks: BackgroundTasks,
db: Session = Depends(database.get_db)
):
"""
创建订单(使用Saga模式确保分布式事务)
步骤:
1. 创建订单(Order Service)
2. 扣减库存(Inventory Service)
3. 创建支付(Payment Service)
4. 发送通知(Notification Service)
"""
# 创建订单记录
db_order = models.Order(
user_id=order.user_id,
total_amount=order.total_amount,
status="pending",
created_at=datetime.now()
)
db.add(db_order)
db.commit()
db.refresh(db_order)
# 启动Saga编排
saga = SagaOrchestrator(order_id=db_order.id, db=db)
background_tasks.add_task(saga.execute, order)
return db_order
class SagaOrchestrator:
"""Saga模式编排器"""
def __init__(self, order_id: int, db: Session):
self.order_id = order_id
self.db = db
self.steps_completed = []
async def execute(self, order_data: schemas.OrderCreate):
"""执行Saga流程"""
try:
# Step 1: 扣减库存
await self.reduce_inventory(order_data.items)
self.steps_completed.append("inventory")
# Step 2: 创建支付
await self.create_payment(order_data.payment_method)
self.steps_completed.append("payment")
# Step 3: 发送通知
await self.send_notification()
self.steps_completed.append("notification")
# 所有步骤成功,更新订单状态
await self.update_order_status("completed")
except Exception as e:
# 发生错误,执行补偿事务(回滚)
await self.compensate()
await self.update_order_status("failed")
raise
async def reduce_inventory(self, items: List[dict]):
"""扣减库存(使用RPC模式等待确认)"""
connection = await get_rabbitmq_connection()
async with connection:
channel = await connection.channel()
# 声明响应队列
callback_queue = await channel.declare_queue('', exclusive=True)
message = {
"order_id": self.order_id,
"action": "reduce",
"items": items
}
# 创建Future对象用于等待响应
future = asyncio.Future()
async def on_response(message):
"""处理响应消息"""
response = json.loads(message.body.decode())
if response.get('order_id') == self.order_id:
if response.get('success'):
future.set_result(True)
else:
future.set_exception(Exception(response.get('error', '库存扣减失败')))
# 监听响应队列
await callback_queue.consume(on_response)
# 发送RPC请求
await channel.default_exchange.publish(
aio_pika.Message(
body=json.dumps(message).encode(),
reply_to=callback_queue.name,
correlation_id=str(self.order_id)
),
routing_key="inventory.reduce"
)
# 等待响应(设置超时)
try:
await asyncio.wait_for(future, timeout=10.0)
except asyncio.TimeoutError:
raise Exception("库存服务响应超时")
async def create_payment(self, payment_method: str):
"""创建支付"""
connection = await get_rabbitmq_connection()
async with connection:
channel = await connection.channel()
message = {
"order_id": self.order_id,
"action": "create",
"payment_method": payment_method
}
await channel.default_exchange.publish(
aio_pika.Message(body=json.dumps(message).encode()),
routing_key="payment.create"
)
await asyncio.sleep(1)
async def send_notification(self):
"""发送通知"""
connection = await get_rabbitmq_connection()
async with connection:
channel = await connection.channel()
message = {
"order_id": self.order_id,
"type": "order_created"
}
await channel.default_exchange.publish(
aio_pika.Message(body=json.dumps(message).encode()),
routing_key="notification.send"
)
async def compensate(self):
"""补偿事务(回滚)"""
# 按相反顺序回滚已完成的步骤
for step in reversed(self.steps_completed):
if step == "inventory":
await self.restore_inventory()
elif step == "payment":
await self.cancel_payment()
async def restore_inventory(self):
"""恢复库存"""
connection = await get_rabbitmq_connection()
async with connection:
channel = await connection.channel()
message = {
"order_id": self.order_id,
"action": "restore"
}
await channel.default_exchange.publish(
aio_pika.Message(body=json.dumps(message).encode()),
routing_key="inventory.restore"
)
async def cancel_payment(self):
"""取消支付"""
connection = await get_rabbitmq_connection()
async with connection:
channel = await connection.channel()
message = {
"order_id": self.order_id,
"action": "cancel"
}
await channel.default_exchange.publish(
aio_pika.Message(body=json.dumps(message).encode()),
routing_key="payment.cancel"
)
async def update_order_status(self, status: str):
"""更新订单状态"""
order = self.db.query(models.Order).filter(models.Order.id == self.order_id).first()
if order:
order.status = status
self.db.commit()
@app.get("/orders/{order_id}", response_model=schemas.Order)
async def get_order(order_id: int, db: Session = Depends(database.get_db)):
"""获取订单详情"""
order = db.query(models.Order).filter(models.Order.id == order_id).first()
if not order:
raise HTTPException(status_code=404, detail="订单不存在")
return order
@app.get("/health")
async def health_check():
"""健康检查"""
return {"status": "healthy"}
4. Docker Compose配置(docker-compose.yml)
yaml
version: '3.8'
services:
# API网关
gateway:
build: ./gateway
ports:
- "8000:8000"
environment:
- SERVICE_REGISTRY=consul:8500
depends_on:
- user-service
- product-service
- order-service
- redis
networks:
- microservices
# 用户服务
user-service:
build: ./user-service
ports:
- "8001:8001"
environment:
- DATABASE_URL=postgresql://user:password@postgres-user/userdb
- REDIS_URL=redis://redis:6379/0
depends_on:
- postgres-user
- redis
networks:
- microservices
# 商品服务
product-service:
build: ./product-service
ports:
- "8002:8002"
environment:
- DATABASE_URL=postgresql://user:password@postgres-product/productdb
- ELASTICSEARCH_URL=http://elasticsearch:9200
depends_on:
- postgres-product
- elasticsearch
networks:
- microservices
# 订单服务
order-service:
build: ./order-service
ports:
- "8003:8003"
environment:
- DATABASE_URL=postgresql://user:password@postgres-order/orderdb
- RABBITMQ_URL=amqp://guest:guest@rabbitmq:5672/
depends_on:
- postgres-order
- rabbitmq
networks:
- microservices
# PostgreSQL数据库
postgres-user:
image: postgres:14
environment:
POSTGRES_USER: user
POSTGRES_PASSWORD: password
POSTGRES_DB: userdb
volumes:
- postgres-user-data:/var/lib/postgresql/data
networks:
- microservices
postgres-product:
image: postgres:14
environment:
POSTGRES_USER: user
POSTGRES_PASSWORD: password
POSTGRES_DB: productdb
volumes:
- postgres-product-data:/var/lib/postgresql/data
networks:
- microservices
postgres-order:
image: postgres:14
environment:
POSTGRES_USER: user
POSTGRES_PASSWORD: password
POSTGRES_DB: orderdb
volumes:
- postgres-order-data:/var/lib/postgresql/data
networks:
- microservices
# Redis缓存
redis:
image: redis:7-alpine
ports:
- "6379:6379"
networks:
- microservices
# RabbitMQ消息队列
rabbitmq:
image: rabbitmq:3-management
ports:
- "5672:5672"
- "15672:15672"
environment:
RABBITMQ_DEFAULT_USER: guest
RABBITMQ_DEFAULT_PASS: guest
networks:
- microservices
# Elasticsearch搜索引擎
elasticsearch:
image: elasticsearch:8.5.0
environment:
- discovery.type=single-node
- xpack.security.enabled=false
ports:
- "9200:9200"
networks:
- microservices
# Prometheus监控
prometheus:
image: prom/prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
networks:
- microservices
# Grafana可视化
grafana:
image: grafana/grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
networks:
- microservices
networks:
microservices:
driver: bridge
volumes:
postgres-user-data:
postgres-product-data:
postgres-order-data:
部署和运维
Kubernetes部署配置(k8s/deployment.yaml)
yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: order-service
labels:
app: order-service
spec:
replicas: 3
selector:
matchLabels:
app: order-service
template:
metadata:
labels:
app: order-service
spec:
containers:
- name: order-service
image: your-registry/order-service:latest
ports:
- containerPort: 8003
env:
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: db-secrets
key: order-db-url
- name: RABBITMQ_URL
valueFrom:
secretKeyRef:
name: mq-secrets
key: rabbitmq-url
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: 8003
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: 8003
initialDelaySeconds: 5
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: order-service
spec:
selector:
app: order-service
ports:
- protocol: TCP
port: 8003
targetPort: 8003
type: ClusterIP
---
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: order-service-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: order-service
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: 80
性能优化要点
-
数据库优化
- 使用数据库连接池
- 添加适当索引
- 读写分离
- 分库分表
-
缓存策略
- Redis多级缓存
- CDN加速静态资源
- 本地缓存热点数据
-
异步处理
- 消息队列异步处理
- 使用asyncio提高并发
-
负载均衡
- Nginx反向代理
- K8s Service负载均衡
- 数据库读负载均衡
环境变量配置说明
在生产环境部署时,需要配置以下环境变量:
bash
# API网关配置
export JWT_SECRET_KEY="your-super-secret-jwt-key-min-32-chars"
export JWT_ALGORITHM="HS256"
export REDIS_HOST="redis"
export REDIS_PORT="6379"
export REDIS_PASSWORD="your-redis-password"
# 数据库配置
export DATABASE_URL="postgresql://user:password@host:5432/dbname"
# RabbitMQ配置
export RABBITMQ_URL="amqp://user:password@host:5672/"
安全最佳实践
-
敏感信息管理
- 不要在代码中硬编码密钥、密码
- 使用环境变量或密钥管理服务(如AWS Secrets Manager、HashiCorp Vault)
- 在生产环境中启用Redis密码认证
- 使用强密码策略(至少32位随机字符)
-
数据库安全
- 使用连接池避免资源耗尽
- 启用SSL/TLS加密数据库连接
- 定期备份数据库
- 使用预编译语句防止SQL注入
-
服务间通信
- 使用RPC确认机制而非简单的sleep
- 实现幂等性保证重试安全
- 添加消息过期时间防止队列积压
- 使用TLS加密RabbitMQ连接
-
监控与告警
- 监控API限流指标
- 监控熔断器状态
- 设置资源使用告警
- 记录所有安全相关事件
项目2:机器学习平台(MLOps)
项目简介
构建一个端到端的机器学习平台,支持数据处理、模型训练、模型部署、A/B测试和模型监控。
技术栈
- ML框架:scikit-learn, TensorFlow, PyTorch
- 数据处理:pandas, numpy, Spark
- 模型服务:FastAPI, TensorFlow Serving
- 实验管理:MLflow
- 特征存储:Feast
- 工作流:Airflow / Prefect
- 模型监控:Evidently AI
系统架构
python
# ml-platform/pipeline/training_pipeline.py
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sqlalchemy import create_engine
import os
# 数据库连接配置
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://user:password@localhost:5432/ml_data')
db_engine = create_engine(DATABASE_URL)
default_args = {
'owner': 'ml-team',
'depends_on_past': False,
'start_date': datetime(2024, 1, 1),
'email_on_failure': True,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(
'model_training_pipeline',
default_args=default_args,
description='端到端模型训练流程',
schedule_interval=timedelta(days=1),
)
def extract_data(**context):
"""数据提取"""
# 从数据仓库提取数据
df = pd.read_sql(
"SELECT * FROM features WHERE date >= CURRENT_DATE - 30",
con=db_engine
)
df.to_parquet('/tmp/raw_data.parquet')
return '/tmp/raw_data.parquet'
def transform_data(**context):
"""数据转换"""
df = pd.read_parquet(context['task_instance'].xcom_pull(task_ids='extract'))
# 数据清洗
df = df.dropna()
df = df[df['value'] > 0]
# 特征工程
df['feature_1'] = df['col_a'] / df['col_b']
df['feature_2'] = df['col_c'].apply(lambda x: x**2)
processed_path = '/tmp/processed_data.parquet'
df.to_parquet(processed_path)
return processed_path
def train_model(**context):
"""模型训练"""
df = pd.read_parquet(context['task_instance'].xcom_pull(task_ids='transform'))
# 分割数据
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# 使用MLflow跟踪实验
with mlflow.start_run():
# 记录参数
params = {
'n_estimators': 100,
'max_depth': 10,
'min_samples_split': 5
}
mlflow.log_params(params)
# 训练模型
model = RandomForestClassifier(**params)
model.fit(X_train, y_train)
# 评估模型
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
# 记录指标
mlflow.log_metrics({
'accuracy': accuracy,
'f1_score': f1
})
# 保存模型
mlflow.sklearn.log_model(model, "model")
# 注册模型
model_uri = f"runs:/{mlflow.active_run().info.run_id}/model"
mlflow.register_model(model_uri, "production_model")
def validate_model(**context):
"""模型验证"""
# 加载模型
model = mlflow.sklearn.load_model("models:/production_model/latest")
# 验证集测试
# ...
# 决定是否部署
return True
def deploy_model(**context):
"""模型部署"""
# 部署到生产环境
# 可以使用TensorFlow Serving, BentoML等
pass
# 定义任务
extract_task = PythonOperator(
task_id='extract',
python_callable=extract_data,
dag=dag,
)
transform_task = PythonOperator(
task_id='transform',
python_callable=transform_data,
dag=dag,
)
train_task = PythonOperator(
task_id='train',
python_callable=train_model,
dag=dag,
)
validate_task = PythonOperator(
task_id='validate',
python_callable=validate_model,
dag=dag,
)
deploy_task = PythonOperator(
task_id='deploy',
python_callable=deploy_model,
dag=dag,
)
# 设置依赖
extract_task >> transform_task >> train_task >> validate_task >> deploy_task
模型服务API
python
# ml-platform/serving/main.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import mlflow
import numpy as np
from typing import List
import redis
import json
import os
app = FastAPI(title="ML Model Serving")
# 从环境变量获取配置
REDIS_HOST = os.getenv('REDIS_HOST', 'redis')
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
# 加载模型
MODEL_URI = os.getenv('MODEL_URI', 'models:/production_model/latest')
model = mlflow.sklearn.load_model(MODEL_URI)
# Redis缓存
redis_client = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=True
)
class PredictionRequest(BaseModel):
features: List[float]
class PredictionResponse(BaseModel):
prediction: int
probability: float
model_version: str
@app.post("/predict", response_model=PredictionResponse)
async def predict(request: PredictionRequest):
"""预测接口"""
# 检查缓存
cache_key = f"prediction:{hash(str(request.features))}"
cached = redis_client.get(cache_key)
if cached:
return json.loads(cached)
# 预测
features = np.array(request.features).reshape(1, -1)
prediction = model.predict(features)[0]
probability = model.predict_proba(features)[0].max()
result = {
"prediction": int(prediction),
"probability": float(probability),
"model_version": "1.0.0"
}
# 缓存结果
redis_client.setex(cache_key, 3600, json.dumps(result))
return result
@app.post("/batch_predict")
async def batch_predict(requests: List[PredictionRequest]):
"""批量预测"""
features = np.array([req.features for req in requests])
predictions = model.predict(features)
probabilities = model.predict_proba(features).max(axis=1)
return [
{
"prediction": int(pred),
"probability": float(prob)
}
for pred, prob in zip(predictions, probabilities)
]
项目3:分布式爬虫系统
项目简介
构建大规模分布式爬虫系统,支持分布式调度、增量爬取、智能去重、反爬虫策略等。
技术栈
- 爬虫框架:Scrapy Distributed
- 任务队列:Celery + Redis
- 数据存储:MongoDB + Elasticsearch
- 代理池:自建代理池
- 去重:BloomFilter
- 调度:Scrapy-Redis
核心实现
python
# distributed-spider/spiders/news_spider.py
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy.http import Request
import hashlib
from pybloom_live import BloomFilter # pip install pybloom-live
import redis
class DistributedNewsSpider(RedisSpider):
"""分布式新闻爬虫"""
name = 'distributed_news'
redis_key = 'news:start_urls'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# 初始化布隆过滤器(去重)
# 注意:BloomFilter参数为 capacity(容量)和 error_rate(错误率)
self.bloom_filter = BloomFilter(capacity=10000000, error_rate=0.001)
def make_request_from_data(self, data):
"""从Redis队列获取URL"""
url = data.decode('utf-8')
return Request(url, callback=self.parse, dont_filter=True)
def parse(self, response):
"""解析页面"""
# 提取文章链接
for article in response.css('article.item'):
article_url = response.urljoin(article.css('a::attr(href)').get())
# URL去重
url_hash = hashlib.md5(article_url.encode()).hexdigest()
if url_hash in self.bloom_filter:
continue
self.bloom_filter.add(url_hash)
yield Request(article_url, callback=self.parse_article,
meta={'proxy': self.get_proxy()})
# 翻页
next_page = response.css('a.next::attr(href)').get()
if next_page:
yield Request(response.urljoin(next_page), callback=self.parse)
def parse_article(self, response):
"""解析文章详情"""
yield {
'url': response.url,
'title': response.css('h1::text').get(),
'content': ''.join(response.css('div.content p::text').getall()),
'publish_date': response.css('time::attr(datetime)').get(),
'author': response.css('span.author::text').get(),
}
def get_proxy(self):
"""从代理池获取代理"""
# 从Redis获取可用代理
proxy = self.redis_conn.spop('proxy:available')
return proxy.decode() if proxy else None
# 代理池管理
class ProxyPool:
"""代理池"""
def __init__(self, redis_client):
self.redis = redis_client
async def fetch_proxies(self):
"""获取代理"""
# 从代理提供商获取
proxies = await self.fetch_from_providers()
for proxy in proxies:
if await self.validate_proxy(proxy):
self.redis.sadd('proxy:available', proxy)
async def validate_proxy(self, proxy):
"""验证代理"""
try:
async with httpx.AsyncClient(proxies={"http://": proxy}) as client:
response = await client.get('http://httpbin.org/ip', timeout=5)
return response.status_code == 200
except:
return False
项目4:实时数据处理平台
项目简介
构建一个实时数据处理平台,使用Kafka进行消息传递,Spark Streaming进行流式计算,实现秒级数据处理和实时分析。
技术栈
- 消息队列:Apache Kafka
- 流处理:PySpark Streaming / Flink PyFlink
- 存储:ClickHouse / TimescaleDB
- 缓存:Redis
- 监控:Prometheus + Grafana
- 协调:Apache ZooKeeper
系统架构
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
│ 数据源 │ │ 数据源 │ │ 数据源 │
│ (App Logs) │ │ (IoT) │ │ (Events) │
└──────┬───────┘ └──────┬───────┘ └──────┬───────┘
│ │ │
└───────────────────┼───────────────────┘
│
┌────────▼────────┐
│ Kafka Cluster │
│ (消息队列) │
└────────┬────────┘
│
┌──────────────────┼──────────────────┐
│ │ │
┌───────▼──────┐ ┌────────▼────────┐ ┌─────▼─────┐
│ Spark Stream │ │ Flink Job │ │ Consumer │
│ (实时计算) │ │ (复杂事件处理) │ │ (存储) │
└───────┬──────┘ └────────┬────────┘ └─────┬─────┘
│ │ │
└──────────────────┼──────────────────┘
│
┌───────────▼──────────┐
│ ClickHouse │
│ (时序数据库) │
└───────────┬──────────┘
│
┌───────────▼──────────┐
│ Grafana Dashboard │
│ (实时监控) │
└──────────────────────┘
核心实现
1. Kafka生产者(producer/event_producer.py)
python
from kafka import KafkaProducer
from kafka.errors import KafkaError
import json
import time
from datetime import datetime
from typing import Dict, Any
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class EventProducer:
"""事件生产者"""
def __init__(self, bootstrap_servers: list, topic: str):
self.topic = topic
self.producer = KafkaProducer(
bootstrap_servers=bootstrap_servers,
value_serializer=lambda v: json.dumps(v).encode('utf-8'),
key_serializer=lambda k: k.encode('utf-8') if k else None,
# 性能优化配置
compression_type='snappy', # 压缩
linger_ms=10, # 批量发送延迟
batch_size=32768, # 批量大小
buffer_memory=67108864, # 缓冲区大小
# 可靠性配置
acks='all', # 所有副本确认
retries=3, # 重试次数
max_in_flight_requests_per_connection=1 # 保证顺序
)
def send_event(self, event_type: str, data: Dict[str, Any], key: str = None):
"""
发送事件
:param event_type: 事件类型
:param data: 事件数据
:param key: 分区键(相同key的消息会发送到同一分区,保证顺序)
"""
event = {
'event_type': event_type,
'timestamp': datetime.now().isoformat(),
'data': data
}
try:
# 异步发送
future = self.producer.send(
topic=self.topic,
value=event,
key=key
)
# 添加回调
future.add_callback(self.on_send_success)
future.add_errback(self.on_send_error)
except Exception as e:
logger.error(f"发送事件失败: {e}")
raise
def on_send_success(self, record_metadata):
"""发送成功回调"""
logger.debug(
f"消息发送成功: topic={record_metadata.topic}, "
f"partition={record_metadata.partition}, "
f"offset={record_metadata.offset}"
)
def on_send_error(self, exception):
"""发送失败回调"""
logger.error(f"消息发送失败: {exception}")
def flush(self):
"""强制发送所有缓冲消息"""
self.producer.flush()
def close(self):
"""关闭生产者"""
self.producer.close()
# 使用示例
if __name__ == "__main__":
producer = EventProducer(
bootstrap_servers=['localhost:9092'],
topic='user_events'
)
# 模拟发送用户行为事件
for i in range(100):
producer.send_event(
event_type='page_view',
data={
'user_id': f'user_{i % 10}',
'page': '/product/123',
'duration': i % 60
},
key=f'user_{i % 10}' # 同一用户的事件会发送到同一分区
)
time.sleep(0.1)
producer.flush()
producer.close()
2. Spark Streaming消费者(streaming/spark_consumer.py)
python
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
# 创建Spark Session
spark = SparkSession.builder \
.appName("RealtimeEventProcessing") \
.config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
# 定义事件Schema
event_schema = StructType([
StructField("event_type", StringType(), True),
StructField("timestamp", StringType(), True),
StructField("data", StructType([
StructField("user_id", StringType(), True),
StructField("page", StringType(), True),
StructField("duration", IntegerType(), True)
]), True)
])
# 从Kafka读取流数据
kafka_df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "user_events") \
.option("startingOffsets", "latest") \
.load()
# 解析JSON数据
events_df = kafka_df.select(
from_json(col("value").cast("string"), event_schema).alias("event")
).select("event.*")
# 数据清洗和转换
cleaned_df = events_df \
.filter(col("data.user_id").isNotNull()) \
.withColumn("hour", hour(to_timestamp(col("timestamp")))) \
.withColumn("date", to_date(to_timestamp(col("timestamp"))))
# 实时聚合:计算每个用户每小时的浏览量
user_hourly_stats = cleaned_df \
.groupBy(
window(to_timestamp(col("timestamp")), "1 hour"),
col("data.user_id")
) \
.agg(
count("*").alias("page_views"),
sum("data.duration").alias("total_duration"),
avg("data.duration").alias("avg_duration")
)
# 输出到控制台(调试用)
console_query = user_hourly_stats.writeStream \
.outputMode("complete") \
.format("console") \
.option("truncate", False) \
.start()
# 输出到Kafka(供下游消费)
kafka_output_query = user_hourly_stats \
.selectExpr(
"data.user_id as key",
"to_json(struct(*)) as value"
) \
.writeStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("topic", "user_stats") \
.option("checkpointLocation", "/tmp/checkpoint") \
.start()
# 输出到ClickHouse(持久化存储)
def write_to_clickhouse(batch_df, batch_id):
"""将批次数据写入ClickHouse"""
if not batch_df.isEmpty():
batch_df.write \
.format("jdbc") \
.option("url", "jdbc:clickhouse://localhost:8123/analytics") \
.option("dbtable", "user_hourly_stats") \
.option("user", "default") \
.option("password", "") \
.mode("append") \
.save()
clickhouse_query = user_hourly_stats.writeStream \
.foreachBatch(write_to_clickhouse) \
.outputMode("update") \
.start()
# 等待终止
spark.streams.awaitAnyTermination()
3. 实时异常检测(streaming/anomaly_detector.py)
python
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.clustering import KMeans
import numpy as np
class RealtimeAnomalyDetector:
"""实时异常检测"""
def __init__(self, spark: SparkSession):
self.spark = spark
self.model = None
self.scaler = None
def train_model(self, historical_data_df):
"""使用历史数据训练异常检测模型"""
# 特征工程
assembler = VectorAssembler(
inputCols=['page_views', 'total_duration', 'avg_duration'],
outputCol='features'
)
feature_df = assembler.transform(historical_data_df)
# 标准化
self.scaler = StandardScaler(
inputCol='features',
outputCol='scaled_features',
withStd=True,
withMean=True
)
scaler_model = self.scaler.fit(feature_df)
scaled_df = scaler_model.transform(feature_df)
# 训练KMeans模型
kmeans = KMeans(
k=3,
featuresCol='scaled_features',
predictionCol='cluster'
)
self.model = kmeans.fit(scaled_df)
def detect_anomalies(self, stream_df):
"""检测流数据中的异常"""
# 特征转换
assembler = VectorAssembler(
inputCols=['page_views', 'total_duration', 'avg_duration'],
outputCol='features'
)
feature_df = assembler.transform(stream_df)
# 预测和计算距离
predictions = self.model.transform(feature_df)
# 标记异常(距离聚类中心过远的点)
anomaly_df = predictions \
.withColumn(
'is_anomaly',
when(col('prediction') == 2, True).otherwise(False)
)
return anomaly_df
# 使用示例
def process_stream_with_anomaly_detection():
spark = SparkSession.builder \
.appName("AnomalyDetection") \
.getOrCreate()
# 定义用户统计Schema
user_stats_schema = StructType([
StructField("user_id", StringType(), True),
StructField("window", StructType([
StructField("start", StringType(), True),
StructField("end", StringType(), True)
]), True),
StructField("page_views", LongType(), True),
StructField("total_duration", LongType(), True),
StructField("avg_duration", DoubleType(), True)
])
# 加载历史数据训练模型
historical_df = spark.read \
.format("jdbc") \
.option("url", "jdbc:clickhouse://localhost:8123/analytics") \
.option("dbtable", "user_hourly_stats") \
.load()
detector = RealtimeAnomalyDetector(spark)
detector.train_model(historical_df)
# 处理实时流
stream_df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "user_stats") \
.load()
# 解析并检测异常
parsed_df = stream_df.select(
from_json(col("value").cast("string"), user_stats_schema).alias("stats")
).select("stats.*")
anomaly_df = detector.detect_anomalies(parsed_df)
# 输出异常事件
anomaly_query = anomaly_df \
.filter(col("is_anomaly") == True) \
.writeStream \
.format("console") \
.outputMode("append") \
.start()
anomaly_query.awaitTermination()
4. 窗口聚合和会话分析(streaming/session_analysis.py)
python
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder \
.appName("SessionAnalysis") \
.getOrCreate()
# 定义事件Schema
event_schema = StructType([
StructField("event_type", StringType(), True),
StructField("timestamp", StringType(), True),
StructField("data", StructType([
StructField("user_id", StringType(), True),
StructField("page", StringType(), True),
StructField("duration", IntegerType(), True)
]), True)
])
# 读取事件流
events_df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "user_events") \
.load()
# 解析事件
parsed_events = events_df.select(
from_json(col("value").cast("string"), event_schema).alias("event")
).select("event.*")
# 会话窗口分析(用户30分钟无活动则会话结束)
session_df = parsed_events \
.withWatermark("timestamp", "10 minutes") \
.groupBy(
col("data.user_id"),
session_window(col("timestamp"), "30 minutes")
) \
.agg(
count("*").alias("events_count"),
countDistinct("data.page").alias("unique_pages"),
sum("data.duration").alias("session_duration"),
collect_list("data.page").alias("page_sequence"),
min("timestamp").alias("session_start"),
max("timestamp").alias("session_end")
)
# 滑动窗口分析(每5分钟计算过去10分钟的统计)
sliding_window_stats = parsed_events \
.withWatermark("timestamp", "10 minutes") \
.groupBy(
window(col("timestamp"), "10 minutes", "5 minutes"),
col("event_type")
) \
.agg(
count("*").alias("event_count"),
countDistinct("data.user_id").alias("unique_users")
)
# 输出会话分析结果
session_query = session_df.writeStream \
.outputMode("append") \
.format("json") \
.option("path", "/output/sessions") \
.option("checkpointLocation", "/checkpoint/sessions") \
.start()
# 输出滑动窗口统计
sliding_query = sliding_window_stats.writeStream \
.outputMode("append") \
.format("console") \
.start()
spark.streams.awaitAnyTermination()
5. Docker Compose配置(docker-compose.yml)
yaml
version: '3.8'
services:
# ZooKeeper
zookeeper:
image: confluentinc/cp-zookeeper:7.3.0
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
ports:
- "2181:2181"
networks:
- streaming
# Kafka
kafka:
image: confluentinc/cp-kafka:7.3.0
depends_on:
- zookeeper
ports:
- "9092:9092"
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://localhost:9092
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
KAFKA_AUTO_CREATE_TOPICS_ENABLE: "true"
networks:
- streaming
# ClickHouse数据库
clickhouse:
image: clickhouse/clickhouse-server:latest
ports:
- "8123:8123"
- "9000:9000"
environment:
CLICKHOUSE_DB: analytics
volumes:
- clickhouse-data:/var/lib/clickhouse
networks:
- streaming
# Spark Master
spark-master:
image: bitnami/spark:3.3.0
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
ports:
- "8080:8080"
- "7077:7077"
networks:
- streaming
# Spark Worker
spark-worker:
image: bitnami/spark:3.3.0
depends_on:
- spark-master
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=2G
- SPARK_WORKER_CORES=2
networks:
- streaming
# Redis缓存
redis:
image: redis:7-alpine
ports:
- "6379:6379"
networks:
- streaming
# Grafana
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
networks:
- streaming
networks:
streaming:
driver: bridge
volumes:
clickhouse-data:
项目5:推荐系统平台
项目简介
构建一个完整的推荐系统,支持协同过滤、内容推荐、深度学习推荐等多种推荐算法,实现个性化推荐和实时推荐。
技术栈
- 推荐算法:协同过滤、矩阵分解、深度学习(Wide & Deep)
- 特征工程:pandas、scikit-learn
- 模型框架:TensorFlow、PyTorch
- 实时推荐:Redis、Kafka
- 离线计算:Spark
- A/B测试:自研框架
核心实现
1. 协同过滤推荐(recommender/collaborative_filtering.py)
python
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
class CollaborativeFilteringRecommender:
"""协同过滤推荐系统"""
def __init__(self, min_common_users=3):
self.min_common_users = min_common_users
self.user_item_matrix = None
self.item_similarity = None
def fit(self, ratings_df: pd.DataFrame):
"""
训练推荐模型
:param ratings_df: 评分数据,包含user_id, item_id, rating列
"""
# 构建用户-物品评分矩阵
self.user_item_matrix = ratings_df.pivot_table(
index='user_id',
columns='item_id',
values='rating',
fill_value=0
)
# 计算物品相似度矩阵(基于物品的协同过滤)
item_matrix = self.user_item_matrix.T
self.item_similarity = cosine_similarity(item_matrix)
# 转换为DataFrame方便查询
self.item_similarity_df = pd.DataFrame(
self.item_similarity,
index=item_matrix.index,
columns=item_matrix.index
)
def recommend_for_user(self, user_id: int, top_n: int = 10):
"""
为用户推荐物品
:param user_id: 用户ID
:param top_n: 推荐数量
:return: 推荐物品列表
"""
if user_id not in self.user_item_matrix.index:
return []
# 获取用户已评分的物品
user_ratings = self.user_item_matrix.loc[user_id]
rated_items = user_ratings[user_ratings > 0].index.tolist()
# 计算推荐分数
scores = {}
for item_id in self.item_similarity_df.columns:
if item_id in rated_items:
continue
# 基于用户已评分物品计算推荐分数
similar_items = self.item_similarity_df[item_id]
rated_similar = similar_items[rated_items]
# 加权平均
if len(rated_similar) > 0:
numerator = np.sum(
rated_similar.values * user_ratings[rated_items].values
)
denominator = np.sum(np.abs(rated_similar.values))
if denominator > 0:
scores[item_id] = numerator / denominator
# 排序并返回Top-N
recommendations = sorted(
scores.items(),
key=lambda x: x[1],
reverse=True
)[:top_n]
return [item_id for item_id, score in recommendations]
def similar_items(self, item_id: int, top_n: int = 10):
"""获取相似物品"""
if item_id not in self.item_similarity_df.columns:
return []
similar = self.item_similarity_df[item_id].sort_values(ascending=False)[1:top_n+1]
return similar.index.tolist()
2. 矩阵分解推荐(recommender/matrix_factorization.py)
python
import numpy as np
from typing import Tuple
class MatrixFactorization:
"""矩阵分解推荐(SVD)"""
def __init__(self, n_factors=50, learning_rate=0.01, regularization=0.02, n_epochs=100):
self.n_factors = n_factors
self.learning_rate = learning_rate
self.regularization = regularization
self.n_epochs = n_epochs
self.user_factors = None
self.item_factors = None
self.user_biases = None
self.item_biases = None
self.global_mean = None
def fit(self, ratings_df: pd.DataFrame):
"""训练模型"""
# 初始化
n_users = ratings_df['user_id'].nunique()
n_items = ratings_df['item_id'].nunique()
self.user_factors = np.random.normal(0, 0.1, (n_users, self.n_factors))
self.item_factors = np.random.normal(0, 0.1, (n_items, self.n_factors))
self.user_biases = np.zeros(n_users)
self.item_biases = np.zeros(n_items)
self.global_mean = ratings_df['rating'].mean()
# 创建用户和物品ID映射
self.user_id_map = {uid: idx for idx, uid in enumerate(ratings_df['user_id'].unique())}
self.item_id_map = {iid: idx for idx, iid in enumerate(ratings_df['item_id'].unique())}
# SGD训练
for epoch in range(self.n_epochs):
# 打乱数据
shuffled = ratings_df.sample(frac=1)
for _, row in shuffled.iterrows():
user_idx = self.user_id_map[row['user_id']]
item_idx = self.item_id_map[row['item_id']]
rating = row['rating']
# 预测
pred = self.predict_rating(user_idx, item_idx)
error = rating - pred
# 更新参数
self.user_factors[user_idx] += self.learning_rate * (
error * self.item_factors[item_idx] -
self.regularization * self.user_factors[user_idx]
)
self.item_factors[item_idx] += self.learning_rate * (
error * self.user_factors[user_idx] -
self.regularization * self.item_factors[item_idx]
)
self.user_biases[user_idx] += self.learning_rate * (
error - self.regularization * self.user_biases[user_idx]
)
self.item_biases[item_idx] += self.learning_rate * (
error - self.regularization * self.item_biases[item_idx]
)
def predict_rating(self, user_idx: int, item_idx: int) -> float:
"""预测评分"""
prediction = (
self.global_mean +
self.user_biases[user_idx] +
self.item_biases[item_idx] +
np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
)
return prediction
def recommend(self, user_id: int, top_n: int = 10):
"""为用户推荐"""
if user_id not in self.user_id_map:
return []
user_idx = self.user_id_map[user_id]
# 计算所有物品的预测评分
scores = []
for item_id, item_idx in self.item_id_map.items():
score = self.predict_rating(user_idx, item_idx)
scores.append((item_id, score))
# 排序并返回Top-N
scores.sort(key=lambda x: x[1], reverse=True)
return [item_id for item_id, score in scores[:top_n]]
3. 深度学习推荐(recommender/deep_recommender.py)
python
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
class WideAndDeepRecommender:
"""Wide & Deep推荐模型"""
def __init__(self, n_users, n_items, embedding_dim=32):
self.n_users = n_users
self.n_items = n_items
self.embedding_dim = embedding_dim
self.model = self._build_model()
def _build_model(self):
"""构建模型"""
# Wide部分输入
user_id_input = keras.Input(shape=(1,), name='user_id')
item_id_input = keras.Input(shape=(1,), name='item_id')
user_features = keras.Input(shape=(10,), name='user_features')
item_features = keras.Input(shape=(10,), name='item_features')
# Embedding层
user_embedding = layers.Embedding(
self.n_users,
self.embedding_dim,
name='user_embedding'
)(user_id_input)
user_embedding = layers.Flatten()(user_embedding)
item_embedding = layers.Embedding(
self.n_items,
self.embedding_dim,
name='item_embedding'
)(item_id_input)
item_embedding = layers.Flatten()(item_embedding)
# Deep部分
deep_concat = layers.Concatenate()([
user_embedding,
item_embedding,
user_features,
item_features
])
deep = layers.Dense(128, activation='relu')(deep_concat)
deep = layers.Dropout(0.3)(deep)
deep = layers.Dense(64, activation='relu')(deep)
deep = layers.Dropout(0.3)(deep)
deep = layers.Dense(32, activation='relu')(deep)
# Wide部分
wide = layers.Concatenate()([user_features, item_features])
# 合并Wide和Deep
combined = layers.Concatenate()([wide, deep])
output = layers.Dense(1, activation='sigmoid', name='output')(combined)
# 构建模型
model = keras.Model(
inputs=[user_id_input, item_id_input, user_features, item_features],
outputs=output
)
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy', 'AUC']
)
return model
def train(self, train_data, val_data, epochs=10, batch_size=256):
"""训练模型"""
history = self.model.fit(
train_data,
validation_data=val_data,
epochs=epochs,
batch_size=batch_size,
callbacks=[
keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True),
keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=2)
]
)
return history
def predict(self, user_ids, item_ids, user_features, item_features):
"""预测"""
return self.model.predict([user_ids, item_ids, user_features, item_features])
4. 实时推荐服务(recommender/realtime_recommender.py)
python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import List
import redis
import json
import pickle
import os
app = FastAPI(title="Recommendation API")
# 从环境变量获取Redis配置
REDIS_HOST = os.getenv('REDIS_HOST', 'localhost')
REDIS_PORT = int(os.getenv('REDIS_PORT', 6379))
REDIS_PASSWORD = os.getenv('REDIS_PASSWORD')
# Redis连接(生产环境应配置密码和连接池)
redis_client = redis.Redis(
host=REDIS_HOST,
port=REDIS_PORT,
password=REDIS_PASSWORD,
decode_responses=False,
max_connections=50 # 连接池最大连接数
)
class RecommendRequest(BaseModel):
user_id: int
top_n: int = 10
context: dict = {}
class RecommendResponse(BaseModel):
user_id: int
recommendations: List[dict]
algorithm: str
# 加载离线训练的模型
cf_model = pickle.loads(redis_client.get('cf_model'))
mf_model = pickle.loads(redis_client.get('mf_model'))
@app.post("/recommend", response_model=RecommendResponse)
async def get_recommendations(request: RecommendRequest):
"""获取推荐"""
user_id = request.user_id
top_n = request.top_n
# 检查缓存
cache_key = f"recommendations:{user_id}:{top_n}"
cached = redis_client.get(cache_key)
if cached:
return json.loads(cached)
# 获取用户画像
user_profile = get_user_profile(user_id)
# 混合推荐策略
cf_recs = cf_model.recommend_for_user(user_id, top_n * 2)
mf_recs = mf_model.recommend(user_id, top_n * 2)
# 融合推荐结果
recommendations = merge_recommendations(
cf_recs,
mf_recs,
user_profile,
top_n
)
# 构建响应
response = {
"user_id": user_id,
"recommendations": recommendations,
"algorithm": "hybrid"
}
# 缓存结果(5分钟)
redis_client.setex(cache_key, 300, json.dumps(response))
return response
def get_user_profile(user_id: int):
"""获取用户画像"""
profile_key = f"user_profile:{user_id}"
profile = redis_client.hgetall(profile_key)
return profile
def merge_recommendations(cf_recs, mf_recs, user_profile, top_n):
"""融合多个推荐结果"""
# 计算权重
cf_weight = 0.4
mf_weight = 0.6
# 合并分数
scores = {}
for item_id in set(cf_recs + mf_recs):
score = 0
if item_id in cf_recs:
score += cf_weight * (1 - cf_recs.index(item_id) / len(cf_recs))
if item_id in mf_recs:
score += mf_weight * (1 - mf_recs.index(item_id) / len(mf_recs))
scores[item_id] = score
# 排序并返回Top-N
sorted_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
# 获取物品信息
recommendations = []
for item_id, score in sorted_items:
item_info = get_item_info(item_id)
recommendations.append({
"item_id": item_id,
"score": score,
**item_info
})
return recommendations
def get_item_info(item_id: int):
"""获取物品信息"""
item_key = f"item:{item_id}"
item_data = redis_client.hgetall(item_key)
return {k.decode(): v.decode() for k, v in item_data.items()}
@app.get("/similar_items/{item_id}")
async def get_similar_items(item_id: int, top_n: int = 10):
"""获取相似物品"""
similar = cf_model.similar_items(item_id, top_n)
items = [get_item_info(iid) for iid in similar]
return {"item_id": item_id, "similar_items": items}
学习要点
- 协同过滤算法(用户CF、物品CF)
- 矩阵分解(SVD、ALS)
- 深度学习推荐(Wide & Deep、DIN)
- 特征工程
- 实时推荐架构
- A/B测试
- 推荐系统评估指标
项目6:DevOps自动化平台
项目简介
构建DevOps自动化平台,实现CI/CD流水线、自动化部署、环境管理和监控告警等功能。
核心实现
GitLab CI/CD配置(.gitlab-ci.yml)
yaml
stages:
- test
- build
- deploy
variables:
DOCKER_IMAGE: $CI_REGISTRY_IMAGE:$CI_COMMIT_REF_SLUG
KUBECONFIG: /etc/deploy/config
# 测试阶段
test:
stage: test
image: python:3.11
script:
- pip install -r requirements.txt
- pytest tests/ -v --cov=app --cov-report=xml
- pylint app/
coverage: '/TOTAL.*\s+(\d+%)$/'
artifacts:
reports:
coverage_report:
coverage_format: cobertura
path: coverage.xml
# 构建Docker镜像
build:
stage: build
image: docker:latest
services:
- docker:dind
script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY
- docker build -t $DOCKER_IMAGE .
- docker push $DOCKER_IMAGE
only:
- main
- develop
# 部署到K8s
deploy_production:
stage: deploy
image: bitnami/kubectl:latest
script:
- kubectl set image deployment/app app=$DOCKER_IMAGE
- kubectl rollout status deployment/app
environment:
name: production
url: https://app.example.com
only:
- main
when: manual
自动化部署脚本(deploy/deploy.py)
python
import subprocess
import yaml
from pathlib import Path
class KubernetesDeployer:
"""Kubernetes自动化部署"""
def __init__(self, kubeconfig_path: str):
self.kubeconfig = kubeconfig_path
def deploy(self, manifest_path: str, namespace: str = "default"):
"""部署应用"""
cmd = [
"kubectl",
f"--kubeconfig={self.kubeconfig}",
"apply",
"-f", manifest_path,
"-n", namespace
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise Exception(f"部署失败: {result.stderr}")
return result.stdout
def rollback(self, deployment_name: str, namespace: str = "default"):
"""回滚部署"""
cmd = [
"kubectl",
f"--kubeconfig={self.kubeconfig}",
"rollout",
"undo",
f"deployment/{deployment_name}",
"-n", namespace
]
subprocess.run(cmd, check=True)
def scale(self, deployment_name: str, replicas: int, namespace: str = "default"):
"""扩缩容"""
cmd = [
"kubectl",
f"--kubeconfig={self.kubeconfig}",
"scale",
f"deployment/{deployment_name}",
f"--replicas={replicas}",
"-n", namespace
]
subprocess.run(cmd, check=True)
项目9:容器化Python应用(Docker + K8s)
项目简介
将Python应用容器化并部署到Kubernetes集群,实现高可用和自动扩缩容。
技术栈
- 容器化:Docker, Docker Compose
- 编排:Kubernetes, Helm
- CI/CD:GitLab CI, GitHub Actions
- 监控:Prometheus, Grafana
- 日志:ELK Stack
核心内容
dockerfile
# Dockerfile
FROM python:3.11-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
CMD ["gunicorn", "--bind", "0.0.0.0:8000", "app:app"]
yaml
# kubernetes/deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: python-app
spec:
replicas: 3
selector:
matchLabels:
app: python-app
template:
metadata:
labels:
app: python-app
spec:
containers:
- name: python-app
image: your-registry/python-app:latest
ports:
- containerPort: 8000
resources:
requests:
memory: "256Mi"
cpu: "250m"
limits:
memory: "512Mi"
cpu: "500m"
项目10:GraphQL API服务
项目简介
使用Graphene构建GraphQL API服务,提供灵活的数据查询能力。
技术栈
- 框架:Graphene-Python
- Web框架:Flask / FastAPI
- ORM:SQLAlchemy
- 认证:JWT
- 订阅:WebSocket
核心代码
python
import graphene
from graphene_sqlalchemy import SQLAlchemyObjectType
from models import User as UserModel, Post as PostModel
class User(SQLAlchemyObjectType):
class Meta:
model = UserModel
class Post(SQLAlchemyObjectType):
class Meta:
model = PostModel
class Query(graphene.ObjectType):
users = graphene.List(User)
user = graphene.Field(User, id=graphene.Int())
posts = graphene.List(Post)
def resolve_users(self, info):
return UserModel.query.all()
def resolve_user(self, info, id):
return UserModel.query.get(id)
def resolve_posts(self, info):
return PostModel.query.all()
class CreateUser(graphene.Mutation):
class Arguments:
username = graphene.String(required=True)
email = graphene.String(required=True)
user = graphene.Field(User)
def mutate(self, info, username, email):
user = UserModel(username=username, email=email)
db.session.add(user)
db.session.commit()
return CreateUser(user=user)
class Mutation(graphene.ObjectType):
create_user = CreateUser.Field()
schema = graphene.Schema(query=Query, mutation=Mutation)
参考资料
微服务架构
- 微服务设计模式:https://microservices.io/patterns/index.html
- Martin Fowler微服务:https://martinfowler.com/microservices/
- 微服务实践指南:https://www.nginx.com/blog/introduction-to-microservices/
- Service Mesh(Istio):https://istio.io/latest/docs/
- Spring Cloud(Python equivalent):https://github.com/Netflix/eureka
容器与编排
- Docker官方文档:https://docs.docker.com/
- Kubernetes官方文档:https://kubernetes.io/docs/
- Helm包管理:https://helm.sh/docs/
- Docker Compose:https://docs.docker.com/compose/
- Podman容器工具:https://podman.io/
- K3s轻量级K8s:https://k3s.io/
分布式系统
- 分布式系统模式:https://martinfowler.com/articles/patterns-of-distributed-systems/
- CAP定理:https://en.wikipedia.org/wiki/CAP_theorem
- 一致性协议:https://raft.github.io/
- 分布式事务:https://www.atomikos.com/Documentation/DistributedTransactions
- Saga模式:https://microservices.io/patterns/data/saga.html
消息队列
- RabbitMQ文档:https://www.rabbitmq.com/documentation.html
- Apache Kafka:https://kafka.apache.org/documentation/
- Redis Streams:https://redis.io/topics/streams-intro
- NATS消息系统:https://docs.nats.io/
- Pulsar消息平台:https://pulsar.apache.org/docs/
数据库进阶
- PostgreSQL文档:https://www.postgresql.org/docs/
- MongoDB文档:https://docs.mongodb.com/
- ElasticSearch:https://www.elastic.co/guide/
- Redis文档:https://redis.io/documentation
- 数据库分库分表:https://shardingsphere.apache.org/
- 数据库设计模式:https://www.amazon.com/Database-Design-Mere-Mortals-Hands/dp/0321884493
机器学习与AI
- scikit-learn文档:https://scikit-learn.org/stable/
- TensorFlow文档:https://www.tensorflow.org/api_docs/python
- PyTorch文档:https://pytorch.org/docs/stable/index.html
- Keras文档:https://keras.io/api/
- Hugging Face Transformers:https://huggingface.co/docs/transformers/
- MLflow实验管理:https://mlflow.org/docs/latest/index.html
- DVC数据版本控制:https://dvc.org/doc
实时数据处理
- Apache Spark:https://spark.apache.org/docs/latest/
- Apache Flink:https://flink.apache.org/
- Kafka Streams:https://kafka.apache.org/documentation/streams/
- Dask并行计算:https://docs.dask.org/
- Ray分布式计算:https://docs.ray.io/
性能优化
- Python性能分析:https://docs.python.org/3/library/profile.html
- cProfile使用:https://docs.python.org/3/library/profile.html#module-cProfile
- line_profiler:https://github.com/pyutils/line_profiler
- memory_profiler:https://pypi.org/project/memory-profiler/
- py-spy采样分析器:https://github.com/benfred/py-spy
- Python并发最佳实践:https://realpython.com/python-concurrency/
安全与认证
- OAuth 2.0:https://oauth.net/2/
- JWT最佳实践:https://tools.ietf.org/html/rfc8725
- OWASP Python安全:https://owasp.org/www-project-python-security/
- Python加密:https://cryptography.io/en/latest/
- 安全编码指南:https://wiki.python.org/moin/SecureCoding
测试进阶
- pytest-django:https://pytest-django.readthedocs.io/
- pytest-asyncio:https://pytest-asyncio.readthedocs.io/
- Hypothesis属性测试:https://hypothesis.readthedocs.io/
- Robot Framework:https://robotframework.org/
- Locust性能测试:https://docs.locust.io/
- Selenium Grid:https://www.selenium.dev/documentation/grid/
CI/CD
- GitHub Actions:https://docs.github.com/en/actions
- GitLab CI/CD:https://docs.gitlab.com/ee/ci/
- Jenkins:https://www.jenkins.io/doc/
- Travis CI:https://docs.travis-ci.com/
- CircleCI:https://circleci.com/docs/
- Azure DevOps:https://docs.microsoft.com/en-us/azure/devops/
监控与日志
- Prometheus监控:https://prometheus.io/docs/
- Grafana可视化:https://grafana.com/docs/
- ELK Stack:https://www.elastic.co/what-is/elk-stack
- Jaeger链路追踪:https://www.jaegertracing.io/docs/
- Sentry错误追踪:https://docs.sentry.io/
- DataDog APM:https://docs.datadoghq.com/
云平台
- AWS Python SDK(Boto3):https://boto3.amazonaws.com/v1/documentation/api/latest/index.html
- Google Cloud Python:https://cloud.google.com/python/docs
- Azure Python SDK:https://docs.microsoft.com/en-us/azure/developer/python/
- Terraform:https://www.terraform.io/docs/
- Serverless Framework:https://www.serverless.com/framework/docs/
架构设计
- 《系统设计面试》
- 《设计数据密集型应用》
- 《微服务架构设计模式》
- 《分布式系统原理与范型》
- 《高性能MySQL》(第3版)
- 《Redis设计与实现》
推荐书籍
- 《Python高性能编程》(第2版)
- 《流畅的Python》(第2版)
- 《Effective Python》(第2版)
- 《Python Cookbook》(第3版)
- 《Architecture Patterns with Python》
- 《Test-Driven Development with Python》
- 《两个披萨团队的DevOps实践》
开源项目学习
- Django源码:https://github.com/django/django
- Flask源码:https://github.com/pallets/flask
- Requests源码:https://github.com/psf/requests
- Celery源码:https://github.com/celery/celery
- awesome-python:https://github.com/vinta/awesome-python
- python-patterns设计模式:https://github.com/faif/python-patterns
技术博客
- Real Python:https://realpython.com/
- Full Stack Python:https://www.fullstackpython.com/
- Python Insider:https://blog.python.org/
- Planet Python:https://planetpython.org/
- PyBites:https://pybit.es/
会议与社区
- PyCon:https://pycon.org/
- EuroPython:https://europython.eu/
- Python中国开发者大会
- Python Weekly:https://www.pythonweekly.com/
- Pycoder's Weekly:https://pycoders.com/
常见问题与最佳实践
1. 环境配置管理
❌ 错误做法
python
# 硬编码敏感信息
SECRET_KEY = "my-secret-key"
redis_client = redis.Redis(host='redis', port=6379)
db_connection = "postgresql://user:password@localhost/db"
正确做法
python
import os
from dotenv import load_dotenv
# 加载.env文件
load_dotenv()
# 从环境变量读取
SECRET_KEY = os.getenv('SECRET_KEY')
if not SECRET_KEY:
raise ValueError("SECRET_KEY环境变量未设置")
# 配置Redis连接
redis_client = redis.Redis(
host=os.getenv('REDIS_HOST', 'localhost'),
port=int(os.getenv('REDIS_PORT', 6379)),
password=os.getenv('REDIS_PASSWORD'),
decode_responses=True
)
2. 数据库连接池
❌ 错误做法
python
# 每次查询都创建新连接
def get_user(user_id):
conn = psycopg2.connect(DATABASE_URL)
cursor = conn.cursor()
cursor.execute("SELECT * FROM users WHERE id = %s", (user_id,))
result = cursor.fetchone()
conn.close()
return result
正确做法
python
from sqlalchemy import create_engine
from sqlalchemy.pool import QueuePool
# 创建连接池
engine = create_engine(
DATABASE_URL,
poolclass=QueuePool,
pool_size=10,
max_overflow=20,
pool_pre_ping=True # 自动检测断开的连接
)
def get_user(user_id):
with engine.connect() as conn:
result = conn.execute(
"SELECT * FROM users WHERE id = :id",
{"id": user_id}
)
return result.fetchone()
3. 异步错误处理
❌ 错误做法
python
async def send_message(message):
# 没有错误处理和超时控制
await channel.publish(message)
正确做法
python
import asyncio
from tenacity import retry, stop_after_attempt, wait_exponential
@retry(
stop=stop_after_attempt(3),
wait=wait_exponential(multiplier=1, min=1, max=10)
)
async def send_message(message):
"""发送消息,带重试和超时控制"""
try:
async with asyncio.timeout(10.0): # 10秒超时
await channel.publish(message)
except asyncio.TimeoutError:
logger.error("消息发送超时")
raise
except Exception as e:
logger.error(f"消息发送失败: {e}")
raise
4. 分布式锁实现
❌ 错误做法
python
# 简单的Redis锁,存在死锁风险
def acquire_lock(key):
return redis_client.setnx(key, 1)
def release_lock(key):
redis_client.delete(key)
正确做法
python
import uuid
from contextlib import contextmanager
@contextmanager
def distributed_lock(key, timeout=10):
"""分布式锁上下文管理器"""
lock_id = str(uuid.uuid4())
try:
# 获取锁,设置过期时间
acquired = redis_client.set(
key,
lock_id,
nx=True,
ex=timeout
)
if not acquired:
raise Exception("获取锁失败")
yield
finally:
# 使用Lua脚本确保只删除自己的锁
lua_script = """
if redis.call("get", KEYS[1]) == ARGV[1] then
return redis.call("del", KEYS[1])
else
return 0
end
"""
redis_client.eval(lua_script, 1, key, lock_id)
# 使用示例
with distributed_lock("order:123"):
# 执行需要加锁的操作
process_order(order_id=123)
5. API限流实现
令牌桶算法
python
from datetime import datetime
import time
class TokenBucket:
"""令牌桶限流器"""
def __init__(self, rate: float, capacity: int):
"""
:param rate: 令牌生成速率(个/秒)
:param capacity: 桶容量
"""
self.rate = rate
self.capacity = capacity
self.tokens = capacity
self.last_update = time.time()
def consume(self, tokens: int = 1) -> bool:
"""消费令牌"""
now = time.time()
# 补充令牌
elapsed = now - self.last_update
self.tokens = min(
self.capacity,
self.tokens + elapsed * self.rate
)
self.last_update = now
# 检查是否有足够令牌
if self.tokens >= tokens:
self.tokens -= tokens
return True
return False
6. 缓存策略
多级缓存
python
from functools import lru_cache
import pickle
class CacheManager:
"""多级缓存管理器"""
def __init__(self, redis_client):
self.redis = redis_client
@lru_cache(maxsize=100) # L1: 本地内存缓存
def get_from_local_cache(self, key):
"""本地缓存(进程内)"""
return self._get_data(key)
def get(self, key):
"""获取数据:本地缓存 -> Redis -> 数据库"""
# L1: 本地缓存
try:
return self.get_from_local_cache(key)
except:
pass
# L2: Redis缓存
cached = self.redis.get(key)
if cached:
return pickle.loads(cached)
# L3: 数据库
data = self._fetch_from_db(key)
if data:
# 写入Redis缓存
self.redis.setex(key, 3600, pickle.dumps(data))
return data
def _fetch_from_db(self, key):
"""从数据库获取数据"""
# 实际的数据库查询逻辑
pass
7. 日志配置
结构化日志
python
import logging
import json
from datetime import datetime
class StructuredLogger:
"""结构化日志"""
def __init__(self, name):
self.logger = logging.getLogger(name)
handler = logging.StreamHandler()
handler.setFormatter(self.JSONFormatter())
self.logger.addHandler(handler)
self.logger.setLevel(logging.INFO)
class JSONFormatter(logging.Formatter):
def format(self, record):
log_data = {
'timestamp': datetime.utcnow().isoformat(),
'level': record.levelname,
'message': record.getMessage(),
'module': record.module,
'function': record.funcName,
'line': record.lineno
}
if hasattr(record, 'user_id'):
log_data['user_id'] = record.user_id
if hasattr(record, 'trace_id'):
log_data['trace_id'] = record.trace_id
return json.dumps(log_data)
def info(self, message, **kwargs):
extra = {'user_id': kwargs.get('user_id'), 'trace_id': kwargs.get('trace_id')}
self.logger.info(message, extra=extra)
8. 测试最佳实践
集成测试
python
import pytest
from fastapi.testclient import TestClient
@pytest.fixture
def test_client():
"""测试客户端"""
from app.main import app
return TestClient(app)
@pytest.fixture
def mock_redis(monkeypatch):
"""Mock Redis客户端"""
class MockRedis:
def __init__(self):
self.data = {}
def get(self, key):
return self.data.get(key)
def set(self, key, value):
self.data[key] = value
mock = MockRedis()
monkeypatch.setattr('app.redis_client', mock)
return mock
def test_api_with_cache(test_client, mock_redis):
"""测试API缓存"""
# 第一次请求,缓存未命中
response = test_client.get("/api/user/123")
assert response.status_code == 200
# 第二次请求,缓存命中
response = test_client.get("/api/user/123")
assert response.status_code == 200
assert mock_redis.get("user:123") is not None