
文章目录
-
- 引言
- 项目概述
- 第一阶段:需求分析与规划
-
- [1.1 业务需求分析](#1.1 业务需求分析)
- [1.2 技术可行性分析](#1.2 技术可行性分析)
- [1.3 项目规划与时间线](#1.3 项目规划与时间线)
- 第二阶段:数据收集与预处理
-
- [2.1 数据收集](#2.1 数据收集)
- [2.2 数据清洗与预处理](#2.2 数据清洗与预处理)
- [2.3 数据探索与分析](#2.3 数据探索与分析)
- 第三阶段:模型开发与训练
-
- [3.1 特征工程](#3.1 特征工程)
- [3.2 模型构建与训练](#3.2 模型构建与训练)
- [3.3 模型优化与调参](#3.3 模型优化与调参)
- 第四阶段:模型评估与验证
-
- [4.1 综合模型评估](#4.1 综合模型评估)
- 第五阶段:部署与上线
-
- [5.1 创建模型服务API](#5.1 创建模型服务API)
- [5.2 Docker容器化部署](#5.2 Docker容器化部署)
- [5.3 监控与日志配置](#5.3 监控与日志配置)
- 第六阶段:维护与迭代
-
- [6.1 模型性能监控](#6.1 模型性能监控)
- [6.2 自动化重新训练管道](#6.2 自动化重新训练管道)
- 完整项目流程图
- 总结
作者:北辰alk
引言
在当今数字化转型的时代,人工智能项目已经成为企业提升竞争力的关键。然而,一个成功的AI项目需要经过严谨的流程设计和系统的开发部署。本文将通过一个电商评论情感分析系统的实际案例,详细讲解AI项目从需求分析到部署上线的完整流程,包含详细的代码实现和流程图。
项目概述
项目名称 :电商平台评论情感分析系统
项目目标:开发一个能够自动分析用户评论情感倾向的AI系统,帮助电商平台快速了解用户反馈,提升服务质量。
第一阶段:需求分析与规划
1.1 业务需求分析
首先,我们需要明确项目的业务目标和价值:
python
# 业务需求文档示例
business_requirements = {
"项目名称": "电商评论情感分析系统",
"业务目标": [
"自动分析用户评论的情感倾向(正面/负面/中性)",
"实时监控产品口碑变化",
"识别潜在的产品质量问题",
"提升客户服务响应效率"
],
"关键指标": [
"情感分类准确率 > 90%",
"系统响应时间 < 2秒",
"每日处理评论量 > 10万条",
"用户满意度提升 15%"
],
"目标用户": [
"产品经理",
"客户服务团队",
"市场营销团队",
"平台运营管理者"
]
}
1.2 技术可行性分析
python
# 技术评估矩阵
technical_assessment = {
"数据处理": {
"技术方案": "Pandas + Numpy 进行数据清洗和预处理",
"可行性": "高",
"风险评估": "数据质量可能影响模型效果"
},
"模型选择": {
"技术方案": "BERT预训练模型 + 微调",
"可行性": "中高",
"风险评估": "计算资源需求较高,可能需要GPU"
},
"部署方案": {
"技术方案": "Flask REST API + Docker容器化",
"可行性": "高",
"风险评估": "高并发场景需要负载均衡"
},
"监控维护": {
"技术方案": "Prometheus + Grafana监控",
"可行性": "中",
"风险评估": "需要专门的运维知识"
}
}
1.3 项目规划与时间线
2024-01-07 2024-01-14 2024-01-21 2024-01-28 2024-02-04 2024-02-11 2024-02-18 2024-02-25 2024-03-03 2024-03-10 2024-03-17 2024-03-24 业务需求调研 技术方案设计 数据收集 数据清洗标注 特征工程 模型训练调优 API开发 测试验证 生产部署 需求分析 数据准备 模型开发 部署上线 AI项目开发时间线
第二阶段:数据收集与预处理
2.1 数据收集
python
import pandas as pd
import requests
import json
from typing import List, Dict
class DataCollector:
def __init__(self, api_key: str):
self.api_key = api_key
def collect_from_api(self, product_ids: List[str]) -> pd.DataFrame:
"""
从电商平台API收集评论数据
"""
base_url = "https://api.ecommerce.com/v1/products"
all_comments = []
for product_id in product_ids:
url = f"{base_url}/{product_id}/reviews"
headers = {"Authorization": f"Bearer {self.api_key}"}
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
reviews = response.json()['reviews']
for review in reviews:
comment_data = {
'product_id': product_id,
'comment_id': review['id'],
'comment_text': review['content'],
'rating': review['rating'],
'create_time': review['create_time'],
'user_id': review['user_id']
}
all_comments.append(comment_data)
except Exception as e:
print(f"Error collecting data for product {product_id}: {e}")
return pd.DataFrame(all_comments)
def collect_from_database(self, query: str) -> pd.DataFrame:
"""
从数据库收集历史评论数据
"""
import sqlalchemy
engine = sqlalchemy.create_engine('mysql://user:password@localhost/db')
df = pd.read_sql(query, engine)
return df
# 使用示例
collector = DataCollector("your_api_key")
product_ids = ["p001", "p002", "p003"]
comments_df = collector.collect_from_api(product_ids)
print(f"收集到 {len(comments_df)} 条评论数据")
2.2 数据清洗与预处理
python
import re
import jieba
import pandas as pd
from sklearn.model_selection import train_test_split
class DataPreprocessor:
def __init__(self):
self.stop_words = self.load_stopwords()
def load_stopwords(self) -> set:
"""加载停用词表"""
stopwords = set()
with open('chinese_stopwords.txt', 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
return stopwords
def clean_text(self, text: str) -> str:
"""文本清洗"""
if pd.isna(text):
return ""
# 移除特殊字符和标点
text = re.sub(r'[^\w\s]', '', text)
# 移除数字
text = re.sub(r'\d+', '', text)
# 移除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def segment_text(self, text: str) -> str:
"""中文分词"""
words = jieba.cut(text)
# 移除停用词
words = [word for word in words if word not in self.stop_words and len(word) > 1]
return ' '.join(words)
def create_labels(self, rating: int) -> str:
"""根据评分创建情感标签"""
if rating >= 4:
return "positive"
elif rating == 3:
return "neutral"
else:
return "negative"
def preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""完整的数据预处理流程"""
print("开始数据预处理...")
# 数据清洗
df = df.dropna(subset=['comment_text', 'rating'])
df = df[df['comment_text'].str.len() > 5] # 移除过短评论
# 文本清洗和分词
df['cleaned_text'] = df['comment_text'].apply(self.clean_text)
df['segmented_text'] = df['cleaned_text'].apply(self.segment_text)
# 创建标签
df['sentiment'] = df['rating'].apply(self.create_labels)
print(f"预处理完成,剩余 {len(df)} 条有效数据")
print("情感分布:")
print(df['sentiment'].value_counts())
return df
# 使用示例
preprocessor = DataPreprocessor()
processed_df = preprocessor.preprocess_data(comments_df)
# 划分训练集和测试集
train_df, test_df = train_test_split(
processed_df,
test_size=0.2,
random_state=42,
stratify=processed_df['sentiment']
)
2.3 数据探索与分析
python
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
class DataAnalyzer:
def __init__(self, df: pd.DataFrame):
self.df = df
def plot_sentiment_distribution(self):
"""绘制情感分布图"""
plt.figure(figsize=(10, 6))
sentiment_counts = self.df['sentiment'].value_counts()
plt.subplot(1, 2, 1)
plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%')
plt.title('Sentiment Distribution')
plt.subplot(1, 2, 2)
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.title('Sentiment Count')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
def generate_wordcloud(self, sentiment: str):
"""生成词云"""
text = ' '.join(self.df[self.df['sentiment'] == sentiment]['segmented_text'])
wordcloud = WordCloud(
font_path='simhei.ttf',
width=800,
height=600,
background_color='white'
).generate(text)
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f'Word Cloud for {sentiment} Reviews')
plt.show()
def analyze_text_length(self):
"""分析文本长度分布"""
self.df['text_length'] = self.df['segmented_text'].apply(lambda x: len(x.split()))
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
self.df['text_length'].hist(bins=50)
plt.title('Text Length Distribution')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
sns.boxplot(x='sentiment', y='text_length', data=self.df)
plt.title('Text Length by Sentiment')
plt.tight_layout()
plt.show()
# 使用示例
analyzer = DataAnalyzer(processed_df)
analyzer.plot_sentiment_distribution()
analyzer.generate_wordcloud('positive')
analyzer.analyze_text_length()
第三阶段:模型开发与训练
3.1 特征工程
python
import torch
from transformers import BertTokenizer, BertModel
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
class FeatureEngineer:
def __init__(self, model_name: str = 'bert-base-chinese'):
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.tfidf_vectorizer = None
def extract_bert_features(self, texts: List[str], max_length: int = 128) -> np.ndarray:
"""使用BERT提取文本特征"""
print("提取BERT特征...")
# 加载BERT模型
model = BertModel.from_pretrained('bert-base-chinese')
model.eval()
all_features = []
with torch.no_grad():
for text in texts:
# 编码文本
inputs = self.tokenizer(
text,
return_tensors='pt',
max_length=max_length,
padding='max_length',
truncation=True
)
# 获取BERT输出
outputs = model(**inputs)
# 使用[CLS] token的表示作为整个文本的表示
cls_embedding = outputs.last_hidden_state[:, 0, :].numpy()
all_features.append(cls_embedding[0])
return np.array(all_features)
def extract_tfidf_features(self, texts: List[str], max_features: int = 5000) -> np.ndarray:
"""提取TF-IDF特征"""
print("提取TF-IDF特征...")
if self.tfidf_vectorizer is None:
self.tfidf_vectorizer = TfidfVectorizer(
max_features=max_features,
ngram_range=(1, 2),
min_df=2,
max_df=0.8
)
tfidf_features = self.tfidf_vectorizer.fit_transform(texts)
else:
tfidf_features = self.tfidf_vectorizer.transform(texts)
return tfidf_features.toarray()
def create_combined_features(self, texts: List[str]) -> np.ndarray:
"""创建组合特征"""
bert_features = self.extract_bert_features(texts)
tfidf_features = self.extract_tfidf_features(texts)
# 合并特征
combined_features = np.concatenate([bert_features, tfidf_features], axis=1)
return combined_features
# 使用示例
feature_engineer = FeatureEngineer()
# 提取训练集特征
X_train = feature_engineer.create_combined_features(train_df['segmented_text'].tolist())
y_train = train_df['sentiment'].values
# 提取测试集特征
X_test = feature_engineer.create_combined_features(test_df['segmented_text'].tolist())
y_test = test_df['sentiment'].values
print(f"训练集特征维度: {X_train.shape}")
print(f"测试集特征维度: {X_test.shape}")
3.2 模型构建与训练
python
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, AdamW
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
class SentimentDataset(Dataset):
"""自定义数据集类"""
def __init__(self, features, labels, label_map):
self.features = features
self.labels = labels
self.label_map = label_map
def __len__(self):
return len(self.features)
def __getitem__(self, idx):
feature = torch.FloatTensor(self.features[idx])
label = torch.LongTensor([self.label_map[self.labels[idx]]])
return feature, label
class SentimentClassifier(nn.Module):
"""自定义情感分类模型"""
def __init__(self, input_dim, num_classes, hidden_dim=512):
super(SentimentClassifier, self).__init__()
self.classifier = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_dim, hidden_dim // 2),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(hidden_dim // 2, num_classes)
)
def forward(self, x):
return self.classifier(x)
class ModelTrainer:
def __init__(self, model_type: str = 'neural_network'):
self.model_type = model_type
self.model = None
self.label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
self.reverse_label_map = {v: k for k, v in self.label_map.items()}
def train_neural_network(self, X_train, y_train, X_val, y_val, input_dim, num_epochs=50):
"""训练神经网络模型"""
print("训练神经网络模型...")
# 创建数据集和数据加载器
train_dataset = SentimentDataset(X_train, y_train, self.label_map)
val_dataset = SentimentDataset(X_val, y_val, self.label_map)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
# 初始化模型
self.model = SentimentClassifier(input_dim=input_dim, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
# 训练循环
train_losses = []
val_accuracies = []
for epoch in range(num_epochs):
self.model.train()
epoch_loss = 0
for features, labels in train_loader:
optimizer.zero_grad()
outputs = self.model(features)
loss = criterion(outputs, labels.squeeze())
loss.backward()
optimizer.step()
epoch_loss += loss.item()
# 验证
val_accuracy = self.evaluate_neural_network(val_loader)
train_losses.append(epoch_loss / len(train_loader))
val_accuracies.append(val_accuracy)
if (epoch + 1) % 10 == 0:
print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}, Val Accuracy: {val_accuracy:.4f}')
return train_losses, val_accuracies
def evaluate_neural_network(self, data_loader):
"""评估神经网络模型"""
self.model.eval()
correct = 0
total = 0
with torch.no_grad():
for features, labels in data_loader:
outputs = self.model(features)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels.squeeze()).sum().item()
return correct / total
def train_random_forest(self, X_train, y_train):
"""训练随机森林模型"""
print("训练随机森林模型...")
self.model = RandomForestClassifier(
n_estimators=100,
max_depth=10,
random_state=42
)
# 转换标签为数字
y_train_num = [self.label_map[label] for label in y_train]
self.model.fit(X_train, y_train_num)
def predict(self, X):
"""预测"""
if self.model_type == 'neural_network':
self.model.eval()
with torch.no_grad():
features = torch.FloatTensor(X)
outputs = self.model(features)
_, predicted = torch.max(outputs.data, 1)
return [self.reverse_label_map[label.item()] for label in predicted]
else:
predictions = self.model.predict(X)
return [self.reverse_label_map[label] for label in predictions]
def evaluate_model(self, X_test, y_test):
"""评估模型性能"""
predictions = self.predict(X_test)
print("模型评估结果:")
print(f"准确率: {accuracy_score(y_test, predictions):.4f}")
print("\n详细分类报告:")
print(classification_report(y_test, predictions))
return predictions
# 使用示例
# 划分验证集
from sklearn.model_selection import train_test_split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
X_train, y_train, test_size=0.2, random_state=42
)
# 训练神经网络模型
nn_trainer = ModelTrainer('neural_network')
train_losses, val_accuracies = nn_trainer.train_neural_network(
X_train_split, y_train_split, X_val_split, y_val_split,
input_dim=X_train.shape[1], num_epochs=50
)
# 评估模型
nn_predictions = nn_trainer.evaluate_model(X_test, y_test)
3.3 模型优化与调参
python
import optuna
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
class HyperparameterOptimizer:
def __init__(self, X, y):
self.X = X
self.y = y
self.label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
self.y_num = [self.label_map[label] for label in self.y]
def objective_rf(self, trial):
"""随机森林超参数优化目标函数"""
n_estimators = trial.suggest_int('n_estimators', 50, 300)
max_depth = trial.suggest_int('max_depth', 5, 50)
min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf,
random_state=42
)
score = cross_val_score(model, self.X, self.y_num, cv=5, scoring='accuracy')
return score.mean()
def optimize_random_forest(self, n_trials=100):
"""优化随机森林超参数"""
study = optuna.create_study(direction='maximize')
study.optimize(self.objective_rf, n_trials=n_trials)
print("最佳超参数:")
for key, value in study.best_params.items():
print(f"{key}: {value}")
print(f"最佳准确率: {study.best_value:.4f}")
return study.best_params
# 使用示例
optimizer = HyperparameterOptimizer(X_train, y_train)
best_params = optimizer.optimize_random_forest(n_trials=50)
# 使用最佳参数训练最终模型
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y_train_num = [label_map[label] for label in y_train]
best_rf_model.fit(X_train, y_train_num)
第四阶段:模型评估与验证
4.1 综合模型评估
python
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.preprocessing import label_binarize
class ModelEvaluator:
def __init__(self, models, model_names, X_test, y_test):
self.models = models
self.model_names = model_names
self.X_test = X_test
self.y_test = y_test
self.label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
self.reverse_label_map = {v: k for k, v in self.label_map.items()}
def plot_confusion_matrix(self, predictions, model_name):
"""绘制混淆矩阵"""
cm = confusion_matrix(self.y_test, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['negative', 'neutral', 'positive'],
yticklabels=['negative', 'neutral', 'positive'])
plt.title(f'Confusion Matrix - {model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()
def plot_learning_curves(self, train_losses, val_accuracies, model_name):
"""绘制学习曲线"""
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title(f'Training Loss - {model_name}')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.subplot(1, 2, 2)
plt.plot(val_accuracies)
plt.title(f'Validation Accuracy - {model_name}')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.show()
def compare_models(self):
"""比较多个模型性能"""
results = {}
for model, name in zip(self.models, self.model_names):
if hasattr(model, 'predict'):
predictions = model.predict(self.X_test)
else:
predictions = model(self.X_test)
accuracy = accuracy_score(self.y_test, predictions)
report = classification_report(self.y_test, predictions, output_dict=True)
results[name] = {
'accuracy': accuracy,
'precision': report['weighted avg']['precision'],
'recall': report['weighted avg']['recall'],
'f1_score': report['weighted avg']['f1-score'],
'predictions': predictions
}
# 绘制混淆矩阵
self.plot_confusion_matrix(predictions, name)
return results
def plot_model_comparison(self, results):
"""绘制模型比较图"""
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
model_names = list(results.keys())
plt.figure(figsize=(12, 8))
for i, metric in enumerate(metrics):
plt.subplot(2, 2, i+1)
values = [results[model][metric] for model in model_names]
plt.bar(model_names, values, color=['skyblue', 'lightcoral', 'lightgreen'])
plt.title(f'Model Comparison - {metric.title()}')
plt.ylim(0, 1)
plt.xticks(rotation=45)
# 在柱状图上显示数值
for j, v in enumerate(values):
plt.text(j, v + 0.01, f'{v:.4f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()
# 使用示例
# 假设我们有多个模型需要比较
models_to_compare = [best_rf_model, nn_trainer.predict] # 随机森林和神经网络
model_names = ['Random Forest', 'Neural Network']
evaluator = ModelEvaluator(models_to_compare, model_names, X_test, y_test)
results = evaluator.compare_models()
evaluator.plot_model_comparison(results)
# 打印详细结果
for model_name, metrics in results.items():
print(f"\n{model_name} 性能指标:")
print(f"准确率: {metrics['accuracy']:.4f}")
print(f"精确率: {metrics['precision']:.4f}")
print(f"召回率: {metrics['recall']:.4f}")
print(f"F1分数: {metrics['f1_score']:.4f}")
第五阶段:部署与上线
5.1 创建模型服务API
python
from flask import Flask, request, jsonify
import pickle
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
app = Flask(__name__)
# 加载训练好的模型和预处理组件
class SentimentService:
def __init__(self):
self.model = None
self.feature_engineer = None
self.preprocessor = None
self.load_components()
def load_components(self):
"""加载所有需要的组件"""
try:
# 加载预处理组件
with open('preprocessor.pkl', 'rb') as f:
self.preprocessor = pickle.load(f)
# 加载特征工程组件
with open('feature_engineer.pkl', 'rb') as f:
self.feature_engineer = pickle.load(f)
# 加载模型
if torch.cuda.is_available():
self.model = torch.load('sentiment_model.pth')
else:
self.model = torch.load('sentiment_model.pth', map_location=torch.device('cpu'))
self.model.eval()
print("所有组件加载完成!")
except Exception as e:
print(f"加载组件时出错: {e}")
def predict_sentiment(self, text):
"""预测文本情感"""
try:
# 预处理文本
cleaned_text = self.preprocessor.clean_text(text)
segmented_text = self.preprocessor.segment_text(cleaned_text)
# 提取特征
features = self.feature_engineer.create_combined_features([segmented_text])
# 预测
with torch.no_grad():
features_tensor = torch.FloatTensor(features)
outputs = self.model(features_tensor)
probabilities = torch.softmax(outputs, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()
# 映射回标签
label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
sentiment = label_map[predicted_class]
confidence = probabilities[0][predicted_class].item()
return {
'sentiment': sentiment,
'confidence': confidence,
'probabilities': {
'negative': probabilities[0][0].item(),
'neutral': probabilities[0][1].item(),
'positive': probabilities[0][2].item()
}
}
except Exception as e:
print(f"预测时出错: {e}")
return {'error': str(e)}
# 初始化服务
service = SentimentService()
@app.route('/health', methods=['GET'])
def health_check():
"""健康检查端点"""
return jsonify({'status': 'healthy', 'message': 'Sentiment Analysis Service is running'})
@app.route('/predict', methods=['POST'])
def predict():
"""情感预测端点"""
try:
data = request.get_json()
if not data or 'text' not in data:
return jsonify({'error': 'No text provided'}), 400
text = data['text']
result = service.predict_sentiment(text)
if 'error' in result:
return jsonify(result), 500
return jsonify(result)
except Exception as e:
return jsonify({'error': str(e)}), 500
@app.route('/batch_predict', methods=['POST'])
def batch_predict():
"""批量预测端点"""
try:
data = request.get_json()
if not data or 'texts' not in data:
return jsonify({'error': 'No texts provided'}), 400
texts = data['texts']
results = []
for text in texts:
result = service.predict_sentiment(text)
results.append(result)
return jsonify({'results': results})
except Exception as e:
return jsonify({'error': str(e)}), 500
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False)
5.2 Docker容器化部署
dockerfile
# Dockerfile
FROM python:3.9-slim
# 设置工作目录
WORKDIR /app
# 复制依赖文件
COPY requirements.txt .
# 安装依赖
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY . .
# 创建模型和组件目录
RUN mkdir -p models
# 复制训练好的模型文件
COPY preprocessor.pkl models/
COPY feature_engineer.pkl models/
COPY sentiment_model.pth models/
# 创建非root用户
RUN useradd -m -u 1000 user
USER user
# 暴露端口
EXPOSE 5000
# 启动应用
CMD ["python", "app.py"]
yaml
# docker-compose.yml
version: '3.8'
services:
sentiment-api:
build: .
ports:
- "5000:5000"
environment:
- PYTHONUNBUFFERED=1
- MODEL_PATH=/app/models
volumes:
- ./models:/app/models
restart: unless-stopped
nginx:
image: nginx:alpine
ports:
- "80:80"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf
depends_on:
- sentiment-api
restart: unless-stopped
monitor:
image: prom/prometheus:latest
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
restart: unless-stopped
grafana:
image: grafana/grafana:latest
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
depends_on:
- monitor
restart: unless-stopped
5.3 监控与日志配置
python
import logging
from logging.handlers import RotatingFileHandler
import prometheus_client
from prometheus_client import Counter, Histogram, generate_latest
import time
from flask import Response
# 配置Prometheus指标
REQUEST_COUNT = Counter('request_count', 'App Request Count',
['app_name', 'endpoint', 'http_status'])
REQUEST_LATENCY = Histogram('request_latency_seconds', 'Request Latency',
['app_name', 'endpoint'])
def setup_logging():
"""配置日志"""
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s %(levelname)s %(name)s %(message)s',
handlers=[
RotatingFileHandler('app.log', maxBytes=1000000, backupCount=5),
logging.StreamHandler()
]
)
def monitor_requests():
"""请求监控装饰器"""
def decorator(f):
def wrapped(*args, **kwargs):
start_time = time.time()
try:
response = f(*args, **kwargs)
REQUEST_COUNT.labels('sentiment_api', request.path, response.status_code).inc()
return response
except Exception as e:
REQUEST_COUNT.labels('sentiment_api', request.path, 500).inc()
raise e
finally:
request_latency = time.time() - start_time
REQUEST_LATENCY.labels('sentiment_api', request.path).observe(request_latency)
return wrapped
return decorator
@app.route('/metrics')
def metrics():
"""Prometheus指标端点"""
return Response(generate_latest(), mimetype='text/plain')
# 在预测端点添加监控
@app.route('/predict', methods=['POST'])
@monitor_requests()
def predict():
# 原有代码保持不变
pass
第六阶段:维护与迭代
6.1 模型性能监控
python
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import smtplib
from email.mime.text import MimeText
class ModelMonitor:
def __init__(self, service: SentimentService):
self.service = service
self.performance_log = []
def log_prediction(self, text, true_label, predicted_label, confidence):
"""记录预测结果"""
log_entry = {
'timestamp': datetime.now(),
'text': text,
'true_label': true_label,
'predicted_label': predicted_label,
'confidence': confidence,
'correct': true_label == predicted_label
}
self.performance_log.append(log_entry)
def calculate_daily_metrics(self):
"""计算每日性能指标"""
if not self.performance_log:
return None
df = pd.DataFrame(self.performance_log)
today = datetime.now().date()
today_data = df[df['timestamp'].dt.date == today]
if len(today_data) == 0:
return None
accuracy = today_data['correct'].mean()
avg_confidence = today_data['confidence'].mean()
metrics = {
'date': today,
'total_predictions': len(today_data),
'accuracy': accuracy,
'avg_confidence': avg_confidence,
'error_rate': 1 - accuracy
}
return metrics
def check_performance_degradation(self, threshold=0.05):
"""检查性能下降"""
metrics = self.calculate_daily_metrics()
if metrics and metrics['accuracy'] < (0.9 - threshold): # 假设基准准确率为90%
self.alert_performance_issue(metrics)
return True
return False
def alert_performance_issue(self, metrics):
"""发送性能告警"""
subject = "模型性能告警"
body = f"""
检测到模型性能下降!
日期: {metrics['date']}
总预测数: {metrics['total_predictions']}
准确率: {metrics['accuracy']:.4f}
平均置信度: {metrics['avg_confidence']:.4f}
错误率: {metrics['error_rate']:.4f}
建议检查数据分布变化或考虑模型重新训练。
"""
self.send_alert(subject, body)
def send_alert(self, subject, body):
"""发送告警邮件"""
# 这里实现邮件发送逻辑
print(f"ALERT: {subject}")
print(body)
# 使用示例
monitor = ModelMonitor(service)
# 在预测时记录日志
def predict_with_monitoring(text, true_label=None):
result = service.predict_sentiment(text)
if true_label and 'error' not in result:
monitor.log_prediction(
text=text,
true_label=true_label,
predicted_label=result['sentiment'],
confidence=result['confidence']
)
# 检查性能
monitor.check_performance_degradation()
return result
6.2 自动化重新训练管道
python
import schedule
import time
from datetime import datetime
class RetrainingPipeline:
def __init__(self, data_collector, preprocessor, feature_engineer):
self.data_collector = data_collector
self.preprocessor = preprocessor
self.feature_engineer = feature_engineer
self.retraining_history = []
def collect_new_data(self):
"""收集新数据"""
print("收集新数据...")
# 这里可以实现从生产环境收集新标注的数据
# 例如从用户反馈中获取标注
pass
def evaluate_model_drift(self):
"""评估模型漂移"""
print("评估模型漂移...")
# 比较当前数据分布与训练数据分布的差异
# 如果差异超过阈值,触发重新训练
return True # 示例返回值
def retrain_model(self):
"""重新训练模型"""
print("开始重新训练模型...")
start_time = datetime.now()
try:
# 收集新数据
new_data = self.collect_new_data()
# 合并新旧数据
# 重新训练模型
# 评估新模型性能
training_result = {
'timestamp': datetime.now(),
'status': 'success',
'duration': (datetime.now() - start_time).total_seconds(),
'new_data_points': len(new_data) if new_data else 0,
'performance_metrics': {} # 这里填充实际指标
}
except Exception as e:
training_result = {
'timestamp': datetime.now(),
'status': 'failed',
'error': str(e)
}
self.retraining_history.append(training_result)
return training_result
def schedule_retraining(self):
"""安排定期重新训练"""
# 每周日凌晨2点重新训练
schedule.every().sunday.at("02:00").do(self.retrain_model)
# 每天检查模型漂移
schedule.every().day.at("06:00").do(self.check_and_retrain)
print("重新训练计划已设置")
def check_and_retrain(self):
"""检查并重新训练"""
if self.evaluate_model_drift():
print("检测到模型漂移,开始重新训练...")
self.retrain_model()
def run_scheduler(self):
"""运行调度器"""
while True:
schedule.run_pending()
time.sleep(60)
# 使用示例
pipeline = RetrainingPipeline(collector, preprocessor, feature_engineer)
pipeline.schedule_retraining()
# 在单独的线程中运行调度器
import threading
scheduler_thread = threading.Thread(target=pipeline.run_scheduler)
scheduler_thread.daemon = True
scheduler_thread.start()
完整项目流程图
监控体系 部署架构 是 否 是 否 ELK Stack 日志收集 Prometheus 性能指标 Grafana 通知系统 告警 负载均衡 客户端 API实例1 API实例2 模型服务 数据库 缓存 需求分析 数据收集 数据预处理 特征工程 模型训练 模型评估 性能达标? 模型部署 API服务 监控告警 性能监控 需要重新训练? 数据收集
总结
本文通过一个完整的电商评论情感分析项目,详细展示了AI项目从需求分析到部署上线的全流程。关键要点包括:
- 严谨的需求分析:明确业务目标和技术可行性
- 系统的数据处理:包括收集、清洗、标注和探索分析
- 科学的模型开发:特征工程、模型选择、训练和优化
- 全面的评估验证:多维度评估模型性能
- 可靠的部署方案:容器化、API服务、监控告警
- 持续的维护迭代:性能监控、自动化重新训练
成功的AI项目不仅需要优秀的技术实现,更需要完善的工程化流程和持续的维护优化。希望本文能为您的AI项目开发提供实用的参考和指导。
