包含邮件特征提取、xgboost 训练、实时判定、拦截规则和简单 API。
python
import re
import os
import json
import joblib
import numpy as np
import pandas as pd
from typing import Dict, List, Any
from dataclasses import dataclass, asdict
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from xgboost import XGBClassifier
from flask import Flask, request, jsonify
# =========================
# 1. 配置
# =========================
MODEL_DIR = "artifacts"
MODEL_PATH = os.path.join(MODEL_DIR, "xgb_phishing_model.joblib")
TFIDF_PATH = os.path.join(MODEL_DIR, "tfidf_vectorizer.joblib")
FEATURE_META_PATH = os.path.join(MODEL_DIR, "feature_meta.json")
os.makedirs(MODEL_DIR, exist_ok=True)
SUSPICIOUS_WORDS = [
"urgent", "verify", "suspend", "password", "bank", "invoice", "click",
"limited", "winner", "login", "confirm", "security", "alert", "reset",
"account", "payment", "free", "prize", "immediately", "risk"
]
TRUSTED_DOMAINS = {
"google.com", "microsoft.com", "apple.com", "amazon.com", "github.com",
"openai.com", "outlook.com", "qq.com", "163.com"
}
# =========================
# 2. 数据结构
# =========================
@dataclass
class EmailSample:
sender: str
subject: str
body: str
urls: List[str]
has_attachment: int = 0
attachment_count: int = 0
label: int = 0 # 1=phishing, 0=legit
# =========================
# 3. 特征工程
# =========================
class EmailFeatureExtractor:
def __init__(self):
self.tfidf = TfidfVectorizer(
max_features=3000,
ngram_range=(1, 2),
stop_words="english"
)
self.fitted = False
@staticmethod
def extract_urls(text: str) -> List[str]:
if not text:
return []
pattern = r"https?://[^\s<>"]+|www\.[^\s<>"]+"
return re.findall(pattern, text.lower())
@staticmethod
def sender_domain(sender: str) -> str:
match = re.search(r'@([A-Za-z0-9.-]+)', sender or "")
return match.group(1).lower() if match else "unknown"
@staticmethod
def url_domain(url: str) -> str:
try:
if not url.startswith("http"):
url = "http://" + url
return urlparse(url).netloc.lower().replace("www.", "")
except Exception:
return "invalid"
def numeric_features(self, email: EmailSample) -> np.ndarray:
text = f"{email.subject} {email.body}".lower()
sender_domain = self.sender_domain(email.sender)
urls = email.urls if email.urls else self.extract_urls(text)
url_domains = [self.url_domain(u) for u in urls]
suspicious_word_count = sum(1 for w in SUSPICIOUS_WORDS if w in text)
exclamation_count = text.count("!")
digit_count = sum(c.isdigit() for c in text)
uppercase_ratio = (
sum(1 for c in (email.subject + email.body) if c.isupper()) /
max(1, len(email.subject + email.body))
)
url_count = len(urls)
mismatched_domains = sum(1 for d in url_domains if d and d != sender_domain)
untrusted_url_domains = sum(1 for d in url_domains if d not in TRUSTED_DOMAINS)
sender_untrusted = 0 if sender_domain in TRUSTED_DOMAINS else 1
body_len = len(email.body or "")
subject_len = len(email.subject or "")
has_html_hint = int("<html" in text or "href=" in text)
return np.array([
suspicious_word_count,
exclamation_count,
digit_count,
uppercase_ratio,
url_count,
mismatched_domains,
untrusted_url_domains,
sender_untrusted,
email.has_attachment,
email.attachment_count,
body_len,
subject_len,
has_html_hint,
], dtype=float)
def _combine_text(self, email: EmailSample) -> str:
sender_domain = self.sender_domain(email.sender)
urls = email.urls if email.urls else self.extract_urls(email.body)
return f"SUBJECT {email.subject} BODY {email.body} SENDER_DOMAIN {sender_domain} URLS {' '.join(urls)}"
def fit_transform(self, emails: List[EmailSample]):
texts = [self._combine_text(e) for e in emails]
numeric = np.vstack([self.numeric_features(e) for e in emails])
tfidf_matrix = self.tfidf.fit_transform(texts)
self.fitted = True
return hstack([tfidf_matrix, csr_matrix(numeric)])
def transform(self, emails: List[EmailSample]):
if not self.fitted:
raise RuntimeError("Feature extractor is not fitted.")
texts = [self._combine_text(e) for e in emails]
numeric = np.vstack([self.numeric_features(e) for e in emails])
tfidf_matrix = self.tfidf.transform(texts)
return hstack([tfidf_matrix, csr_matrix(numeric)])
# =========================
# 4. 模型服务
# =========================
class PhishingDetector:
def __init__(self):
self.extractor = EmailFeatureExtractor()
self.model = XGBClassifier(
n_estimators=250,
max_depth=6,
learning_rate=0.08,
subsample=0.9,
colsample_bytree=0.9,
eval_metric="logloss",
random_state=42,
n_jobs=4
)
self.is_trained = False
def train(self, emails: List[EmailSample]):
X = self.extractor.fit_transform(emails)
y = np.array([e.label for e in emails])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
self.model.fit(X_train, y_train)
self.is_trained = True
pred = self.model.predict(X_test)
prob = self.model.predict_proba(X_test)[:, 1]
print("=== 模型评估 ===")
print(classification_report(y_test, pred, digits=4))
print("ROC-AUC:", round(roc_auc_score(y_test, prob), 4))
return {
"classification_report": classification_report(y_test, pred, digits=4),
"roc_auc": float(roc_auc_score(y_test, prob))
}
def predict_one(self, email: EmailSample) -> Dict[str, Any]:
if not self.is_trained:
raise RuntimeError("Model is not trained or loaded.")
X = self.extractor.transform([email])
proba = float(self.model.predict_proba(X)[0, 1])
label = int(proba >= 0.5)
action = self.decide_action(proba)
reasons = self.explain(email)
return {
"is_phishing": bool(label),
"phishing_probability": round(proba, 4),
"action": action,
"reasons": reasons
}
def decide_action(self, proba: float) -> str:
if proba >= 0.85:
return "block"
if proba >= 0.60:
return "quarantine"
if proba >= 0.40:
return "flag"
return "allow"
def explain(self, email: EmailSample) -> List[str]:
reasons = []
text = f"{email.subject} {email.body}".lower()
sender_domain = self.extractor.sender_domain(email.sender)
urls = email.urls if email.urls else self.extractor.extract_urls(text)
if any(w in text for w in ["verify", "password", "reset", "urgent", "bank", "login"]):
reasons.append("邮件正文或标题包含高风险诱导词")
if sender_domain not in TRUSTED_DOMAINS:
reasons.append("发件人域名不在可信名单中")
if len(urls) > 0:
reasons.append("邮件中包含链接,存在跳转风险")
if any(self.extractor.url_domain(u) != sender_domain for u in urls):
reasons.append("链接域名与发件人域名不一致")
if email.has_attachment:
reasons.append("邮件带有附件,需要进一步审查")
if not reasons:
reasons.append("未发现明显高风险特征")
return reasons
def save(self):
if not self.is_trained:
raise RuntimeError("Nothing to save. Train the model first.")
joblib.dump(self.model, MODEL_PATH)
joblib.dump(self.extractor.tfidf, TFIDF_PATH)
with open(FEATURE_META_PATH, "w", encoding="utf-8") as f:
json.dump({"fitted": True}, f, ensure_ascii=False, indent=2)
def load(self):
self.model = joblib.load(MODEL_PATH)
self.extractor.tfidf = joblib.load(TFIDF_PATH)
self.extractor.fitted = True
self.is_trained = True
# =========================
# 5. 示例数据
# =========================
def build_demo_dataset() -> List[EmailSample]:
data = [
EmailSample(
sender="security@paypa1-support.com",
subject="Urgent: Verify your account immediately",
body="Your account will be suspended. Click here to verify your password: http://paypa1-login-check.com",
urls=["http://paypa1-login-check.com"],
has_attachment=0,
attachment_count=0,
label=1,
),
EmailSample(
sender="it-support@company-alerts.net",
subject="Password reset required",
body="We detected unusual login activity. Reset your password now at http://secure-reset-login.net",
urls=["http://secure-reset-login.net"],
has_attachment=0,
attachment_count=0,
label=1,
),
EmailSample(
sender="ceo.office@external-consult.net",
subject="Confidential invoice attached",
body="Please review the attached invoice and process payment today.",
urls=[],
has_attachment=1,
attachment_count=1,
label=1,
),
EmailSample(
sender="notifications@github.com",
subject="Your pull request has been reviewed",
body="A reviewer left comments on your pull request. Visit https://github.com to review.",
urls=["https://github.com"],
has_attachment=0,
attachment_count=0,
label=0,
),
EmailSample(
sender="no-reply@microsoft.com",
subject="Security info updated",
body="Your security information was updated successfully. If this wasn't you, visit https://microsoft.com/security",
urls=["https://microsoft.com/security"],
has_attachment=0,
attachment_count=0,
label=0,
),
EmailSample(
sender="hr@company.com",
subject="Interview schedule confirmation",
body="Please confirm your interview slot for next Monday.",
urls=[],
has_attachment=0,
attachment_count=0,
label=0,
),
EmailSample(
sender="service@amaz0n-billing.com",
subject="Payment failed - urgent action needed",
body="Your payment failed. Update your account now: http://amaz0n-billing-check.com",
urls=["http://amaz0n-billing-check.com"],
has_attachment=0,
attachment_count=0,
label=1,
),
EmailSample(
sender="newsletter@openai.com",
subject="Product update newsletter",
body="Read about new product updates on https://openai.com/blog",
urls=["https://openai.com/blog"],
has_attachment=0,
attachment_count=0,
label=0,
),
]
# 扩充数据,便于训练演示
expanded = []
for i in range(60):
for item in data:
expanded.append(EmailSample(**asdict(item)))
return expanded
# =========================
# 6. 训练入口
# =========================
def train_and_save_demo_model():
dataset = build_demo_dataset()
detector = PhishingDetector()
metrics = detector.train(dataset)
detector.save()
print("模型已保存到 artifacts/ 目录")
return metrics
# =========================
# 7. 拦截服务 API
# =========================
app = Flask(__name__)
service_detector = PhishingDetector()
if os.path.exists(MODEL_PATH) and os.path.exists(TFIDF_PATH):
service_detector.load()
@app.route("/health", methods=["GET"])
def health():
return jsonify({
"status": "ok",
"model_loaded": service_detector.is_trained
})
@app.route("/predict", methods=["POST"])
def predict():
if not service_detector.is_trained:
return jsonify({"error": "model not loaded"}), 500
payload = request.get_json(force=True)
email = EmailSample(
sender=payload.get("sender", ""),
subject=payload.get("subject", ""),
body=payload.get("body", ""),
urls=payload.get("urls", []),
has_attachment=int(payload.get("has_attachment", 0)),
attachment_count=int(payload.get("attachment_count", 0)),
label=0,
)
result = service_detector.predict_one(email)
return jsonify(result)
# =========================
# 8. 模拟邮件网关拦截流程
# =========================
def intercept_email(email_data: Dict[str, Any]) -> Dict[str, Any]:
if not service_detector.is_trained:
raise RuntimeError("Service model is not loaded.")
email = EmailSample(
sender=email_data.get("sender", ""),
subject=email_data.get("subject", ""),
body=email_data.get("body", ""),
urls=email_data.get("urls", []),
has_attachment=int(email_data.get("has_attachment", 0)),
attachment_count=int(email_data.get("attachment_count", 0)),
label=0,
)
result = service_detector.predict_one(email)
action_map = {
"allow": "允许投递",
"flag": "添加风险标签后投递",
"quarantine": "隔离到安全区等待人工复核",
"block": "直接拦截并告警"
}
return {
"email": {
"sender": email.sender,
"subject": email.subject,
},
"decision": result,
"operation": action_map[result["action"]]
}
# =========================
# 9. 本地测试
# =========================
if __name__ == "__main__":
if not (os.path.exists(MODEL_PATH) and os.path.exists(TFIDF_PATH)):
train_and_save_demo_model()
service_detector.load()
test_email = {
"sender": "alert@micr0soft-reset.net",
"subject": "Urgent account verification",
"body": "Your account is at risk. Verify now using http://micr0soft-login-check.net",
"urls": ["http://micr0soft-login-check.net"],
"has_attachment": 0,
"attachment_count": 0,
}
print("=== 本地拦截测试 ===")
print(json.dumps(intercept_email(test_email), ensure_ascii=False, indent=2))
# 启动 API
app.run(host="0.0.0.0", port=5000, debug=False)
下面给你的是一个可运行的 Python 版"基于 XGBoost 的钓鱼邮件智能识别与拦截系统"方案,已经放到旁边的代码文档里了。它包含这些核心能力:
-
邮件特征提取
- 发件人域名
- 标题/正文文本
- URL 数量与域名是否伪装
- 可疑关键词
- 大写比例、数字数量、附件数量等
-
XGBoost 分类模型
- 使用
TF-IDF + 数值特征组合建模 - 输出钓鱼概率
- 使用
-
智能拦截策略
allow:放行flag:风险标记quarantine:隔离block:直接拦截
-
API 服务
POST /predict实时判定邮件风险GET /health检查服务状态
运行前安装依赖
bash
pip install xgboost scikit-learn pandas numpy scipy flask joblib
启动方式
bash
python your_file.py
首次启动会:
- 自动生成演示训练集
- 训练 XGBoost 模型
- 保存模型到
artifacts/ - 启动 Flask API,端口
5000
调用示例
bash
curl -X POST http://127.0.0.1:5000/predict \
-H "Content-Type: application/json" \
-d '{
"sender": "alert@micr0soft-reset.net",
"subject": "Urgent account verification",
"body": "Your account is at risk. Verify now using http://micr0soft-login-check.net",
"urls": ["http://micr0soft-login-check.net"],
"has_attachment": 0,
"attachment_count": 0
}'
返回示例:
json
{
"is_phishing": true,
"phishing_probability": 0.93,
"action": "block",
"reasons": [
"邮件正文或标题包含高风险诱导词",
"发件人域名不在可信名单中",
"邮件中包含链接,存在跳转风险",
"链接域名与发件人域名不一致"
]
}