python
复制代码
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder
# ==========================
# 1. 定义 Focal Loss (解决样本不平衡)
# ==========================
class FocalLoss(nn.Module):
def __init__(self, alpha=1, gamma=2, reduction='mean'):
super(FocalLoss, self).__init__()
self.alpha = alpha
self.gamma = gamma
self.reduction = reduction
def forward(self, inputs, targets):
bce_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
pt = torch.exp(-bce_loss)
focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
if self.reduction == 'mean':
return torch.mean(focal_loss)
elif self.reduction == 'sum':
return torch.sum(focal_loss)
return focal_loss
# ==========================
# 2. 构建神经网络模型
# ==========================
class TabularNN(nn.Module):
def __init__(self, cat_dims, num_dims, emb_dims, hidden_units, dropout=0.2):
"""
cat_dims: 每个类别特征的基数 (list)
num_dims: 数值特征的数量 (int)
emb_dims: 每个类别特征对应的 embedding 维度 (list)
hidden_units: MLP 隐藏层单元数 (list, e.g., [512, 256, 128])
"""
super(TabularNN, self).__init__()
# 1. Embedding 层列表
self.embeddings = nn.ModuleList([
nn.Embedding(num_embeddings=count, embedding_dim=dim)
for count, dim in zip(cat_dims, emb_dims)
])
# 2. 计算输入 MLP 的总维度
emb_total_dim = sum(emb_dims)
input_dim = emb_total_dim + num_dims
# 3. 构建 MLP 塔
layers = []
prev_dim = input_dim
for h_dim in hidden_units:
layers.append(nn.Linear(prev_dim, h_dim))
layers.append(nn.BatchNorm1d(h_dim))
layers.append(nn.ReLU())
layers.append(nn.Dropout(dropout))
prev_dim = h_dim
self.mlp = nn.Sequential(*layers)
# 4. 输出层
self.out_layer = nn.Linear(prev_dim, 1)
# 初始化权重
self._init_weights()
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
if m.bias is not None:
nn.init.zeros_(m.bias)
elif isinstance(m, nn.Embedding):
nn.init.xavier_uniform_(m.weight)
def forward(self, cat_features, num_features):
# cat_features: List of tensors [B,], num_features: Tensor [B, N]
# 1. Embedding 拼接
emb_outputs = [emb(cat) for emb, cat in zip(self.embeddings, cat_features)]
emb_concat = torch.cat(emb_outputs, dim=1)
# 2. 与数值特征拼接
x = torch.cat([emb_concat, num_features], dim=1)
# 3. MLP 前向传播
x = self.mlp(x)
# 4. 输出 (Logits, 不带 Sigmoid,因为 Loss 用 BCEWithLogits)
return self.out_layer(x)
# ==========================
# 3. 训练与验证流程 (含调参策略)
# ==========================
def train_model(model, train_loader, val_loader, device, epochs=50, lr=1e-3, weight_decay=1e-5):
criterion = FocalLoss(alpha=1.0, gamma=2.0) # 或者用 nn.BCEWithLogitsLoss(pos_weight=torch.tensor([20.0]).to(device))
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
# 学习率调度:当验证集 Loss 不下降时减半
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
best_val_loss = float('inf')
patience_counter = 0
early_stop_patience = 7
best_model_state = None
print(f"{'Epoch':<6} | {'Train Loss':<10} | {'Val Loss':<10} | {'Val AUC':<8} | {'LR'}")
print("-" * 60)
for epoch in range(epochs):
# --- 训练阶段 ---
model.train()
train_loss_sum = 0
for cat_batch, num_batch, y_batch in train_loader:
cat_batch = [c.to(device) for c in cat_batch]
num_batch = num_batch.to(device)
y_batch = y_batch.to(device)
optimizer.zero_grad()
outputs = model(cat_batch, num_batch)
loss = criterion(outputs, y_batch.unsqueeze(1).float())
loss.backward()
# 梯度裁剪 (防止梯度爆炸,重要!)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_loss_sum += loss.item()
avg_train_loss = train_loss_sum / len(train_loader)
# --- 验证阶段 ---
model.eval()
val_loss_sum = 0
all_preds = []
all_targets = []
with torch.no_grad():
for cat_batch, num_batch, y_batch in val_loader:
cat_batch = [c.to(device) for c in cat_batch]
num_batch = num_batch.to(device)
y_batch = y_batch.to(device)
outputs = model(cat_batch, num_batch)
loss = criterion(outputs, y_batch.unsqueeze(1).float())
val_loss_sum += loss.item()
probs = torch.sigmoid(outputs).cpu().numpy()
all_preds.extend(probs)
all_targets.extend(y_batch.cpu().numpy())
avg_val_loss = val_loss_sum / len(val_loader)
val_auc = roc_auc_score(all_targets, all_preds)
# 打印进度
current_lr = optimizer.param_groups[0]['lr']
print(f"{epoch+1:<6} | {avg_train_loss:.6f} | {avg_val_loss:.6f} | {val_auc:.4f} | {current_lr:.2e}")
# 调度器步长
scheduler.step(avg_val_loss)
# 早停机制 (Early Stopping)
if avg_val_loss < best_val_loss:
best_val_loss = avg_val_loss
patience_counter = 0
best_model_state = model.state_dict().copy() # 保存最佳模型
else:
patience_counter += 1
if patience_counter >= early_stop_patience:
print(f"\n⚠️ 早停触发于 Epoch {epoch+1}")
break
# 恢复最佳模型
if best_model_state:
model.load_state_dict(best_model_state)
print(f"✅ 已加载最佳模型 (Val Loss: {best_val_loss:.6f})")
return model
# ==========================
# 4. 数据预处理示例 (模拟 IJCAI 数据)
# ==========================
def prepare_data(df, cat_cols, num_cols, target_col='is_trade'):
# 1. 标签编码 (Label Encoding)
le_dict = {}
cat_data = []
cat_dims = []
emb_dims = []
for col in cat_cols:
le = LabelEncoder()
# 注意:测试集可能有未见过的类别,这里简化处理,实际需 fit 在 train+test 联合集上
df[col] = df[col].fillna(-1).astype(str)
le.fit(df[col])
df[col] = le.transform(df[col])
le_dict[col] = le
n_unique = df[col].nunique()
cat_data.append(torch.tensor(df[col].values, dtype=torch.long))
cat_dims.append(n_unique)
# Embedding 维度经验公式
emb_dims.append(min(600, int(np.ceil(n_unique ** 0.25) * 2)))
# 2. 数值特征标准化
num_data = df[num_cols].fillna(0).values
# 简单归一化 (实际建议用 StandardScaler)
num_data = (num_data - num_data.mean(axis=0)) / (num_data.std(axis=0) + 1e-8)
num_tensor = torch.tensor(num_data, dtype=torch.float32)
# 3. 标签
if target_col in df.columns:
y = df[target_col].values
# 过滤掉测试集的 -1
mask = y != -1
y = y[mask]
cat_data = [c[mask] for c in cat_data]
num_tensor = num_tensor[mask]
y_tensor = torch.tensor(y, dtype=torch.long)
else:
y_tensor = None # 测试集无标签
return cat_data, num_tensor, y_tensor, cat_dims, emb_dims
# ==========================
# 5. 主执行入口
# ==========================
if __name__ == "__main__":
# 假设您已经有了处理好的 DataFrame 'df'
# df = pd.read_csv(...)
# 模拟数据生成 (替换为您的真实数据加载)
print("正在生成模拟数据...")
n_samples = 10000
df_sim = pd.DataFrame({
'user_id': np.random.randint(0, 5000, n_samples),
'item_id': np.random.randint(0, 8000, n_samples),
'shop_id': np.random.randint(0, 2000, n_samples),
'hour': np.random.randint(0, 24, n_samples),
'price': np.random.rand(n_samples) * 100,
'is_trade': np.random.choice([0, 1], n_samples, p=[0.98, 0.02]) # 模拟不平衡
})
# 配置特征
CAT_COLS = ['user_id', 'item_id', 'shop_id', 'hour']
NUM_COLS = ['price']
TARGET = 'is_trade'
# 准备数据
cat_tensors, num_tensor, y_tensor, cat_dims, emb_dims = prepare_data(df_sim, CAT_COLS, NUM_COLS, TARGET)
# 划分训练/验证集 (简单切片)
split_idx = int(len(y_tensor) * 0.8)
train_cat = [c[:split_idx] for c in cat_tensors]
train_num = num_tensor[:split_idx]
train_y = y_tensor[:split_idx]
val_cat = [c[split_idx:] for c in cat_tensors]
val_num = num_tensor[split_idx:]
val_y = y_tensor[split_idx:]
# 构建 DataLoader
train_dataset = TensorDataset(*train_cat, train_num, train_y)
val_dataset = TensorDataset(*val_cat, val_num, val_y)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False)
# 初始化模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")
hidden_layers = [512, 256, 128, 64] # 调参重点:尝试减少层数或宽度
model = TabularNN(
cat_dims=cat_dims,
num_dims=len(NUM_COLS),
emb_dims=emb_dims,
hidden_units=hidden_layers,
dropout=0.3 # 调参重点:0.2 ~ 0.5
).to(device)
print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
# 开始训练
best_model = train_model(
model,
train_loader,
val_loader,
device,
epochs=30,
lr=1e-3, # 调参重点:1e-2 ~ 1e-4
weight_decay=1e-4 # 调参重点:1e-6 ~ 1e-3
)
print("\n🎉 训练完成!")