pythonstudy Day41

早停策略和模型权重的保存

@疏锦行

clike 复制代码
import os
import random
from dataclasses import dataclass
from typing import Dict, Tuple

import numpy as np
import pandas as pd

import joblib
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix


# =========================
# 0) Config
# =========================
@dataclass
class Config:
    data_path: str = "data.xlsx"
    target_col: str = "Credit Default"
    drop_cols: Tuple[str, ...] = ("Id",)

    seed: int = 42
    batch_size: int = 256

    # Stage 1 training
    stage1_max_epochs: int = 30

    # Stage 2 training (resume)
    stage2_max_epochs: int = 50

    # Early stopping
    patience: int = 8
    min_delta: float = 1e-4

    # Optimization
    lr: float = 1e-3
    weight_decay: float = 1e-4

    # Artifacts
    artifacts_dir: str = "artifacts"
    preprocess_file: str = "preprocess.joblib"
    checkpoint_file: str = "checkpoint_best.pt"
    final_weights_file: str = "model_final.pt"


CFG = Config()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# =========================
# 1) Utils
# =========================
def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


def ensure_dir(path: str) -> None:
    os.makedirs(path, exist_ok=True)


def to_dense(x):
    return x.toarray() if hasattr(x, "toarray") else np.asarray(x)


@torch.no_grad()
def evaluate(model: nn.Module, loader: DataLoader, y_true: np.ndarray, threshold: float = 0.5) -> Dict:
    model.eval()
    probs = []
    for xb, _ in loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy()
        probs.append(p)
    probs = np.concatenate(probs, axis=0)
    pred = (probs >= threshold).astype(int)

    return {
        "acc": float(accuracy_score(y_true, pred)),
        "f1": float(f1_score(y_true, pred)),
        "auc": float(roc_auc_score(y_true, probs)),
        "cm": confusion_matrix(y_true, pred),
    }


def save_checkpoint(path: str, model: nn.Module, optimizer: torch.optim.Optimizer,
                    epoch: int, best_auc: float, patience_counter: int) -> None:
    payload = {
        "epoch": epoch,
        "best_auc": best_auc,
        "patience_counter": patience_counter,
        "model_state": model.state_dict(),
        "optimizer_state": optimizer.state_dict(),
    }
    torch.save(payload, path)


def load_checkpoint(path: str, model: nn.Module, optimizer: torch.optim.Optimizer = None):
    ckpt = torch.load(path, map_location="cpu")
    model.load_state_dict(ckpt["model_state"])
    if optimizer is not None and "optimizer_state" in ckpt:
        optimizer.load_state_dict(ckpt["optimizer_state"])
    return ckpt


# =========================
# 2) Data
# =========================
class NPDataset(Dataset):
    def __init__(self, X_np: np.ndarray, y_np: np.ndarray):
        self.X = torch.from_numpy(X_np.astype(np.float32))
        self.y = torch.from_numpy(y_np.astype(np.float32))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        return self.X[i], self.y[i]


def build_preprocess(X: pd.DataFrame) -> ColumnTransformer:
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
    num_cols = [c for c in X.columns if c not in cat_cols]

    numeric_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, num_cols),
            ("cat", categorical_pipe, cat_cols),
        ],
        remainder="drop"
    )
    return preprocess


def load_and_prepare(cfg: Config):
    df = pd.read_excel(cfg.data_path)

    # 常见占位异常值 -> 缺失
    if "Current Loan Amount" in df.columns:
        df.loc[df["Current Loan Amount"] == 99999999, "Current Loan Amount"] = np.nan

    for c in cfg.drop_cols:
        if c in df.columns:
            df = df.drop(columns=[c])

    if cfg.target_col not in df.columns:
        raise ValueError(f"Target column '{cfg.target_col}' not found. Columns: {list(df.columns)}")

    y = df[cfg.target_col].astype(int).values
    X = df.drop(columns=[cfg.target_col])

    # split (70/15/15) stratify
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=0.30, random_state=cfg.seed, stratify=y
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.50, random_state=cfg.seed, stratify=y_temp
    )

    preprocess = build_preprocess(X_train)
    X_train_t = to_dense(preprocess.fit_transform(X_train)).astype(np.float32)
    X_val_t = to_dense(preprocess.transform(X_val)).astype(np.float32)
    X_test_t = to_dense(preprocess.transform(X_test)).astype(np.float32)

    train_loader = DataLoader(NPDataset(X_train_t, y_train), batch_size=cfg.batch_size, shuffle=True)
    val_loader = DataLoader(NPDataset(X_val_t, y_val), batch_size=cfg.batch_size, shuffle=False)
    test_loader = DataLoader(NPDataset(X_test_t, y_test), batch_size=cfg.batch_size, shuffle=False)

    return preprocess, (X_train_t, y_train, train_loader), (X_val_t, y_val, val_loader), (X_test_t, y_test, test_loader)


# =========================
# 3) Model
# =========================
class MLP(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.25),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.20),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.10),

            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)


def make_loss(y_train: np.ndarray) -> nn.Module:
    pos = (y_train == 1).sum()
    neg = (y_train == 0).sum()
    pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32, device=DEVICE)
    return nn.BCEWithLogitsLoss(pos_weight=pos_weight)


# =========================
# 4) Train loop with early stopping + checkpoint
# =========================
def train_with_early_stopping(
    model: nn.Module,
    optimizer: torch.optim.Optimizer,
    criterion: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    y_val: np.ndarray,
    max_epochs: int,
    patience: int,
    min_delta: float,
    ckpt_path: str,
    start_epoch: int = 0,
    best_auc_init: float = -1.0,
    patience_counter_init: int = 0,
) -> Dict:
    best_auc = best_auc_init
    patience_counter = patience_counter_init

    for epoch in range(start_epoch + 1, start_epoch + max_epochs + 1):
        model.train()
        losses = []

        for xb, yb in train_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)

            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
            optimizer.step()
            losses.append(loss.item())

        val_metrics = evaluate(model, val_loader, y_val, threshold=0.5)
        tr_loss = float(np.mean(losses))

        print(
            f"Epoch {epoch:03d} | loss={tr_loss:.4f} | "
            f"val_auc={val_metrics['auc']:.4f} val_f1={val_metrics['f1']:.4f} val_acc={val_metrics['acc']:.4f}"
        )

        improved = val_metrics["auc"] > (best_auc + min_delta)
        if improved:
            best_auc = val_metrics["auc"]
            patience_counter = 0
            save_checkpoint(ckpt_path, model, optimizer, epoch, best_auc, patience_counter)
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}. Best val AUC = {best_auc:.4f}")
                break

    return {
        "best_auc": best_auc,
        "patience_counter": patience_counter,
    }


def main():
    print(f"Using device: {DEVICE}")
    set_seed(CFG.seed)
    ensure_dir(CFG.artifacts_dir)

    preprocess, train_pack, val_pack, test_pack = load_and_prepare(CFG)
    _, y_train, train_loader = train_pack
    _, y_val, val_loader = val_pack
    _, y_test, test_loader = test_pack

    # Save preprocess (must!)
    preprocess_path = os.path.join(CFG.artifacts_dir, CFG.preprocess_file)
    joblib.dump(preprocess, preprocess_path)
    print(f"Saved preprocess: {preprocess_path}")

    input_dim = train_pack[0].shape[1]
    model = MLP(input_dim).to(DEVICE)

    criterion = make_loss(y_train)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

    ckpt_path = os.path.join(CFG.artifacts_dir, CFG.checkpoint_file)

    # -------------------------
    # Stage 1: Train then save best checkpoint
    # -------------------------
    print("\n===== Stage 1: Train & Save Weights =====")
    stage1_state = train_with_early_stopping(
        model=model,
        optimizer=optimizer,
        criterion=criterion,
        train_loader=train_loader,
        val_loader=val_loader,
        y_val=y_val,
        max_epochs=CFG.stage1_max_epochs,
        patience=CFG.patience,
        min_delta=CFG.min_delta,
        ckpt_path=ckpt_path,
        start_epoch=0,
        best_auc_init=-1.0,
        patience_counter_init=0,
    )

    # -------------------------
    # Stage 2: Load weights and continue training up to 50 epochs with early stopping
    # -------------------------
    print("\n===== Stage 2: Resume from Checkpoint & Continue 50 Epochs (Early Stop) =====")
    # 重新构建 optimizer(也可以沿用),然后从 checkpoint 恢复
    optimizer2 = torch.optim.AdamW(model.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)

    if os.path.exists(ckpt_path):
        ckpt = load_checkpoint(ckpt_path, model, optimizer2)
        start_epoch = int(ckpt.get("epoch", 0))
        best_auc = float(ckpt.get("best_auc", -1.0))
        # 第二阶段通常"重新计数 early stop",更符合"继续训练50轮并早停"
        patience_counter = 0
        print(f"Loaded checkpoint from epoch={start_epoch}, best_val_auc={best_auc:.4f}")
    else:
        # 没有 checkpoint 就从当前模型接着训练
        start_epoch = 0
        best_auc = stage1_state["best_auc"]
        patience_counter = 0
        print("Checkpoint not found, continuing from current weights.")

    _ = train_with_early_stopping(
        model=model,
        optimizer=optimizer2,
        criterion=criterion,
        train_loader=train_loader,
        val_loader=val_loader,
        y_val=y_val,
        max_epochs=CFG.stage2_max_epochs,   # <= 关键:最多继续50轮
        patience=CFG.patience,
        min_delta=CFG.min_delta,
        ckpt_path=ckpt_path,               # 持续覆盖保存"best checkpoint"
        start_epoch=start_epoch,
        best_auc_init=best_auc,
        patience_counter_init=patience_counter,
    )

    # 最终评估:加载 best checkpoint 再测 test(避免最后几轮变差)
    if os.path.exists(ckpt_path):
        _ = load_checkpoint(ckpt_path, model, optimizer=None)

    test_metrics = evaluate(model, test_loader, y_test, threshold=0.5)
    print("\n=== Test Metrics (Best Checkpoint) ===")
    print(f"Accuracy: {test_metrics['acc']:.4f}")
    print(f"F1      : {test_metrics['f1']:.4f}")
    print(f"ROC-AUC  : {test_metrics['auc']:.4f}")
    print("Confusion Matrix:\n", test_metrics["cm"])

    # 保存最终权重(可选:保存 best checkpoint 已经足够)
    final_path = os.path.join(CFG.artifacts_dir, CFG.final_weights_file)
    torch.save(model.state_dict(), final_path)
    print(f"\nSaved final weights: {final_path}")
    print(f"Best checkpoint: {ckpt_path}")


if __name__ == "__main__":
    main()
相关推荐
荣码5 小时前
GraphRAG:普通RAG只能回答"点"的问题,我踩了4个坑才搞懂
java·python
金銀銅鐵16 小时前
[Python] 基于欧几里得算法,实现分数约分计算器
python·数学
Lyn_Li18 小时前
Kaggle Top 5 | 198只股票、200条数据的金融预测——BattleFin高分方案从零复现
python·kaggle·比赛复盘·金融预测
拾年2751 天前
大模型的"聪明"从哪来?聊聊 AI 数据集的那些事儿
人工智能·深度学习·机器学习
小九九的爸爸1 天前
前端想要入门Agent开发,要具备哪些Python基础?
python·agent·ai编程
阿耶同学1 天前
手把手教你用 LangGraph 搭建三层嵌套 Agent 架构
python·程序员
花酒锄作田2 天前
Pydantic校验配置文件
python
hboot2 天前
AI工程师第四课 - 深度学习入门
pytorch·python·神经网络
罗西的思考2 天前
机器人 / 强化学习】HIL-SERL:人类在环驱动的具身智能进化框架
人工智能·算法·机器学习