pythonstudy Day40

复习日


@疏锦行

clike 复制代码
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# -----------------------------
# 1) Reproducibility
# -----------------------------
def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# -----------------------------
# 2) Load data
# -----------------------------
DATA_PATH = "data.xlsx"  # <- 如果你的文件不在当前目录,改成实际路径
df = pd.read_excel(DATA_PATH)

TARGET_COL = "Credit Default"
DROP_COLS = ["Id"]  # 纯ID列一般不参与建模

# 处理异常值:99999999 常见为缺失/占位
if "Current Loan Amount" in df.columns:
    df.loc[df["Current Loan Amount"] == 99999999, "Current Loan Amount"] = np.nan

# 丢弃不需要的列
for c in DROP_COLS:
    if c in df.columns:
        df = df.drop(columns=[c])

# 划分 X/y
if TARGET_COL not in df.columns:
    raise ValueError(f"Target column '{TARGET_COL}' not found. Columns: {list(df.columns)}")

y = df[TARGET_COL].astype(int).values
X = df.drop(columns=[TARGET_COL])

# -----------------------------
# 3) Preprocess (numeric + categorical)
# -----------------------------
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ],
    remainder="drop",
)

# 分层切分:train/val/test = 70/15/15
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

# 只在训练集 fit 预处理器,避免泄露
X_train_t = preprocess.fit_transform(X_train)
X_val_t   = preprocess.transform(X_val)
X_test_t  = preprocess.transform(X_test)

# ColumnTransformer + OneHot 输出可能是稀疏矩阵,这里转成 dense(数据量7500不大,OK)
def to_dense(a):
    return a.toarray() if hasattr(a, "toarray") else np.asarray(a)

X_train_t = to_dense(X_train_t).astype(np.float32)
X_val_t   = to_dense(X_val_t).astype(np.float32)
X_test_t  = to_dense(X_test_t).astype(np.float32)

print("Train shape:", X_train_t.shape, "Val shape:", X_val_t.shape, "Test shape:", X_test_t.shape)

# -----------------------------
# 4) Torch Dataset
# -----------------------------
class NPDataset(Dataset):
    def __init__(self, X_np: np.ndarray, y_np: np.ndarray):
        self.X = torch.from_numpy(X_np)
        self.y = torch.from_numpy(y_np.astype(np.float32))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

BATCH_SIZE = 256
train_loader = DataLoader(NPDataset(X_train_t, y_train), batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(NPDataset(X_val_t, y_val), batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(NPDataset(X_test_t, y_test), batch_size=BATCH_SIZE, shuffle=False)

# -----------------------------
# 5) Model (MLP)
# -----------------------------
class MLP(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.25),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.20),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.10),

            nn.Linear(64, 1)  # logits
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

input_dim = X_train_t.shape[1]
model = MLP(input_dim).to(DEVICE)

# 类别不平衡处理:pos_weight = (neg/pos)
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32, device=DEVICE)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# -----------------------------
# 6) Train / Eval helpers
# -----------------------------
@torch.no_grad()
def predict_proba(loader: DataLoader) -> np.ndarray:
    model.eval()
    probs = []
    for xb, _ in loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy()
        probs.append(p)
    return np.concatenate(probs, axis=0)

def train_one_epoch(loader: DataLoader) -> float:
    model.train()
    losses = []
    for xb, yb in loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)

        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        losses.append(loss.item())
    return float(np.mean(losses))

def eval_metrics(X_loader: DataLoader, y_true: np.ndarray, threshold: float = 0.5) -> dict:
    prob = predict_proba(X_loader)
    pred = (prob >= threshold).astype(int)

    out = {
        "acc": accuracy_score(y_true, pred),
        "f1": f1_score(y_true, pred),
        "auc": roc_auc_score(y_true, prob),
        "cm": confusion_matrix(y_true, pred),
    }
    return out

# -----------------------------
# 7) Training loop (early stopping)
# -----------------------------
EPOCHS = 50
PATIENCE = 8
best_auc = -1.0
best_state = None
pat = 0

for epoch in range(1, EPOCHS + 1):
    tr_loss = train_one_epoch(train_loader)
    val_out = eval_metrics(val_loader, y_val, threshold=0.5)

    print(
        f"Epoch {epoch:02d} | loss={tr_loss:.4f} | "
        f"val_auc={val_out['auc']:.4f} val_f1={val_out['f1']:.4f} val_acc={val_out['acc']:.4f}"
    )

    if val_out["auc"] > best_auc + 1e-4:
        best_auc = val_out["auc"]
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        pat = 0
    else:
        pat += 1
        if pat >= PATIENCE:
            print("Early stopping triggered.")
            break

# restore best
if best_state is not None:
    model.load_state_dict(best_state)
print(f"Best val AUC: {best_auc:.4f}")

# -----------------------------
# 8) Test evaluation
# -----------------------------
test_out = eval_metrics(test_loader, y_test, threshold=0.5)
print("\n=== Test Metrics ===")
print(f"Accuracy: {test_out['acc']:.4f}")
print(f"F1      : {test_out['f1']:.4f}")
print(f"ROC-AUC  : {test_out['auc']:.4f}")
print("Confusion Matrix:\n", test_out["cm"])

# -----------------------------
# 9) Save artifacts
# -----------------------------
import joblib
os.makedirs("artifacts", exist_ok=True)

joblib.dump(preprocess, "artifacts/preprocess.joblib")
torch.save(model.state_dict(), "artifacts/mlp_credit_default.pt")

print("\nSaved:")
print(" - artifacts/preprocess.joblib")
print(" - artifacts/mlp_credit_default.pt")
相关推荐
Coding茶水间2 小时前
基于深度学习的水下海洋生物检测系统演示与介绍(YOLOv12/v11/v8/v5模型+Pyqt5界面+训练代码+数据集)
图像处理·人工智能·深度学习·yolo·目标检测·机器学习·计算机视觉
深蓝海拓2 小时前
PySide6从0开始学习的笔记(十三) IDE的选择
笔记·python·qt·学习·pyqt
智算菩萨2 小时前
实战:用 Python + 传统NLP 自动总结长文章
开发语言·人工智能·python
子夜江寒2 小时前
基于 Python 库使用贝叶斯算法与逻辑森林
开发语言·python·算法
BBB努力学习程序设计2 小时前
掌握Python中不可变对象与可变对象的深度解析
python·pycharm
whitelbwwww2 小时前
Pytorch--张量表示实际数据
人工智能·pytorch·python
写文章的大米2 小时前
10分钟用Python搭个接口,还能自动生成文档?
python·fastapi
roman_日积跬步-终至千里2 小时前
【人工智能导论】05-学习-机器学习基础:从数据到智能决策
人工智能·学习·机器学习
Blossom.1183 小时前
大模型推理优化实战:连续批处理与PagedAttention性能提升300%
大数据·人工智能·python·神经网络·算法·机器学习·php