clike
复制代码
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
# -----------------------------
# 1) Reproducibility
# -----------------------------
def set_seed(seed: int = 42) -> None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
set_seed(42)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
# -----------------------------
# 2) Load data
# -----------------------------
DATA_PATH = "data.xlsx" # <- 如果你的文件不在当前目录,改成实际路径
df = pd.read_excel(DATA_PATH)
TARGET_COL = "Credit Default"
DROP_COLS = ["Id"] # 纯ID列一般不参与建模
# 处理异常值:99999999 常见为缺失/占位
if "Current Loan Amount" in df.columns:
df.loc[df["Current Loan Amount"] == 99999999, "Current Loan Amount"] = np.nan
# 丢弃不需要的列
for c in DROP_COLS:
if c in df.columns:
df = df.drop(columns=[c])
# 划分 X/y
if TARGET_COL not in df.columns:
raise ValueError(f"Target column '{TARGET_COL}' not found. Columns: {list(df.columns)}")
y = df[TARGET_COL].astype(int).values
X = df.drop(columns=[TARGET_COL])
# -----------------------------
# 3) Preprocess (numeric + categorical)
# -----------------------------
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = [c for c in X.columns if c not in cat_cols]
numeric_pipe = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_pipe = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocess = ColumnTransformer(
transformers=[
("num", numeric_pipe, num_cols),
("cat", categorical_pipe, cat_cols),
],
remainder="drop",
)
# 分层切分:train/val/test = 70/15/15
X_train, X_temp, y_train, y_temp = train_test_split(
X, y, test_size=0.30, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)
# 只在训练集 fit 预处理器,避免泄露
X_train_t = preprocess.fit_transform(X_train)
X_val_t = preprocess.transform(X_val)
X_test_t = preprocess.transform(X_test)
# ColumnTransformer + OneHot 输出可能是稀疏矩阵,这里转成 dense(数据量7500不大,OK)
def to_dense(a):
return a.toarray() if hasattr(a, "toarray") else np.asarray(a)
X_train_t = to_dense(X_train_t).astype(np.float32)
X_val_t = to_dense(X_val_t).astype(np.float32)
X_test_t = to_dense(X_test_t).astype(np.float32)
print("Train shape:", X_train_t.shape, "Val shape:", X_val_t.shape, "Test shape:", X_test_t.shape)
# -----------------------------
# 4) Torch Dataset
# -----------------------------
class NPDataset(Dataset):
def __init__(self, X_np: np.ndarray, y_np: np.ndarray):
self.X = torch.from_numpy(X_np)
self.y = torch.from_numpy(y_np.astype(np.float32))
def __len__(self):
return self.X.shape[0]
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
BATCH_SIZE = 256
train_loader = DataLoader(NPDataset(X_train_t, y_train), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(NPDataset(X_val_t, y_val), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(NPDataset(X_test_t, y_test), batch_size=BATCH_SIZE, shuffle=False)
# -----------------------------
# 5) Model (MLP)
# -----------------------------
class MLP(nn.Module):
def __init__(self, input_dim: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, 256),
nn.BatchNorm1d(256),
nn.ReLU(),
nn.Dropout(0.25),
nn.Linear(256, 128),
nn.BatchNorm1d(128),
nn.ReLU(),
nn.Dropout(0.20),
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(0.10),
nn.Linear(64, 1) # logits
)
def forward(self, x):
return self.net(x).squeeze(1)
input_dim = X_train_t.shape[1]
model = MLP(input_dim).to(DEVICE)
# 类别不平衡处理:pos_weight = (neg/pos)
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
pos_weight = torch.tensor([neg / max(pos, 1)], dtype=torch.float32, device=DEVICE)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
# -----------------------------
# 6) Train / Eval helpers
# -----------------------------
@torch.no_grad()
def predict_proba(loader: DataLoader) -> np.ndarray:
model.eval()
probs = []
for xb, _ in loader:
xb = xb.to(DEVICE)
logits = model(xb)
p = torch.sigmoid(logits).detach().cpu().numpy()
probs.append(p)
return np.concatenate(probs, axis=0)
def train_one_epoch(loader: DataLoader) -> float:
model.train()
losses = []
for xb, yb in loader:
xb = xb.to(DEVICE)
yb = yb.to(DEVICE)
optimizer.zero_grad()
logits = model(xb)
loss = criterion(logits, yb)
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
optimizer.step()
losses.append(loss.item())
return float(np.mean(losses))
def eval_metrics(X_loader: DataLoader, y_true: np.ndarray, threshold: float = 0.5) -> dict:
prob = predict_proba(X_loader)
pred = (prob >= threshold).astype(int)
out = {
"acc": accuracy_score(y_true, pred),
"f1": f1_score(y_true, pred),
"auc": roc_auc_score(y_true, prob),
"cm": confusion_matrix(y_true, pred),
}
return out
# -----------------------------
# 7) Training loop (early stopping)
# -----------------------------
EPOCHS = 50
PATIENCE = 8
best_auc = -1.0
best_state = None
pat = 0
for epoch in range(1, EPOCHS + 1):
tr_loss = train_one_epoch(train_loader)
val_out = eval_metrics(val_loader, y_val, threshold=0.5)
print(
f"Epoch {epoch:02d} | loss={tr_loss:.4f} | "
f"val_auc={val_out['auc']:.4f} val_f1={val_out['f1']:.4f} val_acc={val_out['acc']:.4f}"
)
if val_out["auc"] > best_auc + 1e-4:
best_auc = val_out["auc"]
best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
pat = 0
else:
pat += 1
if pat >= PATIENCE:
print("Early stopping triggered.")
break
# restore best
if best_state is not None:
model.load_state_dict(best_state)
print(f"Best val AUC: {best_auc:.4f}")
# -----------------------------
# 8) Test evaluation
# -----------------------------
test_out = eval_metrics(test_loader, y_test, threshold=0.5)
print("\n=== Test Metrics ===")
print(f"Accuracy: {test_out['acc']:.4f}")
print(f"F1 : {test_out['f1']:.4f}")
print(f"ROC-AUC : {test_out['auc']:.4f}")
print("Confusion Matrix:\n", test_out["cm"])
# -----------------------------
# 9) Save artifacts
# -----------------------------
import joblib
os.makedirs("artifacts", exist_ok=True)
joblib.dump(preprocess, "artifacts/preprocess.joblib")
torch.save(model.state_dict(), "artifacts/mlp_credit_default.pt")
print("\nSaved:")
print(" - artifacts/preprocess.joblib")
print(" - artifacts/mlp_credit_default.pt")