python
复制代码
import os
import pandas as pd
import numpy as np
from typing import Optional, Tuple
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# ==================== 第一步:自动创建规范的项目目录结构 ====================
def create_project_structure():
"""创建心脏病项目的规范目录结构"""
project_dir = "heart_disease_project"
dirs = [
"data", # 数据目录
"src", # 核心代码目录
"models", # 模型保存目录
"notebooks", # 笔记目录
"outputs" # 输出结果目录
]
# 创建主目录和子目录
os.makedirs(project_dir, exist_ok=True)
for d in dirs:
dir_path = os.path.join(project_dir, d)
os.makedirs(dir_path, exist_ok=True)
# 创建src目录的__init__.py(让src成为可导入的包)
with open(os.path.join(project_dir, "src", "__init__.py"), "w", encoding="utf-8") as f:
f.write("# 心脏病项目核心代码包\n")
return project_dir
# ==================== 第二步:核心功能函数 ====================
def load_heart_data(project_dir: str) -> pd.DataFrame:
"""
加载心脏病数据集(自动生成模拟数据,无需手动下载)
"""
# 生成模拟的心脏病数据集(避免手动下载依赖)
np.random.seed(42)
data = {
"age": np.random.randint(25, 80, 300),
"sex": np.random.randint(0, 2, 300),
"cp": np.random.randint(0, 4, 300),
"trestbps": np.random.randint(90, 200, 300),
"chol": np.random.randint(120, 400, 300),
"fbs": np.random.randint(0, 2, 300),
"restecg": np.random.randint(0, 3, 300),
"thalach": np.random.randint(70, 200, 300),
"exang": np.random.randint(0, 2, 300),
"oldpeak": np.round(np.random.uniform(0, 6, 300), 1),
"slope": np.random.randint(0, 3, 300),
"ca": np.random.randint(0, 5, 300),
"thal": np.random.randint(0, 4, 300),
"target": np.random.randint(0, 2, 300) # 0=无心脏病,1=有心脏病
}
df = pd.DataFrame(data)
# 保存模拟数据到data目录
data_path = os.path.join(project_dir, "data", "heart.csv")
df.to_csv(data_path, index=False)
print(f"✅ 模拟数据集已保存到: {data_path}")
return df
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
"""数据清洗:去重、填充缺失值"""
df_clean = df.drop_duplicates() # 去重
# 模拟添加少量缺失值并填充(演示用)
df_clean.loc[df_clean.sample(5).index, "chol"] = np.nan
df_clean = df_clean.fillna(df_clean.mean()) # 均值填充
print("✅ 数据清洗完成(去重+缺失值填充)")
return df_clean
def build_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
"""特征工程:拆分特征/标签 + 标准化"""
X = df.drop("target", axis=1)
y = df["target"]
# 特征标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
print("✅ 特征工程完成(特征标准化+拆分标签)")
return X_scaled_df, y
def train_model(X: pd.DataFrame, y: pd.Series) -> Tuple[LogisticRegression, float]:
"""训练逻辑回归模型并返回模型和准确率"""
# 拆分训练/测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# 训练模型
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
# 评估模型
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ 模型训练完成 | 测试集准确率: {acc:.2f}")
return model, acc
# ==================== 第三步:主运行流程 ====================
if __name__ == "__main__":
print("===== 心脏病项目(规范拆分版)开始运行 =====")
# 1. 创建项目目录
project_dir = create_project_structure()
print(f"✅ 项目目录已创建: {os.path.abspath(project_dir)}")
# 2. 加载数据(自动生成模拟数据)
df = load_heart_data(project_dir)
# 3. 数据清洗
df_clean = clean_data(df)
# 4. 特征工程
X, y = build_features(df_clean)
# 5. 模型训练
model, acc = train_model(X, y)
print("\n===== 项目运行完成 =====")
print(f"📊 最终结果:模型准确率 {acc:.2f}")
print(f"📂 项目文件位置:{os.path.abspath(project_dir)}")