xgb-练习

以下代码未验证,仅用作练习

python 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
import joblib
import warnings
import time
import logging
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# 设置警告过滤和日志
warnings.filterwarnings("ignore", category=ConvergenceWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CreditRiskModel:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocessor = None
        self.models = {}
        self.best_model = None

    def load_and_explore_data(self):
        logging.info("Loading and exploring data...")
        self.df = pd.read_csv(self.data_path)
        logging.info(f"Dataset shape: {self.df.shape}")
        logging.info("\nDataset info:")
        self.df.info()
        logging.info("\nDataset description:")
        logging.info(self.df.describe())
        logging.info("\nTarget variable distribution:")
        logging.info(self.df['target'].value_counts(normalize=True))

        # 数据可视化
        self.visualize_data()

    def visualize_data(self):
        logging.info("Generating data visualizations...")
        # 相关性热力图
        plt.figure(figsize=(12, 10))
        sns.heatmap(self.df.corr(), annot=False, cmap='coolwarm')
        plt.title('特征相关性热力图')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()

        # 目标变量分布
        plt.figure(figsize=(8, 6))
        sns.countplot(x='target', data=self.df)
        plt.title('目标变量分布')
        plt.savefig('target_distribution.png')
        plt.close()

        # 数值型特征的分布
        num_features = self.df.select_dtypes(include=['int64', 'float64']).columns
        fig = make_subplots(rows=len(num_features)//3 + 1, cols=3, subplot_titles=num_features)
        for i, col in enumerate(num_features):
            row = i // 3 + 1
            col_num = i % 3 + 1
            fig.add_trace(go.Histogram(x=self.df[col], name=col), row=row, col=col_num)
        fig.update_layout(height=300*len(num_features)//3, width=1000, title_text="数值型特征分布")
        fig.write_html("numeric_features_distribution.html")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.X = self.df.drop('target', axis=1)
        self.y = self.df['target']

        numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.X.select_dtypes(include=['object']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        # 使用SMOTE处理不平衡数据
        smote = SMOTE(random_state=42)
        self.preprocessor = ImbPipeline([
            ('preprocessor', self.preprocessor),
            ('smote', smote)
        ])

        # 数据分割
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)

        # 应用预处理
        self.X_train = self.preprocessor.fit_transform(self.X_train, self.y_train)
        self.X_test = self.preprocessor.transform(self.X_test)

    def select_features(self, k=20):
        logging.info(f"Selecting top {k} features...")
        selector = SelectKBest(f_classif, k=k)
        self.X_train = selector.fit_transform(self.X_train, self.y_train)
        self.X_test = selector.transform(self.X_test)
        selected_feature_indices = selector.get_support(indices=True)
        self.selected_features = self.preprocessor.get_feature_names_out()[selected_feature_indices]
        logging.info(f"Selected features: {self.selected_features}")

    def train_models(self):
        logging.info("Training multiple models...")
        models = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'GradientBoosting': GradientBoostingClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42),
            'SVM': SVC(probability=True, random_state=42)
        }

        for name, model in models.items():
            logging.info(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
            self.models[name] = model

    def evaluate_models(self):
        logging.info("Evaluating models...")
        results = {}
        for name, model in self.models.items():
            logging.info(f"Evaluating {name}...")
            y_pred = model.predict(self.X_test)
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            
            results[name] = {
                'accuracy': model.score(self.X_test, self.y_test),
                'roc_auc': roc_auc_score(self.y_test, y_pred_proba),
                'classification_report': classification_report(self.y_test, y_pred),
                'confusion_matrix': confusion_matrix(self.y_test, y_pred)
            }
            
            # ROC曲线
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {results[name]["roc_auc"]:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {name}')
            plt.legend(loc="lower right")
            plt.savefig(f'roc_curve_{name}.png')
            plt.close()
            
            # 精确率-召回率曲线
            precision, recall, _ = precision_recall_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(recall, precision, label='Precision-Recall curve')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve - {name}')
            plt.legend(loc="lower left")
            plt.savefig(f'precision_recall_curve_{name}.png')
            plt.close()

        self.results = results
        self.best_model = max(results, key=lambda x: results[x]['roc_auc'])
        logging.info(f"Best model: {self.best_model}")

    def hyperparameter_tuning(self):
        logging.info("Performing hyperparameter tuning for the best model...")
        if self.best_model == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = RandomForestClassifier(random_state=42)
        elif self.best_model == 'GradientBoosting':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = GradientBoostingClassifier(random_state=42)
        elif self.best_model == 'LogisticRegression':
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
            model = LogisticRegression(random_state=42)
        else:  # SVM
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'poly'],
                'gamma': ['scale', 'auto', 0.1, 1]
            }
            model = SVC(probability=True, random_state=42)

        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        logging.info(f"Best parameters: {grid_search.best_params_}")
        logging.info(f"Best cross-validation score: {grid_search.best_score_:.4f}")
        
        self.models[self.best_model] = grid_search.best_estimator_

    def feature_importance(self):
        logging.info("Calculating feature importance...")
        if hasattr(self.models[self.best_model], 'feature_importances_'):
            importances = self.models[self.best_model].feature_importances_
            indices = np.argsort(importances)[::-1]

            plt.figure(figsize=(12, 8))
            plt.title("Feature Importances")
            plt.bar(range(len(importances)), importances[indices])
            plt.xticks(range(len(importances)), [self.selected_features[i] for i in indices], rotation=90)
            plt.tight_layout()
            plt.savefig('feature_importances.png')
            plt.close()

    def model_interpretation(self):
        logging.info("Interpreting model with SHAP...")
        explainer = shap.TreeExplainer(self.models[self.best_model])
        shap_values = explainer.shap_values(self.X_test)

        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features, plot_type="bar")
        plt.title("Feature Importance (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_importance.png')
        plt.close()

        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features)
        plt.title("Feature Impact (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_impact.png')
        plt.close()

    def save_model(self):
        logging.info("Saving the best model...")
        joblib.dump(self.models[self.best_model], f'best_model_{self.best_model}.joblib')
        joblib.dump(self.preprocessor, 'preprocessor.joblib')

    def generate_report(self):
        logging.info("Generating final report...")
        report = f"""
        Credit Risk Model Report
        ========================

        Data Summary:
        -------------
        Total samples: {len(self.df)}
        Features: {len(self.X.columns)}
        Target distribution:
        {self.df['target'].value_counts(normalize=True)}

        Model Performance:
        ------------------
        Best Model: {self.best_model}
        ROC AUC Score: {self.results[self.best_model]['roc_auc']:.4f}
        
        Classification Report:
        {self.results[self.best_model]['classification_report']}

        Confusion Matrix:
        {self.results[self.best_model]['confusion_matrix']}

        Top Features:
        -------------
        {', '.join(self.selected_features[:10])}

        Model Interpretation:
        ---------------------
        Please refer to the SHAP plots for detailed feature importance and impact analysis.

        Notes:
        ------
        - The model has been trained on balanced data using SMOTE.
        - Hyperparameter tuning was performed using GridSearchCV.
        - The model and preprocessor have been saved for future use.

        Next Steps:
        -----------
        1. Monitor model performance in production.
        2. Regularly retrain the model with new data.
        3. Consider adding more relevant features if available.
        4. Explore more advanced techniques like stacking or neural networks.
        """

        with open('credit_risk_model_report.txt', 'w') as f:
            f.write(report)

        logging.info("Report generated and saved as 'credit_risk_model_report.txt'")

    def detect_anomalies(self):
        logging.info("Detecting anomalies in the dataset...")
        
        # 使用IsolationForest进行异常检测
        from sklearn.ensemble import IsolationForest
        
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        anomalies = iso_forest.fit_predict(self.X)
        
        # 将异常结果添加到原始数据集
        self.df['anomaly'] = anomalies
        
        # 可视化异常
        plt.figure(figsize=(12, 8))
        plt.scatter(self.df.index, self.df.iloc[:, 0], c=self.df['anomaly'], cmap='viridis')
        plt.title('Anomaly Detection Results')
        plt.xlabel('Index')
        plt.ylabel('Feature 1')
        plt.colorbar(label='Anomaly (-1) vs Normal (1)')
        plt.savefig('anomaly_detection.png')
        plt.close()
        
        logging.info(f"Detected {sum(anomalies == -1)} anomalies in the dataset.")
        
    def perform_cross_validation(self):
        logging.info("Performing cross-validation...")
        
        cv_scores = cross_val_score(self.models[self.best_model], self.X_train, self.y_train, cv=5, scoring='roc_auc')
        
        logging.info(f"Cross-validation ROC AUC scores: {cv_scores}")
        logging.info(f"Mean ROC AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
        
    def analyze_misclassifications(self):
        logging.info("Analyzing misclassifications...")
        
        y_pred = self.models[self.best_model].predict(self.X_test)
        misclassified = self.X_test[y_pred != self.y_test]
        
        # 分析误分类样本的特征分布
        for feature in self.selected_features:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=self.y_test, y=self.X_test[feature])
            plt.title(f'Distribution of {feature} for Correct and Incorrect Predictions')
            plt.savefig(f'misclassification_analysis_{feature}.png')
            plt.close()
        
        logging.info("Misclassification analysis plots saved.")

    def run(self):
        self.load_and_explore_data()
        self.preprocess_data()
        self.select_features()
        self.train_models()
        self.evaluate_models()
        self.hyperparameter_tuning()
        self.feature_importance()
        self.model_interpretation()
        self.save_model()
        self.detect_anomalies()
        self.perform_cross_validation()
        self.analyze_misclassifications()
        self.generate_report()

def main():
    start_time = time.time()
    logging.info("Starting Credit Risk Modeling process...")
    
    model = CreditRiskModel('credit_risk_data.csv')
    model.run()
    
    end_time = time.time()
    logging.info(f"Credit Risk Modeling process completed in {end_time - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()
相关推荐
new coder几秒前
[c++语法学习]Day10:c++引用
开发语言·c++·学习
驰羽6 分钟前
[GO]GORM 常用 Tag 速查手册
开发语言·后端·golang
Narcissiffo15 分钟前
【C语言】str系列函数
c语言·开发语言
workflower19 分钟前
软件工程与计算机科学的关系
开发语言·软件工程·团队开发·需求分析·个人开发·结对编程
ajsbxi21 分钟前
【Java 基础】核心知识点梳理
java·开发语言·笔记
阿珊和她的猫33 分钟前
深入理解与手写发布订阅模式
开发语言·前端·javascript·vue.js·ecmascript·状态模式
懷淰メ39 分钟前
python3GUI--模仿百度网盘的本地文件管理器 By:PyQt5(详细分享)
开发语言·python·pyqt·文件管理·百度云·百度网盘·ui设计
Q_Q51100828540 分钟前
python基于web的汽车班车车票管理系统/火车票预订系统/高铁预定系统 可在线选座
spring boot·python·django·flask·node.js·汽车·php
新子y1 小时前
【小白笔记】普通二叉树(General Binary Tree)和二叉搜索树的最近公共祖先(LCA)
开发语言·笔记·python
囚生CY1 小时前
【速写】优化的深度与广度(Adam & Moun)
人工智能·python·算法