xgb-练习

以下代码未验证,仅用作练习

python 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
import joblib
import warnings
import time
import logging
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# 设置警告过滤和日志
warnings.filterwarnings("ignore", category=ConvergenceWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CreditRiskModel:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocessor = None
        self.models = {}
        self.best_model = None

    def load_and_explore_data(self):
        logging.info("Loading and exploring data...")
        self.df = pd.read_csv(self.data_path)
        logging.info(f"Dataset shape: {self.df.shape}")
        logging.info("\nDataset info:")
        self.df.info()
        logging.info("\nDataset description:")
        logging.info(self.df.describe())
        logging.info("\nTarget variable distribution:")
        logging.info(self.df['target'].value_counts(normalize=True))

        # 数据可视化
        self.visualize_data()

    def visualize_data(self):
        logging.info("Generating data visualizations...")
        # 相关性热力图
        plt.figure(figsize=(12, 10))
        sns.heatmap(self.df.corr(), annot=False, cmap='coolwarm')
        plt.title('特征相关性热力图')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()

        # 目标变量分布
        plt.figure(figsize=(8, 6))
        sns.countplot(x='target', data=self.df)
        plt.title('目标变量分布')
        plt.savefig('target_distribution.png')
        plt.close()

        # 数值型特征的分布
        num_features = self.df.select_dtypes(include=['int64', 'float64']).columns
        fig = make_subplots(rows=len(num_features)//3 + 1, cols=3, subplot_titles=num_features)
        for i, col in enumerate(num_features):
            row = i // 3 + 1
            col_num = i % 3 + 1
            fig.add_trace(go.Histogram(x=self.df[col], name=col), row=row, col=col_num)
        fig.update_layout(height=300*len(num_features)//3, width=1000, title_text="数值型特征分布")
        fig.write_html("numeric_features_distribution.html")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.X = self.df.drop('target', axis=1)
        self.y = self.df['target']

        numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.X.select_dtypes(include=['object']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        # 使用SMOTE处理不平衡数据
        smote = SMOTE(random_state=42)
        self.preprocessor = ImbPipeline([
            ('preprocessor', self.preprocessor),
            ('smote', smote)
        ])

        # 数据分割
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)

        # 应用预处理
        self.X_train = self.preprocessor.fit_transform(self.X_train, self.y_train)
        self.X_test = self.preprocessor.transform(self.X_test)

    def select_features(self, k=20):
        logging.info(f"Selecting top {k} features...")
        selector = SelectKBest(f_classif, k=k)
        self.X_train = selector.fit_transform(self.X_train, self.y_train)
        self.X_test = selector.transform(self.X_test)
        selected_feature_indices = selector.get_support(indices=True)
        self.selected_features = self.preprocessor.get_feature_names_out()[selected_feature_indices]
        logging.info(f"Selected features: {self.selected_features}")

    def train_models(self):
        logging.info("Training multiple models...")
        models = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'GradientBoosting': GradientBoostingClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42),
            'SVM': SVC(probability=True, random_state=42)
        }

        for name, model in models.items():
            logging.info(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
            self.models[name] = model

    def evaluate_models(self):
        logging.info("Evaluating models...")
        results = {}
        for name, model in self.models.items():
            logging.info(f"Evaluating {name}...")
            y_pred = model.predict(self.X_test)
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            
            results[name] = {
                'accuracy': model.score(self.X_test, self.y_test),
                'roc_auc': roc_auc_score(self.y_test, y_pred_proba),
                'classification_report': classification_report(self.y_test, y_pred),
                'confusion_matrix': confusion_matrix(self.y_test, y_pred)
            }
            
            # ROC曲线
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {results[name]["roc_auc"]:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {name}')
            plt.legend(loc="lower right")
            plt.savefig(f'roc_curve_{name}.png')
            plt.close()
            
            # 精确率-召回率曲线
            precision, recall, _ = precision_recall_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(recall, precision, label='Precision-Recall curve')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve - {name}')
            plt.legend(loc="lower left")
            plt.savefig(f'precision_recall_curve_{name}.png')
            plt.close()

        self.results = results
        self.best_model = max(results, key=lambda x: results[x]['roc_auc'])
        logging.info(f"Best model: {self.best_model}")

    def hyperparameter_tuning(self):
        logging.info("Performing hyperparameter tuning for the best model...")
        if self.best_model == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = RandomForestClassifier(random_state=42)
        elif self.best_model == 'GradientBoosting':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = GradientBoostingClassifier(random_state=42)
        elif self.best_model == 'LogisticRegression':
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
            model = LogisticRegression(random_state=42)
        else:  # SVM
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'poly'],
                'gamma': ['scale', 'auto', 0.1, 1]
            }
            model = SVC(probability=True, random_state=42)

        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        logging.info(f"Best parameters: {grid_search.best_params_}")
        logging.info(f"Best cross-validation score: {grid_search.best_score_:.4f}")
        
        self.models[self.best_model] = grid_search.best_estimator_

    def feature_importance(self):
        logging.info("Calculating feature importance...")
        if hasattr(self.models[self.best_model], 'feature_importances_'):
            importances = self.models[self.best_model].feature_importances_
            indices = np.argsort(importances)[::-1]

            plt.figure(figsize=(12, 8))
            plt.title("Feature Importances")
            plt.bar(range(len(importances)), importances[indices])
            plt.xticks(range(len(importances)), [self.selected_features[i] for i in indices], rotation=90)
            plt.tight_layout()
            plt.savefig('feature_importances.png')
            plt.close()

    def model_interpretation(self):
        logging.info("Interpreting model with SHAP...")
        explainer = shap.TreeExplainer(self.models[self.best_model])
        shap_values = explainer.shap_values(self.X_test)

        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features, plot_type="bar")
        plt.title("Feature Importance (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_importance.png')
        plt.close()

        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features)
        plt.title("Feature Impact (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_impact.png')
        plt.close()

    def save_model(self):
        logging.info("Saving the best model...")
        joblib.dump(self.models[self.best_model], f'best_model_{self.best_model}.joblib')
        joblib.dump(self.preprocessor, 'preprocessor.joblib')

    def generate_report(self):
        logging.info("Generating final report...")
        report = f"""
        Credit Risk Model Report
        ========================

        Data Summary:
        -------------
        Total samples: {len(self.df)}
        Features: {len(self.X.columns)}
        Target distribution:
        {self.df['target'].value_counts(normalize=True)}

        Model Performance:
        ------------------
        Best Model: {self.best_model}
        ROC AUC Score: {self.results[self.best_model]['roc_auc']:.4f}
        
        Classification Report:
        {self.results[self.best_model]['classification_report']}

        Confusion Matrix:
        {self.results[self.best_model]['confusion_matrix']}

        Top Features:
        -------------
        {', '.join(self.selected_features[:10])}

        Model Interpretation:
        ---------------------
        Please refer to the SHAP plots for detailed feature importance and impact analysis.

        Notes:
        ------
        - The model has been trained on balanced data using SMOTE.
        - Hyperparameter tuning was performed using GridSearchCV.
        - The model and preprocessor have been saved for future use.

        Next Steps:
        -----------
        1. Monitor model performance in production.
        2. Regularly retrain the model with new data.
        3. Consider adding more relevant features if available.
        4. Explore more advanced techniques like stacking or neural networks.
        """

        with open('credit_risk_model_report.txt', 'w') as f:
            f.write(report)

        logging.info("Report generated and saved as 'credit_risk_model_report.txt'")

    def detect_anomalies(self):
        logging.info("Detecting anomalies in the dataset...")
        
        # 使用IsolationForest进行异常检测
        from sklearn.ensemble import IsolationForest
        
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        anomalies = iso_forest.fit_predict(self.X)
        
        # 将异常结果添加到原始数据集
        self.df['anomaly'] = anomalies
        
        # 可视化异常
        plt.figure(figsize=(12, 8))
        plt.scatter(self.df.index, self.df.iloc[:, 0], c=self.df['anomaly'], cmap='viridis')
        plt.title('Anomaly Detection Results')
        plt.xlabel('Index')
        plt.ylabel('Feature 1')
        plt.colorbar(label='Anomaly (-1) vs Normal (1)')
        plt.savefig('anomaly_detection.png')
        plt.close()
        
        logging.info(f"Detected {sum(anomalies == -1)} anomalies in the dataset.")
        
    def perform_cross_validation(self):
        logging.info("Performing cross-validation...")
        
        cv_scores = cross_val_score(self.models[self.best_model], self.X_train, self.y_train, cv=5, scoring='roc_auc')
        
        logging.info(f"Cross-validation ROC AUC scores: {cv_scores}")
        logging.info(f"Mean ROC AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
        
    def analyze_misclassifications(self):
        logging.info("Analyzing misclassifications...")
        
        y_pred = self.models[self.best_model].predict(self.X_test)
        misclassified = self.X_test[y_pred != self.y_test]
        
        # 分析误分类样本的特征分布
        for feature in self.selected_features:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=self.y_test, y=self.X_test[feature])
            plt.title(f'Distribution of {feature} for Correct and Incorrect Predictions')
            plt.savefig(f'misclassification_analysis_{feature}.png')
            plt.close()
        
        logging.info("Misclassification analysis plots saved.")

    def run(self):
        self.load_and_explore_data()
        self.preprocess_data()
        self.select_features()
        self.train_models()
        self.evaluate_models()
        self.hyperparameter_tuning()
        self.feature_importance()
        self.model_interpretation()
        self.save_model()
        self.detect_anomalies()
        self.perform_cross_validation()
        self.analyze_misclassifications()
        self.generate_report()

def main():
    start_time = time.time()
    logging.info("Starting Credit Risk Modeling process...")
    
    model = CreditRiskModel('credit_risk_data.csv')
    model.run()
    
    end_time = time.time()
    logging.info(f"Credit Risk Modeling process completed in {end_time - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()
相关推荐
爱隐身的官人33 分钟前
爬虫基础学习-爬取网页项目(二)
前端·爬虫·python·学习
小清兔2 小时前
c#基础知识
开发语言·数据库·学习·unity·c#·游戏引擎·.net
刘恒1234567892 小时前
Pycharm
ide·python·pycharm
奇某人4 小时前
【语法】【C+V】本身常用图表类型用法快查【CSDN不支持,VSCODE可用】
开发语言·vscode·markdown·mermaid
做一位快乐的码农4 小时前
php程序设计之基于PHP的手工艺品销售网站/基于php在线销售系统/基于php在线购物商城系统
开发语言·php
@珍惜一生@4 小时前
Qt开源库
开发语言·qt·开源
Slaughter信仰5 小时前
深入理解Java虚拟机:JVM高级特性与最佳实践(第3版)第四章知识点问答补充及重新排版
java·开发语言·jvm
心灵宝贝5 小时前
Mac用户安装JDK 22完整流程(Intel版dmg文件安装指南附安装包下载)
java·开发语言·macos
今***b5 小时前
Python 操作 PPT 文件:从新手到高手的实战指南
java·python·powerpoint
secondyoung5 小时前
一文丝滑使用Markdown:从写作、绘图到转换为Word与PPT
开发语言·vscode·编辑器·powerpoint·markdown·visual studio·mermaid