xgb-练习

以下代码未验证,仅用作练习

python 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
import joblib
import warnings
import time
import logging
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# 设置警告过滤和日志
warnings.filterwarnings("ignore", category=ConvergenceWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CreditRiskModel:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocessor = None
        self.models = {}
        self.best_model = None

    def load_and_explore_data(self):
        logging.info("Loading and exploring data...")
        self.df = pd.read_csv(self.data_path)
        logging.info(f"Dataset shape: {self.df.shape}")
        logging.info("\nDataset info:")
        self.df.info()
        logging.info("\nDataset description:")
        logging.info(self.df.describe())
        logging.info("\nTarget variable distribution:")
        logging.info(self.df['target'].value_counts(normalize=True))

        # 数据可视化
        self.visualize_data()

    def visualize_data(self):
        logging.info("Generating data visualizations...")
        # 相关性热力图
        plt.figure(figsize=(12, 10))
        sns.heatmap(self.df.corr(), annot=False, cmap='coolwarm')
        plt.title('特征相关性热力图')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()

        # 目标变量分布
        plt.figure(figsize=(8, 6))
        sns.countplot(x='target', data=self.df)
        plt.title('目标变量分布')
        plt.savefig('target_distribution.png')
        plt.close()

        # 数值型特征的分布
        num_features = self.df.select_dtypes(include=['int64', 'float64']).columns
        fig = make_subplots(rows=len(num_features)//3 + 1, cols=3, subplot_titles=num_features)
        for i, col in enumerate(num_features):
            row = i // 3 + 1
            col_num = i % 3 + 1
            fig.add_trace(go.Histogram(x=self.df[col], name=col), row=row, col=col_num)
        fig.update_layout(height=300*len(num_features)//3, width=1000, title_text="数值型特征分布")
        fig.write_html("numeric_features_distribution.html")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.X = self.df.drop('target', axis=1)
        self.y = self.df['target']

        numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.X.select_dtypes(include=['object']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        # 使用SMOTE处理不平衡数据
        smote = SMOTE(random_state=42)
        self.preprocessor = ImbPipeline([
            ('preprocessor', self.preprocessor),
            ('smote', smote)
        ])

        # 数据分割
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)

        # 应用预处理
        self.X_train = self.preprocessor.fit_transform(self.X_train, self.y_train)
        self.X_test = self.preprocessor.transform(self.X_test)

    def select_features(self, k=20):
        logging.info(f"Selecting top {k} features...")
        selector = SelectKBest(f_classif, k=k)
        self.X_train = selector.fit_transform(self.X_train, self.y_train)
        self.X_test = selector.transform(self.X_test)
        selected_feature_indices = selector.get_support(indices=True)
        self.selected_features = self.preprocessor.get_feature_names_out()[selected_feature_indices]
        logging.info(f"Selected features: {self.selected_features}")

    def train_models(self):
        logging.info("Training multiple models...")
        models = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'GradientBoosting': GradientBoostingClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42),
            'SVM': SVC(probability=True, random_state=42)
        }

        for name, model in models.items():
            logging.info(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
            self.models[name] = model

    def evaluate_models(self):
        logging.info("Evaluating models...")
        results = {}
        for name, model in self.models.items():
            logging.info(f"Evaluating {name}...")
            y_pred = model.predict(self.X_test)
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            
            results[name] = {
                'accuracy': model.score(self.X_test, self.y_test),
                'roc_auc': roc_auc_score(self.y_test, y_pred_proba),
                'classification_report': classification_report(self.y_test, y_pred),
                'confusion_matrix': confusion_matrix(self.y_test, y_pred)
            }
            
            # ROC曲线
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {results[name]["roc_auc"]:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {name}')
            plt.legend(loc="lower right")
            plt.savefig(f'roc_curve_{name}.png')
            plt.close()
            
            # 精确率-召回率曲线
            precision, recall, _ = precision_recall_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(recall, precision, label='Precision-Recall curve')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve - {name}')
            plt.legend(loc="lower left")
            plt.savefig(f'precision_recall_curve_{name}.png')
            plt.close()

        self.results = results
        self.best_model = max(results, key=lambda x: results[x]['roc_auc'])
        logging.info(f"Best model: {self.best_model}")

    def hyperparameter_tuning(self):
        logging.info("Performing hyperparameter tuning for the best model...")
        if self.best_model == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = RandomForestClassifier(random_state=42)
        elif self.best_model == 'GradientBoosting':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = GradientBoostingClassifier(random_state=42)
        elif self.best_model == 'LogisticRegression':
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
            model = LogisticRegression(random_state=42)
        else:  # SVM
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'poly'],
                'gamma': ['scale', 'auto', 0.1, 1]
            }
            model = SVC(probability=True, random_state=42)

        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        logging.info(f"Best parameters: {grid_search.best_params_}")
        logging.info(f"Best cross-validation score: {grid_search.best_score_:.4f}")
        
        self.models[self.best_model] = grid_search.best_estimator_

    def feature_importance(self):
        logging.info("Calculating feature importance...")
        if hasattr(self.models[self.best_model], 'feature_importances_'):
            importances = self.models[self.best_model].feature_importances_
            indices = np.argsort(importances)[::-1]

            plt.figure(figsize=(12, 8))
            plt.title("Feature Importances")
            plt.bar(range(len(importances)), importances[indices])
            plt.xticks(range(len(importances)), [self.selected_features[i] for i in indices], rotation=90)
            plt.tight_layout()
            plt.savefig('feature_importances.png')
            plt.close()

    def model_interpretation(self):
        logging.info("Interpreting model with SHAP...")
        explainer = shap.TreeExplainer(self.models[self.best_model])
        shap_values = explainer.shap_values(self.X_test)

        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features, plot_type="bar")
        plt.title("Feature Importance (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_importance.png')
        plt.close()

        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features)
        plt.title("Feature Impact (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_impact.png')
        plt.close()

    def save_model(self):
        logging.info("Saving the best model...")
        joblib.dump(self.models[self.best_model], f'best_model_{self.best_model}.joblib')
        joblib.dump(self.preprocessor, 'preprocessor.joblib')

    def generate_report(self):
        logging.info("Generating final report...")
        report = f"""
        Credit Risk Model Report
        ========================

        Data Summary:
        -------------
        Total samples: {len(self.df)}
        Features: {len(self.X.columns)}
        Target distribution:
        {self.df['target'].value_counts(normalize=True)}

        Model Performance:
        ------------------
        Best Model: {self.best_model}
        ROC AUC Score: {self.results[self.best_model]['roc_auc']:.4f}
        
        Classification Report:
        {self.results[self.best_model]['classification_report']}

        Confusion Matrix:
        {self.results[self.best_model]['confusion_matrix']}

        Top Features:
        -------------
        {', '.join(self.selected_features[:10])}

        Model Interpretation:
        ---------------------
        Please refer to the SHAP plots for detailed feature importance and impact analysis.

        Notes:
        ------
        - The model has been trained on balanced data using SMOTE.
        - Hyperparameter tuning was performed using GridSearchCV.
        - The model and preprocessor have been saved for future use.

        Next Steps:
        -----------
        1. Monitor model performance in production.
        2. Regularly retrain the model with new data.
        3. Consider adding more relevant features if available.
        4. Explore more advanced techniques like stacking or neural networks.
        """

        with open('credit_risk_model_report.txt', 'w') as f:
            f.write(report)

        logging.info("Report generated and saved as 'credit_risk_model_report.txt'")

    def detect_anomalies(self):
        logging.info("Detecting anomalies in the dataset...")
        
        # 使用IsolationForest进行异常检测
        from sklearn.ensemble import IsolationForest
        
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        anomalies = iso_forest.fit_predict(self.X)
        
        # 将异常结果添加到原始数据集
        self.df['anomaly'] = anomalies
        
        # 可视化异常
        plt.figure(figsize=(12, 8))
        plt.scatter(self.df.index, self.df.iloc[:, 0], c=self.df['anomaly'], cmap='viridis')
        plt.title('Anomaly Detection Results')
        plt.xlabel('Index')
        plt.ylabel('Feature 1')
        plt.colorbar(label='Anomaly (-1) vs Normal (1)')
        plt.savefig('anomaly_detection.png')
        plt.close()
        
        logging.info(f"Detected {sum(anomalies == -1)} anomalies in the dataset.")
        
    def perform_cross_validation(self):
        logging.info("Performing cross-validation...")
        
        cv_scores = cross_val_score(self.models[self.best_model], self.X_train, self.y_train, cv=5, scoring='roc_auc')
        
        logging.info(f"Cross-validation ROC AUC scores: {cv_scores}")
        logging.info(f"Mean ROC AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
        
    def analyze_misclassifications(self):
        logging.info("Analyzing misclassifications...")
        
        y_pred = self.models[self.best_model].predict(self.X_test)
        misclassified = self.X_test[y_pred != self.y_test]
        
        # 分析误分类样本的特征分布
        for feature in self.selected_features:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=self.y_test, y=self.X_test[feature])
            plt.title(f'Distribution of {feature} for Correct and Incorrect Predictions')
            plt.savefig(f'misclassification_analysis_{feature}.png')
            plt.close()
        
        logging.info("Misclassification analysis plots saved.")

    def run(self):
        self.load_and_explore_data()
        self.preprocess_data()
        self.select_features()
        self.train_models()
        self.evaluate_models()
        self.hyperparameter_tuning()
        self.feature_importance()
        self.model_interpretation()
        self.save_model()
        self.detect_anomalies()
        self.perform_cross_validation()
        self.analyze_misclassifications()
        self.generate_report()

def main():
    start_time = time.time()
    logging.info("Starting Credit Risk Modeling process...")
    
    model = CreditRiskModel('credit_risk_data.csv')
    model.run()
    
    end_time = time.time()
    logging.info(f"Credit Risk Modeling process completed in {end_time - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()
相关推荐
曾阿伦5 小时前
Python3 文件 (夹) 操作备忘录
开发语言·python
dong__csdn5 小时前
jdk添加信任证书
java·开发语言
架构师老Y5 小时前
006、异步编程与并发模型:asyncio与高性能后端
python
清水白石0085 小时前
《解锁 Python 潜能:从核心语法到 AI 服务层架构的工业级进阶与实战》
人工智能·python·架构
kcuwu.5 小时前
Python数据分析三剑客导论:NumPy、Pandas、Matplotlib 从入门到入门
python·数据分析·numpy
weixin_513449966 小时前
walk_these_ways项目学习记录第七篇(通过行为多样性 (MoB) 实现地形泛化)--核心环境下
人工智能·python·学习
南 阳6 小时前
Python从入门到精通day64
开发语言·python
花千树-0106 小时前
Java 接入多家大模型 API 实战对比
java·开发语言·人工智能·ai·langchain·ai编程
蓝天守卫者联盟16 小时前
如何选择二氯甲烷回收设备厂家:技术路线与市场格局深度解析
大数据·人工智能·python·sqlite·tornado
蓝色的杯子6 小时前
Python面试30分钟突击掌握
python