xgb-练习

以下代码未验证,仅用作练习

python 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
import joblib
import warnings
import time
import logging
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# 设置警告过滤和日志
warnings.filterwarnings("ignore", category=ConvergenceWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CreditRiskModel:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocessor = None
        self.models = {}
        self.best_model = None

    def load_and_explore_data(self):
        logging.info("Loading and exploring data...")
        self.df = pd.read_csv(self.data_path)
        logging.info(f"Dataset shape: {self.df.shape}")
        logging.info("\nDataset info:")
        self.df.info()
        logging.info("\nDataset description:")
        logging.info(self.df.describe())
        logging.info("\nTarget variable distribution:")
        logging.info(self.df['target'].value_counts(normalize=True))

        # 数据可视化
        self.visualize_data()

    def visualize_data(self):
        logging.info("Generating data visualizations...")
        # 相关性热力图
        plt.figure(figsize=(12, 10))
        sns.heatmap(self.df.corr(), annot=False, cmap='coolwarm')
        plt.title('特征相关性热力图')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()

        # 目标变量分布
        plt.figure(figsize=(8, 6))
        sns.countplot(x='target', data=self.df)
        plt.title('目标变量分布')
        plt.savefig('target_distribution.png')
        plt.close()

        # 数值型特征的分布
        num_features = self.df.select_dtypes(include=['int64', 'float64']).columns
        fig = make_subplots(rows=len(num_features)//3 + 1, cols=3, subplot_titles=num_features)
        for i, col in enumerate(num_features):
            row = i // 3 + 1
            col_num = i % 3 + 1
            fig.add_trace(go.Histogram(x=self.df[col], name=col), row=row, col=col_num)
        fig.update_layout(height=300*len(num_features)//3, width=1000, title_text="数值型特征分布")
        fig.write_html("numeric_features_distribution.html")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.X = self.df.drop('target', axis=1)
        self.y = self.df['target']

        numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.X.select_dtypes(include=['object']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        # 使用SMOTE处理不平衡数据
        smote = SMOTE(random_state=42)
        self.preprocessor = ImbPipeline([
            ('preprocessor', self.preprocessor),
            ('smote', smote)
        ])

        # 数据分割
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)

        # 应用预处理
        self.X_train = self.preprocessor.fit_transform(self.X_train, self.y_train)
        self.X_test = self.preprocessor.transform(self.X_test)

    def select_features(self, k=20):
        logging.info(f"Selecting top {k} features...")
        selector = SelectKBest(f_classif, k=k)
        self.X_train = selector.fit_transform(self.X_train, self.y_train)
        self.X_test = selector.transform(self.X_test)
        selected_feature_indices = selector.get_support(indices=True)
        self.selected_features = self.preprocessor.get_feature_names_out()[selected_feature_indices]
        logging.info(f"Selected features: {self.selected_features}")

    def train_models(self):
        logging.info("Training multiple models...")
        models = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'GradientBoosting': GradientBoostingClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42),
            'SVM': SVC(probability=True, random_state=42)
        }

        for name, model in models.items():
            logging.info(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
            self.models[name] = model

    def evaluate_models(self):
        logging.info("Evaluating models...")
        results = {}
        for name, model in self.models.items():
            logging.info(f"Evaluating {name}...")
            y_pred = model.predict(self.X_test)
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            
            results[name] = {
                'accuracy': model.score(self.X_test, self.y_test),
                'roc_auc': roc_auc_score(self.y_test, y_pred_proba),
                'classification_report': classification_report(self.y_test, y_pred),
                'confusion_matrix': confusion_matrix(self.y_test, y_pred)
            }
            
            # ROC曲线
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {results[name]["roc_auc"]:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {name}')
            plt.legend(loc="lower right")
            plt.savefig(f'roc_curve_{name}.png')
            plt.close()
            
            # 精确率-召回率曲线
            precision, recall, _ = precision_recall_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(recall, precision, label='Precision-Recall curve')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve - {name}')
            plt.legend(loc="lower left")
            plt.savefig(f'precision_recall_curve_{name}.png')
            plt.close()

        self.results = results
        self.best_model = max(results, key=lambda x: results[x]['roc_auc'])
        logging.info(f"Best model: {self.best_model}")

    def hyperparameter_tuning(self):
        logging.info("Performing hyperparameter tuning for the best model...")
        if self.best_model == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = RandomForestClassifier(random_state=42)
        elif self.best_model == 'GradientBoosting':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = GradientBoostingClassifier(random_state=42)
        elif self.best_model == 'LogisticRegression':
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
            model = LogisticRegression(random_state=42)
        else:  # SVM
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'poly'],
                'gamma': ['scale', 'auto', 0.1, 1]
            }
            model = SVC(probability=True, random_state=42)

        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        logging.info(f"Best parameters: {grid_search.best_params_}")
        logging.info(f"Best cross-validation score: {grid_search.best_score_:.4f}")
        
        self.models[self.best_model] = grid_search.best_estimator_

    def feature_importance(self):
        logging.info("Calculating feature importance...")
        if hasattr(self.models[self.best_model], 'feature_importances_'):
            importances = self.models[self.best_model].feature_importances_
            indices = np.argsort(importances)[::-1]

            plt.figure(figsize=(12, 8))
            plt.title("Feature Importances")
            plt.bar(range(len(importances)), importances[indices])
            plt.xticks(range(len(importances)), [self.selected_features[i] for i in indices], rotation=90)
            plt.tight_layout()
            plt.savefig('feature_importances.png')
            plt.close()

    def model_interpretation(self):
        logging.info("Interpreting model with SHAP...")
        explainer = shap.TreeExplainer(self.models[self.best_model])
        shap_values = explainer.shap_values(self.X_test)

        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features, plot_type="bar")
        plt.title("Feature Importance (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_importance.png')
        plt.close()

        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features)
        plt.title("Feature Impact (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_impact.png')
        plt.close()

    def save_model(self):
        logging.info("Saving the best model...")
        joblib.dump(self.models[self.best_model], f'best_model_{self.best_model}.joblib')
        joblib.dump(self.preprocessor, 'preprocessor.joblib')

    def generate_report(self):
        logging.info("Generating final report...")
        report = f"""
        Credit Risk Model Report
        ========================

        Data Summary:
        -------------
        Total samples: {len(self.df)}
        Features: {len(self.X.columns)}
        Target distribution:
        {self.df['target'].value_counts(normalize=True)}

        Model Performance:
        ------------------
        Best Model: {self.best_model}
        ROC AUC Score: {self.results[self.best_model]['roc_auc']:.4f}
        
        Classification Report:
        {self.results[self.best_model]['classification_report']}

        Confusion Matrix:
        {self.results[self.best_model]['confusion_matrix']}

        Top Features:
        -------------
        {', '.join(self.selected_features[:10])}

        Model Interpretation:
        ---------------------
        Please refer to the SHAP plots for detailed feature importance and impact analysis.

        Notes:
        ------
        - The model has been trained on balanced data using SMOTE.
        - Hyperparameter tuning was performed using GridSearchCV.
        - The model and preprocessor have been saved for future use.

        Next Steps:
        -----------
        1. Monitor model performance in production.
        2. Regularly retrain the model with new data.
        3. Consider adding more relevant features if available.
        4. Explore more advanced techniques like stacking or neural networks.
        """

        with open('credit_risk_model_report.txt', 'w') as f:
            f.write(report)

        logging.info("Report generated and saved as 'credit_risk_model_report.txt'")

    def detect_anomalies(self):
        logging.info("Detecting anomalies in the dataset...")
        
        # 使用IsolationForest进行异常检测
        from sklearn.ensemble import IsolationForest
        
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        anomalies = iso_forest.fit_predict(self.X)
        
        # 将异常结果添加到原始数据集
        self.df['anomaly'] = anomalies
        
        # 可视化异常
        plt.figure(figsize=(12, 8))
        plt.scatter(self.df.index, self.df.iloc[:, 0], c=self.df['anomaly'], cmap='viridis')
        plt.title('Anomaly Detection Results')
        plt.xlabel('Index')
        plt.ylabel('Feature 1')
        plt.colorbar(label='Anomaly (-1) vs Normal (1)')
        plt.savefig('anomaly_detection.png')
        plt.close()
        
        logging.info(f"Detected {sum(anomalies == -1)} anomalies in the dataset.")
        
    def perform_cross_validation(self):
        logging.info("Performing cross-validation...")
        
        cv_scores = cross_val_score(self.models[self.best_model], self.X_train, self.y_train, cv=5, scoring='roc_auc')
        
        logging.info(f"Cross-validation ROC AUC scores: {cv_scores}")
        logging.info(f"Mean ROC AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
        
    def analyze_misclassifications(self):
        logging.info("Analyzing misclassifications...")
        
        y_pred = self.models[self.best_model].predict(self.X_test)
        misclassified = self.X_test[y_pred != self.y_test]
        
        # 分析误分类样本的特征分布
        for feature in self.selected_features:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=self.y_test, y=self.X_test[feature])
            plt.title(f'Distribution of {feature} for Correct and Incorrect Predictions')
            plt.savefig(f'misclassification_analysis_{feature}.png')
            plt.close()
        
        logging.info("Misclassification analysis plots saved.")

    def run(self):
        self.load_and_explore_data()
        self.preprocess_data()
        self.select_features()
        self.train_models()
        self.evaluate_models()
        self.hyperparameter_tuning()
        self.feature_importance()
        self.model_interpretation()
        self.save_model()
        self.detect_anomalies()
        self.perform_cross_validation()
        self.analyze_misclassifications()
        self.generate_report()

def main():
    start_time = time.time()
    logging.info("Starting Credit Risk Modeling process...")
    
    model = CreditRiskModel('credit_risk_data.csv')
    model.run()
    
    end_time = time.time()
    logging.info(f"Credit Risk Modeling process completed in {end_time - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()
相关推荐
小镇敲码人3 分钟前
深入剖析华为CANN框架下的Ops-CV仓库:从入门到实战指南
c++·python·华为·cann
island131416 分钟前
CANN GE(图引擎)深度解析:计算图优化管线、内存静态规划与异构任务的 Stream 调度机制
开发语言·人工智能·深度学习·神经网络
坚持就完事了20 分钟前
Java中的集合
java·开发语言
摘星编程23 分钟前
深入理解CANN ops-nn BatchNormalization算子:训练加速的关键技术
python
魔芋红茶24 分钟前
Python 项目版本控制
开发语言·python
lili-felicity31 分钟前
CANN批处理优化技巧:从动态批处理到流水线并行
人工智能·python
一个有梦有戏的人33 分钟前
Python3基础:进阶基础,筑牢编程底层能力
后端·python
云小逸39 分钟前
【nmap源码解析】Nmap OS识别核心模块深度解析:osscan2.cc源码剖析(1)
开发语言·网络·学习·nmap
冰暮流星40 分钟前
javascript之二重循环练习
开发语言·javascript·数据库
风指引着方向41 分钟前
自定义算子开发入门:基于 CANN op-plugin 的扩展实践
开发语言