xgb-练习

以下代码未验证,仅用作练习

python 复制代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import shap
import joblib
import warnings
import time
import logging
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# 设置警告过滤和日志
warnings.filterwarnings("ignore", category=ConvergenceWarning)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class CreditRiskModel:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.preprocessor = None
        self.models = {}
        self.best_model = None

    def load_and_explore_data(self):
        logging.info("Loading and exploring data...")
        self.df = pd.read_csv(self.data_path)
        logging.info(f"Dataset shape: {self.df.shape}")
        logging.info("\nDataset info:")
        self.df.info()
        logging.info("\nDataset description:")
        logging.info(self.df.describe())
        logging.info("\nTarget variable distribution:")
        logging.info(self.df['target'].value_counts(normalize=True))

        # 数据可视化
        self.visualize_data()

    def visualize_data(self):
        logging.info("Generating data visualizations...")
        # 相关性热力图
        plt.figure(figsize=(12, 10))
        sns.heatmap(self.df.corr(), annot=False, cmap='coolwarm')
        plt.title('特征相关性热力图')
        plt.tight_layout()
        plt.savefig('correlation_heatmap.png')
        plt.close()

        # 目标变量分布
        plt.figure(figsize=(8, 6))
        sns.countplot(x='target', data=self.df)
        plt.title('目标变量分布')
        plt.savefig('target_distribution.png')
        plt.close()

        # 数值型特征的分布
        num_features = self.df.select_dtypes(include=['int64', 'float64']).columns
        fig = make_subplots(rows=len(num_features)//3 + 1, cols=3, subplot_titles=num_features)
        for i, col in enumerate(num_features):
            row = i // 3 + 1
            col_num = i % 3 + 1
            fig.add_trace(go.Histogram(x=self.df[col], name=col), row=row, col=col_num)
        fig.update_layout(height=300*len(num_features)//3, width=1000, title_text="数值型特征分布")
        fig.write_html("numeric_features_distribution.html")

    def preprocess_data(self):
        logging.info("Preprocessing data...")
        self.X = self.df.drop('target', axis=1)
        self.y = self.df['target']

        numeric_features = self.X.select_dtypes(include=['int64', 'float64']).columns
        categorical_features = self.X.select_dtypes(include=['object']).columns

        numeric_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numeric_features),
                ('cat', categorical_transformer, categorical_features)
            ])

        # 使用SMOTE处理不平衡数据
        smote = SMOTE(random_state=42)
        self.preprocessor = ImbPipeline([
            ('preprocessor', self.preprocessor),
            ('smote', smote)
        ])

        # 数据分割
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42, stratify=self.y)

        # 应用预处理
        self.X_train = self.preprocessor.fit_transform(self.X_train, self.y_train)
        self.X_test = self.preprocessor.transform(self.X_test)

    def select_features(self, k=20):
        logging.info(f"Selecting top {k} features...")
        selector = SelectKBest(f_classif, k=k)
        self.X_train = selector.fit_transform(self.X_train, self.y_train)
        self.X_test = selector.transform(self.X_test)
        selected_feature_indices = selector.get_support(indices=True)
        self.selected_features = self.preprocessor.get_feature_names_out()[selected_feature_indices]
        logging.info(f"Selected features: {self.selected_features}")

    def train_models(self):
        logging.info("Training multiple models...")
        models = {
            'RandomForest': RandomForestClassifier(random_state=42),
            'GradientBoosting': GradientBoostingClassifier(random_state=42),
            'LogisticRegression': LogisticRegression(random_state=42),
            'SVM': SVC(probability=True, random_state=42)
        }

        for name, model in models.items():
            logging.info(f"Training {name}...")
            model.fit(self.X_train, self.y_train)
            self.models[name] = model

    def evaluate_models(self):
        logging.info("Evaluating models...")
        results = {}
        for name, model in self.models.items():
            logging.info(f"Evaluating {name}...")
            y_pred = model.predict(self.X_test)
            y_pred_proba = model.predict_proba(self.X_test)[:, 1]
            
            results[name] = {
                'accuracy': model.score(self.X_test, self.y_test),
                'roc_auc': roc_auc_score(self.y_test, y_pred_proba),
                'classification_report': classification_report(self.y_test, y_pred),
                'confusion_matrix': confusion_matrix(self.y_test, y_pred)
            }
            
            # ROC曲线
            fpr, tpr, _ = roc_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {results[name]["roc_auc"]:.2f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {name}')
            plt.legend(loc="lower right")
            plt.savefig(f'roc_curve_{name}.png')
            plt.close()
            
            # 精确率-召回率曲线
            precision, recall, _ = precision_recall_curve(self.y_test, y_pred_proba)
            plt.figure()
            plt.plot(recall, precision, label='Precision-Recall curve')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve - {name}')
            plt.legend(loc="lower left")
            plt.savefig(f'precision_recall_curve_{name}.png')
            plt.close()

        self.results = results
        self.best_model = max(results, key=lambda x: results[x]['roc_auc'])
        logging.info(f"Best model: {self.best_model}")

    def hyperparameter_tuning(self):
        logging.info("Performing hyperparameter tuning for the best model...")
        if self.best_model == 'RandomForest':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'max_depth': [None, 10, 20, 30],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = RandomForestClassifier(random_state=42)
        elif self.best_model == 'GradientBoosting':
            param_grid = {
                'n_estimators': [100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 4, 5],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }
            model = GradientBoostingClassifier(random_state=42)
        elif self.best_model == 'LogisticRegression':
            param_grid = {
                'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'penalty': ['l1', 'l2'],
                'solver': ['liblinear', 'saga']
            }
            model = LogisticRegression(random_state=42)
        else:  # SVM
            param_grid = {
                'C': [0.1, 1, 10],
                'kernel': ['rbf', 'poly'],
                'gamma': ['scale', 'auto', 0.1, 1]
            }
            model = SVC(probability=True, random_state=42)

        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(self.X_train, self.y_train)
        
        logging.info(f"Best parameters: {grid_search.best_params_}")
        logging.info(f"Best cross-validation score: {grid_search.best_score_:.4f}")
        
        self.models[self.best_model] = grid_search.best_estimator_

    def feature_importance(self):
        logging.info("Calculating feature importance...")
        if hasattr(self.models[self.best_model], 'feature_importances_'):
            importances = self.models[self.best_model].feature_importances_
            indices = np.argsort(importances)[::-1]

            plt.figure(figsize=(12, 8))
            plt.title("Feature Importances")
            plt.bar(range(len(importances)), importances[indices])
            plt.xticks(range(len(importances)), [self.selected_features[i] for i in indices], rotation=90)
            plt.tight_layout()
            plt.savefig('feature_importances.png')
            plt.close()

    def model_interpretation(self):
        logging.info("Interpreting model with SHAP...")
        explainer = shap.TreeExplainer(self.models[self.best_model])
        shap_values = explainer.shap_values(self.X_test)

        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features, plot_type="bar")
        plt.title("Feature Importance (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_importance.png')
        plt.close()

        plt.figure(figsize=(12, 8))
        shap.summary_plot(shap_values[1], self.X_test, feature_names=self.selected_features)
        plt.title("Feature Impact (SHAP values)")
        plt.tight_layout()
        plt.savefig('shap_feature_impact.png')
        plt.close()

    def save_model(self):
        logging.info("Saving the best model...")
        joblib.dump(self.models[self.best_model], f'best_model_{self.best_model}.joblib')
        joblib.dump(self.preprocessor, 'preprocessor.joblib')

    def generate_report(self):
        logging.info("Generating final report...")
        report = f"""
        Credit Risk Model Report
        ========================

        Data Summary:
        -------------
        Total samples: {len(self.df)}
        Features: {len(self.X.columns)}
        Target distribution:
        {self.df['target'].value_counts(normalize=True)}

        Model Performance:
        ------------------
        Best Model: {self.best_model}
        ROC AUC Score: {self.results[self.best_model]['roc_auc']:.4f}
        
        Classification Report:
        {self.results[self.best_model]['classification_report']}

        Confusion Matrix:
        {self.results[self.best_model]['confusion_matrix']}

        Top Features:
        -------------
        {', '.join(self.selected_features[:10])}

        Model Interpretation:
        ---------------------
        Please refer to the SHAP plots for detailed feature importance and impact analysis.

        Notes:
        ------
        - The model has been trained on balanced data using SMOTE.
        - Hyperparameter tuning was performed using GridSearchCV.
        - The model and preprocessor have been saved for future use.

        Next Steps:
        -----------
        1. Monitor model performance in production.
        2. Regularly retrain the model with new data.
        3. Consider adding more relevant features if available.
        4. Explore more advanced techniques like stacking or neural networks.
        """

        with open('credit_risk_model_report.txt', 'w') as f:
            f.write(report)

        logging.info("Report generated and saved as 'credit_risk_model_report.txt'")

    def detect_anomalies(self):
        logging.info("Detecting anomalies in the dataset...")
        
        # 使用IsolationForest进行异常检测
        from sklearn.ensemble import IsolationForest
        
        iso_forest = IsolationForest(contamination=0.1, random_state=42)
        anomalies = iso_forest.fit_predict(self.X)
        
        # 将异常结果添加到原始数据集
        self.df['anomaly'] = anomalies
        
        # 可视化异常
        plt.figure(figsize=(12, 8))
        plt.scatter(self.df.index, self.df.iloc[:, 0], c=self.df['anomaly'], cmap='viridis')
        plt.title('Anomaly Detection Results')
        plt.xlabel('Index')
        plt.ylabel('Feature 1')
        plt.colorbar(label='Anomaly (-1) vs Normal (1)')
        plt.savefig('anomaly_detection.png')
        plt.close()
        
        logging.info(f"Detected {sum(anomalies == -1)} anomalies in the dataset.")
        
    def perform_cross_validation(self):
        logging.info("Performing cross-validation...")
        
        cv_scores = cross_val_score(self.models[self.best_model], self.X_train, self.y_train, cv=5, scoring='roc_auc')
        
        logging.info(f"Cross-validation ROC AUC scores: {cv_scores}")
        logging.info(f"Mean ROC AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
        
    def analyze_misclassifications(self):
        logging.info("Analyzing misclassifications...")
        
        y_pred = self.models[self.best_model].predict(self.X_test)
        misclassified = self.X_test[y_pred != self.y_test]
        
        # 分析误分类样本的特征分布
        for feature in self.selected_features:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=self.y_test, y=self.X_test[feature])
            plt.title(f'Distribution of {feature} for Correct and Incorrect Predictions')
            plt.savefig(f'misclassification_analysis_{feature}.png')
            plt.close()
        
        logging.info("Misclassification analysis plots saved.")

    def run(self):
        self.load_and_explore_data()
        self.preprocess_data()
        self.select_features()
        self.train_models()
        self.evaluate_models()
        self.hyperparameter_tuning()
        self.feature_importance()
        self.model_interpretation()
        self.save_model()
        self.detect_anomalies()
        self.perform_cross_validation()
        self.analyze_misclassifications()
        self.generate_report()

def main():
    start_time = time.time()
    logging.info("Starting Credit Risk Modeling process...")
    
    model = CreditRiskModel('credit_risk_data.csv')
    model.run()
    
    end_time = time.time()
    logging.info(f"Credit Risk Modeling process completed in {end_time - start_time:.2f} seconds.")

if __name__ == "__main__":
    main()
相关推荐
2301_7644413326 分钟前
使用python构建的应急物资代储博弈模型
开发语言·python·算法
丿BAIKAL巛28 分钟前
Java前后端传参与接收全解析
java·开发语言
code bean34 分钟前
【C++】Scoop 包管理器与 MinGW 工具链详解
开发语言·c++
yanghuashuiyue1 小时前
Java过滤器-拦截器-AOP-Controller
java·开发语言
喏喏心1 小时前
深度强化学习:价值迭代与Bellman方程实践
人工智能·python·学习·机器学习
小冷coding1 小时前
【Java】高并发架构设计:1000 QPS服务器配置与压测实战
java·服务器·开发语言
小白勇闯网安圈1 小时前
supersqli、web2、fileclude、Web_python_template_injection
python·网络安全·web
用户8356290780511 小时前
从一维到二维:用Spire.XLS轻松将Python列表导出到Excel
后端·python
破刺不会编程1 小时前
socket编程TCP
linux·运维·服务器·开发语言·网络·网络协议·tcp/ip
北极糊的狐1 小时前
IDEA中安装 CamelCase 插件实现变量快速生成驼峰命名
开发语言·php