python打卡DAY22

##注入所需库

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

import random

import numpy as np

import time

import shap

from sklearn.svm import SVC #支持向量机分类器

# from sklearn.neighbors import KNeighborsClassifier #K近邻分类器

# from sklearn.linear_model import LogisticRegression #逻辑回归分类器

import xgboost as xgb #XGBoost分类器

import lightgbm as lgb #LightGBM分类器

from sklearn.ensemble import RandomForestClassifier #随机森林分类器

# from catboost import CatBoostClassifier #CatBoost分类器

# from sklearn.tree import DecisionTreeClassifier #决策树分类器

# from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器

from skopt import BayesSearchCV

from skopt.space import Integer

from deap import base, creator, tools, algorithms

from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标

from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵

from sklearn.metrics import make_scorer#定义函数

import warnings #用于忽略警告信息

warnings.filterwarnings("ignore") # 忽略所有警告信息

#聚类

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

#3D可视化

from mpl_toolkits.mplot3d import Axes3D

#设置中文字体&负号正确显示

plt.rcParams'font.sans-serif'='STHeiti'

plt.rcParams'axes.unicode_minus'=True

plt.rcParams'figure.dpi'=100

#读取数据

data=pd.read_csv(r'data.csv')

#数据填补

for i in data.columns:

if datai.dtype!='object':

if datai.isnull().sum()>0:

datai.fillna(datai.mean(),inplace=True)

else:

if datai.isnull().sum()>0:

datai.fillna(datai.mode()0,inplace=True)

mapping={'10+ years':0,

'9 years':1,

'8 years':2,

'7 years':3,

'6 years':4,

'5 years':5,

'4 years':6,

'3 years':7,

'2 years':8,

'1 year':9,

'< 1 year':10}

data'Years in current job'=data'Years in current job'.map(mapping)

dummies_list=\[\]

data2=pd.read_csv(r'data.csv')

data=pd.get_dummies(data=data,drop_first=True)

for i in data.columns:

if i not in data2.columns:

dummies_list.append(i)

for i in dummies_list:

datai=datai.astype(int)

print(f'{data.info()}')

#划分数据集

from sklearn.model_selection import train_test_split

x=data.drop(columns='Credit Default','Id',axis=1)

y=data'Credit Default'

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

#smote

from imblearn.over_sampling import SMOTE

smote=SMOTE(random_state=42)

x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)

#标准化数据,将自变量标准化,聚类就是从自变量中聚合新的自变量,与因变量无关

scaler=StandardScaler()

x_scaled=scaler.fit_transform(x)

#KMeans++

k_range=range(2,5)

inertia_value=\[\]

silhouette_scores=\[\]

ch_scores=\[\]

db_scores=\[\]

start_time=time.time()

for k in k_range:

kmeans=KMeans(n_clusters=k,random_state=42)

kmeans_label=kmeans.fit_predict(x_scaled)#提供了每个数据点所属的簇的信息,用于区分不同簇的数据点

inertia_value.append(kmeans.inertia_)

silhouette=silhouette_score(x_scaled,kmeans_label)

silhouette_scores.append(silhouette)

ch=calinski_harabasz_score(x_scaled,kmeans_label)

ch_scores.append(ch)

db=davies_bouldin_score(x_scaled,kmeans_label)

db_scores.append(db)

# print(f'k={k}\n 惯性:{kmeans.inertia_:.2f}\n轮廓系数:{silhouette:.3f}\n CH系数:{ch:.2f}\n DB{db:.3f}')

end_time=time.time()

print(f'聚类分析耗时:{end_time-start_time:.4f}')

#绘制评估指标图

plt.figure(figsize=(12,6))

#肘部法则图

plt.subplot(2,2,1)

plt.plot(k_range,inertia_value,marker='o')

plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')

plt.xlabel('聚类数 (k)')

plt.ylabel('惯性')

plt.grid(True)

#轮廓系数图

plt.subplot(2,2,2)

plt.plot(k_range,silhouette_scores,marker='o',color='orange')

plt.title('轮廓系数确定最优聚类数 k(越大越好)')

plt.xlabel('聚类数 (k)')

plt.ylabel('轮廓系数')

plt.grid(True)

#CH指数图

plt.subplot(2,2,3)

plt.plot(k_range,ch_scores,marker='o',color='red')

plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')

plt.xlabel('聚类数 (k)')

plt.ylabel('CH 指数')

plt.grid(True)

#DB指数图

plt.subplot(2,2,4)

plt.plot(k_range,db_scores,marker='o',color='yellow')

plt.xlabel('聚类数 (k)')

plt.ylabel('DB 指数')

plt.grid(True)

plt.tight_layout()

plt.show()

#选择K值进行聚类

selected_k=3

kmeans=KMeans(n_clusters=selected_k,random_state=42)

kmeans_label=kmeans.fit_predict(x_scaled)

x'KMeans_Cluster'=kmeans_label

##PCA降维

pca=PCA(n_components=3)

x_pca=pca.fit_transform(x_scaled)

# ##聚类可视化

# plt.figure(figsize=(6,5))

# sns.scatterplot(

# x=x_pca:,0,

# y=x_pca:,1,

# hue=kmeans_label,

# palette='viridis'

# )

# plt.title(f'KMean Clustering with k={selected_k} (PCA Visualization)')

# plt.xlabel('PCA Component 1')

# plt.ylabel('PCA Component 2')

# plt.show()

# #3D可视化

pca=PCA(n_components=3)

import plotly.express as px

import plotly.graph_objects as go

# 准备数据

df_pca = pd.DataFrame(x_pca, columns='PC1', 'PC2', 'PC3')

df_pca'Cluster' = kmeans_label

# 创建3D散点图

fig = px.scatter_3d(df_pca, x='PC1', y='PC2', z='PC3', color='Cluster',

color_continuous_scale=px.colors.sequential.Viridis,

title=f'KMeans Clustering with k={selected_k} (PCA 3D Visualization)')

# 调整图形

fig.update_layout(scene=dict(xaxis_title='PCA Component 1',

yaxis_title='PCA Component 2',

zaxis_title='PCA Component 3'),

width=1200, height=1000)

# 显示图形

fig.show()

##打印KMeans聚类前几行

print(f'KMeans Cluster labels(k={selected_k}added to x):')

print(x\['KMeans_Cluster'].value_counts())

start_time=time.time()

x1=x.drop('KMeans_Cluster',axis=1)

y1=x'KMeans_Cluster'

rf1_model=RandomForestClassifier(random_state=42,class_weight='balanced')

rf1_model.fit(x1,y1)

explainer=shap.TreeExplainer(rf1_model)

shap_values=explainer.shap_values(x1)

print(shap_values.shape)

end_time=time.time()

print(f'SHAP分析耗时:{end_time-start_time:.4f}')

# --- 1. SHAP 特征重要性条形图 (Summary Plot - Bar) ---

print("--- 1. SHAP 特征重要性条形图 ---")

shap.summary_plot(shap_values:,:,0,x1,plot_type='bar',show=False)

plt.title('shap feature importance (bar plot)')

plt.tight_layout()

plt.show()

selected_features='Purpose_debt consolidation','Home Ownership_Home Mortgage','Purpose_home improvements','Purpose_other'

for feature in selected_features:

unique_count=xfeature.nunique()

print(f'{feature}的唯一值数量:{unique_count}')

if unique_count<10:

print(f'{feature}可能是离散型变量')

else:

print(f'{feature}可能是连续性变量')

fig,axes=plt.subplots(2,2,figsize=(10,8))

axes=axes.flatten()

for i,feature in enumerate(selected_features):

axesi.hist(xfeature,bins=10)

axesi.set_title(f'histogram of {feature}')

axesi.set_xlabel(feature)

axesi.set_ylabel('frequency')

plt.tight_layout()

plt.show()

print(x\['KMeans_Cluster'].value_counts())

x_cluster0=xx\['KMeans_Cluster'==0]

x_cluster1=xx\['KMeans_Cluster'==1]

x_cluster2=xx\['KMeans_Cluster'==2]

x_cluster3=xx\['KMeans_Cluster'==3]

#簇0

fig,axes=plt.subplots(2,2,figsize=(6,4))

axes=axes.flatten()

for i,feature in enumerate(selected_features):

sns.countplot(x=x_cluster0feature,ax=axesi)

axesi.set_title(f'countplot of {feature}')

axesi.set_xlabel(feature)

axesi.set_ylabel('count')

plt.tight_layout()

plt.show()

#簇1

fig,axes=plt.subplots(2,2,figsize=(6,4))

axes=axes.flatten()

for i,feature in enumerate(selected_features):

sns.countplot(x=x_cluster1feature,ax=axesi)

axesi.set_title(f'countplot of {feature}')

axesi.set_xlabel(feature)

axesi.set_ylabel('count')

plt.tight_layout()

plt.show()

#簇2

fig,axes=plt.subplots(2,2,figsize=(6,4))

axes=axes.flatten()

for i,feature in enumerate(selected_features):

sns.countplot(x=x_cluster2feature,ax=axesi)

axesi.set_title(f'countplot of {feature}')

axesi.set_xlabel(feature)

axesi.set_ylabel('count')

plt.tight_layout()

plt.show()

print("--- 递归特征消除 (RFE) ---")

from sklearn.feature_selection import RFE

base_model=RandomForestClassifier(random_state=42,class_weight='balanced')

rfe=RFE(base_model,n_features_to_select=3)

rfe.fit(x_train_smote,y_train_smote)

x_train_rfe=rfe.transform(x_train_smote)

x_test_rfe=rfe.transform(x_test)

selected_features_rfe=x_train.columnsrfe.support_

print(f"RFE筛选后保留的特征数量: {len(selected_features_rfe)}")

print(f"保留的特征: {selected_features_rfe}")

#3D可视化

import plotly.express as px

import plotly.graph_objects as go

x_selected=xselected_features_rfe

df_viz=pd.DataFrame(x_selected)

df_viz'cluster'=x'KMeans_Cluster'

fig=px.scatter_3d(

df_viz,

x=selected_features_rfe0,

y=selected_features_rfe1,

z=selected_features_rfe2,

color='cluster',

color_continuous_scale=px.colors.sequential.Viridis,

title='RFE特征选择的3D可视化'

)

fig.update_layout(

scene=dict(

xaxis_title=selected_features_rfe0,

yaxis_title=selected_features_rfe1,

zaxis_title=selected_features_rfe2

),

width=1200,

height=1000

)

fig.show()

#训练随机森林模型

rf_model_rfe=RandomForestClassifier(random_state=42,class_weight='balanced')

rf_model_rfe.fit(x_train_rfe,y_train)

rf_pred_rfe=rf_model_rfe.predict(x_test_rfe)

print("\nRFE筛选后随机森林在测试集上的分类报告:")

print(classification_report(y_test, rf_pred_rfe))

print("RFE筛选后随机森林在测试集上的混淆矩阵:")

print(confusion_matrix(y_test, rf_pred_rfe))

相关推荐
一个梦醒了几秒前
安装git bash选项推荐
开发语言·git·bash
GIS数据转换器13 分钟前
城市排水生命线安全运行监测平台深度解析
java·运维·人工智能·python·安全·数据挖掘·无人机
ct97825 分钟前
React 状态管理方案深度对比
开发语言·前端·react
贤哥哥yyds36 分钟前
GBK转UTF\-8编码自动转换工具 使用文档
python
数量技术宅44 分钟前
2026量化前沿:从Reddit热帖到Python实战,如何用赫斯特指数(Hurst)狙击虚假突破?
开发语言·python
华如锦1 小时前
面了很多 Java转AI Agent方向,一些面试题总结
java·开发语言·人工智能·python·ai
huangdong_1 小时前
电商商品SKU图自动分类技术实现:从DOM解析到智能归档
开发语言
dog2501 小时前
网络长尾延时的重尾本质
开发语言·网络·php
戴西软件1 小时前
戴西 DLM 许可授权管理系统:破解无网络环境下工业软件授权难题,助力制造企业降本增效
网络·人工智能·python·深度学习·程序人生·算法·制造
Dxy12393102161 小时前
Python线程锁:为什么多线程会“打架“,以及怎么解决
开发语言·前端·python