##注入所需库
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
import time
import shap
from sklearn.svm import SVC #支持向量机分类器
# from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
# from sklearn.linear_model import LogisticRegression #逻辑回归分类器
import xgboost as xgb #XGBoost分类器
import lightgbm as lgb #LightGBM分类器
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
# from catboost import CatBoostClassifier #CatBoost分类器
# from sklearn.tree import DecisionTreeClassifier #决策树分类器
# from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器
from skopt import BayesSearchCV
from skopt.space import Integer
from deap import base, creator, tools, algorithms
from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
from sklearn.metrics import make_scorer#定义函数
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
#聚类
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
#3D可视化
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go
导入 Pipeline 和相关预处理工具
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline # 用于创建机器学习工作流
from sklearn.compose import ColumnTransformer # 用于将不同的预处理应用于不同的列
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler # 用于数据预处理(有序编码、独热编码、标准化)
from sklearn.impute import SimpleImputer # 用于处理缺失值
##设置中文字体&负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
#读取数据
data=pd.read_csv(r'data.csv')
x=data.drop(['Id','Credit Default'],axis=1)
y=data['Credit Default']
#定义pipeline相关定义&处理步骤
object_cols=x.select_dtypes(include=['object']).columns.tolist()
numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()
ordinal_features=['Years in current job']
ordinal_catagories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)
ordinal_transforms=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OrdinalEncoder(categories=ordinal_catagories,handle_unknown='use_encoded_value',unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features=['Home Ownership', 'Purpose', 'Term']
nominal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore'))
])
print("标称特征处理 Pipeline 定义完成。")
continuous_cols=x.columns.difference(ordinal_features+nominal_features).tolist()
continuous_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='mean'))
])
print("连续特征处理 Pipeline 定义完成。")
--- 构建 ColumnTransformer ---
preprocessor=ColumnTransformer(
transformers=[
('ordinal',ordinal_transforms,ordinal_features),
('nominal',nominal_transformer,nominal_features),
('continuous',continuous_transformer,continuous_cols)
],remainder='passthrough',verbose_feature_names_out=False
)
print("\nColumnTransformer (预处理器) 定义完成。")
pipeline=Pipeline(steps=[
('preprocessor',preprocessor)
])
print("\n完整的 Pipeline 定义完成。")
print("\n开始对原始数据进行预处理...")
start_time=time.time()
x_processed=pipeline.fit_transform(x)
end_time=time.time()
print(f"预处理完成,耗时: {end_time - start_time:.4f} 秒")
feature_names=preprocessor.get_feature_names_out()
x_processed_df=pd.DataFrame(x_processed,columns=feature_names)
print(x_processed_df.info())
#划分数据集
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x_processed_df,y,test_size=0.2,random_state=42)
#SMOTE(为了训练模型)
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
#标准化数据(为了聚类)
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x_processed_df)
#kmeans++
k_range=(2,5)
inertia_value=[]
silhouette_scores=[]
ch_scores=[]
db_scores=[]
start_time=time.time()
for k in k_range:
kmeans=KMeans(n_clusters=k,random_state=42)
kmeans_label=kmeans.fit_predict(x_scaled)
inertia_value.append(kmeans.inertia_)
silhouette=silhouette_score(x_scaled,kmeans_label)
silhouette_scores.append(silhouette)
ch=calinski_harabasz_score(x_scaled,kmeans_label)
ch_scores.append(ch)
db=davies_bouldin_score(x_scaled,kmeans_label)
db_scores.append(db)
print(f'聚类分析耗时:{end_time-start_time:.4f}')
#绘制评估指标图
plt.figure(figsize=(12,6))
##肘部法则图
plt.subplot(2,2,1)
plt.plot(k_range,inertia_value,marker='o')
plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('惯性')
plt.grid(True)
##轮廓系数图
plt.subplot(2,2,2)
plt.plot(k_range,silhouette_scores,marker='o',color='orange')
plt.title('轮廓系数确定最优聚类数 k(越大越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('轮廓系数')
plt.grid(True)
#CH系数图
plt.subplot(2,2,3)
plt.plot(k_range,ch_scores,marker='o',color='yellow')
plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('CH 指数')
plt.grid(True)
##DB系数图
plt.subplot(2,2,4)
plt.plot(k_range,db_scores,marker='o',color='red')
plt.title('DB 指数确定最优聚类数 k(越小越好)')
plt.xlabel('聚类数 (k)')
plt.ylabel('DB 指数')
plt.grid(True)
plt.tight_layout()
plt.show()
#选择K值进行聚类
selected_k=3
kmeans=KMeans(n_clusters=selected_k,random_state=42)
kmeans_label=kmeans.fit_predict(x_scaled)
x['KMeans_cluster']=kmeans_label
##PCA降维
print(f"\n--- PCA 降维 ---")
pca=PCA(n_components=3)
x_pca=pca.fit_transform(x_scaled)
##聚类可视化
plt.figure(figsize=(6,5))
df_pca_2d=pd.DataFrame({
'x':x_pca[:,0],
'y':x_pca[:,1],
'cluster':kmeans_label})
sample_size_2d=min(1000,len(df_pca_2d))
df_sample_2d=df_pca_2d.sample(sample_size_2d,random_state=42)
sns.scatterplot(
x='x',y='y',
hue='cluster',
data=df_sample_2d,
palette='viridis'
)
plt.title(f'KMean Clustering with k={selected_k} (PCA Visualization)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
##3D可视化
df_pca=pd.DataFrame(x_pca)
df_pca['cluster']=x['KMeans_cluster']
sample_size_3d=min(1000,len(df_pca))
df_sample_3d=df_pca.sample(sample_size_3d,random_state=42)
fig=px.scatter_3d(
df_sample_3d,x=0,y=1,z=2,
color='cluster',
color_discrete_sequence=px.colors.qualitative.Bold,
title='3D可视化'
)
fig.update_layout(
scene=dict(
xaxis_title='pca_0',
yaxis_title='pca_1',
zaxis_title='pca_2'
),
width=1200,
height=1000
)
fig.show()
print(f"\n---t-SNE 降维 ---")
n_component_tsne=3
tsne=TSNE(
n_components=n_component_tsne,
perplexity=1000,
n_iter=250,
learning_rate='auto',
random_state=42,
n_jobs=-1
)
print("正在对训练集进行 t-SNE fit_transform...")
start_time=time.time()
x_tsne=tsne.fit_transform(x_scaled)
end_time=time.time()
print(f"训练集 t-SNE耗时: {end_time - start_time:.2f} 秒")
# ##3D可视化
# ##准备数据
df_tsne=pd.DataFrame(x_tsne)
df_tsne['cluster']=x['KMeans_cluster']
fig=px.scatter_3d(
df_tsne,x=0,y=1,z=2,
color='cluster',
color_discrete_sequence=px.colors.qualitative.Bold,
title='T-SNE特征选择的3D可视化'
)
fig.update_layout(
scene=dict(
xaxis_title='tsne_0',
yaxis_title='tsne_1',
zaxis_title='tsne_2'
),
width=1200,
height=1000
)
fig.show()
##打印KMeans聚类前几行
print(f'KMeans Cluster labels(k={selected_k}added to x):')
print(x[['KMeans_cluster']].value_counts())
# #SHAP分析
start_time=time.time()
rf1_model=RandomForestClassifier(random_state=42,class_weight='balanced')
rf1_model.fit(x_train_smote,y_train_smote)
explainer=shap.TreeExplainer(rf1_model)
shap_value=explainer.shap_values(x_processed_df)
print(shap_value.shape)
end_time=time.time()
print(f'SHAP分析耗时:{end_time-start_time:.4f}')
# --- 1. SHAP 特征重要性蜂巢图 (Summary Plot - violin) ---
print("--- 1. SHAP 特征重要性蜂巢图 ---")
shap.summary_plot(shap_value[:,:,0],x_processed_df,plot_type='violin',show=False)
plt.title('shap feature importance (bar plot)')
plt.tight_layout()
plt.show()
selected_features=['Credit Score','Current Loan Amount','Annual Income','Term_Long Term']
fig,axes=plt.subplots(2,2,figsize=(10,8))
axes=axes.flatten()
for i,feature in enumerate(selected_features):
unique_count=x_processed_df[feature].nunique()
if unique_count<10:
sns.countplot(x=x_processed_df[feature],ax=axes[i])
axes[i].set_title(f'countplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('count')
else:
sns.histplot(x=x_processed_df[feature],ax=axes[i])
axes[i].set_xlabel(feature)
axes[i].set_ylabel('frequency')
plt.tight_layout()
plt.show()
print(x[['KMeans_cluster']].value_counts())
x_cluster0=x_processed_df[x['KMeans_cluster']==0]
x_cluster1=x_processed_df[x['KMeans_cluster']==1]
x_cluster2=x_processed_df[x['KMeans_cluster']==2]
##簇0
fig,axes=plt.subplots(2,2,figsize=(6,4))
axes=axes.flatten()
for i,feature in enumerate(selected_features):
unique_count=x_cluster0[feature].nunique()
if unique_count<10:
sns.countplot(x=x_cluster0[feature],ax=axes[i])
axes[i].set_title(f'countplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('count')
else:
sns.histplot(x=x_cluster0[feature],ax=axes[i])
axes[i].set_title(f'histplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('frequence')
plt.tight_layout()
plt.show()
#簇1
fig,axes=plt.subplots(2,2,figsize=(6,4))
axes=axes.flatten()
for i,feature in enumerate(selected_features):
unique_count=x_cluster1[feature].nunique()
if unique_count<10:
sns.countplot(x=x_cluster1[feature],ax=axes[i])
axes[i].set_title(f'countplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('count')
else:
sns.histplot(x=x_cluster1[feature],ax=axes[i])
axes[i].set_title(f'histplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('frequence')
plt.tight_layout()
plt.show()
#簇2
fig,axes=plt.subplots(2,2,figsize=(6,4))
axes=axes.flatten()
for i,feature in enumerate(selected_features):
unique_count=x_cluster0[feature].nunique()
if unique_count<10:
sns.countplot(x=x_cluster2[feature],ax=axes[i])
axes[i].set_title(f'countplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('count')
else:
sns.histplot(x=x_cluster2[feature],ax=axes[i])
axes[i].set_title(f'histplot of {feature}')
axes[i].set_xlabel(feature)
axes[i].set_ylabel('count')
plt.tight_layout()
plt.show()
print("--- 递归特征消除 (RFE) ---")
from sklearn.feature_selection import RFE
start_time=time.time()
base_model=xgb.XGBClassifier(random_state=42,class_weight='balanced')
rfe=RFE(base_model,n_features_to_select=3)
rfe.fit(x_train_smote,y_train_smote)
x_train_rfe=rfe.transform(x_train_smote)
x_test_rfe=rfe.transform(x_test)
selected_features_rfe=x_train.columns[rfe.support_]
print(f"RFE筛选后保留的特征数量: {len(selected_features_rfe)}")
print(f"保留的特征: {selected_features_rfe}")
end_time=time.time()
print(f'RFE分析耗时:{end_time-start_time:.4f}')
##3D可视化
x_selected=x_processed_df[selected_features_rfe]
df_viz=pd.DataFrame(x_selected)
df_viz['cluster']=x['KMeans_cluster']
fig=px.scatter_3d(
df_viz,
x=selected_features_rfe[0],
y=selected_features_rfe[1],
z=selected_features_rfe[2],
color='cluster',
color_discrete_sequence=px.colors.qualitative.Bold,
title='RFE特征选择的3D可视化'
)
fig.update_layout(
scene=dict(
xaxis_title=selected_features_rfe[0],
yaxis_title=selected_features_rfe[1],
zaxis_title=selected_features_rfe[2]
),
width=1200,
height=1000
)
fig.show()
##训练XGBOOST模型
xgb_model_rfe=xgb.XGBClassifier(random_state=42,class_weight='balanced')
xgb_model_rfe.fit(x_train_rfe,y_train_smote)
xgb_pred_rfe=xgb_model_rfe.predict(x_test_rfe)
print("\nRFE筛选后XGBOOST在测试集上的分类报告:")
print(classification_report(y_test, xgb_pred_rfe))
print("RFE筛选后XGBOOST在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, xgb_pred_rfe))
def outer():
def inner():
print('aaaa')
return inner
f=outer()
print(f())
def chocolate(func):
print("🍫 [1] 巧克力包装机准备好了!") # 装饰器定义时立即执行这句
def wrapper():
print("🍫 [2] 开始包装蛋糕(外壳)")
func() # 原始蛋糕制作
print("🎁 [3] 包装完成,可以出售了")
return wrapper # 返回 wrapper 替代原函数
@chocolate # 在定义阶段就会触发 chocolate(make_cake)
def make_cake():
print("🎂 [ 中间 ] 蛋糕正在烘焙...")
print("\n🟢 [4] 现在开始执行 make_cake():\n")
print(make_cake())
import time
def display_time(func):
def wrapper():
start_time=time.time()
func()
end_time=time.time()
print(f"执行时间: {end_time - start_time} 秒")
return wrapper
def is_prime(num):
if num<2:
return False
elif num==2:
return True
else:
for i in range(2,num):
if num%i==0:
return False
return True
@display_time
def prime_nums():
for i in range(2,99999):
if is_prime(i):
# print(i)
continue
print(prime_nums())
def logger(func):
def wrapper(*args,**kwargs):
print(f'开始执行函数{func.name},参数:{args},{kwargs}')
result=func(*args,**kwargs)
print(f'函数{func.name}执行完毕,返回值:{result}')
return result
return wrapper
@logger
def multiply(a,b):
return a*b
print(multiply(2,3))
def class_logger(cls):
original_init=cls.init
def new_init(self,*args,**kwargs):
print(f'[LOG]实例化对象:{cls.name}')
original_init(self,*args,**kwargs)
cls.init=new_init
def log_message(self,message):
print(f'[LOG]{message}')
cls.log=log_message
return cls
@class_logger
class SimplePrinter:
def init(self,name):
self.name=name
def print_text(self,text):
print(f'{self.name}:{text}')
printer=SimplePrinter('Alice')
printer.print_text('hello wrold')
printer.log('这是装饰器添加的日志方法')
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
iris=load_iris()
df=pd.DataFrame(iris.data,columns=iris.feature_names)
df['target']=iris.target
features=iris.feature_names
target='target'
x_train,x_test,y_train,y_test=train_test_split(df[features],df[target],test_size=0.2,random_state=42)
model=RandomForestClassifier(n_estimators=100,random_state=42)
model.fit(x_train,y_train)
import pdpbox
from pdpbox.info_plots import TargetPlot
print(pdpbox.version)
feature='petal length (cm)'
feature_name=feature
target_plot=TargetPlot(
df=df,
feature=feature,
feature_name=feature_name,
target='target',
grid_type='percentile',
num_grid_points=10
)
print(target_plot.plot())
print(type(target_plot.plot()))
print(target_plot.plot()[0])
fig,axes,summary_df=target_plot.plot(
which_classes=None,
show_percentile=True,
engine='plotly',
template='plotly_white'
)
fig.update_layout(
width=800,
height=500,
title=dict(text=f'Target Plot:{feature_name}',x=0.5)
)
fig.show()