数据挖掘目标(客户价值分析)

复制代码
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:

复制代码
data=pd.read_csv(r'../教师文件/air_data.csv')

In [3]:

复制代码
data.head()

Out[3]:

| | Start_time | End_time | Fare | City | Age | Flight_count | Avg_discount | Flight_mileage |
| 0 | 2011/08/18 | 2014/03/31 | 5860.0 | . | 35.0 | 10 | 0.973129 | 12560 |
| 1 | 2011/01/13 | 2014/03/31 | 5561.0 | 佛山 | 35.0 | 12 | 0.575906 | 21223 |
| 2 | 2012/08/15 | 2014/03/31 | 1089.0 | 北京 | 33.0 | 9 | 0.635025 | 19246 |
| 3 | 2012/10/17 | 2014/03/31 | 9626.0 | 绍兴县 | 53.0 | 7 | 0.868571 | 14070 |

4 2011/09/04 2014/03/31 4473.0 上海 34.0 13 0.703419 17373

In [4]:

复制代码
data.info()
复制代码
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Start_time      15000 non-null  object 
 1   End_time        15000 non-null  object 
 2   Fare            14989 non-null  float64
 3   City            14490 non-null  object 
 4   Age             14907 non-null  float64
 5   Flight_count    15000 non-null  int64  
 6   Avg_discount    15000 non-null  float64
 7   Flight_mileage  15000 non-null  int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 937.6+ KB

In [5]:

复制代码
data.describe()

Out[5]:

| | Fare | Age | Flight_count | Avg_discount | Flight_mileage |
| count | 14989.000000 | 14907.000000 | 15000.000000 | 15000.000000 | 15000.000000 |
| mean | 3761.743812 | 42.569531 | 9.057600 | 0.728391 | 12395.706800 |
| std | 2720.206579 | 9.807385 | 3.946338 | 0.163550 | 3588.357291 |
| min | 0.000000 | 16.000000 | 2.000000 | 0.136017 | 4040.000000 |
| 25% | 1709.000000 | 35.000000 | 6.000000 | 0.625525 | 9747.000000 |
| 50% | 3580.000000 | 41.000000 | 8.000000 | 0.713322 | 11986.500000 |
| 75% | 5452.000000 | 48.000000 | 11.000000 | 0.803840 | 14654.000000 |

max 36602.000000 110.000000 47.000000 1.500000 50758.000000

In [6]:

复制代码
data=data[data.Fare.notnull()]

In [7]:

复制代码
data=data[data.Fare!=0]

In [8]:

复制代码
for index,item in data.iterrows():
    s_year,s_month=item['Start_time'].split('/')[:2]
    e_year,e_month=item['End_time'].split('/')[:2]
    data.loc[index,'Months']=(int(e_year)-int(s_year))*12+(int(e_month)-int(s_month))
data=data.drop(['Start_time','End_time'],axis=1)

In [9]:

复制代码
data.info()
复制代码
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13279 entries, 0 to 14998
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Fare            13279 non-null  float64
 1   City            12809 non-null  object 
 2   Age             13199 non-null  float64
 3   Flight_count    13279 non-null  int64  
 4   Avg_discount    13279 non-null  float64
 5   Flight_mileage  13279 non-null  int64  
 6   Months          13279 non-null  float64
dtypes: float64(4), int64(2), object(1)
memory usage: 1.3+ MB

In [10]:

复制代码
data=data.drop(['City'],axis=1)
data=(data-data.mean(axis=0))/data.std(axis=0)

In [11]:

复制代码
data.head()

Out[11]:

| | Fare | Age | Flight_count | Avg_discount | Flight_mileage | Months |
| 0 | 0.643204 | -0.781959 | 0.191752 | 1.539425 | 0.019051 | -0.616333 |
| 1 | 0.524036 | -0.781959 | 0.700041 | -0.935625 | 2.427818 | -0.357005 |
| 2 | -1.258303 | -0.985351 | -0.062393 | -0.567261 | 1.878109 | -1.060895 |
| 3 | 2.144162 | 1.048561 | -0.570681 | 0.887939 | 0.438910 | -1.134989 |

4 0.090408 -0.883655 0.954185 -0.141105 1.357317 -0.653379

In [12]:

复制代码
plt.figure(figsize=(10,10))
plt.title("Pearson Correlation of Features",y=1.05,size=15)
sns.heatmap(data.astype(float).corr(),linewidths=0.1,vmax=1,square=True,cmap=plt.cm.viridis,linecolor='white',annot=True)

Out[12]:

复制代码
<AxesSubplot:title={'center':'Pearson Correlation of Features'}>

In [13]:

复制代码
data=data.drop(['Fare','Age'],axis=1)

In [14]:

复制代码
from sklearn.cluster import KMeans

In [15]:

复制代码
kmeans=KMeans(n_clusters=3).fit(data)

In [16]:

复制代码
kmeans.cluster_centers_

Out[16]:

复制代码
array([[-0.56475974,  0.54131875, -0.70701626, -0.56628176],
       [-0.06513412, -0.03376272, -0.10437466,  1.24214471],
       [ 0.75090493, -0.63663316,  0.95977635, -0.37662422]])

In [17]:

复制代码
kmeans.labels_

Out[17]:

复制代码
array([0, 2, 2, ..., 0, 0, 0])

In [18]:

复制代码
from collections import defaultdict

In [28]:

复制代码
label_dict=defaultdict(int)

In [29]:

复制代码
for label in kmeans.labels_:
    label_dict[label] += 1

In [30]:

复制代码
label_dict

Out[30]:

复制代码
defaultdict(int, {0: 5287, 2: 4287, 1: 3705})

In [31]:

复制代码
kmeans.cluster_centers_

Out[31]:

复制代码
array([[-0.56475974,  0.54131875, -0.70701626, -0.56628176],
       [-0.06513412, -0.03376272, -0.10437466,  1.24214471],
       [ 0.75090493, -0.63663316,  0.95977635, -0.37662422]])
相关推荐
阿杰学AI2 分钟前
AI核心知识57——大语言模型之MoE(简洁且通俗易懂版)
人工智能·ai·语言模型·aigc·ai-native·moe·混合专家模型
anghost1504 分钟前
基于 STM32 的湖泊水位报警系统设计
stm32·嵌入式硬件·数据挖掘
珠海西格电力6 分钟前
零碳园区边缘计算节点规划:数字底座的硬件部署与能耗控制方案
运维·人工智能·物联网·能源·边缘计算
臼犀9 分钟前
孩子,那不是说明书,那是祈祷文
人工智能·程序员·markdown
黑客思维者10 分钟前
《关于深入实施 “人工智能 +“ 行动的意见》深度解读
人工智能
Sui_Network11 分钟前
Mysten Labs 与不丹王国政府的创新与技术部携手探索离线区块链
大数据·人工智能·web3·去中心化·区块链
互联科技报14 分钟前
GEO优化工具、AI搜索引擎优化软件平台实测报告:四大平台深度体验与选型指南
大数据·人工智能·搜索引擎
BoBoZz1915 分钟前
ExtractSelectionUsingCells选择和提取三维模型中的特定单元(Cell)
python·vtk·图形渲染·图形处理
智者知已应修善业16 分钟前
【删除有序数组中的重复项 II之O(N)算法】2024-1-31
c语言·c++·经验分享·笔记·算法
山东小木17 分钟前
AI智能问数(ChatBI)开发框架&解决方案&相关产品
人工智能·chatbi·智能问数·jboltai·javaai·ai问数·ai生图表