pandas
Pandas 是 Python 语言的一个扩展程序库,用于数据分析。
python
import pandas as pd
pd.__version__
'2.2.2'
Series
相当于表格中的一列,带有索引的一维数组
索引唯一,缺省为range(0,n)
有序
py
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
python
# 指定索引和值
s1 = pd.Series(data=[1, 2, 3], index=['a', 'b', 'c'])
# 通过字典创建
s2 = pd.Series({1:'a', 2:'b', 3:'c'})
s1, s2
(a 1
b 2
c 3
dtype: int64,
1 a
2 b
3 c
dtype: object)
基本信息
python
# 获取统计信息
s1.min(), s1.max(), s1.mean(), s1.std(), s1.sum(),s2.value_counts()
(np.int64(1),
np.int64(3),
np.float64(2.0),
np.float64(1.0),
np.int64(6),
a 1
b 1
c 1
Name: count, dtype: int64)
python
# 获取示例数据,默认5
s2.head(2), s2.tail(2), s2.sample(2)
(1 a
2 b
dtype: object,
2 b
3 c
dtype: object,
1 a
2 b
dtype: object)
python
# 获取属性
print(s1.dtype, s1.shape, s1.size)
print(s2.dtype, s2.shape, s2.size)
print(s1.index, s1.values)
int64 (3,) 3
object (3,) 3
Index(['a', 'b', 'c'], dtype='object') [1 2 3]
常用操作
python
# 通过索引更改、添加元素
s1['a'], s1['d'] = -1, 4
# 使用 del 删除指定索引标签的元素。
del s1['b']
# 使用 drop 方法删除索引,并返回一个新的 Series
s_dropped = s1.drop(['c'])
print(s1)
print(s_dropped)
a -1
c 3
d 4
dtype: int64
a -1
d 4
dtype: int64
python
# 使用切片语法
s1['a':'c'], s1[-2:]
(a -1
c 3
dtype: int64,
c 3
d 4
dtype: int64)
python
# 更改索引,并返回一个新的 Series
s1.rename({'a':'A','c':'C'})
A -1
C 3
d 4
dtype: int64
python
# 设置数据类型,并返回一个新的 Series
s1.astype('float')
a -1.0
c 3.0
d 4.0
dtype: float64
python
# 索引和值的关系
for index, value in s2.items():
print(f'{index=}, {value=}')
index=1, value='a'
index=2, value='b'
index=3, value='c'
DataFrame
相当于二维表
py
pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
创建
list dict json numpy
python
# 通过二维表(list, tuple, numpy)创建
data1 = [('Google', 10), ['Runoob', 12], ['Wiki', 13]]
df1 = pd.DataFrame(data1, columns=['Site', 'Age'])
# 通过字典的列表创建
data2 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df2 = pd.DataFrame(data2)
df1, df2
( Site Age
0 Google 10
1 Runoob 12
2 Wiki 13,
a b c
0 1 2 NaN
1 5 10 20.0)
python
# 通过列表的字典创建
data = {'name':['Tom', 'Bob', 'peter', 'Tom'],
'age':[18, 19, 14, 10],
'gender':['male', 'female', 'male', 'female']}
df = pd.DataFrame(data)
df
| | name | age | gender |
| 0 | Tom | 18 | male |
| 1 | Bob | 19 | female |
| 2 | peter | 14 | male |
3 | Tom | 10 | female |
---|
基本信息
python
# 获取示例数据,默认5
df.head(2), df.tail(2), df.sample(2)
( name age gender
0 Tom 18 male
1 Bob 19 female,
name age gender
2 peter 14 male
3 Tom 10 female,
name age gender
1 Bob 19 female
3 Tom 10 female)
python
# 获取信息
df.info(), df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 name 4 non-null object
1 age 4 non-null int64
2 gender 4 non-null object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes
(None,
age
count 4.000000
mean 15.250000
std 4.112988
min 10.000000
25% 13.000000
50% 16.000000
75% 18.250000
max 19.000000)
python
# 获取属性
df.columns, df.index, df.dtypes, df.shape
(Index(['name', 'age', 'gender'], dtype='object'),
RangeIndex(start=0, stop=4, step=1),
name object
age int64
gender object
dtype: object,
(4, 3))
切片和定位
python
print("df[['name', 'age']]\n", df[['name', 'age']]) # 提取多列
print("\ndf[1:3]\n", df[1:3]) # 切片行
print("\ndf.loc[1:2, ['name', 'age']]\n"
,df.loc[1:2, ['name', 'age']]) # 标签索引提取指定行列
print("\ndf.iloc[:, 1:]\n",df.iloc[:, 1:]) # 位置索引提取指定列
df[['name', 'age']]
name age
0 Tom 18
1 Bob 19
2 peter 14
3 Tom 10
df[1:3]
name age gender
1 Bob 19 female
2 peter 14 male
df.loc[1:2, ['name', 'age']]
name age
1 Bob 19
2 peter 14
df.iloc[:, 1:]
age gender
0 18 male
1 19 female
2 14 male
3 10 female
python
# 布尔索引,筛选True
r1 = df['name'] == 'Tom'
r2 = df['age'] < 18
df[r1], df[r2]
( name age gender
0 Tom 18 male
3 Tom 10 female,
name age gender
2 peter 14 male
3 Tom 10 female)
python
# 更改值
df.loc[df['name']=='Bob', 'age']=15
df
| | name | age | gender |
| 0 | Tom | 18 | male |
| 1 | Bob | 15 | female |
| 2 | peter | 14 | male |
3 | Tom | 10 | female |
---|
python
# 重命名行(index,axis=0)和列(columns,axis=1)
# 默认 axis=0,inplace=False
df.rename(columns={'name':'Name'},index={1:-1})
# df.rename({1:-1})
# df.rename({'name':'Name'},axis=1).rename({1:-1},axis=0)
| | Name | age | gender |
| 0 | Tom | 18 | male |
| -1 | Bob | 15 | female |
| 2 | peter | 14 | male |
3 | Tom | 10 | female |
---|
索引
index
python
# DataFrame.set_index(keys, drop=True, append=False, inplace=False,
# verify_integrity=False)
# 设置keys列为索引
# drop=True表示删除该列
# append=False表示删除原索引。True保留形成多级索引
# verify_integrity=False不检查唯一性
df.set_index('name')
| | age | gender |
| name | | |
| Tom | 18 | male |
| Bob | 15 | female |
| peter | 14 | male |
Tom | 10 | female |
---|
python
# 重置索引
# drop=False,原索引会被添加为新的一列
df.reset_index(drop=False,inplace=False)
| | index | name | age | gender |
| 0 | 0 | Tom | 18 | male |
| 1 | 1 | Bob | 15 | female |
| 2 | 2 | peter | 14 | male |
3 | 3 | Tom | 10 | female |
---|
排序
sort_values sort_index
python
# 值排序,默认升序(ascending=True),生成新的DateFrame
# 可通过 key= 指定排序函数,接受一个 pd.Series 对象
df.sort_values(by='age',ascending=False,inplace=False)
df.sort_values(by='name',key=lambda x:x.str.len())
| | name | age | gender |
| 0 | Tom | 18 | male |
| 1 | Bob | 15 | female |
| 3 | Tom | 10 | female |
2 | peter | 14 | male |
---|
python
# 索引排序,通过 axis=0 or 1 指定行列
# 默认ascending=True升序
df.sort_index(ascending=False)
| | name | age | gender |
| 3 | Tom | 10 | female |
| 2 | peter | 14 | male |
| 1 | Bob | 15 | female |
0 | Tom | 18 | male |
---|
行列操作
insert concat drop
python
# 插入列
df['NAME'] = df['name'].str.upper()
df.insert(loc=4,column='note',value=['a','b','c','d'],allow_duplicates=True)
# 插入行
df.loc[df.index[-1]+1] = {'name':'Alice','age':1}
df.loc[df.index[-1]+1] = ['Joe',2,'male','',None]
new_line = pd.Series({'name':'Mike','age':11,'gender':'male'})
pd.concat([df,new_line]) # 推荐
df
| | name | age | gender | NAME | note |
| 0 | Tom | 18 | male | TOM | a |
| 1 | Bob | 15 | female | BOB | b |
| 2 | peter | 14 | male | PETER | c |
| 3 | Tom | 10 | female | TOM | d |
| 4 | Alice | 1 | NaN | NaN | NaN |
5 | Joe | 2 | male | None |
---|
python
# 删除行列,生成新的 DataFrame
df.drop(index=[0,1]).drop(columns='NAME')
| | name | age | gender | note |
| 2 | peter | 14 | male | c |
| 3 | Tom | 10 | female | d |
| 4 | Alice | 1 | NaN | NaN |
5 | Joe | 2 | male | None |
---|
分组
python
bins = [0, 10, 15, float('inf')]
labels = ['<10','10-15','>15']
df['age_priod'] = pd.cut(df['age'],bins=bins,labels=labels)
df
| | name | age | gender | NAME | note | age_priod |
| 0 | Tom | 18 | male | TOM | a | >15 |
| 1 | Bob | 15 | female | BOB | b | 10-15 |
| 2 | peter | 14 | male | PETER | c | 10-15 |
| 3 | Tom | 10 | female | TOM | d | <10 |
| 4 | Alice | 1 | NaN | NaN | NaN | <10 |
5 | Joe | 2 | male | None | <10 |
---|
python
df.groupby('gender')['age'].mean()
gender
female 12.500000
male 11.333333
Name: age, dtype: float64
长宽格式转换
pivot 和 melt
python
data = {'year': [2010, 2011, 2010, 2011],
'product': ['A', 'A', 'B', 'B'],
'sales': [5, 6, 7, 8]}
df = pd.DataFrame(data)
pivoted_df = df.pivot(index='year', columns='product', values='sales')
melted_df = (pivoted_df.reset_index()
.melt(id_vars=['year'], var_name='product', value_name='sales'))
pivoted_df, melted_df
(product A B
year
2010 5 7
2011 6 8,
year product sales
0 2010 A 5
1 2011 A 6
2 2010 B 7
3 2011 B 8)
拼接、合并
concat 和 merge
python
# pandas.concat(objs, axis=0, join='outer', ignore_index=False, keys=None,
# levels=None, names=None, verify_integrity=False, sort=False)
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']})
df2 = pd.DataFrame({'A': ['A3', 'A4'],
'B': ['B3', 'B4']})
r1 = pd.concat([df1, df2])
r2 = pd.concat([df1, df2], axis=1)
r1, r2
( A B
0 A0 B0
1 A1 B1
2 A2 B2
0 A3 B3
1 A4 B4,
A B A B
0 A0 B0 A3 B3
1 A1 B1 A4 B4
2 A2 B2 NaN NaN)
python
# pandas.merge(right, how='inner', on=None, left_on=None, right_on=None,
# left_index=False, right_index=False, sort=False,
# suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
result = pd.merge(df1, df2, on='key')
result
| | key | A | B | C | D |
| 0 | K0 | A0 | B0 | C0 | D0 |
| 1 | K1 | A1 | B1 | C1 | D1 |
2 | K2 | A2 | B2 | C2 | D2 |
---|
文件操作
csv json excel
python
# 读取
df = pd.read_csv('inputs/data.csv')
df = pd.read_excel('inputs/data.xlsx')
df = pd.read_json('inputs/data.json')
# df = pd.read_html('inputs/data.html') # 需要lxml
df
| | year | product | sales |
| 0 | 2010 | A | 5 |
| 1 | 2011 | A | 6 |
| 2 | 2010 | B | 7 |
3 | 2011 | B | 8 |
---|
python
# 保存
df.to_csv('outputs/data.csv',index=False)
df.to_excel('outputs/data.xlsx',index=False)
df.to_json('outputs/data.json')
df.to_html('outputs/data.html')
数据清洗
python
pd.read_csv('inputs/property-data.csv')
| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3 | 1 | 1000 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3 | 1.5 | -- |
| 2 | 100003000.0 | NaN | LEXINGTON | N | NaN | 1 | 850 |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1 | NaN | 700 |
| 4 | NaN | 203.0 | BERKELEY | Y | 3 | 2 | 1600 |
| 5 | 100006000.0 | 207.0 | BERKELEY | Y | NaN | 1 | 800 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2 | HURLEY | 950 |
| 7 | 100008000.0 | 213.0 | TREMONT | Y | 1 | 1 | NaN |
8 | 100009000.0 | 215.0 | TREMONT | Y | na | 2 | 1800 |
---|
python
# 指定空值
missing_values = ['n/a', 'na', '--']
df = pd.read_csv('inputs/property-data.csv', na_values=missing_values)
df
| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3.0 | 1.5 | NaN |
| 2 | 100003000.0 | NaN | LEXINGTON | N | NaN | 1 | 850.0 |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1.0 | NaN | 700.0 |
| 4 | NaN | 203.0 | BERKELEY | Y | 3.0 | 2 | 1600.0 |
| 5 | 100006000.0 | 207.0 | BERKELEY | Y | NaN | 1 | 800.0 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2.0 | HURLEY | 950.0 |
| 7 | 100008000.0 | 213.0 | TREMONT | Y | 1.0 | 1 | NaN |
8 | 100009000.0 | 215.0 | TREMONT | Y | NaN | 2 | 1800.0 |
---|
python
# 移除指定列有空值的数据
df.dropna(subset=['NUM_BEDROOMS'])
| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3.0 | 1.5 | NaN |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1.0 | NaN | 700.0 |
| 4 | NaN | 203.0 | BERKELEY | Y | 3.0 | 2 | 1600.0 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2.0 | HURLEY | 950.0 |
7 | 100008000.0 | 213.0 | TREMONT | Y | 1.0 | 1 | NaN |
---|
python
# 替换空字段
# 通常使用均值mean(), 中位数median(), 众数mode()替换
(df.fillna({'PID':12345})
.fillna({'ST_NUM':df["ST_NUM"].mean()})
.fillna(-1))
| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.000000 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.000000 | LEXINGTON | N | 3.0 | 1.5 | -1.0 |
| 2 | 100003000.0 | 191.428571 | LEXINGTON | N | -1.0 | 1 | 850.0 |
| 3 | 100004000.0 | 201.000000 | BERKELEY | 12 | 1.0 | -1 | 700.0 |
| 4 | 12345.0 | 203.000000 | BERKELEY | Y | 3.0 | 2 | 1600.0 |
| 5 | 100006000.0 | 207.000000 | BERKELEY | Y | -1.0 | 1 | 800.0 |
| 6 | 100007000.0 | 191.428571 | WASHINGTON | -1 | 2.0 | HURLEY | 950.0 |
| 7 | 100008000.0 | 213.000000 | TREMONT | Y | 1.0 | 1 | -1.0 |
8 | 100009000.0 | 215.000000 | TREMONT | Y | -1.0 | 2 | 1800.0 |
---|
python
# 移去重复字段
df.drop_duplicates(['ST_NAME'])
| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3.0 | 1.5 | NaN |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1.0 | NaN | 700.0 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2.0 | HURLEY | 950.0 |
7 | 100008000.0 | 213.0 | TREMONT | Y | 1.0 | 1 | NaN |
---|