learn_pandas

pandas

Pandas 是 Python 语言的一个扩展程序库,用于数据分析。

python 复制代码
import pandas as pd

pd.__version__
'2.2.2'

Series

相当于表格中的一列,带有索引的一维数组

索引唯一,缺省为range(0,n)

有序

py 复制代码
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
python 复制代码
# 指定索引和值
s1 = pd.Series(data=[1, 2, 3], index=['a', 'b', 'c'])
# 通过字典创建
s2 = pd.Series({1:'a', 2:'b', 3:'c'})

s1, s2
(a    1
 b    2
 c    3
 dtype: int64,
 1    a
 2    b
 3    c
 dtype: object)

基本信息

python 复制代码
# 获取统计信息
s1.min(), s1.max(), s1.mean(), s1.std(), s1.sum(),s2.value_counts()
(np.int64(1),
 np.int64(3),
 np.float64(2.0),
 np.float64(1.0),
 np.int64(6),
 a    1
 b    1
 c    1
 Name: count, dtype: int64)
python 复制代码
# 获取示例数据,默认5
s2.head(2), s2.tail(2), s2.sample(2)
(1    a
 2    b
 dtype: object,
 2    b
 3    c
 dtype: object,
 1    a
 2    b
 dtype: object)
python 复制代码
# 获取属性
print(s1.dtype, s1.shape, s1.size)
print(s2.dtype, s2.shape, s2.size)

print(s1.index, s1.values)
int64 (3,) 3
object (3,) 3
Index(['a', 'b', 'c'], dtype='object') [1 2 3]

常用操作

python 复制代码
# 通过索引更改、添加元素
s1['a'], s1['d'] = -1, 4

# 使用 del 删除指定索引标签的元素。
del s1['b'] 

# 使用 drop 方法删除索引,并返回一个新的 Series
s_dropped = s1.drop(['c']) 

print(s1)
print(s_dropped)
a   -1
c    3
d    4
dtype: int64
a   -1
d    4
dtype: int64
python 复制代码
# 使用切片语法
s1['a':'c'], s1[-2:]
(a   -1
 c    3
 dtype: int64,
 c    3
 d    4
 dtype: int64)
python 复制代码
# 更改索引,并返回一个新的 Series
s1.rename({'a':'A','c':'C'})
A   -1
C    3
d    4
dtype: int64
python 复制代码
# 设置数据类型,并返回一个新的 Series
s1.astype('float')
a   -1.0
c    3.0
d    4.0
dtype: float64
python 复制代码
# 索引和值的关系
for index, value in s2.items():
    print(f'{index=}, {value=}')
index=1, value='a'
index=2, value='b'
index=3, value='c'

DataFrame

相当于二维表

py 复制代码
pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)

创建

list dict json numpy

python 复制代码
# 通过二维表(list, tuple, numpy)创建
data1 = [('Google', 10), ['Runoob', 12], ['Wiki', 13]]
df1 = pd.DataFrame(data1, columns=['Site', 'Age'])

# 通过字典的列表创建
data2 = [{'a': 1, 'b': 2},{'a': 5, 'b': 10, 'c': 20}]
df2 = pd.DataFrame(data2)

df1, df2
(     Site  Age
 0  Google   10
 1  Runoob   12
 2    Wiki   13,
    a   b     c
 0  1   2   NaN
 1  5  10  20.0)
python 复制代码
# 通过列表的字典创建
data = {'name':['Tom', 'Bob', 'peter', 'Tom'],
         'age':[18, 19, 14, 10],
         'gender':['male', 'female', 'male', 'female']}
df = pd.DataFrame(data)
df

| | name | age | gender |
| 0 | Tom | 18 | male |
| 1 | Bob | 19 | female |
| 2 | peter | 14 | male |

3 Tom 10 female

基本信息

python 复制代码
# 获取示例数据,默认5
df.head(2), df.tail(2), df.sample(2)
(  name  age  gender
 0  Tom   18    male
 1  Bob   19  female,
     name  age  gender
 2  peter   14    male
 3    Tom   10  female,
   name  age  gender
 1  Bob   19  female
 3  Tom   10  female)
python 复制代码
# 获取信息
df.info(), df.describe()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   age     4 non-null      int64 
 2   gender  4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes





(None,
              age
 count   4.000000
 mean   15.250000
 std     4.112988
 min    10.000000
 25%    13.000000
 50%    16.000000
 75%    18.250000
 max    19.000000)
python 复制代码
# 获取属性
df.columns, df.index, df.dtypes, df.shape
(Index(['name', 'age', 'gender'], dtype='object'),
 RangeIndex(start=0, stop=4, step=1),
 name      object
 age        int64
 gender    object
 dtype: object,
 (4, 3))

切片和定位

python 复制代码
print("df[['name', 'age']]\n", df[['name', 'age']])  # 提取多列
print("\ndf[1:3]\n", df[1:3])               # 切片行
print("\ndf.loc[1:2, ['name', 'age']]\n"
      ,df.loc[1:2, ['name', 'age']])  # 标签索引提取指定行列
print("\ndf.iloc[:, 1:]\n",df.iloc[:, 1:])        # 位置索引提取指定列
df[['name', 'age']]
     name  age
0    Tom   18
1    Bob   19
2  peter   14
3    Tom   10

df[1:3]
     name  age  gender
1    Bob   19  female
2  peter   14    male

df.loc[1:2, ['name', 'age']]
     name  age
1    Bob   19
2  peter   14

df.iloc[:, 1:]
    age  gender
0   18    male
1   19  female
2   14    male
3   10  female
python 复制代码
# 布尔索引,筛选True
r1 = df['name'] == 'Tom'
r2 = df['age'] < 18
df[r1], df[r2]
(  name  age  gender
 0  Tom   18    male
 3  Tom   10  female,
     name  age  gender
 2  peter   14    male
 3    Tom   10  female)
python 复制代码
# 更改值
df.loc[df['name']=='Bob', 'age']=15
df

| | name | age | gender |
| 0 | Tom | 18 | male |
| 1 | Bob | 15 | female |
| 2 | peter | 14 | male |

3 Tom 10 female
python 复制代码
# 重命名行(index,axis=0)和列(columns,axis=1)  
# 默认 axis=0,inplace=False
df.rename(columns={'name':'Name'},index={1:-1})
# df.rename({1:-1})
# df.rename({'name':'Name'},axis=1).rename({1:-1},axis=0)

| | Name | age | gender |
| 0 | Tom | 18 | male |
| -1 | Bob | 15 | female |
| 2 | peter | 14 | male |

3 Tom 10 female

索引

index

python 复制代码
# DataFrame.set_index(keys, drop=True, append=False, inplace=False, 
#                         verify_integrity=False)

# 设置keys列为索引
# drop=True表示删除该列
# append=False表示删除原索引。True保留形成多级索引
# verify_integrity=False不检查唯一性
df.set_index('name')

| | age | gender |
| name | | |
| Tom | 18 | male |
| Bob | 15 | female |
| peter | 14 | male |

Tom 10 female
python 复制代码
# 重置索引
# drop=False,原索引会被添加为新的一列
df.reset_index(drop=False,inplace=False)

| | index | name | age | gender |
| 0 | 0 | Tom | 18 | male |
| 1 | 1 | Bob | 15 | female |
| 2 | 2 | peter | 14 | male |

3 3 Tom 10 female

排序

sort_values sort_index

python 复制代码
# 值排序,默认升序(ascending=True),生成新的DateFrame  
# 可通过 key= 指定排序函数,接受一个 pd.Series 对象
df.sort_values(by='age',ascending=False,inplace=False)
df.sort_values(by='name',key=lambda x:x.str.len())

| | name | age | gender |
| 0 | Tom | 18 | male |
| 1 | Bob | 15 | female |
| 3 | Tom | 10 | female |

2 peter 14 male
python 复制代码
# 索引排序,通过 axis=0 or 1 指定行列
# 默认ascending=True升序
df.sort_index(ascending=False)

| | name | age | gender |
| 3 | Tom | 10 | female |
| 2 | peter | 14 | male |
| 1 | Bob | 15 | female |

0 Tom 18 male

行列操作

insert concat drop

python 复制代码
# 插入列
df['NAME'] = df['name'].str.upper()
df.insert(loc=4,column='note',value=['a','b','c','d'],allow_duplicates=True)

# 插入行
df.loc[df.index[-1]+1] = {'name':'Alice','age':1}
df.loc[df.index[-1]+1] = ['Joe',2,'male','',None]
new_line = pd.Series({'name':'Mike','age':11,'gender':'male'})
pd.concat([df,new_line]) # 推荐

df

| | name | age | gender | NAME | note |
| 0 | Tom | 18 | male | TOM | a |
| 1 | Bob | 15 | female | BOB | b |
| 2 | peter | 14 | male | PETER | c |
| 3 | Tom | 10 | female | TOM | d |
| 4 | Alice | 1 | NaN | NaN | NaN |

5 Joe 2 male None
python 复制代码
# 删除行列,生成新的 DataFrame
df.drop(index=[0,1]).drop(columns='NAME')

| | name | age | gender | note |
| 2 | peter | 14 | male | c |
| 3 | Tom | 10 | female | d |
| 4 | Alice | 1 | NaN | NaN |

5 Joe 2 male None

分组

python 复制代码
bins = [0, 10, 15, float('inf')]
labels = ['<10','10-15','>15']
df['age_priod'] = pd.cut(df['age'],bins=bins,labels=labels)
df

| | name | age | gender | NAME | note | age_priod |
| 0 | Tom | 18 | male | TOM | a | >15 |
| 1 | Bob | 15 | female | BOB | b | 10-15 |
| 2 | peter | 14 | male | PETER | c | 10-15 |
| 3 | Tom | 10 | female | TOM | d | <10 |
| 4 | Alice | 1 | NaN | NaN | NaN | <10 |

5 Joe 2 male None <10
python 复制代码
df.groupby('gender')['age'].mean()
gender
female    12.500000
male      11.333333
Name: age, dtype: float64

长宽格式转换

pivot 和 melt

python 复制代码
data = {'year': [2010, 2011, 2010, 2011],  
        'product': ['A', 'A', 'B', 'B'],  
        'sales': [5, 6, 7, 8]}  
df = pd.DataFrame(data)

pivoted_df = df.pivot(index='year', columns='product', values='sales')  
melted_df = (pivoted_df.reset_index()
            .melt(id_vars=['year'], var_name='product', value_name='sales'))

pivoted_df, melted_df
(product  A  B
 year         
 2010     5  7
 2011     6  8,
    year product  sales
 0  2010       A      5
 1  2011       A      6
 2  2010       B      7
 3  2011       B      8)

拼接、合并

concat 和 merge

python 复制代码
# pandas.concat(objs, axis=0, join='outer', ignore_index=False, keys=None, 
#                 levels=None, names=None, verify_integrity=False, sort=False)
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],  
                    'B': ['B0', 'B1', 'B2']})  
df2 = pd.DataFrame({'A': ['A3', 'A4'],  
                    'B': ['B3', 'B4']})  
  
r1 = pd.concat([df1, df2])
r2 = pd.concat([df1, df2], axis=1)
r1, r2
(    A   B
 0  A0  B0
 1  A1  B1
 2  A2  B2
 0  A3  B3
 1  A4  B4,
     A   B    A    B
 0  A0  B0   A3   B3
 1  A1  B1   A4   B4
 2  A2  B2  NaN  NaN)
python 复制代码
# pandas.merge(right, how='inner', on=None, left_on=None, right_on=None, 
#                 left_index=False, right_index=False, sort=False,
#                 suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
df1 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],  
                    'A': ['A0', 'A1', 'A2', 'A3'],  
                    'B': ['B0', 'B1', 'B2', 'B3']})  
df2 = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],  
                    'C': ['C0', 'C1', 'C2', 'C3'],  
                    'D': ['D0', 'D1', 'D2', 'D3']})  
  
result = pd.merge(df1, df2, on='key')  
result

| | key | A | B | C | D |
| 0 | K0 | A0 | B0 | C0 | D0 |
| 1 | K1 | A1 | B1 | C1 | D1 |

2 K2 A2 B2 C2 D2

文件操作

csv json excel

python 复制代码
# 读取
df = pd.read_csv('inputs/data.csv')
df = pd.read_excel('inputs/data.xlsx')
df = pd.read_json('inputs/data.json')
# df = pd.read_html('inputs/data.html') # 需要lxml
df

| | year | product | sales |
| 0 | 2010 | A | 5 |
| 1 | 2011 | A | 6 |
| 2 | 2010 | B | 7 |

3 2011 B 8
python 复制代码
# 保存
df.to_csv('outputs/data.csv',index=False)
df.to_excel('outputs/data.xlsx',index=False)
df.to_json('outputs/data.json')
df.to_html('outputs/data.html')

数据清洗

python 复制代码
pd.read_csv('inputs/property-data.csv')

| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3 | 1 | 1000 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3 | 1.5 | -- |
| 2 | 100003000.0 | NaN | LEXINGTON | N | NaN | 1 | 850 |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1 | NaN | 700 |
| 4 | NaN | 203.0 | BERKELEY | Y | 3 | 2 | 1600 |
| 5 | 100006000.0 | 207.0 | BERKELEY | Y | NaN | 1 | 800 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2 | HURLEY | 950 |
| 7 | 100008000.0 | 213.0 | TREMONT | Y | 1 | 1 | NaN |

8 100009000.0 215.0 TREMONT Y na 2 1800
python 复制代码
# 指定空值
missing_values = ['n/a', 'na', '--']
df = pd.read_csv('inputs/property-data.csv', na_values=missing_values)
df

| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3.0 | 1.5 | NaN |
| 2 | 100003000.0 | NaN | LEXINGTON | N | NaN | 1 | 850.0 |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1.0 | NaN | 700.0 |
| 4 | NaN | 203.0 | BERKELEY | Y | 3.0 | 2 | 1600.0 |
| 5 | 100006000.0 | 207.0 | BERKELEY | Y | NaN | 1 | 800.0 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2.0 | HURLEY | 950.0 |
| 7 | 100008000.0 | 213.0 | TREMONT | Y | 1.0 | 1 | NaN |

8 100009000.0 215.0 TREMONT Y NaN 2 1800.0
python 复制代码
# 移除指定列有空值的数据
df.dropna(subset=['NUM_BEDROOMS'])

| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3.0 | 1.5 | NaN |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1.0 | NaN | 700.0 |
| 4 | NaN | 203.0 | BERKELEY | Y | 3.0 | 2 | 1600.0 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2.0 | HURLEY | 950.0 |

7 100008000.0 213.0 TREMONT Y 1.0 1 NaN
python 复制代码
# 替换空字段
# 通常使用均值mean(), 中位数median(), 众数mode()替换
(df.fillna({'PID':12345})
    .fillna({'ST_NUM':df["ST_NUM"].mean()})
    .fillna(-1))

| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.000000 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.000000 | LEXINGTON | N | 3.0 | 1.5 | -1.0 |
| 2 | 100003000.0 | 191.428571 | LEXINGTON | N | -1.0 | 1 | 850.0 |
| 3 | 100004000.0 | 201.000000 | BERKELEY | 12 | 1.0 | -1 | 700.0 |
| 4 | 12345.0 | 203.000000 | BERKELEY | Y | 3.0 | 2 | 1600.0 |
| 5 | 100006000.0 | 207.000000 | BERKELEY | Y | -1.0 | 1 | 800.0 |
| 6 | 100007000.0 | 191.428571 | WASHINGTON | -1 | 2.0 | HURLEY | 950.0 |
| 7 | 100008000.0 | 213.000000 | TREMONT | Y | 1.0 | 1 | -1.0 |

8 100009000.0 215.000000 TREMONT Y -1.0 2 1800.0
python 复制代码
# 移去重复字段
df.drop_duplicates(['ST_NAME']) 

| | PID | ST_NUM | ST_NAME | OWN_OCCUPIED | NUM_BEDROOMS | NUM_BATH | SQ_FT |
| 0 | 100001000.0 | 104.0 | PUTNAM | Y | 3.0 | 1 | 1000.0 |
| 1 | 100002000.0 | 197.0 | LEXINGTON | N | 3.0 | 1.5 | NaN |
| 3 | 100004000.0 | 201.0 | BERKELEY | 12 | 1.0 | NaN | 700.0 |
| 6 | 100007000.0 | NaN | WASHINGTON | NaN | 2.0 | HURLEY | 950.0 |

7 100008000.0 213.0 TREMONT Y 1.0 1 NaN
相关推荐
雯0609~1 小时前
c#:winform调用bartender实现打印(学习整理笔记)
开发语言·c#
胜天半子_王二_王半仙2 小时前
c++源码阅读__smart_ptr__正文阅读
开发语言·c++·开源
沐泽Mu2 小时前
嵌入式学习-C嘎嘎-Day08
开发语言·c++·算法
Non importa2 小时前
汉诺塔(hanio)--C语言函数递归
c语言·开发语言·算法·学习方法
LinuxST2 小时前
27、基于Firefly-rk3399中断休眠唤醒实验(按键中断)
linux·开发语言·stm32·嵌入式硬件
Tony_long74832 小时前
Python学习——猜拳小游戏
开发语言·python·学习
跳动的梦想家h2 小时前
黑马点评 秒杀下单出现的问题:服务器异常---java.lang.NullPointerException: null(已解决)
java·开发语言·redis
ac-er88883 小时前
PHP 二分法查找算法
开发语言·算法·php
陈苏同学3 小时前
机器翻译 & 数据集 (NLP基础 - 预处理 → tokenize → 词表 → 截断/填充 → 迭代器) + 代码实现 —— 笔记3.9《动手学深度学习》
人工智能·pytorch·笔记·python·深度学习·自然语言处理·机器翻译
流着口水看上帝3 小时前
JavaScript完整原型链
开发语言·javascript·原型模式