pandas库 - 技术栈

1、概述

Pandas 是一个开源的第三方 Python 库，从 Numpy 和 Matplotlib 的基础上构建而来
Pandas 名字衍生自术语 "panel data"（面板数据）和 "Python data analysis"（Python 数据分析）
Pandas 已经成为 Python 数据分析的必备高级工具，它的目标是成为强大、灵活、可以支持任何编程语言的数据分析工具
Pandas 是 Python 语言的一个扩展程序库，用于数据分析
Pandas 是一个开放源码、BSD 许可的库，提供高性能、易于使用的数据结构和数据分析工具
Pandas 一个强大的分析结构化数据的工具集，基础是 Numpy（提供高性能的矩阵运算）
Pandas 可以从各种文件格式比如 CSV、JSON、SQL、Microsoft Excel 导入数据
Pandas 可以对各种数据进行运算操作，比如归并、再成形、选择，还有数据清洗和数据加工特征
Pandas 广泛应用在学术、金融、统计学等各个数据分析领域
Pandas 的出现使得 Python 做数据分析的能力得到了大幅度提升，它主要实现了数据分析的五个重要环节：加载数据、整理数据、操作数据、构建数据模型、分析数据

2、安装

复制代码

pip install pandas==1.1.5 -i https://pypi.tuna.tsinghua.edu.cn/simple/

3、方法示例

python 复制代码

# pandas为必要包
# numpy和matplotlib作为支撑，在pandas中经常能用到
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

python 复制代码

def pd_series():
    # 创建空的Series对象
    series01 = pd.Series()
    print(series01)
    print()
    
    # 创建Series对象,使用列表进行初始化
    series02 = pd.Series(['张三','李四', '王五', '赵六'])
    print(series02)
    print()
    
    # 创建Series对象,并指定索引
    series03 = pd.Series([1,2,3,4], index=['a', 'b', 'c', 'd'])
    print(series03)
    print()
    
    #使用nadarray创建Series对象
    arr = np.array([1,2,3,4])
    series04 = pd.Series(arr)
    print(series04)
    print()
    
    # 使用字典创建Series对象
    dic = {'name': '张三', 'age': 18, 'sex': '男'}
    series05 = pd.Series(dic)
    print(series05)
    print()
    
    # 使用标量创建Series对象
    # series06 = pd.Series(5)
    # 指定行索引(index)，按照行索引的数量进行创建，每行的值都是标量的值
    series06 = pd.Series(5, index=['a', 'b', 'c'])
    print(series06)
    
if __name__ == '__main__':
    pd_series()

复制代码

Series([], dtype: object)

0    张三
1    李四
2    王五
3    赵六
dtype: object

a    1
b    2
c    3
d    4
dtype: int64

0    1
1    2
2    3
3    4
dtype: int32

name    张三
age     18
sex      男
dtype: object

a    5
b    5
c    5
dtype: int64

python 复制代码

def pd_series_atr():
    s0 = pd.Series(['a','b','c','d','e'])
    print(s0.axes)  # 获取series对象的索引信息
    print(s0.dtype) # 获取series对象的数据类型
    print(s0.empty) # 判断series对象是否为空
    print(s0.ndim)  # 获取series对象的维度
    print(s0.size)  # 获取series对象的元素个数
    print(s0.values)# 获取series对象的值，返回ndarray类型数组
    print(s0.index) # 获取series对象的索引信息
    
if __name__ == '__main__':
    pd_series_atr()

复制代码

[RangeIndex(start=0, stop=5, step=1)]
object
False
1
5
['a' 'b' 'c' 'd' 'e']
RangeIndex(start=0, stop=5, step=1)

python 复制代码

def pd_series_method():
    s0 = pd.Series(['a','b','c','d','e',None])
    print(s0)
    print()
    
    print('前5个元素为:')
    print(s0.head())  # 获取前5个元素,默认前5个元素，可设置参数，指定元素个数
    print()
    
    print('后5个元素为:')
    print(s0.tail())  # 获取后5个元素，默认后5个元素，可设置参数，指定元素个数
    print()
    
    print('该对象元素为:')
    print(s0.isnull()) # 判断series对象中每个值是否为空
    print()
    
    print('该对象元素为:')
    print(s0.notnull()) # 判断series对象中每个值是否不为空
    print()
    
    print(s0.describe()) # 获取series对象的描述信息
    print()
    
    print(s0.sort_values()) # 对series对象进行排序
    print()
    
    print(s0.value_counts()) # 统计series对象中每个值出现的次数
    print()
if __name__ == '__main__':
    pd_series_method()

复制代码

0       a
1       b
2       c
3       d
4       e
5    None
dtype: object

前5个元素为:
0    a
1    b
2    c
3    d
4    e
dtype: object

后5个元素为:
1       b
2       c
3       d
4       e
5    None
dtype: object

该对象元素为:
0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool

该对象元素为:
0     True
1     True
2     True
3     True
4     True
5    False
dtype: bool

count     5
unique    5
top       a
freq      1
dtype: object

0       a
1       b
2       c
3       d
4       e
5    None
dtype: object

a    1
b    1
c    1
d    1
e    1
Name: count, dtype: int64

python 复制代码

def pd_dataframe():
    # 创建空的DataFrame对象
    df01 = pd.DataFrame()
    print(df01)
    print()
    
    # 创建DataFrame对象,使用列表进行初始化
    df02 = pd.DataFrame(['张三', '李四', '王五', '赵六'])
    print(df02)
    print()
    
    # 创建DataFrame对象,并指定索引
    df03 = pd.DataFrame(['张三', '李四', '王五', '赵六'],columns=['name'])
    print(df03)
    print()
    
    # 使用二维数组创建DataFrame对象
    df04 = pd.DataFrame([['张三',21,'男'],['李四',20,'女'],['王五',19,'男']],columns=['name','age','sex'])
    print(df04)
    print()
    
    # 使用字典创建DataFrame对象
    df05 = pd.DataFrame({'name':['张三', '李四', '王五'],'age':[21, 20, 19],'sex':['男','女','男']})
    print(df05)
    print()
    
    # 使用Series对象创建DataFrame对象
    df06 = pd.DataFrame(
        {'name':pd.Series(['张三', '李四', '王五']),
         'age':pd.Series([21, 20, 19]),
         'sex':pd.Series(['男','女','男'])
         })
    print(df06)
    print()

if __name__ == '__main__':
    pd_dataframe()

复制代码

Empty DataFrame
Columns: []
Index: []

    0
0  张三
1  李四
2  王五
3  赵六

  name
0   张三
1   李四
2   王五
3   赵六

  name  age sex
0   张三   21   男
1   李四   20   女
2   王五   19   男

  name  age sex
0   张三   21   男
1   李四   20   女
2   王五   19   男

  name  age sex
0   张三   21   男
1   李四   20   女
2   王五   19   男

python 复制代码

import pandas as pd
def pd_col():
    df = pd.DataFrame({'name':['张三', '李四', '王五'],'age':[21, 20, 19],'sex':['男','女','男']})
    namelist = list(df['name'])
    print(namelist)
    print(df['age'])
    
    # df添加新列，可使用列表、nparray、Series添加
    # 列表和nparray添加时，列表长度必须与df的行数相同
    # Series添加时，Series的行数可以不一致，缺省的会自动补齐
    df['address'] = ['北京', '上海', '广州']
    df['sex'] = pd.Series(['male', 'female', 'male'])
    print(df)
    
    # insert()方法在指定位置添加新列
    # loc：指定列的索引值
    # column：指定添加列的名称
    # value：指定添加列的元素值
    df.insert(0,'id',pd.Series([0,1,2]))
    print(df)
    print()
    
    # 修改列
    # df.columns = ['ID', 'name', 'age', 'sex']
    # print(df)
    
    # 删除列
    # labels：指定删除的列名称
    # axis：指定删除的轴，0为行，1为列
    # inplace：是否在原数据上修改
    df.drop(labels='id', axis=1,inplace=True)
    print(f"df:{df}")
    
if __name__ == '__main__':
    pd_col()

复制代码

['张三', '李四', '王五']
0    21
1    20
2    19
Name: age, dtype: int64
  name  age     sex address
0   张三   21    male      北京
1   李四   20  female      上海
2   王五   19    male      广州
   id name  age     sex address
0   0   张三   21    male      北京
1   1   李四   20  female      上海
2   2   王五   19    male      广州

df:  name  age     sex address
0   张三   21    male      北京
1   李四   20  female      上海
2   王五   19    male      广州

python 复制代码

def pd_loc():
    data = {
        "name":['张三', '李四', '王五', '赵六'],
        "age":[21, 20, 19, 23],
        "sex":['男','女','男','男'],
        "address":['北京', '上海', '广州', '深圳']
    }
    print(data)
    print()
    
    # loc方法，根据索引获取DataFrame行或列，若获取一行或一列则返回Series对象，若获取多行或多列则返回DataFrame对象
    df = pd.DataFrame(data,index=['a', 'b', 'c' , 'd'])
    
    # 根据行索引标签获取a行的数据，返回结果是DataFrame对象,区间为闭区间
    print(df.loc['a'])
    print()
    
    # 通过切片方式获取a行到c行，返回结果是DataFrame对象
    print(df.loc['a':'c'])
    print()
    
    # 获取a行和B列对应数据，返回结果是一个数值
    print(df.loc['a',"name"])
    print()
    
    # 获取a行、c行和A列、C列数据，返回结果是DataFrame对象
    print(df.loc[['a', 'c'], ['name','age']])
    print() 
    
if __name__ == '__main__':
    pd_loc()

复制代码

{'name': ['张三', '李四', '王五', '赵六'], 'age': [21, 20, 19, 23], 'sex': ['男', '女', '男', '男'], 'address': ['北京', '上海', '广州', '深圳']}

name       张三
age        21
sex         男
address    北京
Name: a, dtype: object

  name  age sex address
a   张三   21   男      北京
b   李四   20   女      上海
c   王五   19   男      广州

张三

  name  age
a   张三   21
c   王五   19

python 复制代码

def pd_iloc():
    data = {
        "name":['张三', '李四', '王五', '赵六'],
        "age":[21, 20, 19, 23],
        "sex":['男','女','男','男'],
        "address":['北京', '上海', '广州', '深圳']
    }
    
    df = pd.DataFrame(data,index=['a', 'b', 'c' , 'd'])
    
    # iloc方法，根据行索引所在位置进行获取，不能通过行索引或列索引标签获取数据
    # 获取行索引位置为0的行数据，返回结果是DataFrame对象
    print(df.iloc[0])
    print()
    
    # 通过切片方式获取行索引位置为0到2的行数据，返回结果是DataFrame对象，区间为左闭右开
    print(df.iloc[0:2])
    print()
    
    #获取行索引和列索引位置的数据，返回结果为一个数值
    print(df.iloc[0,1])
    print()
    
    #通过多个行索引位置和多个列索引位置获取数据，返回结果是DataFrame对象
    print(df.iloc[[0, 2], [0, 2]])
    print()
    
    # 直接获取第0行到第1行的数据，区间左闭右开
    print(df[0:2])
    
if __name__ == '__main__':
    pd_iloc()

复制代码

name       张三
age        21
sex         男
address    北京
Name: a, dtype: object

  name  age sex address
a   张三   21   男      北京
b   李四   20   女      上海

21

  name sex
a   张三   男
c   王五   男

  name  age sex address
a   张三   21   男      北京
b   李四   20   女      上海

python 复制代码

def pd_append():
    data = {"A":[1,2,3],"B":[4,5,6],"C":[7,8,9]}
    df = pd.DataFrame(data,index=['a', 'b', 'c'])
    
    s = pd.Series([10,20,30],name = 'd')
    # 使用concat方法将Series添加到DataFrame
    df1 = pd.concat([df, s.to_frame().T], axis=0)
    print(df1)
    print()
    
    # drop:删除行，axis = 0，即按行删除
    #根据行标签删除数据
    df2 = df1.drop(['d'],axis=0)
    print(df2)
if __name__ == '__main__':
    pd_append()

复制代码

     A    B    C     0     1     2
a  1.0  4.0  7.0   NaN   NaN   NaN
b  2.0  5.0  8.0   NaN   NaN   NaN
c  3.0  6.0  9.0   NaN   NaN   NaN
d  NaN  NaN  NaN  10.0  20.0  30.0

     A    B    C   0   1   2
a  1.0  4.0  7.0 NaN NaN NaN
b  2.0  5.0  8.0 NaN NaN NaN
c  3.0  6.0  9.0 NaN NaN NaN