【算法工程师】—— Python 数据分析

Python 数据分析

Numpy

特点：

ndarray：N维数组对象，快速高效
向量化操作：避免循环，提高性能
广播机制：不同形状数组的运算
丰富的数学函数：线性代数、傅里叶变换等

数组创建与初始化

函数/方法	作用	参数示例
`np.array()`	从列表/元组创建数组	`np.array([1,2,3])`
`np.zeros()`	创建全0数组	`np.zeros((3,3))`
`np.ones()`	创建全1数组	`np.ones((2,4))`
`np.full()`	创建指定值数组	`np.full((3,3), 5)`
`np.arange()`	创建等差数组	`np.arange(0, 10, 2)`
`np.linspace()`	创建等间隔数组	`np.linspace(0, 1, 5)`
`np.random.rand()`	随机数组(0-1)	`np.random.rand(3,3)`
`np.random.randn()`	标准正态分布	`np.random.randn(100)`
`np.random.randint()`	随机整数数组	`np.random.randint(0,10,(3,3))`
`np.eye()`	单位矩阵	`np.eye(3)`
`np.diag()`	对角矩阵	`np.diag([1,2,3])`
`np.empty()`	未初始化数组	`np.empty((3,3))`

python 复制代码

import numpy as np

# 2.1 基础数组创建
# 从列表创建
arr1 = np.array([1, 2, 3, 4, 5])
arr2d = np.array([[1, 2, 3], [4, 5, 6]])

# 特殊数组
zeros = np.zeros((3, 3))           # 3x3全0数组
ones = np.ones((2, 4))             # 2x4全1数组
full = np.full((3, 3), 255)        # 3x3全255数组（图像常用）

# 序列数组
range_arr = np.arange(0, 10, 2)    # [0, 2, 4, 6, 8]
linspace_arr = np.linspace(0, 1, 5) # [0., 0.25, 0.5, 0.75, 1.]

# 随机数组
random_arr = np.random.rand(3, 3)  # 3x3随机数组(0-1)
normal_arr = np.random.randn(100)  # 100个标准正态分布值
int_random = np.random.randint(0, 256, (3, 3))  # 3x3随机整数(0-255)

# 矩阵
identity = np.eye(3)               # 3x3单位矩阵
diagonal = np.diag([1, 2, 3])      # 对角矩阵

print("基础数组创建示例完成")
print(f"数组形状 zeros: {zeros.shape}, 类型: {zeros.dtype}")

数组属性与信息

属性/方法	作用	示例
`.shape`	数组形状	`arr.shape`
`.ndim`	数组维度	`arr.ndim`
`.size`	元素总数	`arr.size`
`.dtype`	数据类型	`arr.dtype`
`.itemsize`	元素字节数	`arr.itemsize`
`.nbytes`	总字节数	`arr.nbytes`
`np.shape()`	获取形状	`np.shape(arr)`
`np.ndim()`	获取维度	`np.ndim(arr)`
`np.size()`	获取元素数	`np.size(arr)`
`np.dtype()`	获取类型	`np.dtype(arr)`

python 复制代码

# 3.1 数组属性
arr = np.random.rand(3, 4, 5)

print("数组属性:")
print(f"形状 shape: {arr.shape}")      # (3, 4, 5)
print(f"维度 ndim: {arr.ndim}")        # 3
print(f"元素总数 size: {arr.size}")    # 60
print(f"数据类型 dtype: {arr.dtype}")  # float64
print(f"元素字节数 itemsize: {arr.itemsize}")  # 8字节
print(f"总字节数 nbytes: {arr.nbytes}")        # 480字节

# 3.2 类型转换
int_arr = np.array([1, 2, 3], dtype=np.float32)
float_arr = int_arr.astype(np.float64)  # 类型转换
uint8_arr = np.array([0, 128, 255], dtype=np.uint8)  # 图像常用uint8

print(f"类型转换: {int_arr.dtype} -> {float_arr.dtype}")
print(f"uint8范围: [{uint8_arr.min()}, {uint8_arr.max()}]")

数组索引与切片

操作	语法	说明
基本索引	`arr[index]`	单个元素
切片	`arr[start:end:step]`	切片操作
多维索引	`arr[row, col]`	多维数组索引
布尔索引	`arr[mask]`	使用布尔数组
花式索引	`arr[[indices]]`	使用整数数组
`:`	`arr[:]`	所有元素
`...`	`arr[..., 0]`	省略号索引

python 复制代码

# 4.1 创建测试数组
arr = np.arange(24).reshape(4, 6)
print("原始数组:")
print(arr)

# 4.2 基本索引
print(f"\n单个元素 arr[2, 3]: {arr[2, 3]}")          # 15
print(f"整行 arr[1]: {arr[1]}")                      # 第1行
print(f"整列 arr[:, 2]: {arr[:, 2]}")                # 第2列

# 4.3 切片操作
print(f"\n切片 arr[1:3, 2:5]:")
print(arr[1:3, 2:5])  # 行1-2，列2-4

print(f"\n带步长 arr[::2, ::2]:")
print(arr[::2, ::2])  # 每隔一行一列

# 4.4 布尔索引
mask = arr > 10
print(f"\n布尔索引 (arr > 10):")
print(arr[mask])  # 所有大于10的元素

# 4.5 花式索引
indices = [0, 2, 3]
print(f"\n花式索引 arr[[0, 2, 3]]:")
print(arr[indices])  # 第0,2,3行

# 4.6 省略号索引
arr_3d = np.arange(60).reshape(3, 4, 5)
print(f"\n省略号索引 arr_3d[..., 0]:")
print(arr_3d[..., 0])  # 所有维度的第一个元素

形状操作与重塑

函数/方法	作用	示例
`.reshape()`	重塑形状	`arr.reshape((3,4))`
`.resize()`	改变数组形状	`arr.resize((3,4))`
`.flatten()`	展平为一维	`arr.flatten()`
`.ravel()`	展平（视图）	`arr.ravel()`
`.transpose()`	转置	`arr.transpose()` 或 `arr.T`
`.swapaxes()`	交换轴	`arr.swapaxes(0,1)`
`.squeeze()`	移除单维度	`arr.squeeze()`
`.expand_dims()`	增加维度	`np.expand_dims(arr, axis)`
`.concatenate()`	连接数组	`np.concatenate((a,b), axis)`
`.stack()`	堆叠数组	`np.stack((a,b), axis)`
`.vstack()`	垂直堆叠	`np.vstack((a,b))`
`.hstack()`	水平堆叠	`np.hstack((a,b))`
`.split()`	分割数组	`np.split(arr, indices)`

python 复制代码

# 5.1 重塑形状
arr = np.arange(12)
reshaped = arr.reshape(3, 4)  # 改为3x4
print(f"reshape(3,4):\n{reshaped}")

# 5.2 展平
flattened = reshaped.flatten()  # 返回拷贝
raveled = reshaped.ravel()      # 返回视图
print(f"\nflatten: {flattened}")
print(f"ravel: {raveled}")

# 5.3 转置
transposed = reshaped.T
print(f"\n转置:\n{transposed}")

# 5.4 堆叠
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6], [7, 8]])

vstacked = np.vstack((a, b))    # 垂直堆叠
hstacked = np.hstack((a, b))    # 水平堆叠
stacked = np.stack((a, b), axis=0)  # 新维度堆叠

print(f"\n垂直堆叠:\n{vstacked}")
print(f"\n水平堆叠:\n{hstacked}")

# 5.5 分割
arr = np.arange(12).reshape(3, 4)
split_arrs = np.split(arr, 3, axis=0)  # 沿行分割为3份
print(f"\n分割为3份:")
for i, sub_arr in enumerate(split_arrs):
    print(f"部分{i}:\n{sub_arr}")

# 5.6 增加/减少维度
arr_1d = np.array([1, 2, 3])
arr_2d = np.expand_dims(arr_1d, axis=0)  # 增加维度 (1,3)
arr_squeezed = arr_2d.squeeze()          # 移除单维度

print(f"\n增加维度: {arr_1d.shape} -> {arr_2d.shape}")
print(f"移除单维度: {arr_2d.shape} -> {arr_squeezed.shape}")

数学运算

基本运算

运算符	作用	示例
`+`	加法	`arr1 + arr2`
`-`	减法	`arr1 - arr2`
`*`	乘法	`arr1 * arr2`
`/`	除法	`arr1 / arr2`
`//`	整除	`arr1 // arr2`
`%`	取模	`arr1 % arr2`
`**`	幂运算	`arr ** 2`
`@`	矩阵乘法	`arr1 @ arr2`

通用函数（ufunc）

函数	作用	示例
`np.add()`	加法	`np.add(a, b)`
`np.subtract()`	减法	`np.subtract(a, b)`
`np.multiply()`	乘法	`np.multiply(a, b)`
`np.divide()`	除法	`np.divide(a, b)`
`np.power()`	幂运算	`np.power(a, 2)`
`np.sqrt()`	平方根	`np.sqrt(arr)`
`np.exp()`	指数	`np.exp(arr)`
`np.log()`	自然对数	`np.log(arr)`
`np.log10()`	常用对数	`np.log10(arr)`
`np.sin()`	正弦	`np.sin(arr)`
`np.cos()`	余弦	`np.cos(arr)`
`np.tan()`	正切	`np.tan(arr)`
`np.abs()`	绝对值	`np.abs(arr)`
`np.sign()`	符号函数	`np.sign(arr)`
`np.ceil()`	向上取整	`np.ceil(arr)`
`np.floor()`	向下取整	`np.floor(arr)`
`np.round()`	四舍五入	`np.round(arr)`

python 复制代码

# 6.1 基本运算
a = np.array([1, 2, 3, 4])
b = np.array([5, 6, 7, 8])

print("基本运算:")
print(f"加法: {a + b}")
print(f"减法: {a - b}")
print(f"乘法: {a * b}")
print(f"除法: {b / a}")
print(f"幂运算: {a ** 2}")

# 6.2 通用函数
print("\n通用函数:")
print(f"平方根 sqrt: {np.sqrt(a)}")
print(f"指数 exp: {np.exp(a)}")
print(f"自然对数 log: {np.log(a)}")
print(f"绝对值 abs: {np.abs(np.array([-1, -2, 3]))}")

# 6.3 三角函数
angles = np.array([0, np.pi/4, np.pi/2])
print(f"\n三角函数:")
print(f"sin: {np.sin(angles)}")
print(f"cos: {np.cos(angles)}")
print(f"tan: {np.tan(angles)}")

# 6.4 取整函数
arr_float = np.array([1.2, 2.7, 3.5, 4.1])
print(f"\n取整函数:")
print(f"ceil向上: {np.ceil(arr_float)}")
print(f"floor向下: {np.floor(arr_float)}")
print(f"round四舍五入: {np.round(arr_float)}")

统计函数

函数	作用	示例
`np.sum()`	求和	`np.sum(arr)`
`np.mean()`	平均值	`np.mean(arr)`
`np.median()`	中位数	`np.median(arr)`
`np.std()`	标准差	`np.std(arr)`
`np.var()`	方差	`np.var(arr)`
`np.min()`	最小值	`np.min(arr)`
`np.max()`	最大值	`np.max(arr)`
`np.argmin()`	最小值索引	`np.argmin(arr)`
`np.argmax()`	最大值索引	`np.argmax(arr)`
`np.percentile()`	百分位数	`np.percentile(arr, 50)`
`np.ptp()`	极差	`np.ptp(arr)`
`np.cumsum()`	累积和	`np.cumsum(arr)`
`np.cumprod()`	累积积	`np.cumprod(arr)`
`np.histogram()`	直方图	`np.histogram(arr)`

python 复制代码

# 7.1 创建测试数据
arr = np.random.randn(100)  # 100个正态分布随机数

print("统计函数:")
print(f"求和 sum: {np.sum(arr):.3f}")
print(f"平均值 mean: {np.mean(arr):.3f}")
print(f"中位数 median: {np.median(arr):.3f}")
print(f"标准差 std: {np.std(arr):.3f}")
print(f"方差 var: {np.var(arr):.3f}")
print(f"最小值 min: {np.min(arr):.3f}")
print(f"最大值 max: {np.max(arr):.3f}")
print(f"极差 ptp: {np.ptp(arr):.3f}")

# 7.2 索引统计
print(f"\n索引统计:")
print(f"最小值索引 argmin: {np.argmin(arr)}")
print(f"最大值索引 argmax: {np.argmax(arr)}")

# 7.3 累积操作
arr_small = np.array([1, 2, 3, 4])
print(f"\n累积操作:")
print(f"原始数组: {arr_small}")
print(f"累积和 cumsum: {np.cumsum(arr_small)}")
print(f"累积积 cumprod: {np.cumprod(arr_small)}")

# 7.4 百分位数
print(f"\n百分位数:")
print(f"25%分位数: {np.percentile(arr, 25):.3f}")
print(f"50%分位数(中位数): {np.percentile(arr, 50):.3f}")
print(f"75%分位数: {np.percentile(arr, 75):.3f}")

# 7.5 直方图
hist, bins = np.histogram(arr, bins=10)
print(f"\n直方图(10个bin):")
print(f"计数: {hist}")
print(f"边界: {bins}")

线性代数运算

函数	作用	示例
`np.dot()`	点积/矩阵乘法	`np.dot(a, b)`
`@`	矩阵乘法运算符	`a @ b`
`np.matmul()`	矩阵乘法	`np.matmul(a, b)`
`np.linalg.inv()`	矩阵求逆	`np.linalg.inv(a)`
`np.linalg.det()`	行列式	`np.linalg.det(a)`
`np.linalg.eig()`	特征值/特征向量	`np.linalg.eig(a)`
`np.linalg.svd()`	奇异值分解	`np.linalg.svd(a)`
`np.linalg.norm()`	范数	`np.linalg.norm(a)`
`np.linalg.solve()`	解线性方程	`np.linalg.solve(A, b)`
`np.linalg.lstsq()`	最小二乘解	`np.linalg.lstsq(A, b)`
`np.linalg.qr()`	QR分解	`np.linalg.qr(a)`
`np.linalg.cholesky()`	Cholesky分解	`np.linalg.cholesky(a)`
`np.trace()`	矩阵迹	`np.trace(a)`

python 复制代码

# 8.1 创建矩阵
A = np.array([[1, 2], [3, 4]])
B = np.array([[5, 6], [7, 8]])
x = np.array([2, 3])
b = np.array([8, 18])

print("线性代数运算:")

# 8.2 矩阵乘法
print(f"\n矩阵乘法:")
print(f"dot: {np.dot(A, B)}")
print(f"@运算符: {A @ B}")
print(f"matmul: {np.matmul(A, B)}")

# 8.3 矩阵求逆
A_inv = np.linalg.inv(A)
print(f"\n矩阵求逆:")
print(f"A: {A}")
print(f"A的逆: {A_inv}")
print(f"A @ A_inv ≈ I: {A @ A_inv}")  # 接近单位矩阵

# 8.4 行列式
det = np.linalg.det(A)
print(f"\n行列式 det(A): {det:.3f}")

# 8.5 特征值和特征向量
eigenvalues, eigenvectors = np.linalg.eig(A)
print(f"\n特征值: {eigenvalues}")
print(f"特征向量:\n{eigenvectors}")

# 8.6 范数
print(f"\n范数:")
print(f"L2范数: {np.linalg.norm(A):.3f}")
print(f"Frobenius范数: {np.linalg.norm(A, 'fro'):.3f}")

# 8.7 解线性方程组
# A @ x = b
solution = np.linalg.solve(A, b)
print(f"\n解线性方程 A@x=b:")
print(f"解x: {solution}")
print(f"验证 A@x: {A @ solution}")

# 8.8 奇异值分解(SVD)
U, S, Vt = np.linalg.svd(A)
print(f"\n奇异值分解:")
print(f"U:\n{U}")
print(f"奇异值: {S}")
print(f"V转置:\n{Vt}")

# 8.9 矩阵迹
trace = np.trace(A)
print(f"\n矩阵迹: {trace}")

广播机制

广播规则：

如果数组维度不同，将小维度数组形状前面补1
如果两个数组在某个维度上大小相同或其中一个为1，则可以广播
如果两个数组在所有维度上都不匹配且没有维度为1，则出错

python 复制代码

# 9.1 广播示例
print("广播机制示例:")

# 示例1: 标量与数组
arr = np.array([[1, 2, 3], [4, 5, 6]])
result = arr + 10  # 标量10广播到整个数组
print(f"\n数组 + 标量:\n{result}")

# 示例2: 行向量与列向量
row = np.array([1, 2, 3])        # 形状(3,)
col = np.array([[1], [2], [3]])  # 形状(3,1)

result = row + col  # row广播为(1,3) -> (3,3), col广播为(3,3)
print(f"\n行向量 + 列向量:\n{result}")

# 示例3: 不同形状数组
A = np.array([[1, 2, 3], 
              [4, 5, 6]])      # 形状(2,3)
B = np.array([10, 20, 30])     # 形状(3,)

result = A + B  # B广播为(1,3) -> (2,3)
print(f"\n(2,3) + (3,):\n{result}")

# 示例4: 图像处理中的广播
# 模拟RGB图像 (高度, 宽度, 3)
image = np.random.randint(0, 256, (100, 100, 3), dtype=np.uint8)
brightness_adjust = np.array([10, 20, 30])  # 每个通道调整不同亮度

# 广播应用到每个像素
adjusted_image = image + brightness_adjust
adjusted_image = np.clip(adjusted_image, 0, 255)  # 限制到0-255

print(f"\n图像广播调整:")
print(f"原始图像形状: {image.shape}")
print(f"调整值形状: {brightness_adjust.shape}")
print(f"调整后图像形状: {adjusted_image.shape}")

随机数生成

函数	作用	示例
`np.random.rand()`	[0,1)均匀分布	`np.random.rand(3,3)`
`np.random.randn()`	标准正态分布	`np.random.randn(100)`
`np.random.randint()`	随机整数	`np.random.randint(0,10,(3,3))`
`np.random.random()`	[0,1)随机数	`np.random.random((3,3))`
`np.random.uniform()`	均匀分布	`np.random.uniform(0,1,10)`
`np.random.normal()`	正态分布	`np.random.normal(0,1,100)`
`np.random.choice()`	随机选择	`np.random.choice(arr, size=5)`
`np.random.shuffle()`	打乱顺序	`np.random.shuffle(arr)`
`np.random.permutation()`	随机排列	`np.random.permutation(arr)`
`np.random.seed()`	设置随机种子	`np.random.seed(42)`

python 复制代码

# 10.1 设置随机种子（保证可复现）
np.random.seed(42)

print("随机数生成:")

# 10.2 均匀分布
uniform = np.random.rand(3, 3)  # [0,1)均匀分布
print(f"\n[0,1)均匀分布 (3x3):\n{uniform}")

uniform_range = np.random.uniform(0, 10, 5)  # [0,10)均匀分布
print(f"\n[0,10)均匀分布: {uniform_range}")

# 10.3 正态分布
normal = np.random.randn(5)  # 标准正态分布
print(f"\n标准正态分布: {normal}")

normal_custom = np.random.normal(100, 15, 10)  # 均值100, 标准差15
print(f"\nN(100,15²)分布: {normal_custom}")

# 10.4 随机整数
integers = np.random.randint(0, 256, (3, 3))  # [0,256)随机整数
print(f"\n随机整数 (0-255):\n{integers}")

# 10.5 随机选择
choices = np.random.choice([0, 128, 255], size=10, p=[0.1, 0.3, 0.6])
print(f"\n加权随机选择: {choices}")

# 10.6 打乱顺序
arr = np.arange(10)
np.random.shuffle(arr)
print(f"\n打乱顺序: {arr}")

permuted = np.random.permutation(10)
print(f"随机排列: {permuted}")

图像处理相关应用

python 复制代码

# 11.1 模拟图像操作
print("图像处理相关应用:")

# 创建模拟RGB图像 (高度, 宽度, 通道)
height, width, channels = 100, 150, 3
image = np.random.randint(0, 256, (height, width, channels), dtype=np.uint8)

print(f"\n模拟图像:")
print(f"形状: {image.shape}")  # (100, 150, 3)
print(f"数据类型: {image.dtype}")  # uint8
print(f"像素范围: [{image.min()}, {image.max()}]")

# 11.2 图像通道操作
# 分离通道
red_channel = image[:, :, 0]
green_channel = image[:, :, 1]
blue_channel = image[:, :, 2]

print(f"\n通道分离:")
print(f"红色通道形状: {red_channel.shape}")  # (100, 150)

# 合并通道
merged = np.stack([red_channel, green_channel, blue_channel], axis=-1)
print(f"合并后形状: {merged.shape}")

# 11.3 图像裁剪
cropped = image[20:80, 30:120, :]  # 裁剪区域
print(f"\n裁剪后形状: {cropped.shape}")

# 11.4 图像旋转
rotated = np.rot90(image, k=1)  # 旋转90度
print(f"旋转后形状: {rotated.shape}")

# 11.5 图像翻转
flipped_h = image[:, ::-1, :]  # 水平翻转
flipped_v = image[::-1, :, :]  # 垂直翻转
print(f"水平翻转形状: {flipped_h.shape}")

# 11.6 图像缩放（最近邻插值）
scale_factor = 0.5
new_height = int(height * scale_factor)
new_width = int(width * scale_factor)

# 使用数组索引进行简单缩放
scaled = image[::2, ::2, :]  # 每2个像素取一个
print(f"缩放后形状: {scaled.shape}")

# 11.7 图像归一化
normalized = image.astype(np.float32) / 255.0
print(f"\n归一化后范围: [{normalized.min():.3f}, {normalized.max():.3f}]")

# 11.8 图像二值化
threshold = 128
binary = (image > threshold).astype(np.uint8) * 255
print(f"二值化后唯一值: {np.unique(binary)}")

高级技巧与性能优化

python 复制代码

# 12.1 向量化操作 vs 循环
print("性能优化 - 向量化操作:")

# 创建测试数据
size = 10000
arr = np.random.rand(size)

# 方法1: Python循环（慢）
import time

start = time.time()
result_loop = np.zeros(size)
for i in range(size):
    result_loop[i] = arr[i] * 2 + 1
loop_time = time.time() - start

# 方法2: NumPy向量化（快）
start = time.time()
result_vectorized = arr * 2 + 1
vectorized_time = time.time() - start

print(f"循环时间: {loop_time:.6f}秒")
print(f"向量化时间: {vectorized_time:.6f}秒")
print(f"加速比: {loop_time/vectorized_time:.1f}倍")

# 12.2 内存视图操作
print(f"\n内存视图操作:")
arr = np.arange(10)
view = arr[3:7]  # 创建视图，共享内存
view[0] = 100    # 修改视图会影响原始数组
print(f"原始数组: {arr}")
print(f"视图修改后: {arr}")

# 12.3 原地操作
print(f"\n原地操作:")
arr = np.array([1, 2, 3, 4, 5])
arr += 10  # 原地操作，不创建新数组
print(f"原地加法后: {arr}")

# 12.4 使用out参数避免内存分配
arr1 = np.random.rand(1000)
arr2 = np.random.rand(1000)
result = np.empty_like(arr1)

np.multiply(arr1, arr2, out=result)  # 使用预分配内存
print(f"\n使用out参数避免内存分配")

# 12.5 避免不必要的拷贝
arr = np.arange(10)
copy = arr.copy()      # 显式拷贝
view = arr[:]          # 视图（无拷贝）
print(f"\n拷贝 vs 视图:")
print(f"arr is copy: {arr is copy}")  # False
print(f"arr is view: {arr is view}")  # True

实用小技巧

python 复制代码

# 13.1 条件操作
arr = np.array([1, 2, 3, 4, 5])
# np.where(condition, x, y): 满足条件取x，否则取y
result = np.where(arr > 3, arr, 0)
print(f"np.where(arr>3, arr, 0): {result}")

# 13.2 重复数组
repeated = np.repeat(arr, 3)  # 每个元素重复3次
print(f"np.repeat每个元素3次: {repeated}")

tiled = np.tile(arr, 3)  # 整个数组重复3次
print(f"np.tile整个数组3次: {tiled}")

# 13.3 唯一值与计数
arr_with_dup = np.array([1, 2, 2, 3, 3, 3, 4])
unique_values = np.unique(arr_with_dup)
unique_counts = np.bincount(arr_with_dup)  # 计数（仅限非负整数）
print(f"唯一值: {unique_values}")
print(f"计数: {unique_counts}")

# 13.4 排序
sorted_arr = np.sort(arr)
sort_indices = np.argsort(arr)  # 排序索引
print(f"排序: {sorted_arr}")
print(f"排序索引: {sort_indices}")

# 13.5 网格坐标
# 用于图像处理中的像素坐标
x = np.linspace(-1, 1, 5)
y = np.linspace(-1, 1, 3)
X, Y = np.meshgrid(x, y)  # 生成网格坐标
print(f"\n网格坐标X:\n{X}")
print(f"网格坐标Y:\n{Y}")

Pandas

核心数据结构

Series（一维数据）

创建方法	说明	示例
`pd.Series()`	从列表创建	`pd.Series([1,2,3])`
`pd.Series()`	从字典创建	`pd.Series({'a':1, 'b':2})`
`pd.Series()`	从ndarray创建	`pd.Series(np.array([1,2,3]))`
属性
`.values`	获取值数组	`s.values`
`.index`	获取索引	`s.index`
`.dtype`	数据类型	`s.dtype`
`.name`	Series名称	`s.name`

python 复制代码

import pandas as pd
import numpy as np

# 1.1 Series 创建
print("=== Series 创建示例 ===")

# 从列表创建
s1 = pd.Series([1, 2, 3, 4, 5])
print(f"从列表创建:\n{s1}")

# 从字典创建
s2 = pd.Series({'a': 10, 'b': 20, 'c': 30})
print(f"\n从字典创建:\n{s2}")

# 从numpy数组创建
s3 = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
print(f"\n指定索引创建:\n{s3}")

# 1.2 Series 属性
print(f"\n=== Series 属性 ===")
print(f"值数组: {s3.values}")
print(f"索引: {s3.index}")
print(f"数据类型: {s3.dtype}")
print(f"形状: {s3.shape}")
print(f"大小: {s3.size}")

# 1.3 Series 运算
s4 = pd.Series([1, 2, 3, 4])
print(f"\n=== Series 运算 ===")
print(f"加法: {s4 + 10}")
print(f"乘法: {s4 * 2}")
print(f"统计: 均值={s4.mean()}, 总和={s4.sum()}, 标准差={s4.std()}")

DataFrame（二维数据）

创建方法	说明	示例
`pd.DataFrame()`	从字典创建	`pd.DataFrame({'A':[1,2], 'B':[3,4]})`
`pd.DataFrame()`	从列表创建	`pd.DataFrame([[1,2],[3,4]])`
`pd.DataFrame()`	从ndarray创建	`pd.DataFrame(np.array([[1,2],[3,4]]))`
`pd.read_csv()`	读取CSV	`pd.read_csv('data.csv')`
`pd.read_excel()`	读取Excel	`pd.read_excel('data.xlsx')`
属性
`.columns`	列名	`df.columns`
`.index`	索引	`df.index`
`.shape`	形状	`df.shape`
`.dtypes`	数据类型	`df.dtypes`
`.values`	值数组	`df.values`

python 复制代码

# 2.1 DataFrame 创建
print("\n=== DataFrame 创建示例 ===")

# 从字典创建（最常用）
df1 = pd.DataFrame({
    '图像ID': ['img001', 'img002', 'img003', 'img004'],
    '宽度': [1920, 1280, 2560, 800],
    '高度': [1080, 720, 1440, 600],
    '通道数': [3, 3, 1, 3],
    '标签': ['cat', 'dog', 'cat', 'bird']
})
print(f"从字典创建:\n{df1}")

# 从列表创建
data = [
    ['img001', 1920, 1080, 3, 'cat'],
    ['img002', 1280, 720, 3, 'dog'],
    ['img003', 2560, 1440, 1, 'cat'],
    ['img004', 800, 600, 3, 'bird']
]
df2 = pd.DataFrame(data, columns=['图像ID', '宽度', '高度', '通道数', '标签'])
print(f"\n从列表创建:\n{df2}")

# 从CSV文件读取
# df_csv = pd.read_csv('image_data.csv')
# print(f"从CSV读取:\n{df_csv.head()}")

# 2.2 DataFrame 属性
print(f"\n=== DataFrame 属性 ===")
print(f"列名: {df1.columns}")
print(f"索引: {df1.index}")
print(f"形状: {df1.shape}")  # (行数, 列数)
print(f"数据类型:\n{df1.dtypes}")
print(f"值数组形状: {df1.values.shape}")

# 2.3 DataFrame 信息查看
print(f"\n=== DataFrame 信息查看 ===")
print("前3行:")
print(df1.head(3))
print("\n基本信息:")
print(df1.info())
print("\n统计描述:")
print(df1.describe())
print("\n唯一值:")
print(df1['标签'].unique())

数据查看与选择

数据查看方法

方法	作用	示例
`.head()`	查看前n行	`df.head(5)`
`.tail()`	查看后n行	`df.tail(5)`
`.sample()`	随机抽样	`df.sample(5)`
`.info()`	数据信息	`df.info()`
`.describe()`	统计描述	`df.describe()`
`.shape`	数据形状	`df.shape`
`.columns`	列名	`df.columns`
`.dtypes`	数据类型	`df.dtypes`
`.value_counts()`	值计数	`df['col'].value_counts()`
`.unique()`	唯一值	`df['col'].unique()`
`.nunique()`	唯一值数量	`df['col'].nunique()`

python 复制代码

# 创建示例数据
df = pd.DataFrame({
    '图像ID': [f'img{i:03d}' for i in range(100)],
    '宽度': np.random.randint(800, 2561, 100),
    '高度': np.random.randint(600, 1441, 100),
    '通道数': np.random.choice([1, 3], 100, p=[0.2, 0.8]),
    '类别': np.random.choice(['cat', 'dog', 'bird', 'car'], 100),
    '置信度': np.random.uniform(0.7, 1.0, 100),
    '检测时间': pd.date_range('2024-01-01', periods=100, freq='H')
})

print("=== 数据查看方法 ===")

# 基础查看
print(f"1. 前5行:\n{df.head()}")
print(f"\n2. 后3行:\n{df.tail(3)}")
print(f"\n3. 随机5行:\n{df.sample(5)}")
print(f"\n4. 数据形状: {df.shape}")
print(f"\n5. 列名: {df.columns.tolist()}")

# 统计信息
print(f"\n6. 数值列统计:\n{df.describe()}")
print(f"\n7. 类别分布:\n{df['类别'].value_counts()}")
print(f"\n8. 唯一类别数: {df['类别'].nunique()}")

# 数据类型
print(f"\n9. 数据类型:\n{df.dtypes}")
print(f"\n10. 内存使用:\n{df.info(memory_usage='deep')}")

# 快速统计
print(f"\n11. 总和统计:")
print(f"   宽度总和: {df['宽度'].sum()}")
print(f"   高度均值: {df['高度'].mean():.1f}")
print(f"   置信度中位数: {df['置信度'].median():.3f}")
print(f"   通道数众数: {df['通道数'].mode().values[0]}")

数据选择（索引/切片）

方法	语法	说明
列选择	`df['col']`	选择单列
列选择	`df[['col1', 'col2']]`	选择多列
行选择	`df.loc[index]`	按标签选择
行选择	`df.iloc[index]`	按位置选择
条件选择	`df[df['col'] > value]`	布尔索引
切片	`df[start:end]`	行切片
切片	`df.loc[start:end]`	标签切片
切片	`df.iloc[start:end]`	位置切片
`.isin()`	`df[df['col'].isin(list)]`	包含判断
`.query()`	`df.query('col > value')`	查询表达式

python 复制代码

print("\n=== 数据选择方法 ===")

# 列选择
print("1. 选择单列:")
print(df['宽度'].head())

print("\n2. 选择多列:")
print(df[['图像ID', '宽度', '高度', '类别']].head())

# 行选择
print("\n3. 按位置选择行:")
print(df.iloc[0])      # 第一行
print(df.iloc[[0, 2, 4]])  # 多行
print(df.iloc[10:15])  # 切片

print("\n4. 按标签选择行:")
print(df.loc[0])  # 索引为0的行
print(df.loc[0:5])  # 索引0到5的行

# 条件选择
print("\n5. 条件选择:")
# 宽度大于2000的图像
wide_images = df[df['宽度'] > 2000]
print(f"宽度>2000的图像: {len(wide_images)}张")

# 灰度图像（通道数=1）
gray_images = df[df['通道数'] == 1]
print(f"灰度图像: {len(gray_images)}张")

# 多条件组合
cat_dog = df[(df['类别'] == 'cat') | (df['类别'] == 'dog')]
print(f"猫狗类别: {len(cat_dog)}张")

# 高置信度且是猫的图像
high_conf_cat = df[(df['置信度'] > 0.9) & (df['类别'] == 'cat')]
print(f"高置信度猫图像: {len(high_conf_cat)}张")

# isin 方法
print("\n6. isin 方法:")
selected_categories = df[df['类别'].isin(['cat', 'dog'])]
print(f"猫或狗: {len(selected_categories)}张")

# query 方法
print("\n7. query 方法:")
result = df.query('宽度 > 2000 and 高度 > 1000')
print(f"大尺寸图像: {len(result)}张")

# 同时选择行和列
print("\n8. 同时选择行和列:")
# 选择前5行的特定列
subset = df.loc[0:4, ['图像ID', '宽度', '高度', '类别']]
print(subset)

# 使用iloc选择
subset2 = df.iloc[0:5, [0, 1, 2, 4]]  # 第0,1,2,4列
print(f"\n使用iloc选择:\n{subset2}")

数据处理与清洗

数据清洗方法

方法	作用	示例
`.dropna()`	删除缺失值	`df.dropna()`
`.fillna()`	填充缺失值	`df.fillna(value)`
`.drop()`	删除行列	`df.drop(columns=['col'])`
`.rename()`	重命名	`df.rename(columns={'old':'new'})`
`.astype()`	类型转换	`df['col'].astype('int')`
`.replace()`	替换值	`df.replace({'old':'new'})`
`.duplicated()`	查找重复	`df.duplicated()`
`.drop_duplicates()`	删除重复	`df.drop_duplicates()`
`.isnull()`	检查空值	`df.isnull()`
`.notnull()`	检查非空	`df.notnull()`
`.clip()`	限幅	`df['col'].clip(lower, upper)`

python 复制代码

print("=== 数据清洗方法 ===")

# 创建包含问题数据的数据集
df_dirty = pd.DataFrame({
    '图像ID': ['img001', 'img002', 'img003', 'img004', 'img005'],
    '宽度': [1920, None, 2560, 800, 1920],
    '高度': [1080, 720, None, 600, 1080],
    '通道数': [3, 3, 1, 3, 3],
    '类别': ['cat', 'dog', 'cat', 'bird', 'cat'],
    '置信度': [0.95, 0.87, 1.2, 0.65, 0.95],  # 1.2超出范围
    '文件大小': [2048, 1024, 4096, 512, 2048]
})

print("原始数据:")
print(df_dirty)
print(f"\n空值统计:\n{df_dirty.isnull().sum()}")

# 1. 处理缺失值
print("\n1. 处理缺失值:")
# 删除含有缺失值的行
df_clean1 = df_dirty.dropna()
print(f"删除缺失值后: {len(df_clean1)}行")

# 填充缺失值
df_filled = df_dirty.fillna({
    '宽度': df_dirty['宽度'].mean(),
    '高度': df_dirty['高度'].median()
})
print(f"填充后:\n{df_filled}")

# 2. 处理异常值
print("\n2. 处理异常值:")
# 置信度限制在0-1之间
df_dirty['置信度'] = df_dirty['置信度'].clip(0, 1)
print(f"限幅后置信度: {df_dirty['置信度'].tolist()}")

# 3. 删除列
print("\n3. 删除列:")
df_no_size = df_dirty.drop(columns=['文件大小'])
print(f"删除文件大小列:\n{df_no_size}")

# 4. 重命名列
print("\n4. 重命名列:")
df_renamed = df_dirty.rename(columns={
    '宽度': 'image_width',
    '高度': 'image_height',
    '通道数': 'channels'
})
print(f"重命名后列名: {df_renamed.columns.tolist()}")

# 5. 类型转换
print("\n5. 类型转换:")
df_dirty['宽度'] = df_dirty['宽度'].astype('float64')
print(f"宽度数据类型: {df_dirty['宽度'].dtype}")

# 6. 替换值
print("\n6. 替换值:")
df_replaced = df_dirty.replace({'类别': {'cat': '猫', 'dog': '狗'}})
print(f"类别替换后:\n{df_replaced['类别']}")

# 7. 删除重复行
print("\n7. 删除重复行:")
df_dirty.loc[5] = ['img001', 1920, 1080, 3, 'cat', 0.95, 2048]  # 添加重复行
df_no_dup = df_dirty.drop_duplicates()
print(f"删除重复后: {len(df_no_dup)}行")

数据变换方法

方法	作用	示例
`.apply()`	应用函数	`df['col'].apply(func)`
`.map()`	映射替换	`df['col'].map(mapping)`
`.applymap()`	元素级应用	`df.applymap(func)`
`.groupby()`	分组	`df.groupby('col')`
`.pivot_table()`	数据透视	`pd.pivot_table(df, ...)`
`.melt()`	宽转长	`pd.melt(df, ...)`
`.pivot()`	长转宽	`df.pivot(...)`
`.cut()`	数据分箱	`pd.cut(df['col'], bins)`
`.qcut()`	等频分箱	`pd.qcut(df['col'], q)`

python 复制代码

print("\n=== 数据变换方法 ===")

# 1. apply 方法
print("1. apply 方法:")

# 计算图像面积
df['面积'] = df.apply(lambda row: row['宽度'] * row['高度'], axis=1)
print(f"添加面积列:\n{df[['图像ID', '宽度', '高度', '面积']].head()}")

# 分类图像尺寸
def classify_size(width, height):
    if width > 2000 or height > 1500:
        return 'large'
    elif width > 1000 or height > 700:
        return 'medium'
    else:
        return 'small'

df['尺寸分类'] = df.apply(lambda row: classify_size(row['宽度'], row['高度']), axis=1)
print(f"\n尺寸分类分布:\n{df['尺寸分类'].value_counts()}")

# 2. map 方法
print("\n2. map 方法:")
# 类别编码
category_map = {'cat': 0, 'dog': 1, 'bird': 2, 'car': 3}
df['类别编码'] = df['类别'].map(category_map)
print(f"类别编码:\n{df[['类别', '类别编码']].head()}")

# 3. 数据分箱
print("\n3. 数据分箱:")
# 将宽度分为3个区间
df['宽度分箱'] = pd.cut(df['宽度'], bins=3, labels=['小', '中', '大'])
print(f"宽度分箱:\n{df[['宽度', '宽度分箱']].head()}")

# 等频分箱（按百分位数）
df['置信度分箱'] = pd.qcut(df['置信度'], q=4, labels=['低', '中低', '中高', '高'])
print(f"\n置信度分箱分布:\n{df['置信度分箱'].value_counts()}")

# 4. 分组操作
print("\n4. 分组操作:")
grouped = df.groupby('类别')
print(f"按类别分组统计:")
print(grouped['宽度'].agg(['mean', 'std', 'count']))

# 多列分组统计
category_stats = df.groupby('类别').agg({
    '宽度': ['mean', 'max', 'min'],
    '高度': ['mean', 'max', 'min'],
    '置信度': 'mean'
})
print(f"\n类别详细统计:\n{category_stats}")

数据合并与连接

数据合并方法

方法	作用	适用场景
`pd.concat()`	拼接	相同结构的多个DataFrame
`pd.merge()`	合并	基于键合并不同DataFrame
`.join()`	连接	基于索引合并
`.append()`	追加	添加行（已弃用，用concat）
`pd.concat(axis=1)`	横向拼接	增加列

python 复制代码

print("=== 数据合并与连接 ===")

# 创建示例数据
df_images = pd.DataFrame({
    '图像ID': [f'img{i:03d}' for i in range(10)],
    '宽度': np.random.randint(800, 2561, 10),
    '高度': np.random.randint(600, 1441, 10),
    '类别': np.random.choice(['cat', 'dog', 'bird'], 10)
})

df_metadata = pd.DataFrame({
    '图像ID': [f'img{i:03d}' for i in range(5, 15)],
    '拍摄时间': pd.date_range('2024-01-01', periods=10, freq='D'),
    '摄影师': np.random.choice(['Alice', 'Bob', 'Charlie'], 10)
})

df_labels = pd.DataFrame({
    '图像ID': [f'img{i:03d}' for i in range(0, 10, 2)],
    '标注质量': np.random.choice(['高', '中', '低'], 5),
    '标注员': np.random.choice(['张三', '李四'], 5)
})

print("1. 图像数据:")
print(df_images)
print("\n2. 元数据:")
print(df_metadata)
print("\n3. 标注数据:")
print(df_labels)

# 1. concat 纵向拼接
print("\n1. concat 纵向拼接:")
# 假设有更多图像数据
df_more_images = pd.DataFrame({
    '图像ID': [f'img{i:03d}' for i in range(10, 15)],
    '宽度': np.random.randint(800, 2561, 5),
    '高度': np.random.randint(600, 1441, 5),
    '类别': np.random.choice(['cat', 'dog', 'bird'], 5)
})

df_all_images = pd.concat([df_images, df_more_images], ignore_index=True)
print(f"合并后总图像数: {len(df_all_images)}")

# 2. merge 合并
print("\n2. merge 合并:")
# 内连接（默认）
df_inner = pd.merge(df_images, df_metadata, on='图像ID', how='inner')
print(f"内连接结果（5行）:\n{df_inner.head()}")

# 左连接
df_left = pd.merge(df_images, df_metadata, on='图像ID', how='left')
print(f"\n左连接结果（10行）:\n{df_left.head()}")

# 右连接
df_right = pd.merge(df_images, df_metadata, on='图像ID', how='right')
print(f"\n右连接结果（10行）:\n{df_right.head()}")

# 外连接
df_outer = pd.merge(df_images, df_metadata, on='图像ID', how='outer')
print(f"\n外连接结果（15行）:\n{df_outer.head()}")

# 3. 多表合并
print("\n3. 多表合并:")
df_combined = pd.merge(
    pd.merge(df_images, df_metadata, on='图像ID', how='left'),
    df_labels, on='图像ID', how='left'
)
print(f"三表合并结果:\n{df_combined}")

# 4. join 连接（基于索引）
print("\n4. join 连接:")
# 设置索引
df_images_idx = df_images.set_index('图像ID')
df_labels_idx = df_labels.set_index('图像ID')

df_joined = df_images_idx.join(df_labels_idx, how='left')
print(f"基于索引连接:\n{df_joined.head()}")

# 5. 横向拼接（增加列）
print("\n5. 横向拼接:")
# 创建额外特征
df_features = pd.DataFrame({
    '特征1': np.random.randn(10),
    '特征2': np.random.randn(10),
    '特征3': np.random.randn(10)
})

df_with_features = pd.concat([df_images.reset_index(drop=True), df_features], axis=1)
print(f"增加特征列:\n{df_with_features.head()}")

分组与聚合操作

分组聚合方法

方法	作用	示例
`.groupby()`	分组	`df.groupby('col')`
`.agg()`	聚合	`df.groupby('col').agg(['mean', 'sum'])`
`.transform()`	分组转换	`df.groupby('col').transform('mean')`
`.filter()`	分组过滤	`df.groupby('col').filter(func)`
`.apply()`	分组应用	`df.groupby('col').apply(func)`
`.pivot_table()`	数据透视	`pd.pivot_table(df, ...)`
`.crosstab()`	交叉表	`pd.crosstab(df['col1'], df['col2'])`

python 复制代码

print("=== 分组与聚合操作 ===")

# 创建示例数据
df_detections = pd.DataFrame({
    '图像ID': [f'img{i:03d}' for i in range(20)],
    '检测类别': np.random.choice(['person', 'car', 'dog', 'cat', 'bicycle'], 20),
    '置信度': np.random.uniform(0.5, 1.0, 20),
    '检测框面积': np.random.randint(100, 10000, 20),
    '检测时间': np.random.uniform(0.1, 2.0, 20)  # 秒
})

print("检测数据:")
print(df_detections.head())

# 1. 基础分组统计
print("\n1. 基础分组统计:")
# 按类别分组
grouped = df_detections.groupby('检测类别')

print("每个类别的检测数量:")
print(grouped.size())

print("\n每个类别的平均置信度:")
print(grouped['置信度'].mean())

print("\n每个类别的统计汇总:")
print(grouped.agg({
    '置信度': ['mean', 'std', 'min', 'max'],
    '检测框面积': ['mean', 'sum'],
    '检测时间': 'mean'
}))

# 2. 多列分组
print("\n2. 多列分组:")
# 假设我们还有图像来源信息
df_detections['来源'] = np.random.choice(['camera1', 'camera2', 'camera3'], 20)

# 按来源和类别双重分组
multi_grouped = df_detections.groupby(['来源', '检测类别'])
print("多级分组统计:")
print(multi_grouped['置信度'].mean())

# 3. 分组转换（添加分组统计列）
print("\n3. 分组转换:")
# 添加每个类别的平均置信度作为新列
df_detections['类别平均置信度'] = df_detections.groupby('检测类别')['置信度'].transform('mean')
print(df_detections[['图像ID', '检测类别', '置信度', '类别平均置信度']].head())

# 4. 分组过滤
print("\n4. 分组过滤:")
# 只保留检测数量大于3的类别
filtered = df_detections.groupby('检测类别').filter(lambda x: len(x) > 3)
print(f"过滤后类别: {filtered['检测类别'].unique()}")

# 5. 自定义聚合函数
print("\n5. 自定义聚合函数:")
def confidence_range(series):
    return series.max() - series.min()

def large_detections(series):
    return (series > 5000).sum()

aggregations = df_detections.groupby('检测类别').agg({
    '置信度': ['mean', confidence_range],
    '检测框面积': ['mean', large_detections],
    '检测时间': lambda x: x.quantile(0.9)  # 90%分位数
})

print("自定义聚合结果:")
print(aggregations)

# 6. 数据透视表
print("\n6. 数据透视表:")
pivot = pd.pivot_table(
    df_detections,
    values=['置信度', '检测框面积'],
    index='检测类别',
    columns='来源',
    aggfunc=['mean', 'count'],
    fill_value=0
)
print("数据透视表:")
print(pivot)

# 7. 交叉表
print("\n7. 交叉表:")
cross = pd.crosstab(
    df_detections['检测类别'],
    df_detections['来源'],
    values=df_detections['置信度'],
    aggfunc='mean'
)
print("交叉表（平均置信度）:")
print(cross)

时间序列处理

时间序列方法

方法	作用	示例
`pd.to_datetime()`	转换为时间戳	`pd.to_datetime(df['col'])`
`.dt`访问器	时间属性	`df['date'].dt.year`
`.resample()`	重采样	`df.resample('D').mean()`
`.shift()`	偏移	`df['col'].shift(1)`
`.rolling()`	滚动窗口	`df['col'].rolling(window=7).mean()`
`.expanding()`	扩展窗口	`df['col'].expanding().mean()`
`.pct_change()`	百分比变化	`df['col'].pct_change()`
`.diff()`	差分	`df['col'].diff()`

python 复制代码

print("=== 时间序列处理 ===")

# 创建时间序列数据
np.random.seed(42)
date_range = pd.date_range('2024-01-01', periods=100, freq='H')
df_time = pd.DataFrame({
    '时间戳': date_range,
    '检测数量': np.random.poisson(5, 100),  # 泊松分布
    '平均置信度': np.random.uniform(0.7, 0.95, 100),
    '处理时间': np.random.exponential(0.5, 100)  # 指数分布
})

# 设置时间索引
df_time.set_index('时间戳', inplace=True)
print("时间序列数据:")
print(df_time.head())

# 1. 时间属性提取
print("\n1. 时间属性提取:")
df_time['小时'] = df_time.index.hour
df_time['星期几'] = df_time.index.day_name()
df_time['是否工作日'] = df_time.index.dayofweek < 5

print("添加时间属性后:")
print(df_time[['检测数量', '小时', '星期几', '是否工作日']].head())

# 2. 重采样（降采样）
print("\n2. 重采样 - 按天聚合:")
daily = df_time.resample('D').agg({
    '检测数量': 'sum',
    '平均置信度': 'mean',
    '处理时间': 'mean'
})
print("按天重采样:")
print(daily.head())

# 3. 重采样（升采样）
print("\n3. 重采样 - 按10分钟插值:")
# 创建更稀疏的数据用于演示
df_sparse = df_time.iloc[::6, :]  # 每6小时一个数据点
ten_min = df_sparse.resample('10T').asfreq().interpolate()
print("按10分钟插值:")
print(ten_min.head())

# 4. 滚动窗口计算
print("\n4. 滚动窗口计算:")
# 计算6小时滚动平均
df_time['6小时平均检测'] = df_time['检测数量'].rolling(window=6, center=True).mean()
# 计算24小时滚动标准差
df_time['24小时检测波动'] = df_time['检测数量'].rolling(window=24).std()

print("滚动窗口统计:")
print(df_time[['检测数量', '6小时平均检测', '24小时检测波动']].head(10))

# 5. 扩展窗口计算
print("\n5. 扩展窗口计算:")
# 计算累计平均
df_time['累计平均置信度'] = df_time['平均置信度'].expanding().mean()
print("扩展窗口统计:")
print(df_time[['平均置信度', '累计平均置信度']].head())

# 6. 时间偏移
print("\n6. 时间偏移:")
# 计算检测数量的日环比
df_time['检测数量_昨日'] = df_time['检测数量'].shift(24)  # 24小时前
df_time['检测数量变化'] = df_time['检测数量'] - df_time['检测数量_昨日']
print("时间偏移计算:")
print(df_time[['检测数量', '检测数量_昨日', '检测数量变化']].head(25))

# 7. 百分比变化和差分
print("\n7. 百分比变化和差分:")
df_time['检测数量_pct_change'] = df_time['检测数量'].pct_change()
df_time['检测数量_diff'] = df_time['检测数量'].diff()
print("变化率计算:")
print(df_time[['检测数量', '检测数量_pct_change', '检测数量_diff']].head())

# 8. 时间段选择
print("\n8. 时间段选择:")
# 选择工作时间（9-17点）
work_hours = df_time.between_time('09:00', '17:00')
print(f"工作时间数据量: {len(work_hours)}")

# 选择特定日期
jan_first = df_time['2024-01-01']
print(f"1月1日数据量: {len(jan_first)}")

# 选择日期范围
jan_data = df_time['2024-01']
print(f"1月份数据量: {len(jan_data)}")

性能优化技巧

性能优化方法

技巧	作用	说明
向量化操作	避免循环	使用NumPy/Pandas内置函数
`.loc[]` vs `[]`	正确索引	使用`.loc[]`进行标签索引
避免链式赋值	防止警告	直接赋值而不是链式
使用合适的数据类型	减少内存	如`int8`代替`int64`
分块处理	处理大数据	`chunksize`参数
使用`.query()`	快速查询	语法简洁性能好
`.at[]`/`.iat[]`	快速访问	访问单个元素
`.eval()`	表达式求值	加速复杂运算
内存映射文件	处理大文件	`mmap_mode`参数

python 复制代码

print("=== 性能优化技巧 ===")

# 1. 创建大数据集用于测试
print("1. 性能测试数据集:")
np.random.seed(42)
n_rows = 100000
df_large = pd.DataFrame({
    '图像ID': [f'img_{i:06d}' for i in range(n_rows)],
    '宽度': np.random.randint(800, 2561, n_rows),
    '高度': np.random.randint(600, 1441, n_rows),
    '类别': np.random.choice(['cat', 'dog', 'bird', 'car', 'person'], n_rows),
    '置信度': np.random.uniform(0.5, 1.0, n_rows),
    '检测时间': np.random.uniform(0.01, 2.0, n_rows)
})

print(f"数据集大小: {n_rows} 行 × {df_large.shape[1]} 列")
print(f"内存使用: {df_large.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# 2. 优化数据类型
print("\n2. 优化数据类型:")
print("优化前数据类型:")
print(df_large.dtypes)

# 优化数值类型
df_optimized = df_large.copy()
df_optimized['宽度'] = df_optimized['宽度'].astype('int16')  # 800-2560适合int16
df_optimized['高度'] = df_optimized['高度'].astype('int16')
df_optimized['置信度'] = df_optimized['置信度'].astype('float32')
df_optimized['检测时间'] = df_optimized['检测时间'].astype('float32')

print("\n优化后数据类型:")
print(df_optimized.dtypes)
print(f"内存减少: {(df_large.memory_usage().sum() - df_optimized.memory_usage().sum()) / 1024**2:.2f} MB")

# 3. 向量化操作 vs 循环
print("\n3. 向量化 vs 循环:")
import time

# 方法1: 使用循环（慢）
start = time.time()
areas_loop = []
for i in range(len(df_large)):
    areas_loop.append(df_large.iloc[i]['宽度'] * df_large.iloc[i]['高度'])
loop_time = time.time() - start

# 方法2: 向量化操作（快）
start = time.time()
areas_vectorized = df_large['宽度'] * df_large['高度']
vector_time = time.time() - start

print(f"循环时间: {loop_time:.4f} 秒")
print(f"向量化时间: {vector_time:.4f} 秒")
print(f"加速比: {loop_time/vector_time:.1f}倍")

# 4. 使用.query()优化查询
print("\n4. .query() 优化:")
start = time.time()
result1 = df_large[(df_large['宽度'] > 2000) & (df_large['置信度'] > 0.8)]
time1 = time.time() - start

start = time.time()
result2 = df_large.query('宽度 > 2000 and 置信度 > 0.8')
time2 = time.time() - start

print(f"传统方法: {time1:.4f} 秒")
print(f".query()方法: {time2:.4f} 秒")
print(f"性能提升: {time1/time2:.1f}倍")

# 5. 使用.eval()优化复杂运算
print("\n5. .eval() 优化:")
# 复杂计算：计算标准化面积
start = time.time()
df_large['标准化面积'] = (df_large['宽度'] * df_large['高度'] - 
                       df_large['宽度'].mean() * df_large['高度'].mean()) / \
                      (df_large['宽度'].std() * df_large['高度'].std())
time1 = time.time() - start

start = time.time()
df_large['标准化面积_eval'] = pd.eval('(宽度 * 高度 - 宽度.mean() * 高度.mean()) / (宽度.std() * 高度.std())')
time2 = time.time() - start

print(f"传统计算: {time1:.4f} 秒")
print(f".eval()计算: {time2:.4f} 秒")

# 6. 分块处理大数据
print("\n6. 分块处理:")
# 模拟处理非常大的CSV文件
chunk_size = 10000
results = []

for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size):
    # 处理每个分块
    chunk_processed = chunk[chunk['置信度'] > 0.7]
    results.append(chunk_processed)
    
# 合并结果
final_df = pd.concat(results, ignore_index=True)
print(f"分块处理完成，总行数: {len(final_df)}")

# 7. 内存映射文件
print("\n7. 内存映射文件:")
# 对于非常大的数组，可以使用numpy内存映射
mmap_array = np.memmap('large_array.dat', dtype='float32', mode='r', shape=(1000000, 100))
print(f"内存映射数组形状: {mmap_array.shape}")

CV工程师实用案例

图像数据集管理

python 复制代码

print("=== CV工程师实用案例 ===")

# 案例1：图像数据集管理
print("案例1：图像数据集管理")

# 模拟图像数据集
image_data = {
    'image_id': [f'img_{i:04d}' for i in range(1000)],
    'width': np.random.randint(800, 2561, 1000),
    'height': np.random.randint(600, 1441, 1000),
    'channels': np.random.choice([1, 3], 1000, p=[0.1, 0.9]),
    'format': np.random.choice(['JPEG', 'PNG', 'BMP'], 1000),
    'file_size': np.random.randint(1024, 1024*1024, 1000),  # 1KB-1MB
    'category': np.random.choice(['person', 'car', 'cat', 'dog', 'bird', 'other'], 1000),
    'split': np.random.choice(['train', 'val', 'test'], 1000, p=[0.7, 0.15, 0.15])
}

df_images = pd.DataFrame(image_data)
print(f"图像数据集: {df_images.shape[0]} 张图像")
print(f"类别分布:\n{df_images['category'].value_counts()}")
print(f"数据集划分:\n{df_images['split'].value_counts()}")

# 添加衍生特征
df_images['aspect_ratio'] = df_images['width'] / df_images['height']
df_images['pixel_count'] = df_images['width'] * df_images['height']
df_images['size_category'] = pd.cut(
    df_images['file_size'], 
    bins=[0, 1024*100, 1024*500, float('inf')],
    labels=['small', 'medium', 'large']
)

# 数据集统计
print("\n数据集统计:")
stats = df_images.groupby('split').agg({
    'width': 'mean',
    'height': 'mean',
    'pixel_count': 'mean',
    'file_size': 'mean',
    'category': lambda x: x.nunique()
}).round(2)

stats.columns = ['平均宽度', '平均高度', '平均像素数', '平均文件大小', '类别数']
print(stats)

# 案例2：模型训练结果分析
print("\n案例2：模型训练结果分析")

# 模拟训练日志
epochs = 50
train_log = pd.DataFrame({
    'epoch': range(1, epochs+1),
    'train_loss': np.exp(-np.linspace(0, 5, epochs)) + np.random.normal(0, 0.02, epochs),
    'val_loss': np.exp(-np.linspace(0, 4.5, epochs)) + np.random.normal(0, 0.03, epochs),
    'train_acc': 1 - np.exp(-np.linspace(0, 4, epochs)) + np.random.normal(0, 0.01, epochs),
    'val_acc': 1 - np.exp(-np.linspace(0, 3.5, epochs)) + np.random.normal(0, 0.015, epochs),
    'learning_rate': np.logspace(-3, -5, epochs)
})

print("训练日志:")
print(train_log.head())

# 分析最佳epoch
best_epoch = train_log.loc[train_log['val_acc'].idxmax()]
print(f"\n最佳epoch: {int(best_epoch['epoch'])}")
print(f"最佳验证准确率: {best_epoch['val_acc']:.3%}")
print(f"对应训练准确率: {best_epoch['train_acc']:.3%}")

# 计算收敛速度
convergence_epoch = train_log[train_log['val_loss'] < 0.1].iloc[0]['epoch']
print(f"收敛到val_loss<0.1的epoch: {int(convergence_epoch)}")

# 案例3：检测结果分析
print("\n案例3：检测结果分析")

# 模拟检测结果
n_detections = 500
detection_results = pd.DataFrame({
    'image_id': np.random.choice(df_images['image_id'], n_detections),
    'class': np.random.choice(['person', 'car', 'cat', 'dog'], n_detections),
    'confidence': np.random.beta(5, 2, n_detections),  # 偏向高置信度的分布
    'bbox_x': np.random.uniform(0, 1, n_detections),
    'bbox_y': np.random.uniform(0, 1, n_detections),
    'bbox_w': np.random.uniform(0.1, 0.5, n_detections),
    'bbox_h': np.random.uniform(0.1, 0.5, n_detections)
})

# 添加检测质量评估
detection_results['bbox_area'] = detection_results['bbox_w'] * detection_results['bbox_h']
detection_results['detection_quality'] = pd.cut(
    detection_results['confidence'],
    bins=[0, 0.5, 0.7, 0.9, 1.0],
    labels=['poor', 'fair', 'good', 'excellent']
)

print("检测结果分析:")
print(f"总检测数: {len(detection_results)}")

# 按类别分析
class_stats = detection_results.groupby('class').agg({
    'confidence': ['mean', 'std'],
    'bbox_area': 'mean',
    'image_id': 'nunique'
})
class_stats.columns = ['平均置信度', '置信度标准差', '平均边界框面积', '涉及图像数']
print(f"\n按类别统计:\n{class_stats}")

# 计算mAP（简化版）
def calculate_map(results, iou_threshold=0.5):
    # 简化的mAP计算
    results_sorted = results.sort_values('confidence', ascending=False)
    
    # 模拟TP/FP（这里简化处理）
    results_sorted['is_tp'] = results_sorted['confidence'] > 0.5
    
    # 计算precision-recall
    results_sorted['cumulative_tp'] = results_sorted['is_tp'].cumsum()
    results_sorted['cumulative_fp'] = (~results_sorted['is_tp']).cumsum()
    results_sorted['precision'] = results_sorted['cumulative_tp'] / \
                                 (results_sorted['cumulative_tp'] + results_sorted['cumulative_fp'])
    results_sorted['recall'] = results_sorted['cumulative_tp'] / results_sorted['is_tp'].sum()
    
    # 计算AP（平均精度）
    ap = results_sorted['precision'].mean()
    return ap

# 按类别计算AP
ap_scores = {}
for cls in detection_results['class'].unique():
    cls_results = detection_results[detection_results['class'] == cls]
    ap_scores[cls] = calculate_map(cls_results)

print(f"\n各类别AP分数:")
for cls, ap in ap_scores.items():
    print(f"  {cls}: {ap:.3f}")

print(f"mAP: {np.mean(list(ap_scores.values())):.3f}")

Matplotlib

架构

Figure（图像）：顶级容器，所有绘图元素的容器
Axes（坐标系）：带有坐标系的绘图区域，一个Figure可以包含多个Axes
Axis（坐标轴）：坐标系中的轴，包含刻度、标签等
Artist（艺术家）：所有可见元素的基类（文本、线条、图像等）

绘图风格

MATLAB风格 ：函数式接口，使用plt.plot()等函数
面向对象风格：显式创建Figure和Axes对象，更灵活可控

基础绘图

创建图形

函数/方法	作用	示例
`plt.figure()`	创建图形	`plt.figure(figsize=(8,6))`
`plt.subplots()`	创建图形和子图	`fig, ax = plt.subplots()`
`plt.subplot()`	创建子图	`plt.subplot(2,2,1)`
`plt.gcf()`	获取当前图形	`fig = plt.gcf()`
`plt.gca()`	获取当前坐标系	`ax = plt.gca()`
`plt.clf()`	清除当前图形	`plt.clf()`
`plt.cla()`	清除当前坐标系	`plt.cla()`
`plt.close()`	关闭图形	`plt.close('all')`

python 复制代码

import matplotlib.pyplot as plt
import numpy as np

# 2.1 创建图形
print("=== 创建图形示例 ===")

# 方法1: MATLAB风格
plt.figure(figsize=(10, 6), dpi=100, facecolor='white')
plt.plot([1, 2, 3, 4], [1, 4, 9, 16])
plt.title('MATLAB Style Plot')
plt.xlabel('X轴')
plt.ylabel('Y轴')
plt.grid(True)
plt.show()

# 方法2: 面向对象风格 (推荐)
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot([1, 2, 3, 4], [1, 4, 9, 16])
ax.set_title('Object-Oriented Style Plot')
ax.set_xlabel('X轴')
ax.set_ylabel('Y轴')
ax.grid(True)
plt.show()

# 2.2 创建多个子图
print("=== 创建子图示例 ===")

# 方法1: 使用subplots创建网格
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('2x2子图示例')

# 在第一个子图绘图
axes[0, 0].plot([1, 2, 3, 4], [1, 4, 9, 16], 'ro-')
axes[0, 0].set_title('子图 1')

# 在第二个子图绘图
axes[0, 1].plot([1, 2, 3, 4], [1, 2, 3, 4], 'bs--')
axes[0, 1].set_title('子图 2')

# 在第三个子图绘图
x = np.linspace(0, 2*np.pi, 100)
axes[1, 0].plot(x, np.sin(x), 'g-')
axes[1, 0].set_title('正弦曲线')

# 在第四个子图绘图
axes[1, 1].bar(['A', 'B', 'C', 'D'], [3, 7, 2, 5], color='orange')
axes[1, 1].set_title('柱状图')

plt.tight_layout()
plt.show()

# 方法2: 使用subplot创建不规则子图
fig = plt.figure(figsize=(10, 6))

# 创建2x2网格中的第一个子图，占据两列
ax1 = plt.subplot(2, 2, 1)
ax1.plot([1, 2, 3, 4], [1, 4, 9, 16])
ax1.set_title('子图1 (2列)')

# 创建2x2网格中的第二个子图，占据第一行的第二列
ax2 = plt.subplot(2, 2, 2)
ax2.plot([1, 2, 3, 4], [1, 2, 3, 4])
ax2.set_title('子图2')

# 创建2x2网格中的第三、四子图，合并为第二行的两个单元格
ax3 = plt.subplot(2, 2, (3, 4))
ax3.plot([1, 2, 3, 4], [1, 8, 27, 64])
ax3.set_title('子图3 (跨两列)')

plt.tight_layout()
plt.show()

基本图表类型

线图

函数	作用	示例
`plt.plot()`	绘制线图	`plt.plot(x, y, 'r--', linewidth=2)`
`plt.scatter()`	绘制散点图	`plt.scatter(x, y, s=50, c='blue')`
`plt.errorbar()`	绘制误差线	`plt.errorbar(x, y, yerr=error)`
`plt.fill_between()`	填充区域	`plt.fill_between(x, y1, y2)`
`plt.stem()`	绘制火柴杆图	`plt.stem(x, y)`
`plt.step()`	绘制阶梯图	`plt.step(x, y)`
`plt.bar()`	绘制柱状图	`plt.bar(x, height, width=0.8)`
`plt.barh()`	绘制水平柱状图	`plt.barh(y, width, height=0.8)`
`plt.pie()`	绘制饼图	`plt.pie(sizes, labels=labels)`
`plt.hist()`	绘制直方图	`plt.hist(data, bins=20)`
`plt.boxplot()`	绘制箱线图	`plt.boxplot(data)`
`plt.violinplot()`	绘制小提琴图	`plt.violinplot(data)`
`plt.imshow()`	显示图像	`plt.imshow(img, cmap='gray')`

python

python 复制代码

# 3.1 线图
print("=== 线图示例 ===")

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('基本图表类型')

# 1. 基础线图
x = np.linspace(0, 10, 100)
axes[0, 0].plot(x, np.sin(x), label='sin(x)', color='red', linewidth=2, linestyle='-')
axes[0, 0].plot(x, np.cos(x), label='cos(x)', color='blue', linewidth=2, linestyle='--')
axes[0, 0].set_title('线图')
axes[0, 0].legend()
axes[0, 0].grid(True)

# 2. 散点图
np.random.seed(42)
x_scatter = np.random.randn(50)
y_scatter = np.random.randn(50)
colors = np.random.rand(50)
sizes = 1000 * np.random.rand(50)

axes[0, 1].scatter(x_scatter, y_scatter, c=colors, s=sizes, alpha=0.6, cmap='viridis')
axes[0, 1].set_title('散点图')
axes[0, 1].set_xlabel('X')
axes[0, 1].set_ylabel('Y')

# 3. 柱状图
categories = ['A', 'B', 'C', 'D', 'E']
values = [25, 40, 30, 35, 20]
colors_bar = ['red', 'blue', 'green', 'orange', 'purple']

axes[0, 2].bar(categories, values, color=colors_bar, edgecolor='black', linewidth=2)
axes[0, 2].set_title('柱状图')
axes[0, 2].set_xlabel('类别')
axes[0, 2].set_ylabel('值')

# 4. 直方图
data_hist = np.random.randn(1000)
axes[1, 0].hist(data_hist, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('直方图')
axes[1, 0].set_xlabel('值')
axes[1, 0].set_ylabel('频数')

# 5. 饼图
sizes_pie = [15, 30, 45, 10]
labels_pie = ['A类', 'B类', 'C类', 'D类']
explode = (0, 0.1, 0, 0)  # 突出第二块

axes[1, 1].pie(sizes_pie, explode=explode, labels=labels_pie, autopct='%1.1f%%',
               shadow=True, startangle=90)
axes[1, 1].set_title('饼图')

# 6. 箱线图
data_box = [np.random.normal(0, std, 100) for std in range(1, 5)]
axes[1, 2].boxplot(data_box, labels=['组1', '组2', '组3', '组4'])
axes[1, 2].set_title('箱线图')
axes[1, 2].set_ylabel('值')

plt.tight_layout()
plt.show()

# 3.2 更多图表类型
print("=== 更多图表类型 ===")

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('更多图表类型')

# 1. 误差线图
x_error = np.arange(1, 6)
y_error = np.array([2, 3.5, 4, 4.5, 5])
yerr = np.array([0.2, 0.3, 0.4, 0.5, 0.6])

axes[0, 0].errorbar(x_error, y_error, yerr=yerr, fmt='o', capsize=5, 
                    ecolor='red', marker='s', mfc='blue', mec='blue')
axes[0, 0].set_title('误差线图')

# 2. 填充区域图
x_fill = np.linspace(0, 10, 100)
y1_fill = np.sin(x_fill)
y2_fill = np.cos(x_fill)

axes[0, 1].plot(x_fill, y1_fill, 'b-', label='sin(x)')
axes[0, 1].plot(x_fill, y2_fill, 'r-', label='cos(x)')
axes[0, 1].fill_between(x_fill, y1_fill, y2_fill, where=(y1_fill > y2_fill), 
                        color='blue', alpha=0.3, label='sin>cos')
axes[0, 1].fill_between(x_fill, y1_fill, y2_fill, where=(y1_fill <= y2_fill), 
                        color='red', alpha=0.3, label='sin≤cos')
axes[0, 1].set_title('填充区域图')
axes[0, 1].legend()

# 3. 阶梯图
x_step = np.arange(1, 11)
y_step = np.random.randint(1, 10, 10)

axes[0, 2].step(x_step, y_step, where='mid', linewidth=2, marker='o')
axes[0, 2].set_title('阶梯图')
axes[0, 2].grid(True)

# 4. 火柴杆图
x_stem = np.linspace(0.1, 2*np.pi, 20)
y_stem = np.exp(-x_stem) * np.cos(2*np.pi*x_stem)

axes[1, 0].stem(x_stem, y_stem, linefmt='C0-', markerfmt='C0o', basefmt='C3--')
axes[1, 0].set_title('火柴杆图')

# 5. 水平柱状图
categories_hbar = ['模型A', '模型B', '模型C', '模型D', '模型E']
accuracy = [0.85, 0.92, 0.78, 0.95, 0.88]

axes[1, 1].barh(categories_hbar, accuracy, color='lightgreen', edgecolor='darkgreen')
axes[1, 1].set_title('模型准确率')
axes[1, 1].set_xlabel('准确率')
axes[1, 1].set_xlim(0, 1)

# 6. 小提琴图
data_violin = [np.random.normal(0, std, 100) for std in range(1, 4)]

axes[1, 2].violinplot(data_violin, showmeans=True, showmedians=True)
axes[1, 2].set_title('小提琴图')
axes[1, 2].set_xticks([1, 2, 3])
axes[1, 2].set_xticklabels(['组1', '组2', '组3'])

plt.tight_layout()
plt.show()

图像显示与处理

图像显示

函数	作用	示例
`plt.imshow()`	显示图像	`plt.imshow(img, cmap='gray')`
`plt.colorbar()`	显示颜色条	`plt.colorbar()`
`plt.axis()`	坐标轴设置	`plt.axis('off')` 或 `plt.axis('equal')`
`plt.matshow()`	显示矩阵	`plt.matshow(matrix)`
`plt.contour()`	绘制等高线	`plt.contour(X, Y, Z)`
`plt.contourf()`	填充等高线	`plt.contourf(X, Y, Z)`
`plt.pcolor()`	伪彩色图	`plt.pcolor(X, Y, Z)`
`plt.pcolormesh()`	网格伪彩色图	`plt.pcolormesh(X, Y, Z)`
`plt.streamplot()`	流线图	`plt.streamplot(X, Y, U, V)`

python 复制代码

# 4.1 图像显示
print("=== 图像显示示例 ===")

# 生成示例图像数据
# 1. 随机图像
random_image = np.random.rand(100, 100)

# 2. 渐变图像
x = np.linspace(-2, 2, 100)
y = np.linspace(-2, 2, 100)
X, Y = np.meshgrid(x, y)
Z = np.exp(-(X**2 + Y**2))  # 2D高斯函数

# 3. 创建带有特征的图像
circle_image = np.zeros((100, 100))
for i in range(100):
    for j in range(100):
        if (i-50)**2 + (j-50)**2 < 400:  # 半径为20的圆
            circle_image[i, j] = 1.0

# 4. 创建RGB图像
rgb_image = np.zeros((100, 100, 3))
# 红色渐变
rgb_image[:, :, 0] = np.linspace(0, 1, 100).reshape(1, -1)  # 红色通道水平渐变
# 绿色渐变
rgb_image[:, :, 1] = np.linspace(0, 1, 100).reshape(-1, 1)  # 绿色通道垂直渐变
# 蓝色常量
rgb_image[:, :, 2] = 0.5  # 蓝色通道常量

# 显示图像
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('图像显示示例')

# 1. 灰度图像
axes[0, 0].imshow(random_image, cmap='gray')
axes[0, 0].set_title('随机灰度图像')
axes[0, 0].axis('off')

# 2. 伪彩色图像
im = axes[0, 1].imshow(Z, cmap='hot')
axes[0, 1].set_title('2D高斯函数 (hot colormap)')
axes[0, 1].axis('off')
plt.colorbar(im, ax=axes[0, 1])

# 3. 二值图像
axes[0, 2].imshow(circle_image, cmap='binary')
axes[0, 2].set_title('圆形二值图像')
axes[0, 2].axis('off')

# 4. RGB彩色图像
axes[1, 0].imshow(rgb_image)
axes[1, 0].set_title('RGB彩色图像')
axes[1, 0].axis('off')

# 5. 使用不同colormap
im2 = axes[1, 1].imshow(Z, cmap='viridis')
axes[1, 1].set_title('viridis colormap')
axes[1, 1].axis('off')
plt.colorbar(im2, ax=axes[1, 1])

# 6. 带插值的图像
axes[1, 2].imshow(Z, cmap='coolwarm', interpolation='bilinear')
axes[1, 2].set_title('双线性插值')
axes[1, 2].axis('off')

plt.tight_layout()
plt.show()

# 4.2 等高线和伪彩色图
print("=== 等高线和伪彩色图 ===")

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('等高线和伪彩色图')

# 生成示例数据
x_contour = np.linspace(-3, 3, 100)
y_contour = np.linspace(-3, 3, 100)
X_contour, Y_contour = np.meshgrid(x_contour, y_contour)
Z_contour = np.sin(X_contour) * np.cos(Y_contour) + 0.1 * X_contour

# 1. 等高线图
contour = axes[0, 0].contour(X_contour, Y_contour, Z_contour, 20, cmap='RdGy')
axes[0, 0].set_title('等高线图')
axes[0, 0].clabel(contour, inline=True, fontsize=8)
axes[0, 0].set_xlabel('X')
axes[0, 0].set_ylabel('Y')

# 2. 填充等高线图
contourf = axes[0, 1].contourf(X_contour, Y_contour, Z_contour, 20, cmap='viridis')
axes[0, 1].set_title('填充等高线图')
axes[0, 1].set_xlabel('X')
axes[0, 1].set_ylabel('Y')
plt.colorbar(contourf, ax=axes[0, 1])

# 3. 伪彩色图
pcolor = axes[0, 2].pcolor(X_contour, Y_contour, Z_contour, cmap='coolwarm', shading='auto')
axes[0, 2].set_title('伪彩色图')
axes[0, 2].set_xlabel('X')
axes[0, 2].set_ylabel('Y')
plt.colorbar(pcolor, ax=axes[0, 2])

# 4. 网格伪彩色图
pcolormesh = axes[1, 0].pcolormesh(X_contour, Y_contour, Z_contour, cmap='Spectral')
axes[1, 0].set_title('网格伪彩色图')
axes[1, 0].set_xlabel('X')
axes[1, 0].set_ylabel('Y')
plt.colorbar(pcolormesh, ax=axes[1, 0])

# 5. 等高线与伪彩色结合
contourf2 = axes[1, 1].contourf(X_contour, Y_contour, Z_contour, 20, cmap='bone', alpha=0.7)
contour2 = axes[1, 1].contour(X_contour, Y_contour, Z_contour, 20, colors='black', linewidths=0.5)
axes[1, 1].set_title('等高线与伪彩色结合')
axes[1, 1].set_xlabel('X')
axes[1, 1].set_ylabel('Y')
plt.colorbar(contourf2, ax=axes[1, 1])

# 6. 3D数据显示为图像
# 生成梯度数据
gradient = np.gradient(Z_contour)
magnitude = np.sqrt(gradient[0]**2 + gradient[1]**2)

im_contour = axes[1, 2].imshow(magnitude, cmap='jet', extent=[-3, 3, -3, 3])
axes[1, 2].set_title('梯度幅度图像')
axes[1, 2].set_xlabel('X')
axes[1, 2].set_ylabel('Y')
plt.colorbar(im_contour, ax=axes[1, 2])

plt.tight_layout()
plt.show()

3D绘图

3D图形

函数	作用	示例
`Axes3D.plot_surface()`	绘制3D曲面	`ax.plot_surface(X, Y, Z, cmap='viridis')`
`Axes3D.plot_wireframe()`	绘制3D线框	`ax.plot_wireframe(X, Y, Z, color='black')`
`Axes3D.scatter()`	绘制3D散点	`ax.scatter(x, y, z, c=z, cmap='viridis')`
`Axes3D.plot()`	绘制3D曲线	`ax.plot(x, y, z, 'r-', linewidth=2)`
`Axes3D.contour3D()`	绘制3D等高线	`ax.contour3D(X, Y, Z, 50, cmap='binary')`
`Axes3D.quiver()`	绘制3D箭头	`ax.quiver(x, y, z, u, v, w)`

python 复制代码

# 5.1 3D绘图
print("=== 3D绘图示例 ===")

# 导入3D工具
from mpl_toolkits.mplot3d import Axes3D

# 生成3D数据
x_3d = np.linspace(-5, 5, 50)
y_3d = np.linspace(-5, 5, 50)
X_3d, Y_3d = np.meshgrid(x_3d, y_3d)
Z_3d = np.sin(np.sqrt(X_3d**2 + Y_3d**2))

# 生成3D散点数据
np.random.seed(42)
n_points = 200
x_scatter_3d = np.random.randn(n_points)
y_scatter_3d = np.random.randn(n_points)
z_scatter_3d = np.random.randn(n_points)
colors_3d = np.random.rand(n_points)
sizes_3d = 100 * np.random.rand(n_points)

# 创建3D图形
fig = plt.figure(figsize=(16, 10))

# 1. 3D曲面图
ax1 = fig.add_subplot(231, projection='3d')
surf = ax1.plot_surface(X_3d, Y_3d, Z_3d, cmap='viridis', alpha=0.8, edgecolor='none')
ax1.set_title('3D曲面图')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.set_zlabel('Z')
fig.colorbar(surf, ax=ax1, shrink=0.5, aspect=10)

# 2. 3D线框图
ax2 = fig.add_subplot(232, projection='3d')
ax2.plot_wireframe(X_3d, Y_3d, Z_3d, color='blue', linewidth=0.5, alpha=0.7)
ax2.set_title('3D线框图')
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
ax2.set_zlabel('Z')

# 3. 3D散点图
ax3 = fig.add_subplot(233, projection='3d')
scatter_3d = ax3.scatter(x_scatter_3d, y_scatter_3d, z_scatter_3d, 
                         c=colors_3d, s=sizes_3d, alpha=0.6, cmap='plasma')
ax3.set_title('3D散点图')
ax3.set_xlabel('X')
ax3.set_ylabel('Y')
ax3.set_zlabel('Z')
fig.colorbar(scatter_3d, ax=ax3, shrink=0.5, aspect=10)

# 4. 3D柱状图
ax4 = fig.add_subplot(234, projection='3d')
x_pos = np.arange(5)
y_pos = np.arange(5)
x_pos, y_pos = np.meshgrid(x_pos, y_pos)
x_pos = x_pos.flatten()
y_pos = y_pos.flatten()
z_pos = np.zeros_like(x_pos)
dx = dy = 0.5 * np.ones_like(z_pos)
dz = np.random.rand(25)

colors_bar3d = plt.cm.viridis(dz / dz.max())
ax4.bar3d(x_pos, y_pos, z_pos, dx, dy, dz, color=colors_bar3d, shade=True)
ax4.set_title('3D柱状图')
ax4.set_xlabel('X')
ax4.set_ylabel('Y')
ax4.set_zlabel('Z')

# 5. 3D曲线图
ax5 = fig.add_subplot(235, projection='3d')
theta = np.linspace(-4 * np.pi, 4 * np.pi, 100)
z_curve = np.linspace(-2, 2, 100)
r_curve = z_curve**2 + 1
x_curve = r_curve * np.sin(theta)
y_curve = r_curve * np.cos(theta)

ax5.plot(x_curve, y_curve, z_curve, 'r-', linewidth=2)
ax5.set_title('3D曲线图 (螺旋线)')
ax5.set_xlabel('X')
ax5.set_ylabel('Y')
ax5.set_zlabel('Z')

# 6. 3D等高线图
ax6 = fig.add_subplot(236, projection='3d')
contour3d = ax6.contour3D(X_3d, Y_3d, Z_3d, 50, cmap='binary')
ax6.set_title('3D等高线图')
ax6.set_xlabel('X')
ax6.set_ylabel('Y')
ax6.set_zlabel('Z')

plt.tight_layout()
plt.show()

高级定制

图形元素定制

元素	定制方法	示例
线条	`linestyle`, `linewidth`, `color`, `marker`	`'r--o', linewidth=2, markersize=8`
标记	`marker`, `markersize`, `markerfacecolor`	`marker='s', markersize=10, mfc='red'`
文本	`plt.text()`, `plt.annotate()`, `plt.title()`	`plt.text(x, y, 'text', fontsize=12)`
图例	`plt.legend()`, `loc`, `frameon`, `ncol`	`plt.legend(loc='upper right', ncol=2)`
网格	`plt.grid()`, `which`, `linestyle`, `alpha`	`plt.grid(True, linestyle='--', alpha=0.5)`
坐标轴	`plt.xlim()`, `plt.ylim()`, `plt.xticks()`	`plt.xlim(0, 10); plt.xticks(range(0, 11, 2))`
刻度	`plt.tick_params()`, `direction`, `labelsize`	`plt.tick_params(labelsize=10, direction='in')`
颜色条	`plt.colorbar()`, `orientation`, `shrink`	`plt.colorbar(orientation='horizontal')`
子图间距	`plt.subplots_adjust()`, `plt.tight_layout()`	`plt.subplots_adjust(wspace=0.3, hspace=0.3)`

python 复制代码

# 6.1 图形元素定制
print("=== 图形元素定制 ===")

# 创建定制化的图形
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('图形元素定制示例', fontsize=16, fontweight='bold')

# 1. 线条样式定制
x = np.linspace(0, 10, 100)
axes[0, 0].plot(x, np.sin(x), 
                color='red',           # 颜色
                linestyle='-',         # 线型: '-', '--', '-.', ':'
                linewidth=2,           # 线宽
                marker='o',            # 标记: 'o', 's', '^', 'D', '*'
                markersize=6,          # 标记大小
                markerfacecolor='blue', # 标记填充色
                markeredgecolor='black', # 标记边缘色
                markeredgewidth=1,     # 标记边缘宽度
                label='sin(x)')

axes[0, 0].plot(x, np.cos(x), 
                color='green',
                linestyle='--',
                linewidth=2,
                marker='s',
                markersize=6,
                markerfacecolor='yellow',
                markeredgecolor='black',
                markeredgewidth=1,
                label='cos(x)')

axes[0, 0].set_title('线条样式定制')
axes[0, 0].legend(loc='upper right', frameon=True, fancybox=True, shadow=True, ncol=2)
axes[0, 0].grid(True, linestyle='--', alpha=0.5)

# 2. 坐标轴定制
x_axis = np.linspace(0, 2*np.pi, 20)
y_axis = np.sin(x_axis)

axes[0, 1].plot(x_axis, y_axis, 'b-o', linewidth=2)

# 设置坐标轴范围
axes[0, 1].set_xlim(-0.5, 7)
axes[0, 1].set_ylim(-1.2, 1.2)

# 设置刻度
axes[0, 1].set_xticks([0, np.pi/2, np.pi, 3*np.pi/2, 2*np.pi])
axes[0, 1].set_xticklabels(['0', 'π/2', 'π', '3π/2', '2π'], fontsize=10)

axes[0, 1].set_yticks([-1, -0.5, 0, 0.5, 1])
axes[0, 1].set_yticklabels(['-1', '-0.5', '0', '0.5', '1'], fontsize=10)

# 设置刻度方向
axes[0, 1].tick_params(axis='both', which='both', direction='in', length=6, width=2)

# 设置坐标轴标签
axes[0, 1].set_xlabel('角度 (弧度)', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('正弦值', fontsize=12, fontweight='bold')

axes[0, 1].set_title('坐标轴定制')
axes[0, 1].grid(True, linestyle=':', alpha=0.7)

# 3. 文本和标注
np.random.seed(42)
x_text = np.arange(1, 11)
y_text = np.random.rand(10) * 100

axes[1, 0].plot(x_text, y_text, 'g-D', linewidth=2, markersize=8)

# 添加文本
axes[1, 0].text(5, 80, '峰值区域', fontsize=12, fontweight='bold',
                bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.5))

# 添加箭头标注
axes[1, 0].annotate('最大值', xy=(x_text[np.argmax(y_text)], max(y_text)), 
                    xytext=(x_text[np.argmax(y_text)]-2, max(y_text)-20),
                    arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='red', lw=2),
                    fontsize=12, fontweight='bold')

# 添加箭头
axes[1, 0].arrow(2, 30, 2, 20, head_width=0.3, head_length=5, fc='blue', ec='blue')

axes[1, 0].set_title('文本和标注')
axes[1, 0].set_xlabel('X轴')
axes[1, 0].set_ylabel('Y轴')
axes[1, 0].grid(True, alpha=0.3)

# 4. 图例和颜色条定制
x_multi = np.linspace(0, 10, 100)
for i in range(5):
    y_multi = np.sin(x_multi + i * 0.5) * (i + 1) * 0.2
    axes[1, 1].plot(x_multi, y_multi, label=f'曲线 {i+1}', linewidth=2)

# 定制图例
legend = axes[1, 1].legend(loc='upper left', 
                           fontsize=10,
                           title='图例标题',
                           title_fontsize=12,
                           frameon=True,
                           fancybox=True,
                           shadow=True,
                           borderpad=1,
                           labelspacing=0.5,
                           handlelength=2,
                           ncol=2)

# 设置图例背景色
legend.get_frame().set_facecolor('lightgray')
legend.get_frame().set_alpha(0.7)

axes[1, 1].set_title('图例和颜色条定制')
axes[1, 1].set_xlabel('X轴')
axes[1, 1].set_ylabel('Y轴')
axes[1, 1].grid(True, alpha=0.3)

# 添加颜色条示例（使用伪数据）
im = axes[1, 1].imshow(np.random.rand(10, 10), extent=[8, 10, 0, 2], aspect='auto', alpha=0.5)
cbar = plt.colorbar(im, ax=axes[1, 1], orientation='vertical', pad=0.01, shrink=0.8)
cbar.set_label('颜色标度', fontsize=10)

plt.tight_layout()
plt.show()

样式与主题

样式设置

方法	作用	示例
`plt.style.use()`	使用样式	`plt.style.use('ggplot')`
`plt.rcParams.update()`	更新配置	`plt.rcParams.update({'font.size': 12})`
`plt.rc()`	设置配置	`plt.rc('lines', linewidth=2)`
`plt.xkcd()`	XKCD漫画风格	`with plt.xkcd(): plt.plot(...)`
可用样式		`'default'`, `'ggplot'`, `'seaborn'`, `'fivethirtyeight'`, `'grayscale'`

python 复制代码

# 7.1 样式与主题
print("=== 样式与主题 ===")

# 查看所有可用样式
print("可用样式:", plt.style.available)

# 演示不同样式
styles = ['default', 'ggplot', 'seaborn', 'fivethirtyeight', 'grayscale', 'dark_background']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('不同样式比较', fontsize=16, fontweight='bold')

for idx, style in enumerate(styles):
    row = idx // 3
    col = idx % 3
    
    with plt.style.context(style):
        ax = axes[row, col]
        
        # 生成数据
        x = np.linspace(0, 10, 100)
        y1 = np.sin(x)
        y2 = np.cos(x)
        y3 = np.exp(-x/5) * np.sin(x)
        
        # 绘制
        ax.plot(x, y1, label='sin(x)', linewidth=2)
        ax.plot(x, y2, label='cos(x)', linewidth=2)
        ax.plot(x, y3, label='衰减正弦', linewidth=2)
        
        # 设置
        ax.set_title(f'{style} 样式', fontsize=12)
        ax.set_xlabel('X轴')
        ax.set_ylabel('Y轴')
        ax.legend(loc='best', fontsize=8)
        ax.grid(True)

plt.tight_layout()
plt.show()

# 7.2 自定义样式
print("=== 自定义样式 ===")

# 方法1: 使用rcParams全局设置
plt.rcParams.update({
    'font.size': 12,               # 字体大小
    'font.family': 'sans-serif',   # 字体家族
    'font.sans-serif': ['Arial', 'DejaVu Sans'],  # 无衬线字体
    'figure.figsize': (10, 6),     # 图形大小
    'figure.autolayout': True,     # 自动调整布局
    'axes.titlesize': 14,          # 标题大小
    'axes.labelsize': 12,          # 坐标轴标签大小
    'axes.linewidth': 1.5,         # 坐标轴线宽
    'axes.grid': True,             # 显示网格
    'grid.linestyle': '--',        # 网格线型
    'grid.alpha': 0.6,             # 网格透明度
    'xtick.labelsize': 10,         # X轴刻度标签大小
    'ytick.labelsize': 10,         # Y轴刻度标签大小
    'legend.fontsize': 10,         # 图例字体大小
    'legend.frameon': True,        # 图例边框
    'legend.shadow': True,         # 图例阴影
    'lines.linewidth': 2,          # 线宽
    'lines.markersize': 8,         # 标记大小
    'savefig.dpi': 300,            # 保存图片DPI
    'savefig.bbox': 'tight'        # 保存时紧凑边界
})

# 应用自定义样式绘制图形
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 子图1: 曲线图
x = np.linspace(0, 2*np.pi, 100)
axes[0, 0].plot(x, np.sin(x), 'r-', label='sin(x)')
axes[0, 0].plot(x, np.cos(x), 'b--', label='cos(x)')
axes[0, 0].set_title('三角函数')
axes[0, 0].legend()

# 子图2: 散点图
np.random.seed(42)
x_scatter = np.random.randn(50)
y_scatter = np.random.randn(50)
colors = np.random.rand(50)
sizes = 100 * np.random.rand(50)
axes[0, 1].scatter(x_scatter, y_scatter, c=colors, s=sizes, alpha=0.6, cmap='viridis')
axes[0, 1].set_title('散点图')

# 子图3: 柱状图
categories = ['A', 'B', 'C', 'D', 'E']
values = [25, 40, 30, 35, 20]
axes[1, 0].bar(categories, values, color=['red', 'blue', 'green', 'orange', 'purple'])
axes[1, 0].set_title('柱状图')

# 子图4: 饼图
sizes_pie = [15, 30, 45, 10]
labels_pie = ['A类', 'B类', 'C类', 'D类']
explode = (0, 0.1, 0, 0)
axes[1, 1].pie(sizes_pie, explode=explode, labels=labels_pie, autopct='%1.1f%%',
               shadow=True, startangle=90)
axes[1, 1].set_title('饼图')

plt.suptitle('自定义样式示例', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# 重置为默认样式
plt.rcdefaults()

保存与导出

保存图形

方法	作用	示例
`plt.savefig()`	保存图形	`plt.savefig('figure.png', dpi=300, bbox_inches='tight')`
`fig.savefig()`	保存图形对象	`fig.savefig('figure.pdf', format='pdf')`
参数
`dpi`	分辨率	`dpi=300` (每英寸点数)
`bbox_inches`	边界框	`bbox_inches='tight'` (紧凑边界)
`pad_inches`	内边距	`pad_inches=0.1`
`transparent`	透明背景	`transparent=True`
`format`	格式	`format='png'`, `'pdf'`, `'svg'`, `'jpg'`

python 复制代码

# 8.1 保存图形
print("=== 保存图形 ===")

# 创建示例图形
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# 子图1: 曲线图
x = np.linspace(0, 10, 100)
ax1.plot(x, np.sin(x), 'r-', label='sin(x)', linewidth=2)
ax1.plot(x, np.cos(x), 'b--', label='cos(x)', linewidth=2)
ax1.set_title('三角函数')
ax1.set_xlabel('X')
ax1.set_ylabel('Y')
ax1.legend()
ax1.grid(True)

# 子图2: 散点图
np.random.seed(42)
x_scatter = np.random.randn(100)
y_scatter = np.random.randn(100)
colors = np.random.rand(100)
sizes = 100 * np.random.rand(100)
scatter = ax2.scatter(x_scatter, y_scatter, c=colors, s=sizes, alpha=0.6, cmap='viridis')
ax2.set_title('散点图')
ax2.set_xlabel('X')
ax2.set_ylabel('Y')
plt.colorbar(scatter, ax=ax2)

plt.suptitle('保存图形示例', fontsize=16)
plt.tight_layout()

# 保存为不同格式
save_dir = './saved_figures/'
import os
os.makedirs(save_dir, exist_ok=True)

# 1. 保存为PNG (默认格式，支持透明)
plt.savefig(os.path.join(save_dir, 'figure.png'), 
            dpi=300, 
            bbox_inches='tight',
            facecolor='white',
            edgecolor='black')
print("已保存: figure.png")

# 2. 保存为PDF (矢量图，无限缩放)
plt.savefig(os.path.join(save_dir, 'figure.pdf'), 
            format='pdf',
            bbox_inches='tight')
print("已保存: figure.pdf")

# 3. 保存为SVG (矢量图，可编辑)
plt.savefig(os.path.join(save_dir, 'figure.svg'), 
            format='svg',
            bbox_inches='tight')
print("已保存: figure.svg")

# 4. 保存为JPG (有损压缩，适合照片)
plt.savefig(os.path.join(save_dir, 'figure.jpg'), 
            format='jpg',
            dpi=300,
            bbox_inches='tight',
            quality=95)  # 质量 1-100
print("已保存: figure.jpg")

# 5. 保存为TIFF (高质量，适合出版)
plt.savefig(os.path.join(save_dir, 'figure.tiff'), 
            format='tiff',
            dpi=300,
            bbox_inches='tight')
print("已保存: figure.tiff")

# 6. 保存为EPS (矢量图，适合LaTeX)
plt.savefig(os.path.join(save_dir, 'figure.eps'), 
            format='eps',
            bbox_inches='tight')
print("已保存: figure.eps")

# 7. 透明背景保存
plt.savefig(os.path.join(save_dir, 'figure_transparent.png'), 
            dpi=300,
            bbox_inches='tight',
            transparent=True)  # 透明背景
print("已保存: figure_transparent.png (透明背景)")

plt.show()

# 8.2 批量保存子图
print("\n=== 批量保存子图 ===")

# 创建包含多个子图的图形
fig, axes = plt.subplots(2, 2, figsize=(10, 8))
fig.suptitle('批量保存示例')

# 填充子图
plot_types = ['曲线图', '散点图', '柱状图', '饼图']
for idx, ax in enumerate(axes.flat):
    ax.text(0.5, 0.5, plot_types[idx], 
            horizontalalignment='center',
            verticalalignment='center',
            transform=ax.transAxes,
            fontsize=14,
            fontweight='bold')
    ax.set_title(f'子图 {idx+1}')

plt.tight_layout()

# 保存整个图形
fig.savefig(os.path.join(save_dir, 'multiplot.png'), dpi=300, bbox_inches='tight')

# 分别保存每个子图
for idx, ax in enumerate(axes.flat):
    # 创建一个新图形
    fig_single = plt.figure(figsize=(5, 4))
    
    # 将子图内容复制到新图形
    ax_single = fig_single.add_subplot(111)
    ax_single.text(0.5, 0.5, plot_types[idx], 
                   horizontalalignment='center',
                   verticalalignment='center',
                   transform=ax_single.transAxes,
                   fontsize=14,
                   fontweight='bold')
    ax_single.set_title(f'子图 {idx+1}')
    ax_single.set_xlim(0, 1)
    ax_single.set_ylim(0, 1)
    ax_single.set_xticks([])
    ax_single.set_yticks([])
    
    # 保存单个子图
    fig_single.savefig(os.path.join(save_dir, f'subplot_{idx+1}.png'), 
                      dpi=300, bbox_inches='tight')
    plt.close(fig_single)  # 关闭图形以释放内存

print(f"已保存: multiplot.png 和 4个子图")

CV工程师实用案例

图像处理可视化

python 复制代码

# 9.1 图像处理可视化
print("=== CV工程师实用案例: 图像处理可视化 ===")

# 模拟图像处理流程
np.random.seed(42)

# 创建测试图像
original_image = np.random.rand(100, 100)

# 添加一些特征
# 1. 添加一个圆形
for i in range(100):
    for j in range(100):
        if (i-30)**2 + (j-70)**2 < 400:  # 半径20的圆
            original_image[i, j] = 0.8

# 2. 添加一个矩形
original_image[60:80, 20:40] = 0.3

# 3. 添加一些噪声
noise = np.random.normal(0, 0.1, (100, 100))
noisy_image = original_image + noise
noisy_image = np.clip(noisy_image, 0, 1)

# 4. 应用高斯滤波
from scipy.ndimage import gaussian_filter
filtered_image = gaussian_filter(noisy_image, sigma=1.0)

# 5. 边缘检测 (使用Sobel算子)
from scipy.ndimage import sobel
edge_x = sobel(filtered_image, axis=0)
edge_y = sobel(filtered_image, axis=1)
edge_image = np.sqrt(edge_x**2 + edge_y**2)
edge_image = edge_image / edge_image.max()  # 归一化

# 6. 二值化
threshold = 0.3
binary_image = (edge_image > threshold).astype(float)

# 可视化处理流程
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('图像处理流程可视化', fontsize=16, fontweight='bold')

# 1. 原始图像
axes[0, 0].imshow(original_image, cmap='gray', vmin=0, vmax=1)
axes[0, 0].set_title('原始图像')
axes[0, 0].axis('off')

# 2. 添加噪声后的图像
axes[0, 1].imshow(noisy_image, cmap='gray', vmin=0, vmax=1)
axes[0, 1].set_title('添加噪声后')
axes[0, 1].axis('off')

# 3. 高斯滤波后
axes[0, 2].imshow(filtered_image, cmap='gray', vmin=0, vmax=1)
axes[0, 2].set_title('高斯滤波后')
axes[0, 2].axis('off')

# 4. 边缘检测
axes[1, 0].imshow(edge_image, cmap='hot', vmin=0, vmax=1)
axes[1, 0].set_title('边缘检测结果')
axes[1, 0].axis('off')

# 5. 二值化结果
axes[1, 1].imshow(binary_image, cmap='binary', vmin=0, vmax=1)
axes[1, 1].set_title('二值化结果')
axes[1, 1].axis('off')

# 6. 处理流程示意图
axes[1, 2].axis('off')
axes[1, 2].text(0.1, 0.9, '图像处理流程:', fontsize=12, fontweight='bold')
axes[1, 2].text(0.1, 0.8, '1. 原始图像', fontsize=10)
axes[1, 2].text(0.1, 0.7, '2. 添加噪声', fontsize=10)
axes[1, 2].text(0.1, 0.6, '3. 高斯滤波去噪', fontsize=10)
axes[1, 2].text(0.1, 0.5, '4. 边缘检测', fontsize=10)
axes[1, 2].text(0.1, 0.4, '5. 二值化', fontsize=10)
axes[1, 2].text(0.1, 0.3, '6. 特征提取', fontsize=10)

plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'image_processing_pipeline.png'), dpi=300, bbox_inches='tight')
plt.show()

# 9.2 模型训练过程可视化
print("\n=== 模型训练过程可视化 ===")

# 模拟训练日志数据
epochs = 100
train_loss = np.exp(-np.linspace(0, 5, epochs)) + np.random.normal(0, 0.01, epochs)
val_loss = np.exp(-np.linspace(0, 4.5, epochs)) + np.random.normal(0, 0.015, epochs)
train_acc = 1 - np.exp(-np.linspace(0, 4, epochs)) + np.random.normal(0, 0.005, epochs)
val_acc = 1 - np.exp(-np.linspace(0, 3.5, epochs)) + np.random.normal(0, 0.01, epochs)

# 创建可视化
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('模型训练过程可视化', fontsize=16, fontweight='bold')

# 1. 训练和验证损失
ax1.plot(range(1, epochs+1), train_loss, 'b-', label='训练损失', linewidth=2)
ax1.plot(range(1, epochs+1), val_loss, 'r--', label='验证损失', linewidth=2)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('损失')
ax1.set_title('训练和验证损失曲线')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 标记最佳验证损失
best_val_epoch = np.argmin(val_loss) + 1
best_val_loss = val_loss[best_val_epoch-1]
ax1.plot(best_val_epoch, best_val_loss, 'go', markersize=10)
ax1.annotate(f'最佳: {best_val_loss:.4f}', 
             xy=(best_val_epoch, best_val_loss),
             xytext=(best_val_epoch+10, best_val_loss+0.05),
             arrowprops=dict(arrowstyle='->', color='green'),
             fontsize=10)

# 2. 训练和验证准确率
ax2.plot(range(1, epochs+1), train_acc, 'b-', label='训练准确率', linewidth=2)
ax2.plot(range(1, epochs+1), val_acc, 'r--', label='验证准确率', linewidth=2)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('准确率')
ax2.set_title('训练和验证准确率曲线')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 标记最佳验证准确率
best_val_acc_epoch = np.argmax(val_acc) + 1
best_val_acc = val_acc[best_val_acc_epoch-1]
ax2.plot(best_val_acc_epoch, best_val_acc, 'go', markersize=10)
ax2.annotate(f'最佳: {best_val_acc:.4f}', 
             xy=(best_val_acc_epoch, best_val_acc),
             xytext=(best_val_acc_epoch+10, best_val_acc-0.05),
             arrowprops=dict(arrowstyle='->', color='green'),
             fontsize=10)

# 3. 学习率衰减 (模拟)
learning_rates = np.logspace(-2, -5, epochs)
ax3.semilogy(range(1, epochs+1), learning_rates, 'g-', linewidth=2)
ax3.set_xlabel('Epoch')
ax3.set_ylabel('学习率 (log scale)')
ax3.set_title('学习率衰减')
ax3.grid(True, alpha=0.3, which='both')

# 4. 损失-准确率关系图
ax4.scatter(train_loss, train_acc, c=range(epochs), cmap='viridis', 
            s=50, alpha=0.6, label='训练')
ax4.scatter(val_loss, val_acc, c=range(epochs), cmap='plasma', 
            s=50, alpha=0.6, marker='s', label='验证')
ax4.set_xlabel('损失')
ax4.set_ylabel('准确率')
ax4.set_title('损失-准确率关系')
ax4.legend()
ax4.grid(True, alpha=0.3)

# 添加颜色条表示训练进度
cbar_train = plt.colorbar(ax4.collections[0], ax=ax4, pad=0.01)
cbar_train.set_label('训练进度 (Epoch)')

plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'training_visualization.png'), dpi=300, bbox_inches='tight')
plt.show()

# 9.3 检测结果可视化
print("\n=== 检测结果可视化 ===")

# 模拟检测结果
np.random.seed(42)

# 创建测试图像
test_image = np.random.rand(200, 300, 3)

# 模拟检测框和类别
n_detections = 15
detections = []

for i in range(n_detections):
    # 随机生成边界框
    x1 = np.random.randint(0, 250)
    y1 = np.random.randint(0, 150)
    width = np.random.randint(30, 80)
    height = np.random.randint(30, 80)
    
    # 确保边界框在图像内
    x2 = min(x1 + width, 299)
    y2 = min(y1 + height, 199)
    
    # 随机生成类别和置信度
    category = np.random.choice(['person', 'car', 'dog', 'cat'])
    confidence = np.random.uniform(0.5, 0.95)
    
    detections.append({
        'bbox': [x1, y1, x2, y2],
        'category': category,
        'confidence': confidence
    })

# 类别颜色映射
category_colors = {
    'person': 'red',
    'car': 'blue',
    'dog': 'green',
    'cat': 'orange'
}

# 可视化检测结果
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('目标检测结果可视化', fontsize=16, fontweight='bold')

# 1. 显示带检测框的图像
axes[0].imshow(test_image)
axes[0].set_title('检测结果')
axes[0].axis('off')

# 绘制检测框
for det in detections:
    bbox = det['bbox']
    category = det['category']
    confidence = det['confidence']
    color = category_colors[category]
    
    # 绘制矩形框
    rect = plt.Rectangle((bbox[0], bbox[1]), 
                         bbox[2]-bbox[0], 
                         bbox[3]-bbox[1],
                         fill=False, 
                         edgecolor=color, 
                         linewidth=2)
    axes[0].add_patch(rect)
    
    # 添加标签
    label = f'{category}: {confidence:.2f}'
    axes[0].text(bbox[0], bbox[1]-5, label, 
                color=color, fontsize=8, fontweight='bold',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.7))

# 2. 统计信息
axes[1].axis('off')

# 计算统计信息
categories = [det['category'] for det in detections]
confidences = [det['confidence'] for det in detections]

category_counts = {cat: categories.count(cat) for cat in set(categories)}
avg_confidences = {cat: np.mean([conf for cat_det, conf in zip(categories, confidences) 
                                 if cat_det == cat]) for cat in set(categories)}

# 显示统计信息
y_pos = 0.9
axes[1].text(0.1, y_pos, '检测结果统计:', fontsize=12, fontweight='bold')
y_pos -= 0.05

for cat in sorted(category_counts.keys()):
    count = category_counts[cat]
    avg_conf = avg_confidences[cat]
    color = category_colors[cat]
    
    axes[1].text(0.1, y_pos, f'{cat}:', fontsize=10, fontweight='bold', color=color)
    axes[1].text(0.4, y_pos, f'数量: {count}', fontsize=10)
    axes[1].text(0.7, y_pos, f'平均置信度: {avg_conf:.3f}', fontsize=10)
    y_pos -= 0.05

# 添加总体统计
y_pos -= 0.05
axes[1].text(0.1, y_pos, f'总检测数: {len(detections)}', fontsize=10, fontweight='bold')
y_pos -= 0.03
axes[1].text(0.1, y_pos, f'平均置信度: {np.mean(confidences):.3f}', fontsize=10, fontweight='bold')

# 添加图例
y_pos -= 0.05
axes[1].text(0.1, y_pos, '图例:', fontsize=10, fontweight='bold')
y_pos -= 0.03

for cat, color in category_colors.items():
    axes[1].plot([0.1, 0.15], [y_pos, y_pos], color=color, linewidth=3)
    axes[1].text(0.18, y_pos, cat, fontsize=9)
    y_pos -= 0.03

plt.tight_layout()
plt.savefig(os.path.join(save_dir, 'detection_results.png'), dpi=300, bbox_inches='tight')
plt.show()