NumPy 完整学习笔记
人群
原因
有Python基础
快速查阅NumPy函数用法
需要速查手册
表格形式,一目了然
复习巩固
已学过,需要快速回顾
目录
数组创建与基础
索引与切片
形状操作
运算与广播
数学统计函数
线性代数
随机数生成
文件IO
高级索引
性能优化
一、数组创建与基础
1.1 从列表创建数组
python
复制代码
import numpy as np
# 一维数组 - 水果库存
fruit_inventory = [150, 230, 89, 176, 210]
fruit_array = np.array(fruit_inventory)
# 二维数组 - 多分店库存
branches_inventory = [
[150, 230, 89], # 分店1:苹果、香蕉、橙子
[120, 180, 95], # 分店2
[200, 150, 110] # 分店3
]
branches_array = np.array(branches_inventory)
print(branches_array.shape) # (3, 3)
1.2 内置创建函数
函数
说明
示例
实际场景
np.zeros()
全0数组
np.zeros((5, 7))
员工考勤表初始化
np.ones()
全1数组
np.ones(7) * 8
每日8小时工作制
np.full()
填充指定值
np.full((5, 7), 2.0)
加班时长矩阵
np.eye()
单位矩阵
np.eye(5)
员工权限矩阵
np.arange()
等差序列
np.arange(930, 1501, 30)
股票交易时间点
np.linspace()
等分数列
np.linspace(100, 110, 13)
价格区间划分
1.3 随机数组创建
python
复制代码
np.random.seed(42) # 固定随机种子,确保结果可复现
# 电商用户行为模拟
browsing_time = np.random.random(1000) # 用户浏览时长(0-1小时)
purchase_quantity = np.random.randint(1, 11, 1000) # 购买数量1-10件
consumption = np.random.normal(500, 100, 1000) # 消费金额,均值500,标准差100
# 使用 Generator API(NumPy 2.x 推荐)
rng = np.random.default_rng(seed=42)
rng.random(10) # [0,1)均匀分布
rng.integers(1, 100, 10) # 随机整数
rng.normal(0, 1, 100) # 正态分布
1.4 数组属性
python
复制代码
# 物流包裹追踪系统 - 3D数组
packages = np.random.randint(1000, 9999, (3, 5, 10))
print("数组形状 (shape):", packages.shape) # (3, 5, 10) - 3仓库 x 5货架 x 10包裹
print("数组维度数 (ndim):", packages.ndim) # 3
print("数组元素总数 (size):", packages.size) # 150
print("数组元素类型 (dtype):", packages.dtype) # int64
print("每个元素字节数 (itemsize):", packages.itemsize) # 8
print("数组总字节数 (nbytes):", packages.nbytes) # 1200
1.5 数据类型
类型
说明
范围
应用场景
int8
8位整数
-128 ~ 127
小范围计数
int16
16位整数
-32768 ~ 32767
传感器数据
int32
32位整数
-2e9 ~ 2e9
通用整数
int64
64位整数
-9e18 ~ 9e18
大整数ID
float32
单精度浮点
~7位小数
图像像素值
float64
双精度浮点
~15位小数
财务计算
bool
布尔型
True/False
掩码数组
python
复制代码
# 财务数据精度控制
transactions = np.array([1234.5678, 9876.5432])
transactions_cents = (transactions * 100).astype(np.int32) # 转为整数分
# 图像数据
image = np.zeros((1080, 1920, 3), dtype=np.uint8) # RGB图像用uint8
二、索引与切片
2.1 基础索引
python
复制代码
# 超市商品价格查询
prices = np.array([5.5, 3.2, 8.9, 12.5, 6.8, 4.5, 9.9])
products = ['苹果', '香蕉', '橙子', '葡萄', '西瓜', '梨', '桃子']
prices[0] # 5.5 - 第一个商品价格
prices[-1] # 9.9 - 最后一个商品价格
prices[3] # 12.5 - 中间商品价格
2.2 二维数组索引
python
复制代码
# 电影院座位预订系统
np.random.seed(42)
seats = np.random.choice([0, 1], size=(10, 15), p=[0.7, 0.3])
seats[2:5, 4:10] = 2 # VIP区域
# 0=空位, 1=已预订, 2=VIP座位
row, col = 2, 5
status = seats[row, col]
seats[2, :] # 第3排所有座位
seats[:, 5] # 第6列所有座位
2.3 切片语法
python
复制代码
# 股票数据分析
np.random.seed(123)
stock_prices = 100 + np.cumsum(np.random.normal(0, 2, 30))
stock_prices = np.round(stock_prices, 2)
stock_prices[:5] # 前5天价格
stock_prices[-5:] # 后5天价格
stock_prices[9:15] # 第10-15天价格
stock_prices[::5] # 每隔5天取样
stock_prices[::-1] # 价格反转
# 计算周涨跌幅
weekly_prices = stock_prices[::7]
weekly_changes = np.diff(weekly_prices)
2.4 多维切片 - 图像处理
python
复制代码
# 图像区域提取
image = np.random.randint(0, 256, (1080, 1920, 3), dtype=np.uint8)
# 提取中心区域(人脸识别区域)
h_start, w_start = (1080 - 400) // 2, (1920 - 400) // 2
center_crop = image[h_start:h_start+400, w_start:w_start+400]
# 提取上半部分
upper_half = image[:540, :]
# 提取红色通道
red_channel = image[:, :, 0]
# 分辨率减半(每隔一个像素取样)
thumbnail = image[::2, ::2]
2.5 布尔索引
python
复制代码
# 学生成绩筛选
np.random.seed(42)
scores = np.random.randint(40, 101, 50)
# 筛选不及格学生
failing = scores < 60
failing_students = np.where(failing)[0]
# 筛选优秀学生
excellent = scores >= 90
# 多条件筛选
medium = (scores >= 60) & (scores < 80)
passing = scores[(scores >= 60) & (scores < 90)]
2.6 花式索引
python
复制代码
# 批量订单查询
order_amounts = np.random.randint(100, 10001, 100)
order_status = np.random.choice([0, 1, 2, 3], 100)
# 查询特定订单
query_indices = [0, 5, 10, 15, 20]
selected_amounts = order_amounts[query_indices]
# 每周一销售额取样
sales_data = np.random.randint(1000, 5000, 30)
days_to_query = [0, 7, 14, 21, 28]
weekly_sales = sales_data[days_to_query]
# 二维花式索引
matrix = np.arange(25).reshape(5, 5)
rows = [0, 2, 4]
cols = [2, 3, 4]
selected = matrix[rows][:, cols]
2.7 赋值与修改
python
复制代码
# 库存管理系统
np.random.seed(1)
inventory = np.random.randint(0, 100, (5, 10))
# 修改单个值
inventory[0, 0] += 50 # 仓库1手机进货50台
# 修改整行
inventory[2, :] = 0 # 仓库3盘点清零
# 修改整列
inventory[:, 3] += 20 # 所有仓库耳机增加20个
# 修改子区域
inventory[:3, :5] = (inventory[:3, :5] * 0.8).astype(int) # 促销折扣
三、形状操作
3.1 reshape
python
复制代码
# 批量图像数据预处理
raw_data = np.random.randint(0, 256, 100 * 28 * 28)
# 重塑为 (样本数, 高度, 宽度)
images = raw_data.reshape(100, 28, 28)
# 添加通道维度(用于CNN)
images_with_channel = images.reshape(100, 28, 28, 1)
# 自动计算维度
flattened = images.reshape(100, -1) # (100, 784)
3.2 ravel 和 flatten
python
复制代码
# 特征工程数据准备
sensor_data = np.random.rand(5, 4, 3)
# ravel - 返回视图(共享内存)
raveled = sensor_data.ravel()
print(np.shares_memory(sensor_data, raveled)) # True
# flatten - 返回副本(独立内存)
flattened = sensor_data.flatten()
print(np.shares_memory(sensor_data, flattened)) # False
3.3 转置与轴交换
python
复制代码
# 图像格式转换 (HWC -> CHW)
image_hwc = np.random.randint(0, 256, (1080, 1920, 3), dtype=np.uint8)
image_chw = image_hwc.transpose(2, 0, 1) # PyTorch格式
# 批量图像 (B, H, W, C) -> (B, C, H, W)
batch_images = np.random.rand(32, 224, 224, 3)
batch_nchw = batch_images.transpose(0, 3, 1, 2)
# swapaxes
matrix = np.arange(24).reshape(2, 3, 4)
swapped = matrix.swapaxes(0, 2)
3.4 维度增减
python
复制代码
# 机器学习数据维度对齐
gray_image = np.random.rand(28, 28)
# 增加批次维度 -> (1, H, W)
image_with_batch = np.expand_dims(gray_image, axis=0)
# 增加通道维度 -> (H, W, 1)
image_with_channel = np.expand_dims(gray_image, axis=-1)
# 同时增加多个维度
image_full = gray_image[np.newaxis, ..., np.newaxis] # (1, 28, 28, 1)
# squeeze - 移除单维度
squeezed = image_full.squeeze() # (28, 28)
3.5 数组拼接
python
复制代码
# 数据集合并
dataset_q1 = np.random.rand(1000, 10)
dataset_q2 = np.random.rand(1200, 10)
dataset_q3 = np.random.rand(1100, 10)
# 垂直拼接(增加样本)
full_dataset = np.concatenate([dataset_q1, dataset_q2, dataset_q3], axis=0)
# 水平拼接(增加特征)
user_features = np.random.rand(1000, 5)
order_features = np.random.rand(1000, 3)
enriched = np.concatenate([user_features, order_features], axis=1)
# 堆叠
daily_sales = [np.random.randint(100, 1000, 5) for _ in range(7)]
weekly_vstack = np.vstack(daily_sales) # (7, 5)
weekly_stack = np.stack(daily_sales, axis=0) # (7, 5)
3.6 数组分割
python
复制代码
# 训练集/验证集/测试集划分
dataset = np.random.rand(10000, 20)
train, val, test = np.split(dataset, [int(0.6 * len(dataset)), int(0.8 * len(dataset))])
# K折交叉验证
k_folds = np.array_split(dataset, 5)
# 水平分割
features = np.random.rand(1000, 10)
X_train, X_test = np.hsplit(features, [8])
四、运算与广播
4.1 算术运算
python
复制代码
# 购物车价格计算
prices = np.array([59.9, 129.5, 35.0, 299.0, 89.9])
quantities = np.array([2, 1, 5, 1, 3])
discounts = np.array([0.9, 0.85, 0.95, 0.8, 0.9])
subtotals = prices * quantities
discounted = subtotals * discounts
total = discounted.sum()
4.2 广播机制
python
复制代码
# 成绩标准化
np.random.seed(42)
scores = np.random.randint(50, 101, (5, 4))
# 计算每科平均分
subject_means = scores.mean(axis=0)
centered_scores = scores - subject_means # 广播
# 标准化
subject_stds = scores.std(axis=0)
z_scores = (scores - subject_means) / subject_stds # 广播
4.3 比较运算
python
复制代码
# 库存预警系统
current_stock = np.array([150, 23, 89, 5, 200, 12, 78])
safety_stock = np.array([50, 30, 40, 20, 60, 25, 35])
low_stock = current_stock < safety_stock
excess_stock = current_stock > safety_stock * 1.5
normal_stock = (current_stock >= safety_stock) & (current_stock <= safety_stock * 1.5)
4.4 通用函数 (ufunc)
python
复制代码
# 三角函数 - 信号处理
angles = np.array([0, 30, 45, 60, 90])
radians = np.radians(angles)
sin_values = np.sin(radians)
# 复利计算
principal = 10000
rate = 0.05
years = np.arange(1, 11)
amounts = principal * np.power(1 + rate, years)
# 对数变换
data = np.array([1, 10, 100, 1000])
log_data = np.log(data)
log10_data = np.log10(data)
# 取整
values = np.array([2.3, 3.7, -1.2])
np.round(values) # [2., 4., -1.]
np.ceil(values) # [3., 4., -1.]
np.floor(values) # [2., 3., -2.]
4.5 聚合函数
python
复制代码
# 销售数据分析
np.random.seed(42)
monthly_sales = np.random.randint(100, 1000, (12, 5))
# 全局聚合
monthly_sales.sum()
monthly_sales.mean()
monthly_sales.std()
# 按产品聚合(按列)
monthly_sales.sum(axis=0)
monthly_sales.mean(axis=0)
# 按月聚合(按行)
monthly_sales.sum(axis=1)
五、数学统计函数
5.1 基础统计
python
复制代码
# 员工绩效分析
np.random.seed(42)
sales = np.random.normal(50000, 15000, (12, 10))
sales.sum()
sales.mean()
sales.std()
sales.var()
np.median(sales)
5.2 百分位数
python
复制代码
# 薪资水平分析
np.random.seed(123)
salaries = np.concatenate([
np.random.normal(8000, 2000, 100),
np.random.normal(20000, 5000, 20),
np.random.normal(50000, 10000, 5)
])
percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
print(f"P{p}: {np.percentile(salaries, p):.2f}")
# 四分位距
q1 = np.percentile(salaries, 25)
q3 = np.percentile(salaries, 75)
iqr = q3 - q1
5.3 相关性与协方差
python
复制代码
# 销售因素分析
np.random.seed(42)
n_samples = 100
advertising = np.random.uniform(10, 100, n_samples)
promotion_days = np.random.randint(5, 30, n_samples)
sales_staff = np.random.randint(10, 50, n_samples)
sales_amount = advertising * 0.5 + promotion_days * 2 + np.random.normal(0, 10, n_samples)
data_matrix = np.column_stack([advertising, promotion_days, sales_staff, sales_amount])
correlation = np.corrcoef(data_matrix.T)
5.4 直方图
python
复制代码
# 用户年龄分布
np.random.seed(42)
user_ages = np.concatenate([
np.random.normal(22, 3, 200),
np.random.normal(35, 8, 500),
np.random.normal(50, 10, 200),
np.random.normal(65, 8, 100)
]).astype(int)
age_bins = [18, 25, 35, 45, 55, 65, 80]
counts, bin_edges = np.histogram(user_ages, bins=age_bins)
5.5 差分与累积
python
复制代码
# 股票收益分析
np.random.seed(42)
returns = np.random.normal(0.001, 0.02, 252)
prices = 100 * np.cumprod(1 + returns)
# 日收益率
daily_returns = np.diff(prices) / prices[:-1]
# 累积收益
cumulative_returns = np.cumprod(1 + daily_returns) - 1
# 资金流向
net_flow = np.random.choice([-1, 1], 252) * np.random.uniform(100, 1000, 252)
cumulative_flow = np.cumsum(net_flow)
5.6 排序和排名
python
复制代码
# 排行榜系统
np.random.seed(42)
player_scores = np.random.randint(1000, 10000, 20)
# 排序
sorted_indices = np.argsort(player_scores)[::-1]
# 获取排名
ranks = np.empty_like(sorted_indices)
ranks[sorted_indices] = np.arange(1, 21)
# 多关键字排序
indices = np.lexsort((secondary_key, primary_key))
5.7 唯一值和计数
python
复制代码
# 商品分类统计
categories = np.random.choice(['电子', '服装', '食品', '家居', '图书'], 1000)
unique_cats, counts = np.unique(categories, return_counts=True)
# 交集和并集
category_a = np.array(['电子', '服装', '食品', '家居'])
category_b = np.array(['电子', '美妆', '食品', '运动'])
np.intersect1d(category_a, category_b)
np.union1d(category_a, category_b)
六、线性代数
6.1 矩阵基础
python
复制代码
# 特殊矩阵
I = np.eye(3) # 单位矩阵
D = np.diag([2, 3, 4]) # 对角矩阵
# 提取对角线
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
np.diag(matrix) # 主对角线
np.diag(matrix, k=1) # 上对角线
np.diag(matrix, k=-1) # 下对角线
# 三角矩阵
np.triu(matrix) # 上三角
np.tril(matrix) # 下三角
6.2 矩阵运算
python
复制代码
# 神经网络前向传播
X = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
W = np.random.randn(3, 5)
b = np.random.randn(5)
Z = X @ W + b
A = np.maximum(0, Z) # ReLU激活
6.3 矩阵分解
python
复制代码
# PCA主成分分析
np.random.seed(42)
X_high_dim = np.random.randn(100, 10)
X_centered = X_high_dim - X_high_dim.mean(axis=0)
cov_matrix = (X_centered.T @ X_centered) / 99
# 特征值分解
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# SVD分解
U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)
# QR分解
Q, R = np.linalg.qr(X_high_dim)
6.4 求解线性方程组
python
复制代码
# Ax = b
A = np.array([[2, 1, -1], [-3, -1, 2], [-2, 1, 2]])
b = np.array([8, -11, -3])
# 方法1: solve (推荐)
x = np.linalg.solve(A, b)
# 方法2: 逆矩阵
A_inv = np.linalg.inv(A)
x = A_inv @ b
# 行列式
det = np.linalg.det(A)
6.5 范数计算
python
复制代码
# 误差分析
predictions = np.array([3.2, 4.5, 2.8, 5.1, 3.9])
true_values = np.array([3.0, 4.0, 3.0, 5.0, 4.0])
errors = predictions - true_values
l1_norm = np.linalg.norm(errors, ord=1) # MAE * n
l2_norm = np.linalg.norm(errors, ord=2) # RMSE * sqrt(n)
inf_norm = np.linalg.norm(errors, ord=np.inf) # 最大误差
七、随机数生成
7.1 创建随机数生成器
python
复制代码
# 推荐方式 (NumPy 2.x)
rng = np.random.default_rng(seed=42)
# 可重复性验证
rng1 = np.random.default_rng(seed=123)
rng2 = np.random.default_rng(seed=123)
7.2 基本随机数
python
复制代码
rng = np.random.default_rng(seed=42)
# 抽奖系统
lucky_number = rng.integers(1, 101) # 幸运数字
lottery_numbers = rng.integers(1, 34, size=6) # 彩票号码
winners = rng.choice(range(1, 101), size=5, replace=False) # 中奖者
# 带权重选择
products = ['手机', '电脑', '平板', '耳机', '手表']
weights = [0.4, 0.3, 0.15, 0.1, 0.05]
selected = rng.choice(products, size=10, p=weights)
7.3 概率分布
python
复制代码
rng = np.random.default_rng(seed=42)
# 正态分布 - 身高体重模拟
male_heights = rng.normal(170, 8, 1000)
male_weights = (male_heights - 105) + rng.normal(0, 5, 1000)
# 二项分布 - 广告点击率
clicks = rng.binomial(n=1000, p=0.05, size=100)
# 泊松分布 - 客服来电
calls_per_hour = rng.poisson(lam=10, size=24)
# 指数分布 - 设备故障间隔
failure_intervals = rng.exponential(scale=100, size=50)
# 对数正态分布 - 收入分布
incomes = rng.lognormal(mean=8.5, sigma=0.8, size=1000)
7.4 蒙特卡洛模拟
python
复制代码
# 投资风险分析
rng = np.random.default_rng(seed=42)
initial_investment = 100000
expected_return = 0.08
volatility = 0.15
years = 20
n_simulations = 10000
final_values = []
for _ in range(n_simulations):
value = initial_investment
for _ in range(years):
annual_return = rng.normal(expected_return, volatility)
value *= (1 + annual_return)
final_values.append(value)
final_values = np.array(final_values)
print(f"平均终值: {final_values.mean():,.0f}")
print(f"亏损概率: {np.mean(final_values < initial_investment) * 100:.2f}%")
八、文件IO
8.1 npy/npz格式
python
复制代码
# 模型权重保存
layer1_weights = np.random.randn(784, 256).astype(np.float32)
layer1_bias = np.zeros(256, dtype=np.float32)
# 保存单个数组
np.save('layer1_weights.npy', layer1_weights)
# 加载数组
loaded_weights = np.load('layer1_weights.npy')
# 保存多个数组
np.savez('model.npz',
w1=layer1_weights,
b1=layer1_bias,
model_name=np.array('NeuralNetwork_v1'))
# 压缩保存
np.savez_compressed('model_compressed.npz', w1=layer1_weights, b1=layer1_bias)
# 加载npz
checkpoint = np.load('model.npz')
restored_weights = checkpoint['w1']
8.2 文本文件
python
复制代码
# 销售数据导出
np.random.seed(42)
n_records = 100
sales_data = np.column_stack([
np.arange(1, n_records + 1),
np.random.randint(1000, 9999, n_records),
np.random.choice([1, 2, 3, 4, 5], n_records),
np.random.randint(1, 10, n_records),
np.round(np.random.uniform(10, 1000, n_records), 2),
])
# 保存CSV
header = "order_id,customer_id,category,quantity,amount"
np.savetxt('sales_data.csv', sales_data, delimiter=',',
header=header, comments='', fmt=['%d', '%d', '%d', '%d', '%.2f'])
# 加载CSV
loaded_data = np.loadtxt('sales_data.csv', delimiter=',', skiprows=1)
8.3 内存映射
python
复制代码
# 大文件处理
# 创建内存映射
fp = np.memmap('large_array.dat', dtype='float32',
mode='w+', shape=(10000, 10000))
# 读取内存映射
mmap = np.load('big_data.npy', mmap_mode='r')
# 只读取需要的部分
subset = mmap[100:200, :50]
九、高级索引
9.1 复杂条件筛选
python
复制代码
# 客户分群
np.random.seed(42)
n_customers = 1000
ages = np.random.randint(18, 70, n_customers)
spending = np.random.lognormal(6, 1, n_customers)
visits = np.random.poisson(10, n_customers)
membership = np.random.choice([1, 2, 3, 4], n_customers)
# 高价值客户
high_value = (spending > 1000) & (visits > 15) & (membership >= 3)
# 潜力客户
potential = (ages < 30) & (spending > 500) & (visits < 10)
# 流失风险
churn_risk = (ages > 50) & (visits < 5)
9.2 np.where 和 np.select
python
复制代码
# 数据清洗
sensor_data = np.array([23.5, 24.1, 100.0, 23.8, -50.0])
# np.where
cleaned = np.where(
(sensor_data >= 10) & (sensor_data <= 40),
sensor_data,
np.nan
)
# 成绩等级
scores = np.array([55, 72, 88, 91, 67])
grades = np.where(scores >= 90, 'A',
np.where(scores >= 80, 'B',
np.where(scores >= 70, 'C',
np.where(scores >= 60, 'D', 'F'))))
# np.select - 客户价值分层
conditions = [
(rfm_score >= 80) & (recency <= 30),
(rfm_score >= 60) & (recency <= 60),
recency > 180
]
choices = ['重要价值客户', '重要保持客户', '流失客户']
segments = np.select(conditions, choices, default='新客户')
9.3 查找位置
python
复制代码
# 异常检测
data_matrix = np.random.randn(10, 10)
data_matrix[2, 5] = 10
data_matrix[7, 3] = -8
anomaly_mask = np.abs(data_matrix) > 3
anomaly_positions = np.argwhere(anomaly_mask)
# np.nonzero
rows, cols = np.nonzero(anomaly_mask)
9.4 掩码数组
python
复制代码
# 缺失值处理
experiment_data = np.random.randn(5, 5)
experiment_data[1, 2] = np.nan
experiment_data[3, 4] = np.nan
# 创建掩码数组
masked_data = np.ma.masked_invalid(experiment_data)
# 统计计算
masked_data.mean()
masked_data.std()
# 填充缺失值
filled = masked_data.filled(masked_data.mean())
十、性能优化
10.1 向量化操作
python
复制代码
# ❌ 避免:Python循环
result = []
for x in arr:
result.append(x ** 2 + 2 * x + 1)
# ✅ 推荐:向量化
result = arr ** 2 + 2 * arr + 1
# 性能对比
import time
n = 1000000
data = np.random.randn(n)
# 循环版本
start = time.time()
result_loop = [x ** 2 + 2 * x + 1 for x in data]
time_loop = time.time() - start
# 向量化版本
start = time.time()
result_vectorized = data ** 2 + 2 * data + 1
time_vectorized = time.time() - start
print(f"加速比: {time_loop / time_vectorized:.1f}x")
10.2 内存优化
python
复制代码
# 预分配内存
result = np.empty(n)
for i in range(n):
result[i] = compute(i)
# 选择合适的数据类型
image = np.zeros((1000, 1000), dtype=np.uint8) # 图像用uint8
ids = np.array(ids, dtype=np.int32) # ID用int32
# 使用视图而非副本
view = arr[::2] # 视图 - 共享内存
copy = arr[::2].copy() # 副本 - 独立内存
# 检查是否共享内存
np.shares_memory(arr, view) # True
np.shares_memory(arr, copy) # False
10.3 内存布局优化
python
复制代码
# C-order vs F-order
n = 5000
a_c = np.random.randn(n, n) # C-order (行优先)
a_f = np.asfortranarray(a_c) # F-order (列优先)
# 行操作 - C-order更快
row_sum_c = a_c.sum(axis=1)
# 列操作 - F-order更快
col_sum_f = a_f.sum(axis=0)
10.4 避免常见陷阱
python
复制代码
# ❌ 避免:在循环中拼接
result = np.array([])
for x in data:
result = np.append(result, x) # 每次创建新数组,很慢!
# ✅ 推荐:预分配
result = np.empty(len(data))
for i, x in enumerate(data):
result[i] = x
# ✅ 更好的方法:向量化
result = np.array(data) * 2
10.5 性能测试
python
复制代码
import time
# 简单计时
start = time.time()
result = func()
elapsed = time.time() - start
# 使用timeit
import timeit
timeit.timeit('np.sum(arr)',
setup='import numpy as np; arr = np.arange(1000)',
number=1000)
常用代码片段
数据标准化
python
复制代码
def normalize(data, axis=0):
mean = data.mean(axis=axis, keepdims=True)
std = data.std(axis=axis, keepdims=True)
return (data - mean) / std
训练集/测试集划分
python
复制代码
def train_test_split(X, y, test_size=0.2, random_state=None):
rng = np.random.default_rng(random_state)
n = len(X)
indices = rng.permutation(n)
split_idx = int(n * (1 - test_size))
train_idx, test_idx = indices[:split_idx], indices[split_idx:]
return X[train_idx], X[test_idx], y[train_idx], y[test_idx]
批量处理
python
复制代码
def batch_generator(data, batch_size):
n = len(data)
for i in range(0, n, batch_size):
yield data[i:i + batch_size]
滑动窗口
python
复制代码
def sliding_window(arr, window_size):
shape = (arr.size - window_size + 1, window_size)
strides = (arr.strides[0], arr.strides[0])
return np.lib.stride_tricks.as_strided(arr, shape=shape, strides=strides)
Min-Max归一化
python
复制代码
def min_max_scale(data, axis=0):
min_val = data.min(axis=axis, keepdims=True)
max_val = data.max(axis=axis, keepdims=True)
return (data - min_val) / (max_val - min_val)
Z-Score标准化
python
复制代码
def z_score(data, axis=0):
mean = data.mean(axis=axis, keepdims=True)
std = data.std(axis=axis, keepdims=True)
return (data - mean) / std
学习资源
本笔记整合了 numpy_tutorial 目录下 10 章教程代码的实际场景案例