NumPy 完整学习笔记

人群	原因
有Python基础	快速查阅NumPy函数用法
需要速查手册	表格形式，一目了然
复习巩固	已学过，需要快速回顾

一、数组创建与基础

1.1 从列表创建数组

python 复制代码

import numpy as np

# 一维数组 - 水果库存
fruit_inventory = [150, 230, 89, 176, 210]
fruit_array = np.array(fruit_inventory)

# 二维数组 - 多分店库存
branches_inventory = [
    [150, 230, 89],   # 分店1：苹果、香蕉、橙子
    [120, 180, 95],   # 分店2
    [200, 150, 110]   # 分店3
]
branches_array = np.array(branches_inventory)
print(branches_array.shape)  # (3, 3)

1.2 内置创建函数

函数	说明	示例	实际场景
`np.zeros()`	全0数组	`np.zeros((5, 7))`	员工考勤表初始化
`np.ones()`	全1数组	`np.ones(7) * 8`	每日8小时工作制
`np.full()`	填充指定值	`np.full((5, 7), 2.0)`	加班时长矩阵
`np.eye()`	单位矩阵	`np.eye(5)`	员工权限矩阵
`np.arange()`	等差序列	`np.arange(930, 1501, 30)`	股票交易时间点
`np.linspace()`	等分数列	`np.linspace(100, 110, 13)`	价格区间划分

1.3 随机数组创建

python 复制代码

np.random.seed(42)  # 固定随机种子，确保结果可复现

# 电商用户行为模拟
browsing_time = np.random.random(1000)  # 用户浏览时长（0-1小时）
purchase_quantity = np.random.randint(1, 11, 1000)  # 购买数量1-10件
consumption = np.random.normal(500, 100, 1000)  # 消费金额，均值500，标准差100

# 使用 Generator API（NumPy 2.x 推荐）
rng = np.random.default_rng(seed=42)
rng.random(10)           # [0,1)均匀分布
rng.integers(1, 100, 10) # 随机整数
rng.normal(0, 1, 100)    # 正态分布

1.4 数组属性

python 复制代码

# 物流包裹追踪系统 - 3D数组
packages = np.random.randint(1000, 9999, (3, 5, 10))

print("数组形状 (shape):", packages.shape)      # (3, 5, 10) - 3仓库 x 5货架 x 10包裹
print("数组维度数 (ndim):", packages.ndim)       # 3
print("数组元素总数 (size):", packages.size)     # 150
print("数组元素类型 (dtype):", packages.dtype)   # int64
print("每个元素字节数 (itemsize):", packages.itemsize)  # 8
print("数组总字节数 (nbytes):", packages.nbytes)  # 1200

1.5 数据类型

类型	说明	范围	应用场景
`int8`	8位整数	-128 ~ 127	小范围计数
`int16`	16位整数	-32768 ~ 32767	传感器数据
`int32`	32位整数	-2e9 ~ 2e9	通用整数
`int64`	64位整数	-9e18 ~ 9e18	大整数ID
`float32`	单精度浮点	~7位小数	图像像素值
`float64`	双精度浮点	~15位小数	财务计算
`bool`	布尔型	True/False	掩码数组

python 复制代码

# 财务数据精度控制
transactions = np.array([1234.5678, 9876.5432])
transactions_cents = (transactions * 100).astype(np.int32)  # 转为整数分

# 图像数据
image = np.zeros((1080, 1920, 3), dtype=np.uint8)  # RGB图像用uint8

二、索引与切片

2.1 基础索引

python 复制代码

# 超市商品价格查询
prices = np.array([5.5, 3.2, 8.9, 12.5, 6.8, 4.5, 9.9])
products = ['苹果', '香蕉', '橙子', '葡萄', '西瓜', '梨', '桃子']

prices[0]    # 5.5 - 第一个商品价格
prices[-1]   # 9.9 - 最后一个商品价格
prices[3]    # 12.5 - 中间商品价格

2.2 二维数组索引

python 复制代码

# 电影院座位预订系统
np.random.seed(42)
seats = np.random.choice([0, 1], size=(10, 15), p=[0.7, 0.3])
seats[2:5, 4:10] = 2  # VIP区域

# 0=空位, 1=已预订, 2=VIP座位
row, col = 2, 5
status = seats[row, col]

seats[2, :]    # 第3排所有座位
seats[:, 5]    # 第6列所有座位

2.3 切片语法

python 复制代码

# 股票数据分析
np.random.seed(123)
stock_prices = 100 + np.cumsum(np.random.normal(0, 2, 30))
stock_prices = np.round(stock_prices, 2)

stock_prices[:5]      # 前5天价格
stock_prices[-5:]     # 后5天价格
stock_prices[9:15]    # 第10-15天价格
stock_prices[::5]     # 每隔5天取样
stock_prices[::-1]    # 价格反转

# 计算周涨跌幅
weekly_prices = stock_prices[::7]
weekly_changes = np.diff(weekly_prices)

2.4 多维切片 - 图像处理

python 复制代码

# 图像区域提取
image = np.random.randint(0, 256, (1080, 1920, 3), dtype=np.uint8)

# 提取中心区域（人脸识别区域）
h_start, w_start = (1080 - 400) // 2, (1920 - 400) // 2
center_crop = image[h_start:h_start+400, w_start:w_start+400]

# 提取上半部分
upper_half = image[:540, :]

# 提取红色通道
red_channel = image[:, :, 0]

# 分辨率减半（每隔一个像素取样）
thumbnail = image[::2, ::2]

2.5 布尔索引

python 复制代码

# 学生成绩筛选
np.random.seed(42)
scores = np.random.randint(40, 101, 50)

# 筛选不及格学生
failing = scores < 60
failing_students = np.where(failing)[0]

# 筛选优秀学生
excellent = scores >= 90

# 多条件筛选
medium = (scores >= 60) & (scores < 80)
passing = scores[(scores >= 60) & (scores < 90)]

2.6 花式索引

python 复制代码

# 批量订单查询
order_amounts = np.random.randint(100, 10001, 100)
order_status = np.random.choice([0, 1, 2, 3], 100)

# 查询特定订单
query_indices = [0, 5, 10, 15, 20]
selected_amounts = order_amounts[query_indices]

# 每周一销售额取样
sales_data = np.random.randint(1000, 5000, 30)
days_to_query = [0, 7, 14, 21, 28]
weekly_sales = sales_data[days_to_query]

# 二维花式索引
matrix = np.arange(25).reshape(5, 5)
rows = [0, 2, 4]
cols = [2, 3, 4]
selected = matrix[rows][:, cols]

2.7 赋值与修改

python 复制代码

# 库存管理系统
np.random.seed(1)
inventory = np.random.randint(0, 100, (5, 10))

# 修改单个值
inventory[0, 0] += 50  # 仓库1手机进货50台

# 修改整行
inventory[2, :] = 0  # 仓库3盘点清零

# 修改整列
inventory[:, 3] += 20  # 所有仓库耳机增加20个

# 修改子区域
inventory[:3, :5] = (inventory[:3, :5] * 0.8).astype(int)  # 促销折扣

三、形状操作

3.1 reshape

python 复制代码

# 批量图像数据预处理
raw_data = np.random.randint(0, 256, 100 * 28 * 28)

# 重塑为 (样本数, 高度, 宽度)
images = raw_data.reshape(100, 28, 28)

# 添加通道维度（用于CNN）
images_with_channel = images.reshape(100, 28, 28, 1)

# 自动计算维度
flattened = images.reshape(100, -1)  # (100, 784)

3.2 ravel 和 flatten

python 复制代码

# 特征工程数据准备
sensor_data = np.random.rand(5, 4, 3)

# ravel - 返回视图（共享内存）
raveled = sensor_data.ravel()
print(np.shares_memory(sensor_data, raveled))  # True

# flatten - 返回副本（独立内存）
flattened = sensor_data.flatten()
print(np.shares_memory(sensor_data, flattened))  # False

3.3 转置与轴交换

python 复制代码

# 图像格式转换 (HWC -> CHW)
image_hwc = np.random.randint(0, 256, (1080, 1920, 3), dtype=np.uint8)
image_chw = image_hwc.transpose(2, 0, 1)  # PyTorch格式

# 批量图像 (B, H, W, C) -> (B, C, H, W)
batch_images = np.random.rand(32, 224, 224, 3)
batch_nchw = batch_images.transpose(0, 3, 1, 2)

# swapaxes
matrix = np.arange(24).reshape(2, 3, 4)
swapped = matrix.swapaxes(0, 2)

3.4 维度增减

python 复制代码

# 机器学习数据维度对齐
gray_image = np.random.rand(28, 28)

# 增加批次维度 -> (1, H, W)
image_with_batch = np.expand_dims(gray_image, axis=0)

# 增加通道维度 -> (H, W, 1)
image_with_channel = np.expand_dims(gray_image, axis=-1)

# 同时增加多个维度
image_full = gray_image[np.newaxis, ..., np.newaxis]  # (1, 28, 28, 1)

# squeeze - 移除单维度
squeezed = image_full.squeeze()  # (28, 28)

3.5 数组拼接

python 复制代码

# 数据集合并
dataset_q1 = np.random.rand(1000, 10)
dataset_q2 = np.random.rand(1200, 10)
dataset_q3 = np.random.rand(1100, 10)

# 垂直拼接（增加样本）
full_dataset = np.concatenate([dataset_q1, dataset_q2, dataset_q3], axis=0)

# 水平拼接（增加特征）
user_features = np.random.rand(1000, 5)
order_features = np.random.rand(1000, 3)
enriched = np.concatenate([user_features, order_features], axis=1)

# 堆叠
daily_sales = [np.random.randint(100, 1000, 5) for _ in range(7)]
weekly_vstack = np.vstack(daily_sales)  # (7, 5)
weekly_stack = np.stack(daily_sales, axis=0)  # (7, 5)

3.6 数组分割

python 复制代码

# 训练集/验证集/测试集划分
dataset = np.random.rand(10000, 20)

train, val, test = np.split(dataset, [int(0.6 * len(dataset)), int(0.8 * len(dataset))])

# K折交叉验证
k_folds = np.array_split(dataset, 5)

# 水平分割
features = np.random.rand(1000, 10)
X_train, X_test = np.hsplit(features, [8])

四、运算与广播

4.1 算术运算

python 复制代码

# 购物车价格计算
prices = np.array([59.9, 129.5, 35.0, 299.0, 89.9])
quantities = np.array([2, 1, 5, 1, 3])
discounts = np.array([0.9, 0.85, 0.95, 0.8, 0.9])

subtotals = prices * quantities
discounted = subtotals * discounts
total = discounted.sum()

4.2 广播机制

python 复制代码

# 成绩标准化
np.random.seed(42)
scores = np.random.randint(50, 101, (5, 4))

# 计算每科平均分
subject_means = scores.mean(axis=0)
centered_scores = scores - subject_means  # 广播

# 标准化
subject_stds = scores.std(axis=0)
z_scores = (scores - subject_means) / subject_stds  # 广播

4.3 比较运算

python 复制代码

# 库存预警系统
current_stock = np.array([150, 23, 89, 5, 200, 12, 78])
safety_stock = np.array([50, 30, 40, 20, 60, 25, 35])

low_stock = current_stock < safety_stock
excess_stock = current_stock > safety_stock * 1.5
normal_stock = (current_stock >= safety_stock) & (current_stock <= safety_stock * 1.5)

4.4 通用函数 (ufunc)

python 复制代码

# 三角函数 - 信号处理
angles = np.array([0, 30, 45, 60, 90])
radians = np.radians(angles)
sin_values = np.sin(radians)

# 复利计算
principal = 10000
rate = 0.05
years = np.arange(1, 11)
amounts = principal * np.power(1 + rate, years)

# 对数变换
data = np.array([1, 10, 100, 1000])
log_data = np.log(data)
log10_data = np.log10(data)

# 取整
values = np.array([2.3, 3.7, -1.2])
np.round(values)   # [2., 4., -1.]
np.ceil(values)    # [3., 4., -1.]
np.floor(values)   # [2., 3., -2.]

4.5 聚合函数

python 复制代码

# 销售数据分析
np.random.seed(42)
monthly_sales = np.random.randint(100, 1000, (12, 5))

# 全局聚合
monthly_sales.sum()
monthly_sales.mean()
monthly_sales.std()

# 按产品聚合（按列）
monthly_sales.sum(axis=0)
monthly_sales.mean(axis=0)

# 按月聚合（按行）
monthly_sales.sum(axis=1)

五、数学统计函数

5.1 基础统计

python 复制代码

# 员工绩效分析
np.random.seed(42)
sales = np.random.normal(50000, 15000, (12, 10))

sales.sum()
sales.mean()
sales.std()
sales.var()
np.median(sales)

5.2 百分位数

python 复制代码

# 薪资水平分析
np.random.seed(123)
salaries = np.concatenate([
    np.random.normal(8000, 2000, 100),
    np.random.normal(20000, 5000, 20),
    np.random.normal(50000, 10000, 5)
])

percentiles = [10, 25, 50, 75, 90, 95, 99]
for p in percentiles:
    print(f"P{p}: {np.percentile(salaries, p):.2f}")

# 四分位距
q1 = np.percentile(salaries, 25)
q3 = np.percentile(salaries, 75)
iqr = q3 - q1

5.3 相关性与协方差

python 复制代码

# 销售因素分析
np.random.seed(42)
n_samples = 100

advertising = np.random.uniform(10, 100, n_samples)
promotion_days = np.random.randint(5, 30, n_samples)
sales_staff = np.random.randint(10, 50, n_samples)
sales_amount = advertising * 0.5 + promotion_days * 2 + np.random.normal(0, 10, n_samples)

data_matrix = np.column_stack([advertising, promotion_days, sales_staff, sales_amount])
correlation = np.corrcoef(data_matrix.T)

5.4 直方图

python 复制代码

# 用户年龄分布
np.random.seed(42)
user_ages = np.concatenate([
    np.random.normal(22, 3, 200),
    np.random.normal(35, 8, 500),
    np.random.normal(50, 10, 200),
    np.random.normal(65, 8, 100)
]).astype(int)

age_bins = [18, 25, 35, 45, 55, 65, 80]
counts, bin_edges = np.histogram(user_ages, bins=age_bins)

5.5 差分与累积

python 复制代码

# 股票收益分析
np.random.seed(42)
returns = np.random.normal(0.001, 0.02, 252)
prices = 100 * np.cumprod(1 + returns)

# 日收益率
daily_returns = np.diff(prices) / prices[:-1]

# 累积收益
cumulative_returns = np.cumprod(1 + daily_returns) - 1

# 资金流向
net_flow = np.random.choice([-1, 1], 252) * np.random.uniform(100, 1000, 252)
cumulative_flow = np.cumsum(net_flow)

5.6 排序和排名

python 复制代码

# 排行榜系统
np.random.seed(42)
player_scores = np.random.randint(1000, 10000, 20)

# 排序
sorted_indices = np.argsort(player_scores)[::-1]

# 获取排名
ranks = np.empty_like(sorted_indices)
ranks[sorted_indices] = np.arange(1, 21)

# 多关键字排序
indices = np.lexsort((secondary_key, primary_key))

5.7 唯一值和计数

python 复制代码

# 商品分类统计
categories = np.random.choice(['电子', '服装', '食品', '家居', '图书'], 1000)

unique_cats, counts = np.unique(categories, return_counts=True)

# 交集和并集
category_a = np.array(['电子', '服装', '食品', '家居'])
category_b = np.array(['电子', '美妆', '食品', '运动'])
np.intersect1d(category_a, category_b)
np.union1d(category_a, category_b)

六、线性代数

6.1 矩阵基础

python 复制代码

# 特殊矩阵
I = np.eye(3)                    # 单位矩阵
D = np.diag([2, 3, 4])          # 对角矩阵

# 提取对角线
matrix = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
np.diag(matrix)      # 主对角线
np.diag(matrix, k=1) # 上对角线
np.diag(matrix, k=-1) # 下对角线

# 三角矩阵
np.triu(matrix)  # 上三角
np.tril(matrix)  # 下三角

6.2 矩阵运算

python 复制代码

# 神经网络前向传播
X = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
W = np.random.randn(3, 5)
b = np.random.randn(5)

Z = X @ W + b
A = np.maximum(0, Z)  # ReLU激活

6.3 矩阵分解

python 复制代码

# PCA主成分分析
np.random.seed(42)
X_high_dim = np.random.randn(100, 10)
X_centered = X_high_dim - X_high_dim.mean(axis=0)
cov_matrix = (X_centered.T @ X_centered) / 99

# 特征值分解
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

# SVD分解
U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)

# QR分解
Q, R = np.linalg.qr(X_high_dim)

6.4 求解线性方程组

python 复制代码

# Ax = b
A = np.array([[2, 1, -1], [-3, -1, 2], [-2, 1, 2]])
b = np.array([8, -11, -3])

# 方法1: solve (推荐)
x = np.linalg.solve(A, b)

# 方法2: 逆矩阵
A_inv = np.linalg.inv(A)
x = A_inv @ b

# 行列式
det = np.linalg.det(A)

6.5 范数计算

python 复制代码

# 误差分析
predictions = np.array([3.2, 4.5, 2.8, 5.1, 3.9])
true_values = np.array([3.0, 4.0, 3.0, 5.0, 4.0])
errors = predictions - true_values

l1_norm = np.linalg.norm(errors, ord=1)      # MAE * n
l2_norm = np.linalg.norm(errors, ord=2)      # RMSE * sqrt(n)
inf_norm = np.linalg.norm(errors, ord=np.inf) # 最大误差

七、随机数生成

7.1 创建随机数生成器

python 复制代码

# 推荐方式 (NumPy 2.x)
rng = np.random.default_rng(seed=42)

# 可重复性验证
rng1 = np.random.default_rng(seed=123)
rng2 = np.random.default_rng(seed=123)

7.2 基本随机数

python 复制代码

rng = np.random.default_rng(seed=42)

# 抽奖系统
lucky_number = rng.integers(1, 101)  # 幸运数字
lottery_numbers = rng.integers(1, 34, size=6)  # 彩票号码
winners = rng.choice(range(1, 101), size=5, replace=False)  # 中奖者

# 带权重选择
products = ['手机', '电脑', '平板', '耳机', '手表']
weights = [0.4, 0.3, 0.15, 0.1, 0.05]
selected = rng.choice(products, size=10, p=weights)

7.3 概率分布

python 复制代码

rng = np.random.default_rng(seed=42)

# 正态分布 - 身高体重模拟
male_heights = rng.normal(170, 8, 1000)
male_weights = (male_heights - 105) + rng.normal(0, 5, 1000)

# 二项分布 - 广告点击率
clicks = rng.binomial(n=1000, p=0.05, size=100)

# 泊松分布 - 客服来电
calls_per_hour = rng.poisson(lam=10, size=24)

# 指数分布 - 设备故障间隔
failure_intervals = rng.exponential(scale=100, size=50)

# 对数正态分布 - 收入分布
incomes = rng.lognormal(mean=8.5, sigma=0.8, size=1000)

7.4 蒙特卡洛模拟

python 复制代码

# 投资风险分析
rng = np.random.default_rng(seed=42)

initial_investment = 100000
expected_return = 0.08
volatility = 0.15
years = 20
n_simulations = 10000

final_values = []
for _ in range(n_simulations):
    value = initial_investment
    for _ in range(years):
        annual_return = rng.normal(expected_return, volatility)
        value *= (1 + annual_return)
    final_values.append(value)

final_values = np.array(final_values)
print(f"平均终值: {final_values.mean():,.0f}")
print(f"亏损概率: {np.mean(final_values < initial_investment) * 100:.2f}%")

八、文件IO

8.1 npy/npz格式

python 复制代码

# 模型权重保存
layer1_weights = np.random.randn(784, 256).astype(np.float32)
layer1_bias = np.zeros(256, dtype=np.float32)

# 保存单个数组
np.save('layer1_weights.npy', layer1_weights)

# 加载数组
loaded_weights = np.load('layer1_weights.npy')

# 保存多个数组
np.savez('model.npz',
         w1=layer1_weights,
         b1=layer1_bias,
         model_name=np.array('NeuralNetwork_v1'))

# 压缩保存
np.savez_compressed('model_compressed.npz', w1=layer1_weights, b1=layer1_bias)

# 加载npz
checkpoint = np.load('model.npz')
restored_weights = checkpoint['w1']

8.2 文本文件

python 复制代码

# 销售数据导出
np.random.seed(42)
n_records = 100
sales_data = np.column_stack([
    np.arange(1, n_records + 1),
    np.random.randint(1000, 9999, n_records),
    np.random.choice([1, 2, 3, 4, 5], n_records),
    np.random.randint(1, 10, n_records),
    np.round(np.random.uniform(10, 1000, n_records), 2),
])

# 保存CSV
header = "order_id,customer_id,category,quantity,amount"
np.savetxt('sales_data.csv', sales_data, delimiter=',',
           header=header, comments='', fmt=['%d', '%d', '%d', '%d', '%.2f'])

# 加载CSV
loaded_data = np.loadtxt('sales_data.csv', delimiter=',', skiprows=1)

8.3 内存映射

python 复制代码

# 大文件处理
# 创建内存映射
fp = np.memmap('large_array.dat', dtype='float32',
               mode='w+', shape=(10000, 10000))

# 读取内存映射
mmap = np.load('big_data.npy', mmap_mode='r')

# 只读取需要的部分
subset = mmap[100:200, :50]

九、高级索引

9.1 复杂条件筛选

python 复制代码

# 客户分群
np.random.seed(42)
n_customers = 1000

ages = np.random.randint(18, 70, n_customers)
spending = np.random.lognormal(6, 1, n_customers)
visits = np.random.poisson(10, n_customers)
membership = np.random.choice([1, 2, 3, 4], n_customers)

# 高价值客户
high_value = (spending > 1000) & (visits > 15) & (membership >= 3)

# 潜力客户
potential = (ages < 30) & (spending > 500) & (visits < 10)

# 流失风险
churn_risk = (ages > 50) & (visits < 5)

9.2 np.where 和 np.select

python 复制代码

# 数据清洗
sensor_data = np.array([23.5, 24.1, 100.0, 23.8, -50.0])

# np.where
cleaned = np.where(
    (sensor_data >= 10) & (sensor_data <= 40),
    sensor_data,
    np.nan
)

# 成绩等级
scores = np.array([55, 72, 88, 91, 67])
grades = np.where(scores >= 90, 'A',
         np.where(scores >= 80, 'B',
         np.where(scores >= 70, 'C',
         np.where(scores >= 60, 'D', 'F'))))

# np.select - 客户价值分层
conditions = [
    (rfm_score >= 80) & (recency <= 30),
    (rfm_score >= 60) & (recency <= 60),
    recency > 180
]
choices = ['重要价值客户', '重要保持客户', '流失客户']
segments = np.select(conditions, choices, default='新客户')

9.3 查找位置

python 复制代码

# 异常检测
data_matrix = np.random.randn(10, 10)
data_matrix[2, 5] = 10
data_matrix[7, 3] = -8

anomaly_mask = np.abs(data_matrix) > 3
anomaly_positions = np.argwhere(anomaly_mask)

# np.nonzero
rows, cols = np.nonzero(anomaly_mask)

9.4 掩码数组

python 复制代码

# 缺失值处理
experiment_data = np.random.randn(5, 5)
experiment_data[1, 2] = np.nan
experiment_data[3, 4] = np.nan

# 创建掩码数组
masked_data = np.ma.masked_invalid(experiment_data)

# 统计计算
masked_data.mean()
masked_data.std()

# 填充缺失值
filled = masked_data.filled(masked_data.mean())

十、性能优化

10.1 向量化操作

python 复制代码

# ❌ 避免：Python循环
result = []
for x in arr:
    result.append(x ** 2 + 2 * x + 1)

# ✅ 推荐：向量化
result = arr ** 2 + 2 * arr + 1

# 性能对比
import time

n = 1000000
data = np.random.randn(n)

# 循环版本
start = time.time()
result_loop = [x ** 2 + 2 * x + 1 for x in data]
time_loop = time.time() - start

# 向量化版本
start = time.time()
result_vectorized = data ** 2 + 2 * data + 1
time_vectorized = time.time() - start

print(f"加速比: {time_loop / time_vectorized:.1f}x")

10.2 内存优化

python 复制代码

# 预分配内存
result = np.empty(n)
for i in range(n):
    result[i] = compute(i)

# 选择合适的数据类型
image = np.zeros((1000, 1000), dtype=np.uint8)  # 图像用uint8
ids = np.array(ids, dtype=np.int32)              # ID用int32

# 使用视图而非副本
view = arr[::2]           # 视图 - 共享内存
copy = arr[::2].copy()    # 副本 - 独立内存

# 检查是否共享内存
np.shares_memory(arr, view)  # True
np.shares_memory(arr, copy)  # False

10.3 内存布局优化

python 复制代码

# C-order vs F-order
n = 5000
a_c = np.random.randn(n, n)       # C-order (行优先)
a_f = np.asfortranarray(a_c)      # F-order (列优先)

# 行操作 - C-order更快
row_sum_c = a_c.sum(axis=1)

# 列操作 - F-order更快
col_sum_f = a_f.sum(axis=0)

10.4 避免常见陷阱

python 复制代码

# ❌ 避免：在循环中拼接
result = np.array([])
for x in data:
    result = np.append(result, x)  # 每次创建新数组，很慢！

# ✅ 推荐：预分配
result = np.empty(len(data))
for i, x in enumerate(data):
    result[i] = x

# ✅ 更好的方法：向量化
result = np.array(data) * 2

10.5 性能测试

python 复制代码

import time

# 简单计时
start = time.time()
result = func()
elapsed = time.time() - start

# 使用timeit
import timeit
timeit.timeit('np.sum(arr)',
              setup='import numpy as np; arr = np.arange(1000)',
              number=1000)

常用代码片段

数据标准化

python 复制代码

def normalize(data, axis=0):
    mean = data.mean(axis=axis, keepdims=True)
    std = data.std(axis=axis, keepdims=True)
    return (data - mean) / std

训练集/测试集划分

python 复制代码

def train_test_split(X, y, test_size=0.2, random_state=None):
    rng = np.random.default_rng(random_state)
    n = len(X)
    indices = rng.permutation(n)
    split_idx = int(n * (1 - test_size))
    train_idx, test_idx = indices[:split_idx], indices[split_idx:]
    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

批量处理

python 复制代码

def batch_generator(data, batch_size):
    n = len(data)
    for i in range(0, n, batch_size):
        yield data[i:i + batch_size]

滑动窗口

python 复制代码

def sliding_window(arr, window_size):
    shape = (arr.size - window_size + 1, window_size)
    strides = (arr.strides[0], arr.strides[0])
    return np.lib.stride_tricks.as_strided(arr, shape=shape, strides=strides)

Min-Max归一化

python 复制代码

def min_max_scale(data, axis=0):
    min_val = data.min(axis=axis, keepdims=True)
    max_val = data.max(axis=axis, keepdims=True)
    return (data - min_val) / (max_val - min_val)

Z-Score标准化

python 复制代码

def z_score(data, axis=0):
    mean = data.mean(axis=axis, keepdims=True)
    std = data.std(axis=axis, keepdims=True)
    return (data - mean) / std

学习资源

本笔记整合了 numpy_tutorial 目录下 10 章教程代码的实际场景案例