【AI大模型--NumPy-02】-数组创建与高级索引完全指南

02_arrayIndexing.py - 数组创建与高级索引完全指南

学习路径第 2 步 (共 10 步) | 难度：基础
概述

NumPy 的核心能力之一是强大的索引系统。本文件系统讲解基础切片、花式索引、布尔掩码操作，以及结构化数组的实际应用。
学习目标

掌握一维/二维/三维数组的基础切片语法
理解花式索引 (Fancy Indexing) 与普通切片的区别
学会使用布尔掩码进行数据筛选和清洗
了解结构化数组与自定义数据类型
核心内容 (6 个模块)

模块	核心知识点
1. 基础切片	`start:stop:step`、负索引、多维数组切片
2. 花式索引	用整数数组索引、跨行/跨列选取元素
3. 布尔掩码	比较运算符、`np.where()`、数据过滤筛选
4. 结构化数组	自定义 dtype、字段访问、命名列操作
5. 实战案例	销售数据筛选与聚合统计
6. 速查表	常用索引语法快速参考
python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
=====================================
NumPy 数组创建与高级索引完全指南 (Array Creation & Advanced Indexing)
=====================================

本案例系统介绍 NumPy 数组的：
1. 多种数组创建方式及其适用场景
2. 基础切片 (Basic Slicing)
3. 花式索引 (Fancy Indexing)
4. 布尔索引 / 掩码操作 (Boolean Masking)
5. 结构化数组与自定义数据类型
6. 实战应用：数据筛选与清洗

【核心概念】
  NumPy 的索引系统是其最强大的特性之一，
  熟练掌握可以写出简洁高效的向量化代码，
  避免显式循环，性能提升数十到数百倍。

作者：学习笔记
日期：2026-05-18
"""

import numpy as np


def separator(title):
    """打印分隔线"""
    print(f"\n{'='*60}")
    print(f"  {title}")
    print('='*60)


# ============================================================
# 第一部分：数组创建方式全景图
# ============================================================
separator("一、数组创建方式全景图")

print("""
┌───────────────────────────────────────────────────────────────┐
│                    NumPy 数组创建方法速查                      │
├──────────────┬──────────────────┬─────────────────────────────┤
│ 方法          │ 典型用法           │ 适用场景                   │
├──────────────┼──────────────────┼─────────────────────────────┤
│ np.array()   │ np.array([1,2,3]) │ 从 Python 列表/元组创建     │
│ np.zeros()   │ np.zeros((3,4))  │ 预分配内存(全0)             │
│ np.ones()    │ np.ones((2,3))   │ 预分配内存(全1)             │
│ np.full()    │ np.full((2,3),7) │ 创建指定值的数组            │
│ np.arange()  │ np.arange(0,10,2)│ 类似 range(), 等差数列      │
│ np.linspace()│ np.linspace(0,1) │ 等间隔采样(指定点数)        │
│ np.logspace()│ np.logspace(0,2)│ 对数等间隔                  │
│ np.eye()     │ np.eye(3)       │ 单位矩阵                    │
│ np.diag()    │ np.diag([1,2,3])│ 对角矩阵                    │
│ np.random.*  │ np.random.randn()│ 随机数生成                  │
│ np.fromfunction│ np.fromfunc() │ 根据坐标函数生成             │
└──────────────┴──────────────────┴─────────────────────────────┘
""")

# --- 常用创建方式演示 ---
print("【实际演示】\n")

# 1. np.array --- 最基础的方式
arr_list = np.array([1, 2, 3, 4, 5])
print(f"1. from list:     {arr_list}, dtype={arr_list.dtype}")

arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
print(f"   2D array shape: {arr_2d.shape}")

# 2. zeros / ones / full --- 预分配内存
print(f"2. zeros(3,3):\n{np.zeros((3,3), dtype=int)}")
print(f"3. ones(2,4): \n{np.ones((2,4))}")
print(f"4. full(2,3, fill=PI):\n{np.full((2,3), np.pi)}")

# 3. arange vs linspace 的关键区别
print(f"\n5. arange(0, 10, 2) → {np.arange(0, 10, 2)}")
print(f"   linspace(0, 10, 5) → {np.linspace(0, 10, 5)}")
print("   [!] arange: 指定步长，终点不一定包含")
print("   [!] linspace: 指定点数，始终包含起点和终点")

# 4. 特殊矩阵
print(f"\n6. eye(3) 单位矩阵:\n{np.eye(3)}")
print(f"7. diag([1,2,3]) 对角矩阵:\n{np.diag([1, 2, 3])}")

# 5. 数据类型 dtype --- 关键!
print("\n8. dtype 对内存的影响:")
for dt in [np.int8, np.int32, np.int64, np.float32, np.float64]:
    arr = np.zeros(1000, dtype=dt)
    print(f"   {str(dt):10s} → 每元素 {arr.itemsize} 字节, 总计 {arr.nbytes/1024:.1f} KB")

# 6. fromfunction --- 高级用法
print("\n9. fromfunction (根据坐标函数生成):")
coord_arr = np.fromfunction(lambda i, j: i * 10 + j, (3, 5))
print(coord_arr)
print("   解释: arr[i,j] = i*10 + j")


# ============================================================
# 第二部分：基础切片 (Basic Slicing)
# ============================================================
separator("二、基础切片 (Basic Slicing)")

print("""
【语法规则】 arr[start:stop:step]
  - start: 起始索引 (含), 默认 0
  - stop:  结束索引 (不含), 默认末尾
  - step:  步长, 默认 1 (负数表示反向)

[★] 切片返回的是视图(View)，不是拷贝！修改切片会影响原数组！
""")

# 准备测试数据
data = np.arange(24).reshape(4, 6)
print("原始数组 (4x6):")
print(data)

# 一维切片类比
a1d = np.arange(10)
print(f"\n一维切片演示 (arr = {list(a1d)}):")
print(f"  a1d[1:5]     = {a1d[1:5]}         # 取索引 1~4")
print(f"  a1d[:5]      = {a1d[:5]}          # 取前5个")
print(f"  a1d[5:]      = {a1d[5:]}          # 取第5个之后")
print(f"  a1d[::2]     = {a1d[::2]}         # 每隔一个取")
print(f"  a1d[::-1]    = {a1d[::-1]}        # 反转!")
print(f"  a1d[-3:]     = {a1d[-3:]}         # 最后3个 (负索引)")

# 二维切片
print(f"\n二维切片 (基于上面的 4x6 数组):")
print(f"  data[0, :]    = 第0行:    {data[0, :]}")
print(f"  data[:, 0]    = 第0列:    {data[:, 0]}")
print(f"  data[1:3, 2:5] = 子块:\n{data[1:3, 2:5]}")
print(f"  data[::2, ::2] = 隔行隔列:\n{data[::2, ::2]}")
print(f"  data[:, ::-1]  = 左右翻转列:\n{data[:, ::-1]}")

# [!重要] 视图 vs 拷贝
print("\n[!!] 视图 vs 拷贝:")
view_slice = data[0:2, 0:2]
copy_slice = data[0:2, 0:2].copy()
view_slice[0, 0] = 999
copy_slice[0, 0] = 888
print(f"  修改 view_slice 后, 原数组变化了吗? data[0,0]={data[0,0]} (变了!)")
print(f"  修改 copy_slice 后, 原数组不变。用 .copy() 可避免副作用")


# ============================================================
# 第三部分：花式索引 (Fancy Indexing)
# ============================================================
separator("三、花式索引 (Fancy Indexing)")

print("""
【定义】使用整数数组作为索引，可同时选取非连续的多个元素

[★] 与切片的关键区别:
  - 切片: 返回 视图(View)
  - 花式索引: 返回 拷贝(Copy)，不影响原数组
""")

arr = np.arange(1, 13).reshape(3, 4)
print(f"原始数组:\n{arr}\n")

# 一维花式索引
idx = [0, 2, 3]  # 取第 0, 2, 3 列
print(f"1. 一维花式索引: arr[:, {idx}]")
print(f"   结果:\n{arr[:, idx]}")

# 二维花式索引
rows = [0, 2, 2]
cols = [1, 3, 0]
print(f"\n2. 二维花式索引: arr[[{rows}], [{cols}]]")
print(f"   结果: {arr[rows, cols]}")  # 逐对取: (0,1), (2,3), (2,0)

# 用 np.take / np.put
flat_idx = [0, 5, 11]
print(f"\n3. np.take (扁平化后按索引取): np.take(arr, {flat_idx})")
print(f"   结果: {np.take(arr, flat_idx)}")

# 使用 ix_ 构建开放网格索引
print(f"\n4. np.ix_ (用于多维组合索引):")
idx_rows = np.array([0, 2])
idx_cols = np.array([1, 2, 3])
result = arr[np.ix_(idx_rows, idx_cols)]
print(f"   选 行{list(idx_rows)} × 列{list(idx_cols)}:\n{result}")


# ============================================================
# 第四部分：布尔索引 / 掩码操作 (Boolean Masking)
# ============================================================
separator("四、布尔索引 / 掩码操作 ★★★")

print("""
【原理】用布尔数组作为掩码(mask)，True位置保留，False位置丢弃

这是数据分析中最常用的技巧之一！
等价于 SQL 的 WHERE 子句或 pandas 的 query。
""")

# 场景模拟：一组学生成绩
students = np.array(['Alice', 'Bob', 'Charlie', 'Diana', 'Eve',
                     'Frank', 'Grace', 'Henry'])
scores_math = np.array([85, 62, 91, 45, 78, 55, 88, 72])
scores_eng  = np.array([90, 58, 85, 50, 82, 60, 92, 68])

print("学生成绩数据:")
print(f"  姓名:  {students}")
print(f"  数学:  {scores_math}")
print(f"  英语:  {scores_eng}")

# 基本布尔条件
mask_pass = scores_math >= 60  # 及格线
print(f"\n数学及格掩码 (>=60): {mask_pass}")
print(f"及格的学生: {students[mask_pass]}")
print(f"他们的成绩:  {scores_math[mask_pass]}")

# 复合条件 (用 & | ~, 注意括号!)
mask_good = (scores_math >= 80) & (scores_eng >= 80)  # [!] 必须加括号
print(f"\n双科 >=80 的学霸: {students[mask_good]}")

# where 函数: 条件选择
math_better = np.where(scores_math > scores_eng, "MATH", "ENG")
print(f"\n各人更强科目: {list(zip(students, math_better))}")

# mask 操作实战: 异常值处理
print("\n--- 异常值处理场景 ---")
sensor_data = np.array([23.5, 24.1, 999.9, 23.8, 24.0, -99.0,
                        23.9, 1000.5, 23.7, 24.2])
print(f"原始传感器数据: {sensor_data}")

valid_mask = (sensor_data > 0) & (sensor_data < 100)  # 合理范围
clean_data = sensor_data.copy()
clean_data[~valid_mask] = np.nan  # 异常值标记为 NaN
print(f"清洗后数据:     {clean_data}")
print(f"有效均值:       {np.nanmean(clean_data):.2f} (排除异常值)")

# np.select: 多条件分类
conditions = [
    scores_math >= 90,
    scores_math >= 70,
    scores_math >= 60,
]
choices = ['A', 'B', 'C']
grades = np.select(conditions, choices, default='D')
print(f"\n数学等级评定: {dict(zip(students, grades))}")


# ============================================================
# 第五部分：结构化数组与自定义dtype
# ============================================================
separator("五、结构化数组 (Structured Arrays)")

print("""
【用途】当需要在一个数组中存储混合类型数据时（类似数据库表）
  例如: 每条记录包含 姓名(str)、年龄(int)、分数(float)
""")

# 方式1: 定义复合 dtype
student_dtype = np.dtype([
    ('name', 'U10'),    # Unicode字符串，最长10字符
    ('age', 'i4'),      # 32位整数
    ('score', 'f4'),    # 32位浮点
    ('passed', '?')     # 布尔型
])

structured_data = np.array([
    ('Alice',   20, 88.5, True),
    ('Bob',     19, 56.0, False),
    ('Charlie', 21, 92.3, True),
    ('Diana',   20, 73.8, True),
], dtype=student_dtype)

print("结构化数组:")
print(structured_data)
print(f"\n字段访问:")
print(f"  所有姓名: {structured_data['name']}")
print(f"  所有分数: {structured_data['score']}")
print(f"  平均分:   {structured_data['score'].mean():.1f}")
print(f"  及格率:   {structured_data['passed'].mean()*100:.0f}%")

# 排序 (按某字段)
sorted_indices = np.argsort(structured_data, order='score')[::-1]
print(f"\n按分数降序排列:")
for idx in sorted_indices:
    rec = structured_data[idx]
    print(f"  {rec['name']:8s} 年龄={rec['age']} 分数={rec['score']:.1f} {'PASS' if rec['passed'] else 'FAIL'}")


# ============================================================
# 第六部分：综合实战 ------ 数据清洗流水线
# ============================================================
separator("六、综合实战: 销售数据清洗与分析")

print("--- 模拟销售数据集 ---")
# 生成模拟数据: 产品ID, 单价, 销量, 地区
np.random.seed(42)
n_records = 20

product_ids = np.random.choice(['P001', 'P002', 'P003', 'P004'], n_records)
prices = np.where(product_ids == 'P001', 299,
         np.where(product_ids == 'P002', 499,
         np.where(product_ids == 'P003', 129, 89)))
quantities = np.random.randint(1, 50, n_records).astype(np.int32)
regions = np.random.choice(['North', 'South', 'East', 'West'], n_records)
revenue = prices * quantities

print(f"记录数: {n_records}")
print(f"产品ID: {product_ids}")
print(f"单价:   {prices}")
print(f"销量:   {quantities}")
print(f"地区:   {regions}")
print(f"营收:   {revenue}")

# 分析任务
print(f"\n--- 数据分析结果 ---")

# 1. 各产品总营收
unique_products = np.unique(product_ids)
print("各产品总营收:")
for prod in unique_products:
    mask = product_ids == prod
    total_rev = revenue[mask].sum()
    total_qty = quantities[mask].sum()
    print(f"  {prod}: revenue {total_rev:,}  sales {total_qty} items")

# 2. 各地区平均客单价
print("\n各地区统计:")
for region in np.unique(regions):
    mask = regions == region
    avg_price = revenue[mask].mean()
    count = mask.sum()
    print(f"  {region:5s} → 订单{count}笔, 平均营收 ¥{avg_price:.0f}")

# 3. Top 3 高额订单
top3_idx = np.argsort(revenue)[-3:][::-1]
print(f"\nTop 3 最高订单:")
for rank, idx in enumerate(top3_idx, 1):
    print(f"  #{rank} {product_ids[idx]:4s} x{quantities[idx]:2d} = ¥{revenue[idx]:,} ({regions[idx]})")

# 4. 异常订单检测 (>¥15000 为大额)
large_orders_mask = revenue > 15000
large_count = large_orders_mask.sum()
print(f"\n大额订单(>¥15000): {large_count} 笔, 占比 {large_count/n_records*100:.0f}%")
if large_count > 0:
    print(f"  详情: {product_ids[large_orders_mask]}")


# ============================================================
# 总结
# ============================================================
separator("总结: NumPy 索引体系速查")

summary = r"""
+--------------------------------------------------------------+
|                    NumPy 索引方式速查                          |
+--------------------------------------------------------------+
|                                                              |
|  1. 基础切片 arr[start:stop:step]                            |
|     - 返回 VIEW (视图), 修改影响原数组                        |
|     - 支持负索引、省略参数、步长                              |
|                                                              |
|  2. 花式索引 arr[[i,j,k]]                                    |
|     - 返回 COPY (拷贝), 安全但多占内存                       |
|     - 支持整数数组索引, np.ix_ 组合                           |
|                                                              |
|  3. 布尔索引 arr[bool_array]                                 |
|     - 最强大的数据筛选工具                                   |
|     - 支持 & | ~ 复合条件运算                                |
|     - 配合 np.where / np.select 使用                         |
|                                                              |
|  4. 结构化数组 dtype=[(field,type)]                          |
|     - 存储异构数据 (类似表格)                                |
|     - 按字段名访问, 支持排序和聚合                           |
|                                                              |
|  [记忆口诀]                                                   |
|  切片是窗口看原数据, 花式是抄录新副本,                        |
|  布尔像筛子过滤数据, 结构化像迷你表格。                        |
+--------------------------------------------------------------+
"""
print(summary)

print("\n程序运行完毕!")