本教程详细讲解NumPy的常用内置函数、统计函数、去重排序函数以及数组的各种运算操作,包含大量实例、易错点分析和性能对比。
1. NumPy基本数学函数
1.1 创建测试数据
import numpy as np
# 设置随机种子,保证结果可重复
np.random.seed(42)
# 创建一个形状为(2,3)的数组,元素范围在[-3,5)之间
arr1 = np.random.uniform(-3, 5, size=(2, 3))
print("原始数组 arr1:")
print(arr1)
print(f"数组形状: {arr1.shape}")
print(f"数据类型: {arr1.dtype}")
print()
# 为了演示效果,我们同时创建一个包含NaN的数组
arr_with_nan = np.array([[1.0, 2.0, np.nan],
[4.0, np.nan, 6.0]])
print("包含NaN的数组:")
print(arr_with_nan)
print()
1.2 取整与绝对值函数
print("=== 取整与绝对值函数 ===")
print()
# 1. np.ceil() - 向上取整
print("1. np.ceil() - 向上取整")
ceil_result = np.ceil(arr1)
print(f"原始数组:\n{arr1}")
print(f"向上取整结果:\n{ceil_result}")
print(f"数据类型: {ceil_result.dtype}")
print()
# 2. np.floor() - 向下取整
print("2. np.floor() - 向下取整")
floor_result = np.floor(arr1)
print(f"原始数组:\n{arr1}")
print(f"向下取整结果:\n{floor_result}")
print()
# 3. np.trunc() - 截断小数部分
print("3. np.trunc() - 截断小数部分")
trunc_result = np.trunc(arr1)
print(f"截断结果:\n{trunc_result}")
print()
# 4. np.round() - 四舍五入
print("4. np.round() - 四舍五入(指定小数位数)")
round_result = np.round(arr1, decimals=2) # 保留2位小数
rint_result = np.rint(arr1) # 四舍五入到最近的整数
print(f"保留2位小数:\n{round_result}")
print(f"四舍五入到整数 (np.rint):\n{rint_result}")
print()
# 比较rint和round的区别
test_numbers = np.array([1.5, 2.5, 3.5, 4.5])
print(f"测试数组: {test_numbers}")
print(f"np.rint结果: {np.rint(test_numbers)} # 银行家舍入法")
print(f"np.round结果: {np.round(test_numbers)} # 四舍五入")
print()
# 5. np.abs() - 绝对值
print("5. np.abs() - 绝对值")
abs_result = np.abs(arr1)
print(f"原始数组:\n{arr1}")
print(f"绝对值结果:\n{abs_result}")
print()
# 6. 其他绝对值相关函数
print("6. 其他绝对值相关函数")
complex_arr = np.array([1+2j, 3-4j, -5+6j])
print(f"复数数组: {complex_arr}")
print(f"np.absolute (绝对值): {np.absolute(complex_arr)}")
print(f"np.abs (绝对值): {np.abs(complex_arr)}")
print(f"np.real (实部): {np.real(complex_arr)}")
print(f"np.imag (虚部): {np.imag(complex_arr)}")
print()
1.3 特殊值判断函数
print("=== 特殊值判断函数 ===")
print()
# 1. np.isnan() - 判断是否为NaN
print("1. np.isnan() - 判断是否为NaN")
print(f"测试数组:\n{arr_with_nan}")
print(f"是否为NaN:\n{np.isnan(arr_with_nan)}")
print()
# 2. np.isinf() - 判断是否为无穷大
print("2. np.isinf() - 判断是否为无穷大")
arr_inf = np.array([1.0, 2.0, np.inf, -np.inf, 5.0])
print(f"测试数组: {arr_inf}")
print(f"是否为无穷大: {np.isinf(arr_inf)}")
print()
# 3. np.isfinite() - 判断是否为有限数
print("3. np.isfinite() - 判断是否为有限数")
print(f"测试数组: {arr_inf}")
print(f"是否为有限数: {np.isfinite(arr_inf)}")
print()
# 4. 组合判断示例
print("4. 组合判断示例 - 清理无效数据")
data = np.array([1.0, np.nan, 3.0, np.inf, -np.inf, 6.0, np.nan])
print(f"原始数据: {data}")
# 找出有效数据(非NaN且有限)
valid_mask = np.isfinite(data)
clean_data = data[valid_mask]
print(f"有效数据索引: {valid_mask}")
print(f"清理后数据: {clean_data}")
print()
1.4 条件选择函数 np.where()
print("=== 条件选择函数 np.where() ===")
print()
# 1. 基本用法:三目运算符形式
print("1. np.where() 基本用法(三目运算符)")
condition = arr1 > 0
where_result = np.where(condition, 1, -1) # 大于0的置1,否则置-1
print(f"条件数组 (arr1 > 0):\n{condition}")
print(f"np.where结果:\n{where_result}")
print()
# 2. 更复杂的条件
print("2. 多条件选择")
score = np.array([85, 92, 78, 60, 45, 90, 100, 55])
grade = np.where(score >= 90, 'A',
np.where(score >= 80, 'B',
np.where(score >= 60, 'C', 'D')))
print(f"分数: {score}")
print(f"等级: {grade}")
print()
# 3. 使用np.select进行多条件选择(更清晰)
print("3. np.select() 多条件选择(推荐)")
conditions = [
score >= 90,
score >= 80,
score >= 60,
score < 60
]
choices = ['A', 'B', 'C', 'D']
grade_select = np.select(conditions, choices)
print(f"np.select结果: {grade_select}")
print()
# 4. 条件索引形式
print("4. np.where() 条件索引形式")
arr = np.array([1, 2, 3, 4, 5, 6])
indices = np.where(arr > 3)
print(f"数组: {arr}")
print(f"大于3的元素索引: {indices}")
print(f"大于3的元素: {arr[indices]}")
print()
# 5. 实际应用:数据清洗
print("5. 实际应用:异常值处理")
data = np.array([1.0, 2.0, 100.0, 4.0, 5.0, 200.0, 7.0, 8.0])
print(f"原始数据: {data}")
# 识别异常值(假设大于10为异常)
mean = np.mean(data)
std = np.std(data)
z_scores = np.abs((data - mean) / std)
is_outlier = z_scores > 2 # z-score绝对值大于2视为异常
# 用中位数替换异常值
median = np.median(data)
cleaned_data = np.where(is_outlier, median, data)
print(f"均值: {mean:.2f}, 标准差: {std:.2f}")
print(f"z-scores: {z_scores}")
print(f"异常值掩码: {is_outlier}")
print(f"清理后数据: {cleaned_data}")
print()
1.5 算术运算函数
print("=== 算术运算函数 ===")
print()
# 1. 基本算术运算
print("1. 基本算术运算")
a = np.array([1, 2, 3, 4, 5])
b = np.array([6, 7, 8, 9, 10])
print(f"数组 a: {a}")
print(f"数组 b: {b}")
print(f"加法 (a+b): {a + b}")
print(f"减法 (a-b): {a - b}")
print(f"乘法 (a*b): {a * b}")
print(f"除法 (a/b): {a / b}")
print(f"整除 (a//b): {a // b}")
print(f"取余 (a%b): {a % b}")
print(f"幂运算 (a**2): {a ** 2}")
print()
# 2. 使用函数形式的运算
print("2. 函数形式的算术运算")
print(f"np.add(a, b): {np.add(a, b)}")
print(f"np.subtract(a, b): {np.subtract(a, b)}")
print(f"np.multiply(a, b): {np.multiply(a, b)}")
print(f"np.divide(a, b): {np.divide(a, b)}")
print(f"np.floor_divide(a, b): {np.floor_divide(a, b)}")
print(f"np.mod(a, b): {np.mod(a, b)}")
print(f"np.power(a, 2): {np.power(a, 2)}")
print()
# 3. 就地修改运算
print("3. 就地修改运算")
x = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
y = np.array([2.0, 2.0, 2.0, 2.0, 2.0])
# 创建副本用于演示
x_copy = x.copy()
np.multiply(x_copy, y, out=x_copy) # 就地修改
print(f"x * y (就地修改): {x_copy}")
# 对比普通乘法
x_normal = x * y
print(f"x * y (普通): {x_normal}")
print()
# 4. 处理除零错误
print("4. 处理除零错误")
numerator = np.array([1, 2, 3, 4, 5])
denominator = np.array([1, 2, 0, 4, 0])
# 方法1:使用np.errstate忽略警告
with np.errstate(divide='ignore', invalid='ignore'):
result = numerator / denominator
result = np.where(denominator == 0, 0, result) # 将除零结果设为0
print(f"安全除法结果: {result}")
# 方法2:使用np.divide的where参数
result2 = np.divide(numerator, denominator, where=denominator!=0)
print(f"使用where参数: {result2}")
print()
# 5. 性能对比:函数形式 vs 运算符
print("5. 性能对比:函数形式 vs 运算符")
import time
size = 1000000
x_large = np.random.randn(size)
y_large = np.random.randn(size)
# 运算符
start = time.time()
result_op = x_large + y_large
time_op = time.time() - start
# 函数形式
start = time.time()
result_func = np.add(x_large, y_large)
time_func = time.time() - start
print(f"数组大小: {size:,}")
print(f"运算符 (+) 耗时: {time_op:.6f}秒")
print(f"函数 (np.add) 耗时: {time_func:.6f}秒")
print(f"结果相等: {np.allclose(result_op, result_func)}")
print()
# 6. 实际应用:向量化运算的优势
print("6. 实际应用:向量化运算 vs 循环")
def calculate_with_loop(a, b):
"""使用循环计算"""
result = np.zeros_like(a)
for i in range(len(a)):
result[i] = a[i] + b[i] * 2 - 1
return result
def calculate_vectorized(a, b):
"""使用向量化计算"""
return a + b * 2 - 1
# 测试
a_test = np.random.randn(10000)
b_test = np.random.randn(10000)
start = time.time()
result_loop = calculate_with_loop(a_test, b_test)
time_loop = time.time() - start
start = time.time()
result_vec = calculate_vectorized(a_test, b_test)
time_vec = time.time() - start
print(f"循环计算耗时: {time_loop:.6f}秒")
print(f"向量化计算耗时: {time_vec:.6f}秒")
print(f"加速比: {time_loop/time_vec:.1f}倍")
print(f"结果相等: {np.allclose(result_loop, result_vec)}")
2. NumPy统计函数
2.1 基础统计函数
print("=== NumPy统计函数 ===")
print()
# 创建测试数据
arr1 = np.arange(12).reshape(3, 4)
print("测试数组 arr1:")
print(arr1)
print(f"形状: {arr1.shape}")
print()
# 1. 基本统计量
print("1. 基本统计量")
print(f"数组总和 np.sum(): {np.sum(arr1)}")
print(f"数组均值 np.mean(): {np.mean(arr1):.4f}")
print(f"数组中位数 np.median(): {np.median(arr1)}")
print(f"数组方差 np.var(): {np.var(arr1):.4f}")
print(f"数组标准差 np.std(): {np.std(arr1):.4f}")
print(f"数组最小值 np.min(): {np.min(arr1)}")
print(f"数组最大值 np.max(): {np.max(arr1)}")
print()
# 2. 按轴计算统计量
print("2. 按轴计算统计量")
print("沿axis=0计算(按列计算):")
print(f" 每列总和: {np.sum(arr1, axis=0)}")
print(f" 每列均值: {np.mean(arr1, axis=0)}")
print(f" 每列方差: {np.var(arr1, axis=0)}")
print(f" 每列标准差: {np.std(arr1, axis=0)}")
print(f" 每列最小值: {np.min(arr1, axis=0)}")
print(f" 每列最大值: {np.max(arr1, axis=0)}")
print()
print("沿axis=1计算(按行计算):")
print(f" 每行总和: {np.sum(arr1, axis=1)}")
print(f" 每行均值: {np.mean(arr1, axis=1)}")
print(f" 每行方差: {np.var(arr1, axis=1)}")
print(f" 每行标准差: {np.std(arr1, axis=1)}")
print(f" 每行最小值: {np.min(arr1, axis=1)}")
print(f" 每行最大值: {np.max(arr1, axis=1)}")
print()
# 3. 位置统计函数
print("3. 位置统计函数")
print(f"最小值索引 np.argmin(): {np.argmin(arr1)}")
print(f"最大值索引 np.argmax(): {np.argmax(arr1)}")
print()
# 获取多维数组中的位置
min_index_flat = np.argmin(arr1)
min_index_multi = np.unravel_index(min_index_flat, arr1.shape)
max_index_flat = np.argmax(arr1)
max_index_multi = np.unravel_index(max_index_flat, arr1.shape)
print(f"最小值在扁平化数组中的索引: {min_index_flat}")
print(f"最小值在多维数组中的位置: {min_index_multi}")
print(f"最大值在扁平化数组中的索引: {max_index_flat}")
print(f"最大值在多维数组中的位置: {max_index_multi}")
print(f"验证最小值: arr1[{min_index_multi}] = {arr1[min_index_multi]}")
print(f"验证最大值: arr1[{max_index_multi}] = {arr1[max_index_multi]}")
print()
# 4. 累积统计函数
print("4. 累积统计函数")
print("原始数组:")
print(arr1)
print()
# np.cumsum() - 累积和
print("np.cumsum() - 累积和")
print(f"默认(扁平化): {np.cumsum(arr1)}")
print(f"沿axis=0(按列累积):")
print(np.cumsum(arr1, axis=0))
print(f"沿axis=1(按行累积):")
print(np.cumsum(arr1, axis=1))
print()
# np.cumprod() - 累积积
print("np.cumprod() - 累积积")
print(f"默认(扁平化): {np.cumprod(arr1)}")
print(f"沿axis=0(按列累积):")
print(np.cumprod(arr1, axis=0))
print()
# 5. 百分位数统计
print("5. 百分位数统计")
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
print(f"数据: {data}")
print(f"25%分位数: {np.percentile(data, 25)}")
print(f"50%分位数(中位数): {np.percentile(data, 50)}")
print(f"75%分位数: {np.percentile(data, 75)}")
print(f"90%分位数: {np.percentile(data, 90)}")
# 多个百分位数
print(f"多个分位数: {np.percentile(data, [25, 50, 75])}")
# 使用np.quantile(推荐,功能相同)
print(f"使用np.quantile: {np.quantile(data, [0.25, 0.5, 0.75])}")
print()
# 6. 极差统计
print("6. 极差统计")
print(f"极差 np.ptp(): {np.ptp(arr1)}") # peak to peak
print(f"按列极差: {np.ptp(arr1, axis=0)}")
print(f"按行极差: {np.ptp(arr1, axis=1)}")
print()
2.2 加权统计函数
print("=== 加权统计函数 ===")
print()
# 1. 加权平均值
print("1. 加权平均值 np.average()")
data = np.array([1, 2, 3, 4, 5])
weights = np.array([0.1, 0.2, 0.3, 0.2, 0.2]) # 权重总和应为1
simple_mean = np.mean(data)
weighted_mean = np.average(data, weights=weights)
print(f"数据: {data}")
print(f"权重: {weights}")
print(f"简单平均值: {simple_mean:.4f}")
print(f"加权平均值: {weighted_mean:.4f}")
print()
# 2. 多维数组加权平均
print("2. 多维数组加权平均")
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
weights_2d = np.array([[0.1, 0.2, 0.3], [0.1, 0.2, 0.1]])
print(f"2D数组:\n{arr_2d}")
print(f"权重数组:\n{weights_2d}")
# 整个数组的加权平均
total_weighted = np.average(arr_2d, weights=weights_2d)
print(f"整个数组加权平均: {total_weighted:.4f}")
# 按轴加权平均
weighted_axis0 = np.average(arr_2d, axis=0, weights=[0.6, 0.4])
print(f"按列加权平均 (axis=0): {weighted_axis0}")
print()
# 3. 协方差和相关系数
print("3. 协方差和相关系数")
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])
z = np.array([5, 4, 3, 2, 1])
print(f"x: {x}")
print(f"y: {y} (x的两倍)")
print(f"z: {z} (递减)")
# 协方差矩阵
cov_matrix = np.cov([x, y, z])
print(f"协方差矩阵:\n{cov_matrix}")
# 相关系数矩阵
corr_matrix = np.corrcoef([x, y, z])
print(f"相关系数矩阵:\n{corr_matrix}")
# 解释
print("\n相关系数解读:")
print(f"x和y的相关系数: {corr_matrix[0, 1]:.6f} (完全正相关)")
print(f"x和z的相关系数: {corr_matrix[0, 2]:.6f} (完全负相关)")
print(f"y和z的相关系数: {corr_matrix[1, 2]:.6f} (完全负相关)")
print()
# 4. 实际应用:成绩统计
print("4. 实际应用:学生成绩统计系统")
def calculate_grades(scores, weights=None):
"""计算学生成绩统计"""
n_students, n_subjects = scores.shape
if weights is None:
weights = np.ones(n_subjects) / n_subjects
print("=" * 60)
print("学生成绩统计报告")
print("=" * 60)
# 各科平均分
subject_means = np.mean(scores, axis=0)
print(f"\n各科平均分:")
for i, mean in enumerate(subject_means):
print(f" 科目{i+1}: {mean:.2f}")
# 各科标准差
subject_stds = np.std(scores, axis=0)
print(f"\n各科标准差:")
for i, std in enumerate(subject_stds):
print(f" 科目{i+1}: {std:.2f}")
# 学生加权总成绩
weighted_scores = np.average(scores, axis=1, weights=weights)
print(f"\n学生加权总成绩:")
for i, score in enumerate(weighted_scores):
print(f" 学生{i+1}: {score:.2f}")
# 排名
ranks = np.argsort(weighted_scores)[::-1] + 1 # 从高到低
print(f"\n学生排名:")
for rank, student_idx in enumerate(np.argsort(-weighted_scores)):
print(f" 第{rank+1}名: 学生{student_idx+1} (分数: {weighted_scores[student_idx]:.2f})")
# 总体统计
print(f"\n总体统计:")
print(f" 平均分: {np.mean(weighted_scores):.2f}")
print(f" 最高分: {np.max(weighted_scores):.2f}")
print(f" 最低分: {np.min(weighted_scores):.2f}")
print(f" 标准差: {np.std(weighted_scores):.2f}")
return weighted_scores, ranks
# 测试
np.random.seed(42)
n_students = 10
n_subjects = 5
scores = np.random.randint(60, 101, (n_students, n_subjects))
weights = np.array([0.2, 0.2, 0.2, 0.2, 0.2]) # 各科权重相等
calculate_grades(scores, weights)
2.3 易错点与注意事项
print("=== 易错点与注意事项 ===")
print()
# 1. ddof参数的区别
print("1. ddof参数的区别(自由度调整)")
data = np.array([1, 2, 3, 4, 5])
print(f"数据: {data}")
# 总体方差 vs 样本方差
population_var = np.var(data, ddof=0) # 总体方差,分母为n
sample_var = np.var(data, ddof=1) # 样本方差,分母为n-1
print(f"总体方差 (ddof=0): {population_var}")
print(f"样本方差 (ddof=1): {sample_var}")
print(f"两者关系: 样本方差 = 总体方差 × n/(n-1)")
print()
# 2. 忽略NaN的统计函数
print("2. 处理包含NaN的数据")
data_with_nan = np.array([1.0, 2.0, np.nan, 4.0, 5.0])
print(f"包含NaN的数据: {data_with_nan}")
# 错误的方法
try:
wrong_mean = np.mean(data_with_nan)
print(f"直接计算均值: {wrong_mean}")
except:
print("直接计算均值: 结果为NaN")
# 正确的方法
correct_mean = np.nanmean(data_with_nan)
print(f"使用np.nanmean: {correct_mean}")
# 其他忽略NaN的函数
print(f"np.nanstd: {np.nanstd(data_with_nan)}")
print(f"np.nanvar: {np.nanvar(data_with_nan)}")
print(f"np.nanmin: {np.nanmin(data_with_nan)}")
print(f"np.nanmax: {np.nanmax(data_with_nan)}")
print()
# 3. 大型数组的内存问题
print("3. 大型数组的内存问题")
print("对于非常大的数组,一次性计算统计量可能占用大量内存。")
print("解决方案:使用分块计算或在线算法。")
def online_mean_variance(data, chunk_size=1000):
"""在线计算均值和方差(适用于流数据)"""
n = 0
mean = 0.0
M2 = 0.0
for i in range(0, len(data), chunk_size):
chunk = data[i:i+chunk_size]
chunk_n = len(chunk)
chunk_mean = np.mean(chunk)
chunk_var = np.var(chunk, ddof=1) if chunk_n > 1 else 0
# 合并统计量
delta = chunk_mean - mean
mean += delta * chunk_n / (n + chunk_n)
M2 += chunk_var * (chunk_n - 1) + delta**2 * n * chunk_n / (n + chunk_n)
n += chunk_n
variance = M2 / (n - 1) if n > 1 else 0
return mean, variance
# 测试
large_data = np.random.randn(10000)
online_mean, online_var = online_mean_variance(large_data)
true_mean, true_var = np.mean(large_data), np.var(large_data, ddof=1)
print(f"在线算法均值: {online_mean:.6f}, 真实均值: {true_mean:.6f}")
print(f"在线算法方差: {online_var:.6f}, 真实方差: {true_var:.6f}")
print(f"均值误差: {abs(online_mean - true_mean):.6e}")
print(f"方差误差: {abs(online_var - true_var):.6e}")
print()
# 4. 性能优化技巧
print("4. 统计函数性能优化")
size = 1000000
data = np.random.randn(size)
# 多次计算相同统计量
import time
# 不好:重复计算
start = time.time()
for _ in range(10):
mean1 = np.mean(data)
std1 = np.std(data)
time_bad = time.time() - start
# 好:一次计算
start = time.time()
mean2 = np.mean(data)
std2 = np.std(data)
# 重复使用结果
for _ in range(9):
temp_mean = mean2
temp_std = std2
time_good = time.time() - start
print(f"数组大小: {size:,}")
print(f"重复计算10次耗时: {time_bad:.4f}秒")
print(f"计算1次+复用耗时: {time_good:.4f}秒")
print(f"加速比: {time_bad/time_good:.1f}倍")
3. NumPy去重和排序函数
3.1 去重函数 np.unique()
print("=== NumPy去重函数 ===")
print()
# 1. 基本用法
print("1. np.unique() 基本用法")
arr1 = np.array([[6, 3, 5], [2, 1, 3]])
print(f"原始数组:\n{arr1}")
unique_values = np.unique(arr1)
print(f"去重结果: {unique_values}")
print(f"结果类型: {type(unique_values)}")
print(f"结果形状: {unique_values.shape}")
print()
# 2. 参数详解
print("2. np.unique() 参数详解")
# return_index: 返回唯一值在原始数组中的第一次出现的索引
arr = np.array([3, 1, 2, 1, 3, 4, 2])
unique_vals, indices = np.unique(arr, return_index=True)
print(f"原始数组: {arr}")
print(f"唯一值: {unique_vals}")
print(f"第一次出现索引: {indices}")
print(f"验证: arr[{indices}] = {arr[indices]}")
print()
# return_inverse: 返回原始数组重建唯一值数组的索引
unique_vals, inverse = np.unique(arr, return_inverse=True)
print(f"原始数组: {arr}")
print(f"唯一值: {unique_vals}")
print(f"重建索引: {inverse}")
print(f"重建数组: {unique_vals[inverse]}")
print(f"重建是否相等: {np.array_equal(arr, unique_vals[inverse])}")
print()
# return_counts: 返回每个唯一值出现的次数
unique_vals, counts = np.unique(arr, return_counts=True)
print(f"唯一值: {unique_vals}")
print(f"出现次数: {counts}")
for val, count in zip(unique_vals, counts):
print(f" 值 {val} 出现了 {count} 次")
print()
# 3. 轴参数 axis
print("3. 按轴去重 (axis参数)")
arr_2d = np.array([[1, 2, 3, 1],
[4, 5, 6, 4],
[7, 8, 9, 7]])
print(f"2D数组:\n{arr_2d}")
# 默认会扁平化
flat_unique = np.unique(arr_2d)
print(f"默认去重(扁平化): {flat_unique}")
# 按行去重(axis=0)
row_unique = np.unique(arr_2d, axis=0)
print(f"按行去重 (axis=0):\n{row_unique}")
# 按列去重(axis=1)
col_unique = np.unique(arr_2d, axis=1)
print(f"按列去重 (axis=1):\n{col_unique}")
print()
# 4. 实际应用:数据清洗
print("4. 实际应用:数据清洗")
def clean_duplicate_data(data, threshold=0.01):
"""清理重复和近似重复的数据"""
print(f"原始数据 ({len(data)} 个点):\n{data}")
# 1. 精确去重
unique_data, indices = np.unique(data, return_index=True)
print(f"\n精确去重后: {len(unique_data)} 个唯一值")
# 2. 近似去重(基于阈值)
# 对数据进行排序
sorted_indices = np.argsort(data)
sorted_data = data[sorted_indices]
# 找到差异小于阈值的连续点
diffs = np.diff(sorted_data)
mask = np.concatenate(([True], diffs > threshold))
approx_unique = sorted_data[mask]
print(f"近似去重 (阈值={threshold}) 后: {len(approx_unique)} 个唯一值")
return unique_data, approx_unique
# 测试
test_data = np.array([1.0, 1.001, 1.002, 2.0, 2.001, 3.0, 3.0, 3.001])
exact_unique, approx_unique = clean_duplicate_data(test_data, threshold=0.01)
print(f"\n精确唯一值: {exact_unique}")
print(f"近似唯一值: {approx_unique}")
3.2 排序函数
print("\n=== NumPy排序函数 ===")
print()
# 1. np.sort() vs ndarray.sort()
print("1. np.sort() 与 ndarray.sort() 的区别")
arr2 = np.array([33, 11, 22, 55, 66])
print(f"原始数组 arr2: {arr2}")
print(f"id: {id(arr2)}")
# np.sort() 返回排序后的新数组,不修改原数组
sorted_np = np.sort(arr2)
print(f"\nnp.sort(arr2) 结果: {sorted_np}")
print(f"排序后原数组: {arr2} (未改变)")
print(f"np.sort结果id: {id(sorted_np)} (新数组)")
# ndarray.sort() 原地排序,修改原数组
arr2_copy = arr2.copy() # 创建副本用于演示
arr2_copy.sort()
print(f"\narr2.sort() 后数组: {arr2_copy} (已修改)")
print(f"原地排序后id: {id(arr2_copy)} (相同数组)")
print()
# 2. 排序方向
print("2. 排序方向控制")
arr = np.array([3, 1, 4, 1, 5, 9, 2, 6])
print(f"原始数组: {arr}")
# 默认升序
ascending = np.sort(arr)
print(f"升序排序: {ascending}")
# 降序排序
descending = np.sort(arr)[::-1] # 方法1:反转
descending2 = -np.sort(-arr) # 方法2:取负排序
print(f"降序排序 (反转): {descending}")
print(f"降序排序 (取负): {descending2}")
print()
# 3. 排序算法
print("3. 排序算法选择")
large_arr = np.random.randn(10000)
# 快速排序(默认)
start = time.time()
np.sort(large_arr, kind='quicksort')
time_quick = time.time() - start
# 归并排序
start = time.time()
np.sort(large_arr, kind='mergesort')
time_merge = time.time() - start
# 堆排序
start = time.time()
np.sort(large_arr, kind='heapsort')
time_heap = time.time() - start
print(f"数组大小: {len(large_arr):,}")
print(f"快速排序耗时: {time_quick:.6f}秒")
print(f"归并排序耗时: {time_merge:.6f}秒")
print(f"堆排序耗时: {time_heap:.6f}秒")
print("注意:归并排序稳定,但通常比快速排序慢")
print()
# 4. argsort() 排序索引
print("4. np.argsort() - 返回排序索引")
arr = np.array([33, 11, 22, 55, 66])
print(f"原始数组: {arr}")
indices = np.argsort(arr)
print(f"排序索引: {indices}")
print(f"通过索引获取排序结果: {arr[indices]}")
print()
# 5. 多维数组排序
print("5. 多维数组排序")
arr_2d = np.array([[3, 2, 1],
[6, 5, 4],
[9, 8, 7]])
print(f"2D数组:\n{arr_2d}")
# 按行排序
sorted_rows = np.sort(arr_2d, axis=1)
print(f"按行排序 (axis=1):\n{sorted_rows}")
# 按列排序
sorted_cols = np.sort(arr_2d, axis=0)
print(f"按列排序 (axis=0):\n{sorted_cols}")
# 按指定列排序
data = np.array([[3, 2, 9],
[1, 5, 7],
[4, 8, 6]])
print(f"\n待排序数据:\n{data}")
# 按第一列排序
sort_by_col0 = data[data[:, 0].argsort()]
print(f"按第0列排序:\n{sort_by_col0}")
# 按第二列排序
sort_by_col1 = data[data[:, 1].argsort()]
print(f"按第1列排序:\n{sort_by_col1}")
print()
# 6. 实际应用:Top-N 查询
print("6. 实际应用:Top-N 查询")
def get_top_n(data, n=3, axis=-1, largest=True):
"""获取前N个最大/最小值"""
if largest:
# 获取最大的N个值
indices = np.argpartition(data, -n, axis=axis)
if axis == -1:
# 扁平化的情况
top_indices = indices[-n:]
else:
# 需要处理多维情况
take_indices = tuple([slice(None)] * axis + [slice(-n, None)])
top_indices = indices[take_indices]
top_values = np.take_along_axis(data, top_indices, axis=axis)
# 排序
sorted_indices = np.argsort(top_values, axis=axis)
if axis == -1:
sorted_top_values = top_values[sorted_indices]
else:
sorted_top_values = np.take_along_axis(top_values, sorted_indices, axis=axis)
else:
# 获取最小的N个值
indices = np.argpartition(data, n, axis=axis)
if axis == -1:
top_indices = indices[:n]
else:
take_indices = tuple([slice(None)] * axis + [slice(0, n)])
top_indices = indices[take_indices]
top_values = np.take_along_axis(data, top_indices, axis=axis)
# 排序
sorted_indices = np.argsort(top_values, axis=axis)
if axis == -1:
sorted_top_values = top_values[sorted_indices]
else:
sorted_top_values = np.take_along_axis(top_values, sorted_indices, axis=axis)
return sorted_top_values
# 测试
scores = np.array([85, 92, 78, 60, 95, 88, 76, 90])
print(f"所有分数: {scores}")
print(f"前3名: {get_top_n(scores, n=3)}")
print(f"后3名: {get_top_n(scores, n=3, largest=False)}")
4. NumPy数组运算
4.1 形状相同的数组运算
print("=== NumPy数组运算:形状相同的数组 ===")
print()
# 1. 基本运算
print("1. 形状相同数组的基本运算")
arr1 = np.array([10, 20, 30, 40])
arr2 = np.arange(1, 5) # [1, 2, 3, 4]
print(f"arr1: {arr1}")
print(f"arr2: {arr2}")
print(f"形状: arr1.shape={arr1.shape}, arr2.shape={arr2.shape}")
print()
# 算术运算
print("算术运算:")
print(f"arr1 + arr2 = {arr1 + arr2}")
print(f"arr1 - arr2 = {arr1 - arr2}")
print(f"arr1 * arr2 = {arr1 * arr2}")
print(f"arr1 / arr2 = {arr1 / arr2}")
print(f"arr1 // arr2 = {arr1 // arr2}") # 整除
print(f"arr1 ** arr2 = {arr1 ** arr2}") # 幂运算
print()
# 比较运算
print("比较运算:")
print(f"arr1 > arr2 = {arr1 > arr2}")
print(f"arr1 == arr2 = {arr1 == arr2}")
print(f"arr1 != arr2 = {arr1 != arr2}")
print()
# 逻辑运算
print("逻辑运算:")
bool_arr1 = np.array([True, False, True, False])
bool_arr2 = np.array([True, True, False, False])
print(f"bool_arr1: {bool_arr1}")
print(f"bool_arr2: {bool_arr2}")
print(f"逻辑与: {np.logical_and(bool_arr1, bool_arr2)}")
print(f"逻辑或: {np.logical_or(bool_arr1, bool_arr2)}")
print(f"逻辑非: {np.logical_not(bool_arr1)}")
print(f"逻辑异或: {np.logical_xor(bool_arr1, bool_arr2)}")
print()
# 2. 广播的概念
print("2. 广播的概念")
# 即使形状不完全相同,但兼容的数组也能进行运算
arr1_2d = np.array([[1, 2, 3], [4, 5, 6]])
arr2_1d = np.array([10, 20, 30])
print(f"2D数组:\n{arr1_2d}")
print(f"形状: {arr1_2d.shape}")
print(f"1D数组: {arr2_1d}")
print(f"形状: {arr2_1d.shape}")
print(f"广播加法:\n{arr1_2d + arr2_1d}")
print("解释: 1D数组被广播到2D数组的每一行")
print()
# 3. 运算函数形式
print("3. 函数形式的运算")
print(f"np.add(arr1, arr2): {np.add(arr1, arr2)}")
print(f"np.subtract(arr1, arr2): {np.subtract(arr1, arr2)}")
print(f"np.multiply(arr1, arr2): {np.multiply(arr1, arr2)}")
print(f"np.divide(arr1, arr2): {np.divide(arr1, arr2)}")
print(f"np.power(arr1, arr2): {np.power(arr1, arr2)}")
print()
# 4. 就地运算
print("4. 就地运算(修改原数组)")
x = np.array([1.0, 2.0, 3.0, 4.0])
y = np.array([0.1, 0.2, 0.3, 0.4])
print(f"原始 x: {x}")
print(f"原始 y: {y}")
# 使用out参数进行就地运算
np.add(x, y, out=x)
print(f"x + y 后 x: {x}")
np.multiply(x, 2, out=x)
print(f"x * 2 后 x: {x}")
print()
# 5. 性能对比
print("5. 性能对比:循环 vs 向量化")
size = 1000000
a = np.random.randn(size)
b = np.random.randn(size)
# Python循环
def loop_operation(a, b):
result = np.zeros_like(a)
for i in range(len(a)):
result[i] = a[i] + b[i] * 2 - 1
return result
# NumPy向量化
def vectorized_operation(a, b):
return a + b * 2 - 1
# 计时
start = time.time()
result_loop = loop_operation(a, b)
time_loop = time.time() - start
start = time.time()
result_vec = vectorized_operation(a, b)
time_vec = time.time() - start
print(f"数组大小: {size:,}")
print(f"循环耗时: {time_loop:.4f}秒")
print(f"向量化耗时: {time_vec:.4f}秒")
print(f"加速比: {time_loop/time_vec:.1f}倍")
print(f"结果一致: {np.allclose(result_loop, result_vec)}")
4.2 数组与标量的运算
print("\n=== NumPy数组运算:数组与标量 ===")
print()
# 1. 基本运算
print("1. 数组与标量的基本运算")
arr1 = np.arange(1, 5) # [1, 2, 3, 4]
scalar = 10
print(f"数组: {arr1}")
print(f"标量: {scalar}")
print()
print("算术运算:")
print(f"arr1 + 10 = {arr1 + scalar}")
print(f"arr1 - 10 = {arr1 - scalar}")
print(f"arr1 * 10 = {arr1 * scalar}")
print(f"arr1 / 10 = {arr1 / scalar}")
print(f"10 / arr1 = {scalar / arr1}")
print(f"arr1 ** 2 = {arr1 ** 2}")
print(f"2 ** arr1 = {2 ** arr1}")
print()
# 2. 比较运算
print("2. 数组与标量的比较运算")
print(f"arr1 > 2 = {arr1 > 2}")
print(f"arr1 == 3 = {arr1 == 3}")
print(f"arr1 != 2 = {arr1 != 2}")
print()
# 3. 实际应用:数据标准化
print("3. 实际应用:数据标准化")
def normalize_data(data, method='zscore'):
"""数据标准化"""
if method == 'zscore':
# Z-score标准化:(x - μ) / σ
mean = np.mean(data)
std = np.std(data)
if std == 0: # 避免除零
return np.zeros_like(data)
return (data - mean) / std
elif method == 'minmax':
# 最小-最大标准化:(x - min) / (max - min)
min_val = np.min(data)
max_val = np.max(data)
if max_val == min_val: # 避免除零
return np.zeros_like(data)
return (data - min_val) / (max_val - min_val)
else:
raise ValueError(f"未知的标准化方法: {method}")
# 测试
data = np.array([10, 20, 30, 40, 50])
print(f"原始数据: {data}")
zscore_normalized = normalize_data(data, 'zscore')
print(f"Z-score标准化: {zscore_normalized}")
print(f"均值: {np.mean(zscore_normalized):.6f}")
print(f"标准差: {np.std(zscore_normalized):.6f}")
minmax_normalized = normalize_data(data, 'minmax')
print(f"最小-最大标准化: {minmax_normalized}")
print(f"范围: [{np.min(minmax_normalized):.2f}, {np.max(minmax_normalized):.2f}]")
print()
# 4. 广播的高级应用
print("4. 广播的高级应用")
# 创建网格
x = np.linspace(-2, 2, 5)
y = np.linspace(-2, 2, 5)
X, Y = np.meshgrid(x, y)
print(f"X坐标网格:\n{X}")
print(f"Y坐标网格:\n{Y}")
# 计算距离
Z = np.sqrt(X**2 + Y**2)
print(f"距离原点的距离:\n{Z}")
# 创建三维数组
arr_3d = np.random.rand(2, 3, 4)
scalar_3d = 10
print(f"\n3D数组形状: {arr_3d.shape}")
print(f"3D数组 + 标量 形状: {(arr_3d + scalar_3d).shape}")
4.3 矩阵乘法(重点)
print("\n=== NumPy矩阵乘法(重点) ===")
print()
# 1. 矩阵乘法基础
print("1. 矩阵乘法基础")
arr1 = np.array([[1, 2, 3],
[4, 5, 6]]) # 2×3矩阵
arr2 = np.array([[6, 23],
[-1, 7],
[8, 9]]) # 3×2矩阵
print(f"矩阵A (2×3):\n{arr1}")
print(f"矩阵B (3×2):\n{arr2}")
# 计算乘积:C = A × B,形状为2×2
result = arr1 @ arr2
print(f"矩阵乘积 A @ B (2×2):\n{result}")
# 验证计算
# 第一个元素:1 * 6 + 2*(-1) + 3 * 8 = 6 - 2 + 24 = 28
# 第二个元素:1 * 23 + 2 * 7 + 3 * 9 = 23 + 14 + 27 = 64
# 以此类推...
print()
# 2. 不同乘法方法的比较
print("2. 不同矩阵乘法方法")
# 方法1:@运算符(推荐)
result1 = arr1 @ arr2
print(f"使用 @ 运算符:\n{result1}")
# 方法2:np.dot()
result2 = np.dot(arr1, arr2)
print(f"使用 np.dot():\n{result2}")
# 方法3:数组的dot方法
result3 = arr1.dot(arr2)
print(f"使用 arr1.dot():\n{result3}")
# 方法4:np.matmul()
result4 = np.matmul(arr1, arr2)
print(f"使用 np.matmul():\n{result4}")
print(f"所有方法结果是否相同: {np.allclose(result1, result2) and np.allclose(result2, result3) and np.allclose(result3, result4)}")
print()
# 3. 矩阵乘法 vs 逐元素乘法
print("3. 矩阵乘法 vs 逐元素乘法")
A = np.array([[1, 2],
[3, 4]])
B = np.array([[5, 6],
[7, 8]])
print(f"矩阵A:\n{A}")
print(f"矩阵B:\n{B}")
# 逐元素乘法(Hadamard积)
elementwise = A * B
print(f"逐元素乘法 (A * B):\n{elementwise}")
# 矩阵乘法
matmul = A @ B
print(f"矩阵乘法 (A @ B):\n{matmul}")
print("注意:两种乘法完全不同!")
print()
# 4. 矩阵乘法的维度要求
print("4. 矩阵乘法的维度要求")
def check_matmul_shape(A, B):
"""检查矩阵乘法的维度兼容性"""
shape_A = A.shape
shape_B = B.shape
if len(shape_A) == 1:
dims_A = (1, shape_A[0])
else:
dims_A = shape_A
if len(shape_B) == 1:
dims_B = (shape_B[0], 1)
else:
dims_B = shape_B
if dims_A[-1] != dims_B[-2]:
return False, f"维度不匹配: A的列数({dims_A[-1]}) != B的行数({dims_B[-2]})"
return True, f"可以相乘,结果形状: {dims_A[:-1] + dims_B[1:]}"
# 测试
test_cases = [
(np.array([1, 2, 3]), np.array([[4], [5], [6]])), # 向量点积
(np.array([[1, 2, 3]]), np.array([[4], [5], [6]])), # 1×3 @ 3×1
(np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])), # 2×2 @ 2×2
(np.array([[1, 2, 3], [4, 5, 6]]), np.array([[7, 8], [9, 10], [11, 12]])), # 2×3 @ 3×2
]
for i, (A, B) in enumerate(test_cases):
valid, message = check_matmul_shape(A, B)
print(f"测试用例 {i+1}: A.shape={A.shape}, B.shape={B.shape}")
print(f" {message}")
if valid:
result = A @ B
print(f" 结果形状: {result.shape}")
print()
4.4 线性代数运算
print("\n=== 线性代数运算 ===")
print()
# 1. 矩阵的转置
print("1. 矩阵的转置")
A = np.array([[1, 2, 3],
[4, 5, 6]])
print(f"原始矩阵A:\n{A}")
print(f"形状: {A.shape}")
# 转置
A_T = A.T
print(f"转置A.T:\n{A_T}")
print(f"形状: {A_T.shape}")
print()
# 2. 矩阵的迹
print("2. 矩阵的迹(对角线元素之和)")
A_square = np.array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
trace = np.trace(A_square)
print(f"矩阵:\n{A_square}")
print(f"迹: {trace} (1+5+9)")
print()
# 3. 矩阵的行列式
print("3. 矩阵的行列式")
A_det = np.array([[1, 2],
[3, 4]])
det = np.linalg.det(A_det)
print(f"矩阵:\n{A_det}")
print(f"行列式: {det:.2f}")
print()
# 4. 矩阵的逆
print("4. 矩阵的逆")
A_inv = np.array([[1, 2],
[3, 4]])
try:
inv = np.linalg.inv(A_inv)
print(f"矩阵:\n{A_inv}")
print(f"逆矩阵:\n{inv}")
# 验证:A × A⁻¹ = I
identity = A_inv @ inv
print(f"验证 A × A⁻¹:\n{identity}")
except np.linalg.LinAlgError as e:
print(f"不可逆: {e}")
print()
# 5. 特征值和特征向量
print("5. 特征值和特征向量")
A_eig = np.array([[4, -2],
[1, 1]])
eigenvalues, eigenvectors = np.linalg.eig(A_eig)
print(f"矩阵:\n{A_eig}")
print(f"特征值: {eigenvalues}")
print(f"特征向量:\n{eigenvectors}")
# 验证:A × v = λ × v
for i in range(len(eigenvalues)):
λ = eigenvalues[i]
v = eigenvectors[:, i]
result = A_eig @ v
expected = λ * v
print(f"验证特征值{λ}: A×v={result}, λ×v={expected}, 是否接近: {np.allclose(result, expected)}")
print()
# 6. 线性方程组求解
print("6. 线性方程组求解")
# 解方程组:2x + y = 5, x - 3y = -2
A = np.array([[2, 1],
[1, -3]])
b = np.array([5, -2])
try:
x = np.linalg.solve(A, b)
print(f"系数矩阵A:\n{A}")
print(f"常数向量b: {b}")
print(f"解x: {x}")
# 验证:Ax = b
verification = A @ x
print(f"验证 Ax: {verification}, 与b相等: {np.allclose(verification, b)}")
except np.linalg.LinAlgError as e:
print(f"无法求解: {e}")
print()
# 7. 实际应用:最小二乘法
print("7. 实际应用:最小二乘法拟合")
def linear_regression(X, y):
"""线性回归:y = Xβ + ε"""
# 添加截距项
X_with_intercept = np.column_stack([np.ones(len(X)), X])
# 使用正规方程:(XᵀX)⁻¹Xᵀy
XTX = X_with_intercept.T @ X_with_intercept
XTy = X_with_intercept.T @ y
try:
beta = np.linalg.inv(XTX) @ XTy
except np.linalg.LinAlgError:
# 如果不可逆,使用伪逆
beta = np.linalg.pinv(XTX) @ XTy
return beta
# 生成测试数据
np.random.seed(42)
n_samples = 100
X = np.random.randn(n_samples)
true_slope = 2.5
true_intercept = 1.0
y = true_intercept + true_slope * X + np.random.randn(n_samples) * 0.5
# 拟合
beta = linear_regression(X, y)
print(f"真实参数: 截距={true_intercept}, 斜率={true_slope}")
print(f"估计参数: 截距={beta[0]:.4f}, 斜率={beta[1]:.4f}")
print(f"误差: 截距={abs(beta[0]-true_intercept):.4f}, 斜率={abs(beta[1]-true_slope):.4f}")
# 预测
X_test = np.array([-2, -1, 0, 1, 2])
y_pred = beta[0] + beta[1] * X_test
print(f"测试点X: {X_test}")
print(f"预测值y: {y_pred}")
5. 综合实战应用
print("\n=== 综合实战应用 ===")
print()
# 实战1:图像处理滤波器
print("实战1:图像处理滤波器")
def apply_filter(image, kernel):
"""应用卷积滤波器"""
# 简单实现(不考虑边界)
h, w = image.shape
kh, kw = kernel.shape
pad_h, pad_w = kh // 2, kw // 2
# 填充
padded = np.pad(image, ((pad_h, pad_h), (pad_w, pad_w)), mode='constant')
# 应用滤波器
result = np.zeros_like(image, dtype=np.float32)
for i in range(h):
for j in range(w):
region = padded[i:i+kh, j:j+kw]
result[i, j] = np.sum(region * kernel)
return result
# 创建测试图像
image = np.array([[1, 2, 3, 4, 5],
[6, 7, 8, 9, 10],
[11, 12, 13, 14, 15],
[16, 17, 18, 19, 20],
[21, 22, 23, 24, 25]], dtype=np.float32)
# 均值滤波器
mean_kernel = np.ones((3, 3)) / 9
filtered = apply_filter(image, mean_kernel)
print(f"原始图像:\n{image}")
print(f"3×3均值滤波器:\n{mean_kernel}")
print(f"