引言:为什么需要迭代器和生成器?
想象一下,你要处理一个100GB的文本文件,或者从数据库读取100万条记录。如果你一次性把所有数据加载到内存,电脑可能会崩溃。迭代器和生成器就是为解决这类问题而生的,它们可以"懒加载"数据,用多少取多少。
一、迭代器(Iterator)基础
1.1 什么是迭代器?
python
python
# 迭代器基础示例
print("=== 迭代器基础 ===")
# Python中一切都是可迭代的
my_list = [1, 2, 3, 4, 5]
# 传统for循环
print("传统for循环:")
for item in my_list:
print(f" {item}")
print("\n实际发生了什么:")
# for循环背后的原理
iter_obj = iter(my_list) # 获取迭代器
print(f"迭代器对象: {iter_obj}")
while True:
try:
item = next(iter_obj) # 获取下一个元素
print(f" 获取到: {item}")
except StopIteration:
print(" 迭代结束")
break
print("\n" + "="*50)
1.2 自定义迭代器类
python
class Countdown:
"""倒计时迭代器"""
def __init__(self, start):
self.current = start
def __iter__(self):
"""返回迭代器自身"""
return self
def __next__(self):
"""返回下一个值"""
if self.current <= 0:
raise StopIteration
value = self.current
self.current -= 1
return value
print("=== 自定义迭代器 ===")
# 使用自定义迭代器
countdown = Countdown(5)
print("倒计时开始:")
for number in countdown:
print(f" {number}...")
print("发射!")
# 手动使用迭代器
print("\n手动使用迭代器:")
countdown2 = Countdown(3)
iterator = iter(countdown2)
print(next(iterator)) # 3
print(next(iterator)) # 2
print(next(iterator)) # 1
try:
print(next(iterator)) # 会引发StopIteration
except StopIteration:
print("迭代结束")
print("\n" + "="*50)
1.3 实用的迭代器:文件读取器
python
class LargeFileReader:
"""大文件读取器 - 逐行读取,避免内存溢出"""
def __init__(self, filename, chunk_size=1024):
self.filename = filename
self.chunk_size = chunk_size
self.file = None
def __iter__(self):
self.file = open(self.filename, 'r', encoding='utf-8')
return self
def __next__(self):
line = self.file.readline()
if not line:
self.file.close()
raise StopIteration
return line.strip()
def read_in_chunks(self):
"""按块读取文件"""
with open(self.filename, 'r', encoding='utf-8') as f:
while True:
chunk = f.read(self.chunk_size)
if not chunk:
break
yield chunk
print("=== 文件读取迭代器 ===")
# 模拟一个大文件
import tempfile
# 创建临时文件,写入100行数据
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp:
for i in range(100):
tmp.write(f"这是第{i+1}行数据,包含一些测试内容用于演示迭代器的工作方式\n")
tmp_filename = tmp.name
print(f"创建了测试文件: {tmp_filename}")
# 使用迭代器逐行读取
print("\n逐行读取文件:")
file_reader = LargeFileReader(tmp_filename)
lines_read = 0
for line in file_reader:
lines_read += 1
if lines_read <= 3: # 只显示前3行
print(f" 行{lines_read}: {line[:30]}...")
elif lines_read == 100:
print(f" 行{lines_read}: {line[:30]}...")
print(f"总共读取了 {lines_read} 行")
# 按块读取
print("\n按块读取文件:")
chunks_read = 0
for chunk in file_reader.read_in_chunks():
chunks_read += 1
if chunks_read <= 2:
print(f" 块{chunks_read}: {len(chunk)} 字符")
print(f"总共读取了 {chunks_read} 个块")
# 清理临时文件
import os
os.unlink(tmp_filename)
print("\n" + "="*50)
二、生成器(Generator)进阶
2.1 生成器函数 vs 生成器表达式
python
print("=== 生成器基础 ===")
# 1. 生成器函数 (使用yield)
def fibonacci_generator(n):
"""斐波那契数列生成器"""
a, b = 0, 1
count = 0
while count < n:
yield a
a, b = b, a + b
count += 1
print("斐波那契数列生成器:")
fib_gen = fibonacci_generator(10)
for num in fib_gen:
print(f" {num}", end=" ")
print()
# 2. 生成器表达式 (类似列表推导式,但使用圆括号)
print("\n生成器表达式:")
# 列表推导式 - 立即生成所有元素
squares_list = [x**2 for x in range(10)]
print(f"列表推导式 (已生成所有元素): {squares_list}")
# 生成器表达式 - 懒生成元素
squares_gen = (x**2 for x in range(10))
print(f"生成器表达式 (尚未生成): {squares_gen}")
print("逐个获取生成器的值:")
for i, square in enumerate(squares_gen):
print(f" {i}^2 = {square}")
if i >= 4: # 只显示前5个
break
print("... 剩余的值会在需要时生成")
print("\n" + "="*50)
2.2 生成器的实用场景
python
print("=== 生成器实用场景 ===")
# 场景1:处理大型数据集
def read_large_dataset(file_path):
"""读取大型数据集"""
with open(file_path, 'r') as file:
for line in file:
# 处理每一行数据
processed_line = line.strip().lower()
yield processed_line
# 场景2:无限序列
def infinite_counter(start=0, step=1):
"""无限计数器"""
current = start
while True:
yield current
current += step
# 场景3:数据处理管道
def data_pipeline(data):
"""数据处理管道"""
# 第一步:过滤
for item in data:
if item % 2 == 0: # 只保留偶数
yield item
def multiply_by_3(data):
"""乘以3"""
for item in data:
yield item * 3
def add_10(data):
"""加10"""
for item in data:
yield item + 10
print("数据处理管道演示:")
original_data = range(20)
# 构建管道
pipeline = add_10(multiply_by_3(data_pipeline(original_data)))
print("原始数据:", list(original_data))
print("处理后数据:", list(pipeline))
# 场景4:协程(双向通信)
def coroutine_example():
"""协程示例 - 生成器可以接收数据"""
print("协程启动")
total = 0
count = 0
while True:
value = yield
if value is None:
break
total += value
count += 1
print(f"收到值: {value}, 当前平均值: {total/count:.2f}")
return total, count
print("\n协程演示:")
coro = coroutine_example()
next(coro) # 启动协程
coro.send(10)
coro.send(20)
coro.send(30)
try:
coro.send(None) # 结束协程
except StopIteration as e:
total, count = e.value
print(f"协程结束,总和: {total}, 次数: {count}")
print("\n" + "="*50)
2.3 生成器的send()和throw()方法
python
print("=== 生成器的高级方法 ===")
def advanced_generator():
"""高级生成器示例"""
print("生成器启动")
try:
received = yield "第一步完成"
print(f"收到: {received}")
received = yield "第二步完成"
print(f"收到: {received}")
yield "第三步完成"
except ValueError as e:
print(f"捕获到异常: {e}")
yield "从异常中恢复"
finally:
print("生成器清理")
print("1. 正常使用send()方法:")
gen = advanced_generator()
# 启动生成器
result = next(gen)
print(f"生成器返回: {result}")
# 发送数据给生成器
result = gen.send("第一条消息")
print(f"生成器返回: {result}")
result = gen.send("第二条消息")
print(f"生成器返回: {result}")
print("\n2. 使用throw()方法向生成器抛出异常:")
gen2 = advanced_generator()
next(gen2) # 启动
# 向生成器抛出异常
result = gen2.throw(ValueError("测试异常"))
print(f"生成器返回: {result}")
try:
next(gen2)
except StopIteration:
print("生成器正常结束")
print("\n3. 使用close()方法关闭生成器:")
gen3 = advanced_generator()
next(gen3) # 启动
gen3.close() # 关闭生成器
print("生成器已关闭")
print("\n" + "="*50)
三、yield from:生成器的魔法
3.1 yield from 基础
python
print("=== yield from 语法 ===")
def sub_generator():
"""子生成器"""
yield "子生成器: A"
yield "子生成器: B"
return "子生成器完成"
def main_generator_old():
"""老式方法 - 手动委托"""
for item in sub_generator():
yield item
yield "主生成器继续"
def main_generator_new():
"""新方法 - 使用yield from"""
result = yield from sub_generator()
yield f"主生成器收到: {result}"
yield "主生成器完成"
print("老式委托方法:")
for item in main_generator_old():
print(f" {item}")
print("\n使用yield from:")
for item in main_generator_new():
print(f" {item}")
print("\n" + "="*50)
3.2 yield from 的实用示例
python
print("=== yield from 实用示例 ===")
# 示例1:扁平化嵌套结构
def flatten(nested):
"""扁平化嵌套的列表"""
for sublist in nested:
if isinstance(sublist, list):
yield from flatten(sublist) # 递归委托
else:
yield sublist
# 示例2:合并多个生成器
def chain_generators(*generators):
"""连接多个生成器"""
for gen in generators:
yield from gen
# 示例3:树形结构遍历
class TreeNode:
"""树节点"""
def __init__(self, value, children=None):
self.value = value
self.children = children or []
def __iter__(self):
"""使用yield from遍历子树"""
yield self.value
for child in self.children:
yield from child
print("1. 扁平化嵌套列表:")
nested_list = [1, [2, [3, 4], 5], 6, [7, 8]]
print(f"原始列表: {nested_list}")
print(f"扁平化后: {list(flatten(nested_list))}")
print("\n2. 合并多个生成器:")
gen1 = (x for x in range(3))
gen2 = (x for x in "abc")
gen3 = (x for x in [True, False])
chained = chain_generators(gen1, gen2, gen3)
print(f"合并结果: {list(chained)}")
print("\n3. 树形结构遍历:")
# 构建一棵树
tree = TreeNode("A", [
TreeNode("B", [
TreeNode("D"),
TreeNode("E")
]),
TreeNode("C", [
TreeNode("F")
])
])
print("树节点遍历:")
for node in tree:
print(f" 节点: {node}")
print("\n" + "="*50)
四、性能对比:迭代器 vs 列表
4.1 内存使用对比
python
print("=== 性能对比:迭代器 vs 列表 ===")
import sys
import time
def test_memory_usage():
"""测试内存使用"""
print("1. 内存使用对比:")
# 列表推导式 - 占用大量内存
print("列表推导式:")
start_mem = sys.getsizeof([])
big_list = [x for x in range(1000000)]
end_mem = sys.getsizeof(big_list)
print(f" 内存占用: {(end_mem - start_mem) / 1024 / 1024:.2f} MB")
# 生成器表达式 - 几乎不占内存
print("生成器表达式:")
start_mem = sys.getsizeof([])
big_gen = (x for x in range(1000000))
end_mem = sys.getsizeof(big_gen)
print(f" 内存占用: {(end_mem - start_mem) / 1024:.2f} KB")
del big_list, big_gen
def test_execution_time():
"""测试执行时间"""
print("\n2. 执行时间对比:")
# 创建大型数据集
data_size = 1000000
# 测试列表
start_time = time.time()
total = 0
for i in range(data_size):
total += i
list_time = time.time() - start_time
print(f"列表循环时间: {list_time:.4f} 秒")
# 测试生成器
start_time = time.time()
total = 0
gen = (x for x in range(data_size))
for i in gen:
total += i
gen_time = time.time() - start_time
print(f"生成器时间: {gen_time:.4f} 秒")
print(f"时间差异: {abs(list_time - gen_time):.4f} 秒")
def test_practical_scenario():
"""测试实际场景"""
print("\n3. 实际场景:查找第一个符合条件的元素")
# 模拟大型数据集
data = range(10000000)
target = 5000000
# 使用列表(需要先生成所有元素)
print("列表方法(需要先生成所有元素):")
start_time = time.time()
try:
result = [x for x in data if x > target][0]
except IndexError:
result = None
list_time = time.time() - start_time
print(f" 结果: {result}, 时间: {list_time:.4f} 秒")
# 使用生成器(找到就停止)
print("生成器方法(找到就停止):")
start_time = time.time()
gen = (x for x in data if x > target)
result = next(gen, None)
gen_time = time.time() - start_time
print(f" 结果: {result}, 时间: {gen_time:.4f} 秒")
print(f" 时间节省: {(list_time - gen_time):.4f} 秒")
# 运行测试
test_memory_usage()
test_execution_time()
test_practical_scenario()
print("\n" + "="*50)
五、实战项目:构建数据流处理框架
5.1 完整的数据处理框架
python
print("=== 实战项目:数据流处理框架 ===")
class DataStream:
"""数据流基类"""
def __iter__(self):
raise NotImplementedError("子类必须实现")
def map(self, func):
"""应用转换函数"""
return MapStream(self, func)
def filter(self, predicate):
"""过滤数据"""
return FilterStream(self, predicate)
def take(self, n):
"""取前n个元素"""
return TakeStream(self, n)
def skip(self, n):
"""跳过前n个元素"""
return SkipStream(self, n)
def collect(self):
"""收集所有数据到列表"""
return list(self)
class MapStream(DataStream):
"""映射流"""
def __init__(self, source, func):
self.source = source
self.func = func
def __iter__(self):
for item in self.source:
yield self.func(item)
class FilterStream(DataStream):
"""过滤流"""
def __init__(self, source, predicate):
self.source = source
self.predicate = predicate
def __iter__(self):
for item in self.source:
if self.predicate(item):
yield item
class TakeStream(DataStream):
"""取前n个流"""
def __init__(self, source, n):
self.source = source
self.n = n
def __iter__(self):
count = 0
for item in self.source:
if count >= self.n:
break
yield item
count += 1
class SkipStream(DataStream):
"""跳过前n个流"""
def __init__(self, source, n):
self.source = source
self.n = n
def __iter__(self):
count = 0
for item in self.source:
if count >= self.n:
yield item
count += 1
class RangeStream(DataStream):
"""范围流"""
def __init__(self, start, end=None, step=1):
if end is None:
self.start = 0
self.end = start
else:
self.start = start
self.end = end
self.step = step
def __iter__(self):
current = self.start
while current < self.end:
yield current
current += self.step
print("构建数据流处理管道:")
# 创建数据流
stream = RangeStream(1, 101) # 1-100
# 构建处理管道
result = (stream
.filter(lambda x: x % 2 == 0) # 只保留偶数
.map(lambda x: x * 3) # 乘以3
.filter(lambda x: x > 100) # 只保留大于100的
.take(10) # 取前10个
.collect()) # 收集结果
print(f"处理结果: {result}")
print("\n流式处理演示:")
# 显示每一步的处理过程
print("原始数据: 1-100")
print("步骤1: 过滤偶数")
print("步骤2: 乘以3")
print("步骤3: 过滤 >100 的数")
print("步骤4: 取前10个")
# 分步演示
print("\n分步处理过程:")
stream = RangeStream(1, 21) # 1-20
print("原始数据:", list(stream))
stream = RangeStream(1, 21)
even_stream = stream.filter(lambda x: x % 2 == 0)
print("过滤偶数:", list(even_stream))
stream = RangeStream(1, 21)
processed = stream.filter(lambda x: x % 2 == 0).map(lambda x: x * 3)
print("乘以3:", list(processed))
print("\n" + "="*50)
总结:何时使用迭代器和生成器?
适合使用的情况:
- ✅ 处理大型数据集(避免内存溢出)
- ✅ 无限序列(如计数器、斐波那契数列)
- ✅ 管道处理(多个处理步骤串联)
- ✅ 懒加载(需要时才计算)
不适合使用的情况:
- ❌ 需要随机访问(迭代器只能单向前进)
- ❌ 需要重用数据(迭代器消耗后就不能再用了)
- ❌ 小数据集(列表推导式更简单)