Python迭代器与生成器深度解析:懒加载的艺术

引言:为什么需要迭代器和生成器?

想象一下,你要处理一个100GB的文本文件,或者从数据库读取100万条记录。如果你一次性把所有数据加载到内存,电脑可能会崩溃。迭代器和生成器就是为解决这类问题而生的,它们可以"懒加载"数据,用多少取多少。

一、迭代器(Iterator)基础

1.1 什么是迭代器?

python

python 复制代码
# 迭代器基础示例
print("=== 迭代器基础 ===")

# Python中一切都是可迭代的
my_list = [1, 2, 3, 4, 5]

# 传统for循环
print("传统for循环:")
for item in my_list:
    print(f"  {item}")

print("\n实际发生了什么:")
# for循环背后的原理
iter_obj = iter(my_list)  # 获取迭代器
print(f"迭代器对象: {iter_obj}")

while True:
    try:
        item = next(iter_obj)  # 获取下一个元素
        print(f"  获取到: {item}")
    except StopIteration:
        print("  迭代结束")
        break

print("\n" + "="*50)

1.2 自定义迭代器类

python 复制代码
class Countdown:
    """倒计时迭代器"""
    
    def __init__(self, start):
        self.current = start
    
    def __iter__(self):
        """返回迭代器自身"""
        return self
    
    def __next__(self):
        """返回下一个值"""
        if self.current <= 0:
            raise StopIteration
        value = self.current
        self.current -= 1
        return value

print("=== 自定义迭代器 ===")

# 使用自定义迭代器
countdown = Countdown(5)
print("倒计时开始:")

for number in countdown:
    print(f"  {number}...")

print("发射!")

# 手动使用迭代器
print("\n手动使用迭代器:")
countdown2 = Countdown(3)
iterator = iter(countdown2)

print(next(iterator))  # 3
print(next(iterator))  # 2
print(next(iterator))  # 1

try:
    print(next(iterator))  # 会引发StopIteration
except StopIteration:
    print("迭代结束")

print("\n" + "="*50)

1.3 实用的迭代器:文件读取器

python 复制代码
class LargeFileReader:
    """大文件读取器 - 逐行读取,避免内存溢出"""
    
    def __init__(self, filename, chunk_size=1024):
        self.filename = filename
        self.chunk_size = chunk_size
        self.file = None
    
    def __iter__(self):
        self.file = open(self.filename, 'r', encoding='utf-8')
        return self
    
    def __next__(self):
        line = self.file.readline()
        if not line:
            self.file.close()
            raise StopIteration
        return line.strip()
    
    def read_in_chunks(self):
        """按块读取文件"""
        with open(self.filename, 'r', encoding='utf-8') as f:
            while True:
                chunk = f.read(self.chunk_size)
                if not chunk:
                    break
                yield chunk

print("=== 文件读取迭代器 ===")

# 模拟一个大文件
import tempfile

# 创建临时文件,写入100行数据
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as tmp:
    for i in range(100):
        tmp.write(f"这是第{i+1}行数据,包含一些测试内容用于演示迭代器的工作方式\n")
    tmp_filename = tmp.name

print(f"创建了测试文件: {tmp_filename}")

# 使用迭代器逐行读取
print("\n逐行读取文件:")
file_reader = LargeFileReader(tmp_filename)
lines_read = 0

for line in file_reader:
    lines_read += 1
    if lines_read <= 3:  # 只显示前3行
        print(f"  行{lines_read}: {line[:30]}...")
    elif lines_read == 100:
        print(f"  行{lines_read}: {line[:30]}...")

print(f"总共读取了 {lines_read} 行")

# 按块读取
print("\n按块读取文件:")
chunks_read = 0
for chunk in file_reader.read_in_chunks():
    chunks_read += 1
    if chunks_read <= 2:
        print(f"  块{chunks_read}: {len(chunk)} 字符")

print(f"总共读取了 {chunks_read} 个块")

# 清理临时文件
import os
os.unlink(tmp_filename)

print("\n" + "="*50)

二、生成器(Generator)进阶

2.1 生成器函数 vs 生成器表达式

python 复制代码
print("=== 生成器基础 ===")

# 1. 生成器函数 (使用yield)
def fibonacci_generator(n):
    """斐波那契数列生成器"""
    a, b = 0, 1
    count = 0
    while count < n:
        yield a
        a, b = b, a + b
        count += 1

print("斐波那契数列生成器:")
fib_gen = fibonacci_generator(10)

for num in fib_gen:
    print(f"  {num}", end=" ")
print()

# 2. 生成器表达式 (类似列表推导式,但使用圆括号)
print("\n生成器表达式:")

# 列表推导式 - 立即生成所有元素
squares_list = [x**2 for x in range(10)]
print(f"列表推导式 (已生成所有元素): {squares_list}")

# 生成器表达式 - 懒生成元素
squares_gen = (x**2 for x in range(10))
print(f"生成器表达式 (尚未生成): {squares_gen}")

print("逐个获取生成器的值:")
for i, square in enumerate(squares_gen):
    print(f"  {i}^2 = {square}")
    if i >= 4:  # 只显示前5个
        break

print("... 剩余的值会在需要时生成")

print("\n" + "="*50)

2.2 生成器的实用场景

python 复制代码
print("=== 生成器实用场景 ===")

# 场景1:处理大型数据集
def read_large_dataset(file_path):
    """读取大型数据集"""
    with open(file_path, 'r') as file:
        for line in file:
            # 处理每一行数据
            processed_line = line.strip().lower()
            yield processed_line

# 场景2:无限序列
def infinite_counter(start=0, step=1):
    """无限计数器"""
    current = start
    while True:
        yield current
        current += step

# 场景3:数据处理管道
def data_pipeline(data):
    """数据处理管道"""
    # 第一步:过滤
    for item in data:
        if item % 2 == 0:  # 只保留偶数
            yield item
    
def multiply_by_3(data):
    """乘以3"""
    for item in data:
        yield item * 3

def add_10(data):
    """加10"""
    for item in data:
        yield item + 10

print("数据处理管道演示:")
original_data = range(20)

# 构建管道
pipeline = add_10(multiply_by_3(data_pipeline(original_data)))

print("原始数据:", list(original_data))
print("处理后数据:", list(pipeline))

# 场景4:协程(双向通信)
def coroutine_example():
    """协程示例 - 生成器可以接收数据"""
    print("协程启动")
    total = 0
    count = 0
    
    while True:
        value = yield
        if value is None:
            break
        total += value
        count += 1
        print(f"收到值: {value}, 当前平均值: {total/count:.2f}")
    
    return total, count

print("\n协程演示:")
coro = coroutine_example()
next(coro)  # 启动协程

coro.send(10)
coro.send(20)
coro.send(30)

try:
    coro.send(None)  # 结束协程
except StopIteration as e:
    total, count = e.value
    print(f"协程结束,总和: {total}, 次数: {count}")

print("\n" + "="*50)

2.3 生成器的send()和throw()方法

python 复制代码
print("=== 生成器的高级方法 ===")

def advanced_generator():
    """高级生成器示例"""
    print("生成器启动")
    try:
        received = yield "第一步完成"
        print(f"收到: {received}")
        
        received = yield "第二步完成"
        print(f"收到: {received}")
        
        yield "第三步完成"
        
    except ValueError as e:
        print(f"捕获到异常: {e}")
        yield "从异常中恢复"
    finally:
        print("生成器清理")

print("1. 正常使用send()方法:")
gen = advanced_generator()

# 启动生成器
result = next(gen)
print(f"生成器返回: {result}")

# 发送数据给生成器
result = gen.send("第一条消息")
print(f"生成器返回: {result}")

result = gen.send("第二条消息")
print(f"生成器返回: {result}")

print("\n2. 使用throw()方法向生成器抛出异常:")
gen2 = advanced_generator()
next(gen2)  # 启动

# 向生成器抛出异常
result = gen2.throw(ValueError("测试异常"))
print(f"生成器返回: {result}")

try:
    next(gen2)
except StopIteration:
    print("生成器正常结束")

print("\n3. 使用close()方法关闭生成器:")
gen3 = advanced_generator()
next(gen3)  # 启动

gen3.close()  # 关闭生成器
print("生成器已关闭")

print("\n" + "="*50)

三、yield from:生成器的魔法

3.1 yield from 基础

python 复制代码
print("=== yield from 语法 ===")

def sub_generator():
    """子生成器"""
    yield "子生成器: A"
    yield "子生成器: B"
    return "子生成器完成"

def main_generator_old():
    """老式方法 - 手动委托"""
    for item in sub_generator():
        yield item
    yield "主生成器继续"

def main_generator_new():
    """新方法 - 使用yield from"""
    result = yield from sub_generator()
    yield f"主生成器收到: {result}"
    yield "主生成器完成"

print("老式委托方法:")
for item in main_generator_old():
    print(f"  {item}")

print("\n使用yield from:")
for item in main_generator_new():
    print(f"  {item}")

print("\n" + "="*50)

3.2 yield from 的实用示例

python 复制代码
print("=== yield from 实用示例 ===")

# 示例1:扁平化嵌套结构
def flatten(nested):
    """扁平化嵌套的列表"""
    for sublist in nested:
        if isinstance(sublist, list):
            yield from flatten(sublist)  # 递归委托
        else:
            yield sublist

# 示例2:合并多个生成器
def chain_generators(*generators):
    """连接多个生成器"""
    for gen in generators:
        yield from gen

# 示例3:树形结构遍历
class TreeNode:
    """树节点"""
    def __init__(self, value, children=None):
        self.value = value
        self.children = children or []
    
    def __iter__(self):
        """使用yield from遍历子树"""
        yield self.value
        for child in self.children:
            yield from child

print("1. 扁平化嵌套列表:")
nested_list = [1, [2, [3, 4], 5], 6, [7, 8]]
print(f"原始列表: {nested_list}")
print(f"扁平化后: {list(flatten(nested_list))}")

print("\n2. 合并多个生成器:")
gen1 = (x for x in range(3))
gen2 = (x for x in "abc")
gen3 = (x for x in [True, False])

chained = chain_generators(gen1, gen2, gen3)
print(f"合并结果: {list(chained)}")

print("\n3. 树形结构遍历:")
# 构建一棵树
tree = TreeNode("A", [
    TreeNode("B", [
        TreeNode("D"),
        TreeNode("E")
    ]),
    TreeNode("C", [
        TreeNode("F")
    ])
])

print("树节点遍历:")
for node in tree:
    print(f"  节点: {node}")

print("\n" + "="*50)

四、性能对比:迭代器 vs 列表

4.1 内存使用对比

python 复制代码
print("=== 性能对比:迭代器 vs 列表 ===")

import sys
import time

def test_memory_usage():
    """测试内存使用"""
    print("1. 内存使用对比:")
    
    # 列表推导式 - 占用大量内存
    print("列表推导式:")
    start_mem = sys.getsizeof([])
    big_list = [x for x in range(1000000)]
    end_mem = sys.getsizeof(big_list)
    print(f"  内存占用: {(end_mem - start_mem) / 1024 / 1024:.2f} MB")
    
    # 生成器表达式 - 几乎不占内存
    print("生成器表达式:")
    start_mem = sys.getsizeof([])
    big_gen = (x for x in range(1000000))
    end_mem = sys.getsizeof(big_gen)
    print(f"  内存占用: {(end_mem - start_mem) / 1024:.2f} KB")
    
    del big_list, big_gen

def test_execution_time():
    """测试执行时间"""
    print("\n2. 执行时间对比:")
    
    # 创建大型数据集
    data_size = 1000000
    
    # 测试列表
    start_time = time.time()
    total = 0
    for i in range(data_size):
        total += i
    list_time = time.time() - start_time
    print(f"列表循环时间: {list_time:.4f} 秒")
    
    # 测试生成器
    start_time = time.time()
    total = 0
    gen = (x for x in range(data_size))
    for i in gen:
        total += i
    gen_time = time.time() - start_time
    print(f"生成器时间: {gen_time:.4f} 秒")
    
    print(f"时间差异: {abs(list_time - gen_time):.4f} 秒")

def test_practical_scenario():
    """测试实际场景"""
    print("\n3. 实际场景:查找第一个符合条件的元素")
    
    # 模拟大型数据集
    data = range(10000000)
    target = 5000000
    
    # 使用列表(需要先生成所有元素)
    print("列表方法(需要先生成所有元素):")
    start_time = time.time()
    try:
        result = [x for x in data if x > target][0]
    except IndexError:
        result = None
    list_time = time.time() - start_time
    print(f"  结果: {result}, 时间: {list_time:.4f} 秒")
    
    # 使用生成器(找到就停止)
    print("生成器方法(找到就停止):")
    start_time = time.time()
    gen = (x for x in data if x > target)
    result = next(gen, None)
    gen_time = time.time() - start_time
    print(f"  结果: {result}, 时间: {gen_time:.4f} 秒")
    
    print(f"  时间节省: {(list_time - gen_time):.4f} 秒")

# 运行测试
test_memory_usage()
test_execution_time()
test_practical_scenario()

print("\n" + "="*50)

五、实战项目:构建数据流处理框架

5.1 完整的数据处理框架

python 复制代码
print("=== 实战项目:数据流处理框架 ===")

class DataStream:
    """数据流基类"""
    
    def __iter__(self):
        raise NotImplementedError("子类必须实现")
    
    def map(self, func):
        """应用转换函数"""
        return MapStream(self, func)
    
    def filter(self, predicate):
        """过滤数据"""
        return FilterStream(self, predicate)
    
    def take(self, n):
        """取前n个元素"""
        return TakeStream(self, n)
    
    def skip(self, n):
        """跳过前n个元素"""
        return SkipStream(self, n)
    
    def collect(self):
        """收集所有数据到列表"""
        return list(self)

class MapStream(DataStream):
    """映射流"""
    
    def __init__(self, source, func):
        self.source = source
        self.func = func
    
    def __iter__(self):
        for item in self.source:
            yield self.func(item)

class FilterStream(DataStream):
    """过滤流"""
    
    def __init__(self, source, predicate):
        self.source = source
        self.predicate = predicate
    
    def __iter__(self):
        for item in self.source:
            if self.predicate(item):
                yield item

class TakeStream(DataStream):
    """取前n个流"""
    
    def __init__(self, source, n):
        self.source = source
        self.n = n
    
    def __iter__(self):
        count = 0
        for item in self.source:
            if count >= self.n:
                break
            yield item
            count += 1

class SkipStream(DataStream):
    """跳过前n个流"""
    
    def __init__(self, source, n):
        self.source = source
        self.n = n
    
    def __iter__(self):
        count = 0
        for item in self.source:
            if count >= self.n:
                yield item
            count += 1

class RangeStream(DataStream):
    """范围流"""
    
    def __init__(self, start, end=None, step=1):
        if end is None:
            self.start = 0
            self.end = start
        else:
            self.start = start
            self.end = end
        self.step = step
    
    def __iter__(self):
        current = self.start
        while current < self.end:
            yield current
            current += self.step

print("构建数据流处理管道:")

# 创建数据流
stream = RangeStream(1, 101)  # 1-100

# 构建处理管道
result = (stream
    .filter(lambda x: x % 2 == 0)      # 只保留偶数
    .map(lambda x: x * 3)              # 乘以3
    .filter(lambda x: x > 100)         # 只保留大于100的
    .take(10)                          # 取前10个
    .collect())                        # 收集结果

print(f"处理结果: {result}")

print("\n流式处理演示:")
# 显示每一步的处理过程
print("原始数据: 1-100")
print("步骤1: 过滤偶数")
print("步骤2: 乘以3")
print("步骤3: 过滤 >100 的数")
print("步骤4: 取前10个")

# 分步演示
print("\n分步处理过程:")
stream = RangeStream(1, 21)  # 1-20

print("原始数据:", list(stream))

stream = RangeStream(1, 21)
even_stream = stream.filter(lambda x: x % 2 == 0)
print("过滤偶数:", list(even_stream))

stream = RangeStream(1, 21)
processed = stream.filter(lambda x: x % 2 == 0).map(lambda x: x * 3)
print("乘以3:", list(processed))

print("\n" + "="*50)

总结:何时使用迭代器和生成器?

适合使用的情况:

  1. 处理大型数据集(避免内存溢出)
  2. 无限序列(如计数器、斐波那契数列)
  3. 管道处理(多个处理步骤串联)
  4. 懒加载(需要时才计算)

不适合使用的情况:

  1. 需要随机访问(迭代器只能单向前进)
  2. 需要重用数据(迭代器消耗后就不能再用了)
  3. 小数据集(列表推导式更简单)
相关推荐
dazzle2 小时前
OpenCV基础教学(二):图像的灰度化处理
python·opencv·计算机视觉
代码洲学长2 小时前
RNN模型01
人工智能·python·rnn·自然语言处理·gru·lstm
饕餮争锋2 小时前
REPL简介
python
执笔论英雄2 小时前
【RL]大模型训练1F1B执行过程
python
Amelia1111112 小时前
day35
python
superman超哥3 小时前
仓颉Actor模型的实现机制深度解析
开发语言·后端·python·c#·仓颉
superman超哥3 小时前
仓颉内存管理深度探索:引用计数的实现原理与实战
c语言·开发语言·c++·python·仓颉
zhuzihuaile3 小时前
Langchain-Chatchat + Ollama + QWen3 + 搭建知识库 + AI-Win
人工智能·python·ai·langchain
Warson_L3 小时前
python的__init__.py
python