[Python代码设计] 使用生成器替代回调函数

假设有这么一个场景，需要计算一个非常大的文件的md5值，这个文件非常大，如果一次性读取到内存中，可能会导致内存溢出。同时，我们需要在屏幕中输出计算md5的进度，使得用户有耐心等待这个md5计算完成。

最常规的做法就是在计算md5的同时传一个回调函数，让回调函数在屏幕上输出计算进度：

python 复制代码

import hashlib
from typing import Callable, Any

def compute_md5(

    file: str, callback: Callable[[bytes], Any], *, block_size: int = 256

) -> str:
    md5 = hashlib.md5()
    with open(file, "rb") as f:
        while chunk := f.read(block_size):
            md5.update(chunk)
            callback(chunk)

    return md5.hexdigest()

这个函数在每一次循环中，会读取一部分文件内容然后更新md5值并传入到回调函数中。

对于这个函数的调用者来说，要在屏幕上显示计算md5的进度，还要写以下代码来作为回调函数传入到compute_md5中：

python 复制代码

class UpdateMd5Progress:

    def __init__(self, total_size: int):
        self.total_size = total_size
        self.progress = 0

    def __call__(self, chunk):
        self.progress += len(chunk)
        print(f"{self.progress / self.total_size:.2%}")

完整代码如下：

python 复制代码

import hashlib
import os
from typing import Callable, Any

class UpdateMd5Progress:
    
    def __init__(self, total_size: int):
        self.total_size = total_size
        self.progress = 0

    def __call__(self, chunk):
        self.progress += len(chunk)
        print(f"{self.progress / self.total_size:.2%}")


def compute_md5(
    file: str, 
    callback: Callable[[bytes], Any], 
    *, 
    block_size: int = 256
) -> str:

    md5 = hashlib.md5()
    with open(file, "rb") as f:
        while chunk := f.read(block_size):
            md5.update(chunk)
            callback(chunk)
    return md5.hexdigest()

  
  


def main():
    md5 = compute_md5(
        __file__, 
        UpdateMd5Progress(os.path.getsize(__file__))
    )

    print(md5)

if __name__ == "__main__":
    main()

对于函数调用者来说，这个过程还是略微有些复杂了，需要写一个类记录进度和文件大小。

可以考虑使用生成器来简化函数调用者的代码, 计算md5的代码如下：

python 复制代码

import hashlib
import os


class Md5Calculator:

    def __init__(self, file: str, *, block_size: int = 256):
        self.file = file
        self.block_size = block_size

    def run(self):
        md5 = hashlib.md5()
        with open(self.file, 'rb') as f:
            while chunk:=f.read(self.block_size):
                md5.update(chunk)
                yield chunk
        self.hexdigest = md5.hexdigest()

  


def main():
    md5_calculator = Md5Calculator(__file__)
    total_size = os.path.getsize(__file__)
    progress = 0
    
    for chunk in md5_calculator.run():
        progress += len(chunk)
        print(f"{progress / total_size:.2%}")
    print(md5_calculator.hexdigest)

if __name__ == '__main__':
    main()

在以上代码中，新建了一个Md5Calculator类，来计算md5。以前callback里的内容会通过这个for循环来运行，最后再通过Md5Calculator的hexdigest属性来获取最后的值。

这里为了获取最后md5的方便而没有使用生成器函数，而是直接写了一个类，用户可以通过 Md5Calculator 的 hexdigest属性来获取结果。

如果写为生成器函数，想要获取函数的返回值，需要从StopIteration 这个异常中取出。

代码如下：

python 复制代码

import hashlib
import os
from typing import Generator


def compute_md5(
    file: str, *, block_size: int = 256
) -> Generator[bytes, None, str]:

    md5 = hashlib.md5()
    with open(file, "rb") as f:
        while chunk := f.read(block_size):
            md5.update(chunk)
            yield chunk

    return md5.hexdigest()

  


def main():
    md5_calculator = compute_md5(__file__)
    total_size = os.path.getsize(__file__)
    progress = 0
    while True:
        try:
            chunk = next(md5_calculator)
            progress += len(chunk)
            print(f'{progress / total_size:.2%}')
        except StopIteration as e:
            result = e.value
            break
    print(result)

if __name__ == '__main__':
    main()

这三种的总代码量是差不多的，但是对于用户来说，使用for循环是更为简单直接的选择。

不过如果使用for循环的方式，来让用户处理回调的数据，计算md5时就无法处理用户代码中引起的异常。

接下来可以对比一下，使用回调函数处理异常的方式和使用for循环的方式处理异常的方式：

回调函数：

python 复制代码

import hashlib
from typing import Callable, Any
from traceback import print_exception


def compute_md5(

    file: str, 
    callback: Callable[[bytes], Any], 
    *, 
    block_size: int = 256

) -> str:

    md5 = hashlib.md5()
    with open(file, "rb") as f:
        while chunk := f.read(block_size):
            md5.update(chunk)
        try:
            callback(chunk)
        except Exception as e:
            print_exception(e)

    return md5.hexdigest()

从上面的代码可以看出，compute_md5可以处理用户回调函数中的异常，以保证md5计算完成。

但是使用生成器来代替回调函数，是无法处理用户代码的异常：

python 复制代码

class Md5Calculator:

    def __init__(self, file: str, *, block_size: int = 256):
        self.file = file
        self.block_size = block_size

    def run(self):
        md5 = hashlib.md5()
        with open(self.file, 'rb') as f:
            while chunk:=f.read(self.block_size):
                md5.update(chunk)
                try:
                    yield chunk
                except Exception:
                    print("无法捕获ValueError")
        self.hexdigest = md5.hexdigest()

  
def main():
    md5_calculator = Md5Calculator(__file__)
    total_size = os.path.getsize(__file__)
    progress = 0

    for chunk in md5_calculator.run():
        progress += len(chunk)
        print(f"{progress / total_size:.2%}")
        raise ValueError

    print(md5_calculator.hexdigest)

在这个例子中Md5Calculator.run这个生成器函数，完全捕获不到用户代码中的异常。在大多数情况下，这是合理的，因为异常是由用户代码引起的，计算md5的代码不应该去处理用户代码的异常，但是在稳定性要求非常高的代码中，可能开发者必须处理用户的异常，这时，使用生成器来替代回调函数，则并不合理。

而且，使用生成器来替代回调函数，并不是常见的做法，可能会增加理解的成本，并且写上额外的注释，不过也可以让用户的代码更加流畅，如何处理需要开发者权衡。