Python 并行与并发：案例与实现

一、核心概念辨析

维度	并发 (Concurrency)	并行 (Parallelism)
本质	多任务交替执行	多任务同时执行
硬件	单核即可	必须多核
Python 实现	`threading` / `asyncio`	`multiprocessing` / `concurrent.futures.ProcessPoolExecutor`
适用场景	I/O 密集型（网络、磁盘）	CPU 密集型（计算、加解密）
关键限制	GIL 限制线程的 CPU 并行	进程间通信开销大

GIL（全局解释器锁） ：CPython 解释器同一时刻只允许一个线程执行 Python 字节码，因此多线程对 CPU 密集型任务无加速，但对 I/O 密集型任务有效（I/O 时会释放 GIL）。

二、三种并发模型对比

css 复制代码

┌─────────────────┬──────────────┬──────────────┬─────────────────┐
│      模型        │    调度方     │   切换成本    │      场景        │
├─────────────────┼──────────────┼──────────────┼─────────────────┤
│ 多进程           │   操作系统    │     高       │  CPU 密集        │
│ 多线程           │   操作系统    │     中       │  I/O 密集（阻塞）│
│ 协程 (asyncio)  │   事件循环    │     低       │  I/O 密集（非阻塞）│
└─────────────────┴──────────────┴──────────────┴─────────────────┘

三、案例一：I/O 密集型 ------ 批量抓取 URL

3.1 串行版本（基线）

python 复制代码

import time
import requests

URLS = [
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
    "https://httpbin.org/delay/1",
]

def fetch(url: str) -> int:
    return requests.get(url, timeout=10).status_code

def main():
    start = time.perf_counter()
    results = [fetch(u) for u in URLS]
    print(f"串行耗时 {time.perf_counter() - start:.2f}s, 结果 {results}")

if __name__ == "__main__":
    main()
# 输出：串行耗时 ~5.20s

3.2 多线程版本（ThreadPoolExecutor）

python 复制代码

import time
import requests
from concurrent.futures import ThreadPoolExecutor

URLS = ["https://httpbin.org/delay/1"] * 5

def fetch(url: str) -> int:
    return requests.get(url, timeout=10).status_code

def main():
    start = time.perf_counter()
    with ThreadPoolExecutor(max_workers=5) as pool:
        results = list(pool.map(fetch, URLS))
    print(f"多线程耗时 {time.perf_counter() - start:.2f}s, 结果 {results}")

if __name__ == "__main__":
    main()
# 输出：多线程耗时 ~1.10s

原理：requests.get 在等待网络响应时会释放 GIL，其他线程可以执行，从而实现 I/O 并发。

3.3 协程版本（asyncio + aiohttp）

python 复制代码

import asyncio
import time
import aiohttp

URLS = ["https://httpbin.org/delay/1"] * 5

async def fetch(session: aiohttp.ClientSession, url: str) -> int:
    async with session.get(url) as resp:
        return resp.status

async def main():
    start = time.perf_counter()
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, u) for u in URLS]
        results = await asyncio.gather(*tasks)
    print(f"协程耗时 {time.perf_counter() - start:.2f}s, 结果 {results}")

if __name__ == "__main__":
    asyncio.run(main())
# 输出：协程耗时 ~1.05s，单线程内即可处理上万并发

关键点：

asyncio.gather 并发调度多个协程
单线程事件循环，无线程切换成本
每个连接占用内存远低于线程（~KB vs ~MB）

四、案例二：CPU 密集型 ------ 大数质因数分解

4.1 串行版本

python 复制代码

import time
import math

def factorize(n: int) -> list[int]:
    factors = []
    d = 2
    while d * d <= n:
        while n % d == 0:
            factors.append(d)
            n //= d
        d += 1
    if n > 1:
        factors.append(n)
    return factors

NUMBERS = [112272535095293] * 8  # 大质数

def main():
    start = time.perf_counter()
    results = [factorize(n) for n in NUMBERS]
    print(f"串行耗时 {time.perf_counter() - start:.2f}s")

if __name__ == "__main__":
    main()
# 输出：串行耗时 ~12.0s（8 核机器）

4.2 多线程版本（无效，仅作对比）

python 复制代码

from concurrent.futures import ThreadPoolExecutor

def main():
    start = time.perf_counter()
    with ThreadPoolExecutor(max_workers=8) as pool:
        results = list(pool.map(factorize, NUMBERS))
    print(f"多线程耗时 {time.perf_counter() - start:.2f}s")
# 输出：多线程耗时 ~12.0s（GIL 导致无加速）

4.3 多进程版本（真正的并行）

python 复制代码

import time
from concurrent.futures import ProcessPoolExecutor

def factorize(n: int) -> list[int]:
    factors = []
    d = 2
    while d * d <= n:
        while n % d == 0:
            factors.append(d)
            n //= d
        d += 1
    if n > 1:
        factors.append(n)
    return factors

NUMBERS = [112272535095293] * 8

def main():
    start = time.perf_counter()
    with ProcessPoolExecutor(max_workers=8) as pool:
        results = list(pool.map(factorize, NUMBERS))
    print(f"多进程耗时 {time.perf_counter() - start:.2f}s")

if __name__ == "__main__":
    main()
# 输出：多进程耗时 ~1.6s（8 核接近线性加速）

注意事项：

Windows 下必须有 if __name__ == "__main__": 守卫，否则子进程会无限递归 fork
函数与参数必须可 pickle 序列化
进程启动有固定开销（~50-200ms），任务太小反而变慢

五、案例三：混合负载 ------ 生产者消费者

5.1 线程 + Queue

python 复制代码

import threading
import queue
import time
import random

q: queue.Queue[int] = queue.Queue(maxsize=10)
SENTINEL = None

def producer(n: int):
    for i in range(n):
        item = random.randint(1, 100)
        q.put(item)
        print(f"生产 {item}")
        time.sleep(0.1)
    q.put(SENTINEL)

def consumer(name: str):
    while True:
        item = q.get()
        if item is SENTINEL:
            q.put(SENTINEL)  # 通知其他消费者
            break
        print(f"  [{name}] 消费 {item}")
        time.sleep(0.2)
        q.task_done()

if __name__ == "__main__":
    t_p = threading.Thread(target=producer, args=(10,))
    t_c1 = threading.Thread(target=consumer, args=("C1",))
    t_c2 = threading.Thread(target=consumer, args=("C2",))
    t_p.start(); t_c1.start(); t_c2.start()
    t_p.join(); t_c1.join(); t_c2.join()

5.2 asyncio 版本

python 复制代码

import asyncio
import random

async def producer(q: asyncio.Queue, n: int):
    for i in range(n):
        item = random.randint(1, 100)
        await q.put(item)
        print(f"生产 {item}")
        await asyncio.sleep(0.1)
    await q.put(None)

async def consumer(q: asyncio.Queue, name: str):
    while True:
        item = await q.get()
        if item is None:
            await q.put(None)
            break
        print(f"  [{name}] 消费 {item}")
        await asyncio.sleep(0.2)

async def main():
    q: asyncio.Queue = asyncio.Queue(maxsize=10)
    await asyncio.gather(
        producer(q, 10),
        consumer(q, "C1"),
        consumer(q, "C2"),
    )

if __name__ == "__main__":
    asyncio.run(main())

六、案例四：异步 Web 服务（FastAPI）

python 复制代码

# pip install fastapi uvicorn httpx
import asyncio
import httpx
from fastapi import FastAPI

app = FastAPI()

@app.get("/aggregate")
async def aggregate():
    """并发聚合 3 个下游服务"""
    async with httpx.AsyncClient(timeout=5) as client:
        user_task = client.get("https://httpbin.org/delay/1")
        order_task = client.get("https://httpbin.org/delay/1")
        stock_task = client.get("https://httpbin.org/delay/1")
        user, order, stock = await asyncio.gather(
            user_task, order_task, stock_task
        )
    return {
        "user": user.status_code,
        "order": order.status_code,
        "stock": stock.status_code,
    }

# uvicorn main:app --workers 4
# 多进程 worker (并行) + 单 worker 内 asyncio (并发) = 双层加速

七、同步原语对比

场景	threading	asyncio
互斥	`Lock`	`asyncio.Lock`
信号量	`Semaphore`	`asyncio.Semaphore`
事件	`Event`	`asyncio.Event`
队列	`queue.Queue`	`asyncio.Queue`
等待	`Thread.join()`	`await task` / `gather`

asyncio.Semaphore 限流示例

python 复制代码

import asyncio
import aiohttp

sem = asyncio.Semaphore(10)  # 最多 10 个并发请求

async def fetch_limited(session, url):
    async with sem:
        async with session.get(url) as resp:
            return await resp.text()

async def main():
    urls = [f"https://httpbin.org/get?id={i}" for i in range(100)]
    async with aiohttp.ClientSession() as session:
        results = await asyncio.gather(
            *[fetch_limited(session, u) for u in urls]
        )
    print(f"完成 {len(results)} 个请求")

八、决策树：怎么选？

bash 复制代码

任务是 CPU 密集？
├── 是 → multiprocessing / ProcessPoolExecutor
│        （或考虑 numpy/numba/Cython 释放 GIL）
└── 否（I/O 密集）
    ├── 已有同步库（如 requests、psycopg2）
    │   └── ThreadPoolExecutor（改造成本低）
    └── 有异步库（aiohttp、asyncpg、httpx）
        └── asyncio（单机最高吞吐）

九、常见陷阱

多进程下全局变量不共享 ：每个进程有独立内存，需用 Manager 或 Queue
协程中调用阻塞函数会阻塞整个事件循环 ：用 loop.run_in_executor 或 asyncio.to_thread
python 复制代码
```
result = await asyncio.to_thread(blocking_io_func, arg)
```
线程池中异常会被吞掉 ：必须 future.result() 或 as_completed 才会抛出
ProcessPoolExecutor 任务必须可 pickle：lambda、嵌套函数会报错
asyncio.gather 中一个失败会取消其他 ：用 return_exceptions=True 收集所有结果

十、Python 3.13+ 自由线程（No-GIL）展望

Python 3.13 引入实验性 --disable-gil 构建（PEP 703），未来多线程也能真正并行 CPU 密集任务。但短期内生态兼容性仍是问题，多进程仍是 CPU 并行的稳妥选择。