Python多线程与多进程编程实战指南

引言：理解并发与并行

在处理大数据或高并发场景时，我们需要让程序同时做多件事情。Python提供了两种主要方式：多线程（threading）和多进程（multiprocessing）。

python 复制代码

# 一个直观的例子：同步 vs 多线程 vs 多进程
import time

def task(n):
    """模拟耗时任务"""
    result = 0
    for i in range(n):
        result += i * i
    return result

# 同步执行（一个一个来）
start = time.time()
for _ in range(4):
    task(10000000)
print(f"同步执行耗时: {time.time() - start:.2f}秒")

# 多线程执行（同时开始，但由于GIL限制，实际还是交替执行）
# 多进程执行（真正的同时执行）

一、多线程编程

1.1 创建和启动线程

python 复制代码

import threading
import time

print("=== 多线程基础 ===")

def worker(name, delay):
    """工作者函数"""
    print(f"[{name}] 开始工作，需要 {delay} 秒")
    time.sleep(delay)
    print(f"[{name}] 工作完成")
    return f"{name}的结果"

# 方法1：直接创建线程
def basic_threads():
    print("方法1：直接创建线程")
    
    # 创建线程对象
    thread1 = threading.Thread(target=worker, args=("工人1", 2))
    thread2 = threading.Thread(target=worker, args=("工人2", 1))
    thread3 = threading.Thread(target=worker, args=("工人3", 3))
    
    # 启动线程
    thread1.start()
    thread2.start()
    thread3.start()
    
    # 等待所有线程完成
    thread1.join()
    thread2.join()
    thread3.join()
    
    print("所有线程工作完成")

# 方法2：继承Thread类
class MyThread(threading.Thread):
    """自定义线程类"""
    
    def __init__(self, name, delay):
        super().__init__()
        self.name = name
        self.delay = delay
        self.result = None
    
    def run(self):
        """线程执行的代码"""
        print(f"[{self.name}] 开始工作")
        time.sleep(self.delay)
        self.result = f"{self.name}完成任务"
        print(f"[{self.name}] 工作完成")
    
    def get_result(self):
        """获取结果"""
        return self.result

def class_based_threads():
    print("\n方法2：继承Thread类")
    
    threads = []
    for i in range(3):
        thread = MyThread(f"线程{i+1}", i+1)
        thread.start()
        threads.append(thread)
    
    # 等待并获取结果
    for thread in threads:
        thread.join()
        print(f"结果: {thread.get_result()}")

# 运行演示
basic_threads()
class_based_threads()

print("\n" + "="*50)

1.2 线程同步与通信

python 复制代码

import threading
import queue
import time

print("=== 线程同步与通信 ===")

# 1. 线程锁 - 防止数据竞争
class BankAccount:
    """银行账户（线程安全）"""
    
    def __init__(self, balance=0):
        self.balance = balance
        self.lock = threading.Lock()
        self.transactions = []
    
    def deposit(self, amount, thread_name):
        """存款"""
        with self.lock:  # 自动获取和释放锁
            old_balance = self.balance
            self.balance += amount
            self.transactions.append(f"{thread_name}: 存款 {amount}")
            print(f"{thread_name}: 存款 {amount}, 余额: {old_balance} -> {self.balance}")
    
    def withdraw(self, amount, thread_name):
        """取款"""
        with self.lock:
            if self.balance >= amount:
                old_balance = self.balance
                self.balance -= amount
                self.transactions.append(f"{thread_name}: 取款 {amount}")
                print(f"{thread_name}: 取款 {amount}, 余额: {old_balance} -> {self.balance}")
                return True
            else:
                print(f"{thread_name}: 取款失败，余额不足")
                return False

def test_bank_account():
    """测试银行账户"""
    account = BankAccount(1000)
    
    def customer_operations(name, operations):
        for op_type, amount in operations:
            if op_type == "deposit":
                account.deposit(amount, name)
            else:
                account.withdraw(amount, name)
            time.sleep(0.1)
    
    # 多个客户同时操作账户
    customers = [
        ("张三", [("deposit", 200), ("withdraw", 300)]),
        ("李四", [("deposit", 500), ("withdraw", 800)]),
        ("王五", [("withdraw", 400), ("deposit", 600)])
    ]
    
    threads = []
    for name, operations in customers:
        thread = threading.Thread(target=customer_operations, args=(name, operations))
        thread.start()
        threads.append(thread)
    
    for thread in threads:
        thread.join()
    
    print(f"\n最终余额: {account.balance}")
    print("交易记录:", account.transactions)

# 2. 队列 - 线程间安全通信
def producer_consumer_demo():
    """生产者-消费者模式"""
    print("\n生产者-消费者模式演示")
    
    q = queue.Queue(maxsize=5)  # 最大容量5
    
    def producer(name, items):
        """生产者"""
        for i in range(items):
            item = f"{name}的产品-{i}"
            q.put(item)  # 如果队列满，会阻塞等待
            print(f"[生产者{name}] 生产: {item}")
            time.sleep(0.5)
        q.put(None)  # 发送结束信号
    
    def consumer(name):
        """消费者"""
        while True:
            item = q.get()  # 如果队列空，会阻塞等待
            if item is None:
                q.put(None)  # 传递结束信号给其他消费者
                print(f"[消费者{name}] 结束")
                break
            print(f"[消费者{name}] 消费: {item}")
            time.sleep(1)
            q.task_done()
    
    # 启动生产者和消费者
    producers = [
        threading.Thread(target=producer, args=("A", 5)),
        threading.Thread(target=producer, args=("B", 3))
    ]
    
    consumers = [
        threading.Thread(target=consumer, args=("1",)),
        threading.Thread(target=consumer, args=("2",))
    ]
    
    # 启动所有线程
    for p in producers:
        p.start()
    
    for c in consumers:
        c.start()
    
    # 等待生产者完成
    for p in producers:
        p.join()
    
    # 等待队列清空
    q.join()
    
    print("所有任务完成")

# 运行演示
test_bank_account()
producer_consumer_demo()

print("\n" + "="*50)

1.3 线程池

python 复制代码

from concurrent.futures import ThreadPoolExecutor
import time

print("=== 线程池 ===")

def process_item(item):
    """处理单个项目"""
    print(f"开始处理: {item}")
    time.sleep(1)  # 模拟耗时操作
    result = f"{item}_processed"
    print(f"处理完成: {item} -> {result}")
    return result

def thread_pool_demo():
    """线程池演示"""
    
    items = [f"item_{i}" for i in range(10)]
    
    print(f"开始处理 {len(items)} 个项目")
    start_time = time.time()
    
    # 使用线程池（最大3个线程）
    with ThreadPoolExecutor(max_workers=3) as executor:
        # 提交所有任务
        future_to_item = {executor.submit(process_item, item): item 
                         for item in items}
        
        # 收集结果
        results = []
        for future in future_to_item:
            result = future.result()
            results.append(result)
    
    end_time = time.time()
    print(f"\n所有项目处理完成!")
    print(f"总耗时: {end_time - start_time:.2f}秒")
    print(f"处理了 {len(results)} 个项目")
    print(f"结果样例: {results[:3]}...")

# 高级特性：回调函数
def callback_demo():
    """回调函数演示"""
    print("\n线程池回调函数演示")
    
    def task(n):
        """任务函数"""
        return n * n
    
    def callback(future):
        """回调函数"""
        result = future.result()
        print(f"回调: 任务完成，结果={result}")
    
    with ThreadPoolExecutor(max_workers=2) as executor:
        # 提交任务并添加回调
        for i in range(5):
            future = executor.submit(task, i)
            future.add_done_callback(callback)
    
    print("所有回调执行完成")

# 运行演示
thread_pool_demo()
callback_demo()

print("\n" + "="*50)

二、多进程编程

2.1 创建进程

python 复制代码

import multiprocessing
import time
import os

print("=== 多进程基础 ===")

def worker_process(name, delay):
    """工作进程函数"""
    pid = os.getpid()  # 获取进程ID
    print(f"[进程{pid}] {name} 开始工作，需要 {delay} 秒")
    time.sleep(delay)
    print(f"[进程{pid}] {name} 工作完成")
    return f"{name}的结果"

def basic_processes():
    """基本进程创建"""
    print("创建多个进程")
    
    processes = []
    
    # 创建进程
    for i in range(3):
        p = multiprocessing.Process(
            target=worker_process,
            args=(f"工人{i+1}", i+1)
        )
        processes.append(p)
        p.start()
    
    # 等待所有进程完成
    for p in processes:
        p.join()
    
    print("所有进程工作完成")

def process_pool_demo():
    """进程池演示"""
    print("\n使用进程池")
    
    def cpu_intensive_task(n):
        """CPU密集型任务"""
        result = 0
        for i in range(n):
            result += i * i
        pid = os.getpid()
        print(f"[进程{pid}] 完成计算 n={n}")
        return result
    
    numbers = [1000000, 2000000, 3000000, 4000000]
    
    start_time = time.time()
    
    # 使用进程池
    with multiprocessing.Pool(processes=2) as pool:
        results = pool.map(cpu_intensive_task, numbers)
    
    end_time = time.time()
    
    print(f"计算完成!")
    print(f"结果: {results}")
    print(f"总耗时: {end_time - start_time:.2f}秒")

# 运行演示
basic_processes()
process_pool_demo()

print("\n" + "="*50)

2.2 进程间通信

python 复制代码

import multiprocessing
import time

print("=== 进程间通信 ===")

# 1. 队列通信
def queue_communication():
    """使用队列进行进程间通信"""
    print("1. 队列通信演示")
    
    def producer(queue, items):
        """生产者进程"""
        for item in items:
            print(f"[生产者] 发送: {item}")
            queue.put(item)
            time.sleep(0.5)
        queue.put(None)  # 结束信号
    
    def consumer(queue, name):
        """消费者进程"""
        while True:
            item = queue.get()
            if item is None:
                queue.put(None)  # 传递结束信号
                print(f"[消费者{name}] 结束")
                break
            print(f"[消费者{name}] 收到: {item}")
            time.sleep(1)
    
    # 创建队列
    q = multiprocessing.Queue(maxsize=3)
    
    # 创建进程
    producer_p = multiprocessing.Process(
        target=producer,
        args=(q, ["A", "B", "C", "D", "E"])
    )
    
    consumer1_p = multiprocessing.Process(
        target=consumer,
        args=(q, "1")
    )
    
    consumer2_p = multiprocessing.Process(
        target=consumer,
        args=(q, "2")
    )
    
    # 启动进程
    producer_p.start()
    consumer1_p.start()
    consumer2_p.start()
    
    # 等待完成
    producer_p.join()
    consumer1_p.join()
    consumer2_p.join()
    
    print("队列通信完成")

# 2. 管道通信
def pipe_communication():
    """使用管道进行进程间通信"""
    print("\n2. 管道通信演示")
    
    def process_a(conn):
        """进程A"""
        conn.send("Hello from Process A")
        data = conn.recv()
        print(f"[Process A] 收到: {data}")
        conn.close()
    
    def process_b(conn):
        """进程B"""
        data = conn.recv()
        print(f"[Process B] 收到: {data}")
        conn.send("Hello from Process B")
        conn.close()
    
    # 创建管道
    parent_conn, child_conn = multiprocessing.Pipe()
    
    # 创建进程
    p1 = multiprocessing.Process(target=process_a, args=(parent_conn,))
    p2 = multiprocessing.Process(target=process_b, args=(child_conn,))
    
    # 启动进程
    p1.start()
    p2.start()
    
    # 等待完成
    p1.join()
    p2.join()
    
    print("管道通信完成")

# 3. 共享内存
def shared_memory_demo():
    """共享内存演示"""
    print("\n3. 共享内存演示")
    
    def worker(shared_value, shared_array, lock, worker_id):
        """工作进程"""
        with lock:
            shared_value.value += 1
            print(f"[Worker {worker_id}] 共享值: {shared_value.value}")
        
        # 修改共享数组
        for i in range(len(shared_array)):
            shared_array[i] = worker_id * 10 + i
    
    # 创建共享值
    shared_value = multiprocessing.Value('i', 0)  # 整数类型
    
    # 创建共享数组
    shared_array = multiprocessing.Array('i', 5)  # 5个整数的数组
    
    # 创建锁
    lock = multiprocessing.Lock()
    
    # 创建多个进程
    processes = []
    for i in range(3):
        p = multiprocessing.Process(
            target=worker,
            args=(shared_value, shared_array, lock, i+1)
        )
        processes.append(p)
        p.start()
    
    # 等待所有进程完成
    for p in processes:
        p.join()
    
    print(f"最终共享值: {shared_value.value}")
    print(f"共享数组: {list(shared_array)}")

# 运行演示
queue_communication()
pipe_communication()
shared_memory_demo()

print("\n" + "="*50)

三、选择指南：线程 vs 进程

3.1 性能对比

python 复制代码

import threading
import multiprocessing
import time
import os

def cpu_bound_task(n):
    """CPU密集型任务"""
    count = 0
    for i in range(n):
        count += i * i
    return count

def io_bound_task(duration):
    """I/O密集型任务"""
    time.sleep(duration)
    return duration

def compare_performance():
    """比较线程和进程的性能"""
    print("=== 线程 vs 进程 性能对比 ===")
    
    # 测试配置
    task_count = 4
    cpu_task_size = 10000000
    io_task_duration = 1
    
    # 1. CPU密集型任务对比
    print("\n1. CPU密集型任务对比:")
    
    # 单线程
    start = time.time()
    for _ in range(task_count):
        cpu_bound_task(cpu_task_size)
    single_time = time.time() - start
    print(f"  单线程: {single_time:.2f}秒")
    
    # 多线程（由于GIL，可能不会更快）
    start = time.time()
    threads = []
    for _ in range(task_count):
        t = threading.Thread(target=cpu_bound_task, args=(cpu_task_size,))
        t.start()
        threads.append(t)
    
    for t in threads:
        t.join()
    thread_time = time.time() - start
    print(f"  多线程({task_count}线程): {thread_time:.2f}秒")
    
    # 多进程
    start = time.time()
    processes = []
    for _ in range(task_count):
        p = multiprocessing.Process(target=cpu_bound_task, args=(cpu_task_size,))
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()
    process_time = time.time() - start
    print(f"  多进程({task_count}进程): {process_time:.2f}秒")
    
    print(f"  CPU任务结论: 多进程比多线程快 {thread_time/process_time:.1f}倍")
    
    # 2. I/O密集型任务对比
    print("\n2. I/O密集型任务对比:")
    
    # 单线程
    start = time.time()
    for _ in range(task_count):
        io_bound_task(io_task_duration)
    single_io_time = time.time() - start
    print(f"  单线程: {single_io_time:.2f}秒")
    
    # 多线程
    start = time.time()
    threads = []
    for _ in range(task_count):
        t = threading.Thread(target=io_bound_task, args=(io_task_duration,))
        t.start()
        threads.append(t)
    
    for t in threads:
        t.join()
    thread_io_time = time.time() - start
    print(f"  多线程({task_count}线程): {thread_io_time:.2f}秒")
    
    # 多进程
    start = time.time()
    processes = []
    for _ in range(task_count):
        p = multiprocessing.Process(target=io_bound_task, args=(io_task_duration,))
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()
    process_io_time = time.time() - start
    print(f"  多进程({task_count}进程): {process_io_time:.2f}秒")
    
    print(f"  I/O任务结论: 多线程比单线程快 {single_io_time/thread_io_time:.1f}倍")
    
    # 3. 内存使用对比
    print("\n3. 内存使用注意事项:")
    print("  - 线程: 共享内存，内存使用少，但需要处理线程安全问题")
    print("  - 进程: 独立内存空间，内存使用多，但不需要处理线程安全问题")
    print("  - 线程创建开销: 小")
    print("  - 进程创建开销: 大")
    
    # 4. 选择指南
    print("\n4. 选择指南:")
    print("  使用多线程的场景:")
    print("    ✓ I/O密集型任务（网络请求、文件读写）")
    print("    ✓ 需要共享数据")
    print("    ✓ 任务执行时间短")
    
    print("\n  使用多进程的场景:")
    print("    ✓ CPU密集型任务（科学计算、图像处理）")
    print("    ✓ 需要进程隔离（一个进程崩溃不影响其他进程）")
    print("    ✓ 需要利用多核CPU")
    
    print("\n  通用建议:")
    print("    - 优先使用concurrent.futures模块")
    print("    - I/O密集型：使用ThreadPoolExecutor")
    print("    - CPU密集型：使用ProcessPoolExecutor")
    print("    - 任务数量多但简单：使用线程池/进程池")

# 运行比较
compare_performance()

print("\n" + "="*50)

3.2 实战案例：并行下载管理器

python 复制代码

import concurrent.futures
import requests
import os
import time

class ParallelDownloader:
    """并行下载管理器"""
    
    def __init__(self, max_workers=5, download_dir="downloads"):
        self.max_workers = max_workers
        self.download_dir = download_dir
        
        # 创建下载目录
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
    
    def download_file(self, url, filename=None):
        """下载单个文件"""
        if filename is None:
            filename = url.split("/")[-1]
        
        filepath = os.path.join(self.download_dir, filename)
        
        try:
            print(f"开始下载: {url}")
            start_time = time.time()
            
            response = requests.get(url, stream=True, timeout=10)
            response.raise_for_status()
            
            total_size = int(response.headers.get('content-length', 0))
            downloaded = 0
            
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        
                        # 显示进度
                        if total_size > 0:
                            percent = downloaded * 100 / total_size
                            print(f"  {filename}: {percent:.1f}%", end='\r')
            
            end_time = time.time()
            download_time = end_time - start_time
            
            file_size = os.path.getsize(filepath)
            speed = file_size / download_time / 1024  # KB/s
            
            print(f"下载完成: {filename} ({file_size/1024:.1f}KB, {speed:.1f}KB/s)")
            return {
                'url': url,
                'filename': filename,
                'success': True,
                'size': file_size,
                'time': download_time,
                'speed': speed
            }
            
        except Exception as e:
            print(f"下载失败 {url}: {e}")
            return {
                'url': url,
                'filename': filename,
                'success': False,
                'error': str(e)
            }
    
    def parallel_download(self, url_list):
        """并行下载多个文件"""
        print(f"开始并行下载 {len(url_list)} 个文件")
        print(f"工作线程数: {self.max_workers}")
        print("-" * 50)
        
        start_time = time.time()
        results = []
        
        # 使用线程池（I/O密集型任务适合用线程）
        with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # 提交所有下载任务
            future_to_url = {}
            for url in url_list:
                filename = url.split("/")[-1]
                future = executor.submit(self.download_file, url, filename)
                future_to_url[future] = url
            
            # 收集结果
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    result = future.result()
                    results.append(result)
                except Exception as e:
                    print(f"任务异常 {url}: {e}")
                    results.append({
                        'url': url,
                        'success': False,
                        'error': str(e)
                    })
        
        end_time = time.time()
        total_time = end_time - start_time
        
        # 统计结果
        successful = [r for r in results if r['success']]
        failed = [r for r in results if not r['success']]
        
        print("\n" + "="*50)
        print("下载完成!")
        print(f"总耗时: {total_time:.2f}秒")
        print(f"成功: {len(successful)} 个")
        print(f"失败: {len(failed)} 个")
        
        if successful:
            total_size = sum(r['size'] for r in successful)
            avg_speed = sum(r['speed'] for r in successful) / len(successful)
            print(f"总大小: {total_size/1024/1024:.2f}MB")
            print(f"平均速度: {avg_speed:.1f}KB/s")
        
        if failed:
            print("\n失败的文件:")
            for result in failed:
                print(f"  {result['url']}: {result.get('error', '未知错误')}")
        
        return results

# 模拟使用
def demo_downloader():
    """下载管理器演示"""
    print("=== 并行下载管理器演示 ===")
    
    # 示例URL列表（实际使用时替换为真实URL）
    # 注意：这里使用一些公开的测试URL
    test_urls = [
        "https://httpbin.org/image/jpeg",  # 测试图片
        "https://httpbin.org/image/png",
        "https://httpbin.org/image/svg",
        "https://httpbin.org/html",        # 测试HTML
        "https://httpbin.org/robots.txt",  # 测试文本
        "https://httpbin.org/xml"          # 测试XML
    ]
    
    # 创建下载管理器
    downloader = ParallelDownloader(max_workers=3, download_dir="test_downloads")
    
    # 执行并行下载
    results = downloader.parallel_download(test_urls)
    
    return results

print("实战案例：并行下载管理器")
print("这个管理器可以同时下载多个文件，提高下载效率")
print("注意：演示代码中使用的是测试URL，实际使用时需要替换为真实URL")

print("\n" + "="*50)

总结：最佳实践指南

关键决策点：

CPU密集型任务 → 使用多进程
I/O密集型任务 → 使用多线程
需要数据共享 → 多线程或共享内存
需要进程隔离 → 多进程
不确定任务类型 → 先测试，后选择

最佳实践：

使用高层API ：优先使用concurrent.futures模块
控制并发数：合理设置线程/进程数量
处理异常：确保异常不会导致程序崩溃
资源清理：使用with语句确保资源正确释放
性能监控：记录执行时间和资源使用情况

常见陷阱：

死锁：多个锁相互等待
竞态条件：多个线程同时修改共享数据
内存泄漏：忘记释放资源
过度并发：创建过多线程/进程导致系统卡顿