引言:理解并发与并行
在处理大数据或高并发场景时,我们需要让程序同时做多件事情。Python提供了两种主要方式:多线程(threading)和多进程(multiprocessing)。
python
# 一个直观的例子:同步 vs 多线程 vs 多进程
import time
def task(n):
"""模拟耗时任务"""
result = 0
for i in range(n):
result += i * i
return result
# 同步执行(一个一个来)
start = time.time()
for _ in range(4):
task(10000000)
print(f"同步执行耗时: {time.time() - start:.2f}秒")
# 多线程执行(同时开始,但由于GIL限制,实际还是交替执行)
# 多进程执行(真正的同时执行)
一、多线程编程
1.1 创建和启动线程
python
import threading
import time
print("=== 多线程基础 ===")
def worker(name, delay):
"""工作者函数"""
print(f"[{name}] 开始工作,需要 {delay} 秒")
time.sleep(delay)
print(f"[{name}] 工作完成")
return f"{name}的结果"
# 方法1:直接创建线程
def basic_threads():
print("方法1:直接创建线程")
# 创建线程对象
thread1 = threading.Thread(target=worker, args=("工人1", 2))
thread2 = threading.Thread(target=worker, args=("工人2", 1))
thread3 = threading.Thread(target=worker, args=("工人3", 3))
# 启动线程
thread1.start()
thread2.start()
thread3.start()
# 等待所有线程完成
thread1.join()
thread2.join()
thread3.join()
print("所有线程工作完成")
# 方法2:继承Thread类
class MyThread(threading.Thread):
"""自定义线程类"""
def __init__(self, name, delay):
super().__init__()
self.name = name
self.delay = delay
self.result = None
def run(self):
"""线程执行的代码"""
print(f"[{self.name}] 开始工作")
time.sleep(self.delay)
self.result = f"{self.name}完成任务"
print(f"[{self.name}] 工作完成")
def get_result(self):
"""获取结果"""
return self.result
def class_based_threads():
print("\n方法2:继承Thread类")
threads = []
for i in range(3):
thread = MyThread(f"线程{i+1}", i+1)
thread.start()
threads.append(thread)
# 等待并获取结果
for thread in threads:
thread.join()
print(f"结果: {thread.get_result()}")
# 运行演示
basic_threads()
class_based_threads()
print("\n" + "="*50)
1.2 线程同步与通信
python
import threading
import queue
import time
print("=== 线程同步与通信 ===")
# 1. 线程锁 - 防止数据竞争
class BankAccount:
"""银行账户(线程安全)"""
def __init__(self, balance=0):
self.balance = balance
self.lock = threading.Lock()
self.transactions = []
def deposit(self, amount, thread_name):
"""存款"""
with self.lock: # 自动获取和释放锁
old_balance = self.balance
self.balance += amount
self.transactions.append(f"{thread_name}: 存款 {amount}")
print(f"{thread_name}: 存款 {amount}, 余额: {old_balance} -> {self.balance}")
def withdraw(self, amount, thread_name):
"""取款"""
with self.lock:
if self.balance >= amount:
old_balance = self.balance
self.balance -= amount
self.transactions.append(f"{thread_name}: 取款 {amount}")
print(f"{thread_name}: 取款 {amount}, 余额: {old_balance} -> {self.balance}")
return True
else:
print(f"{thread_name}: 取款失败,余额不足")
return False
def test_bank_account():
"""测试银行账户"""
account = BankAccount(1000)
def customer_operations(name, operations):
for op_type, amount in operations:
if op_type == "deposit":
account.deposit(amount, name)
else:
account.withdraw(amount, name)
time.sleep(0.1)
# 多个客户同时操作账户
customers = [
("张三", [("deposit", 200), ("withdraw", 300)]),
("李四", [("deposit", 500), ("withdraw", 800)]),
("王五", [("withdraw", 400), ("deposit", 600)])
]
threads = []
for name, operations in customers:
thread = threading.Thread(target=customer_operations, args=(name, operations))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
print(f"\n最终余额: {account.balance}")
print("交易记录:", account.transactions)
# 2. 队列 - 线程间安全通信
def producer_consumer_demo():
"""生产者-消费者模式"""
print("\n生产者-消费者模式演示")
q = queue.Queue(maxsize=5) # 最大容量5
def producer(name, items):
"""生产者"""
for i in range(items):
item = f"{name}的产品-{i}"
q.put(item) # 如果队列满,会阻塞等待
print(f"[生产者{name}] 生产: {item}")
time.sleep(0.5)
q.put(None) # 发送结束信号
def consumer(name):
"""消费者"""
while True:
item = q.get() # 如果队列空,会阻塞等待
if item is None:
q.put(None) # 传递结束信号给其他消费者
print(f"[消费者{name}] 结束")
break
print(f"[消费者{name}] 消费: {item}")
time.sleep(1)
q.task_done()
# 启动生产者和消费者
producers = [
threading.Thread(target=producer, args=("A", 5)),
threading.Thread(target=producer, args=("B", 3))
]
consumers = [
threading.Thread(target=consumer, args=("1",)),
threading.Thread(target=consumer, args=("2",))
]
# 启动所有线程
for p in producers:
p.start()
for c in consumers:
c.start()
# 等待生产者完成
for p in producers:
p.join()
# 等待队列清空
q.join()
print("所有任务完成")
# 运行演示
test_bank_account()
producer_consumer_demo()
print("\n" + "="*50)
1.3 线程池
python
from concurrent.futures import ThreadPoolExecutor
import time
print("=== 线程池 ===")
def process_item(item):
"""处理单个项目"""
print(f"开始处理: {item}")
time.sleep(1) # 模拟耗时操作
result = f"{item}_processed"
print(f"处理完成: {item} -> {result}")
return result
def thread_pool_demo():
"""线程池演示"""
items = [f"item_{i}" for i in range(10)]
print(f"开始处理 {len(items)} 个项目")
start_time = time.time()
# 使用线程池(最大3个线程)
with ThreadPoolExecutor(max_workers=3) as executor:
# 提交所有任务
future_to_item = {executor.submit(process_item, item): item
for item in items}
# 收集结果
results = []
for future in future_to_item:
result = future.result()
results.append(result)
end_time = time.time()
print(f"\n所有项目处理完成!")
print(f"总耗时: {end_time - start_time:.2f}秒")
print(f"处理了 {len(results)} 个项目")
print(f"结果样例: {results[:3]}...")
# 高级特性:回调函数
def callback_demo():
"""回调函数演示"""
print("\n线程池回调函数演示")
def task(n):
"""任务函数"""
return n * n
def callback(future):
"""回调函数"""
result = future.result()
print(f"回调: 任务完成,结果={result}")
with ThreadPoolExecutor(max_workers=2) as executor:
# 提交任务并添加回调
for i in range(5):
future = executor.submit(task, i)
future.add_done_callback(callback)
print("所有回调执行完成")
# 运行演示
thread_pool_demo()
callback_demo()
print("\n" + "="*50)
二、多进程编程
2.1 创建进程
python
import multiprocessing
import time
import os
print("=== 多进程基础 ===")
def worker_process(name, delay):
"""工作进程函数"""
pid = os.getpid() # 获取进程ID
print(f"[进程{pid}] {name} 开始工作,需要 {delay} 秒")
time.sleep(delay)
print(f"[进程{pid}] {name} 工作完成")
return f"{name}的结果"
def basic_processes():
"""基本进程创建"""
print("创建多个进程")
processes = []
# 创建进程
for i in range(3):
p = multiprocessing.Process(
target=worker_process,
args=(f"工人{i+1}", i+1)
)
processes.append(p)
p.start()
# 等待所有进程完成
for p in processes:
p.join()
print("所有进程工作完成")
def process_pool_demo():
"""进程池演示"""
print("\n使用进程池")
def cpu_intensive_task(n):
"""CPU密集型任务"""
result = 0
for i in range(n):
result += i * i
pid = os.getpid()
print(f"[进程{pid}] 完成计算 n={n}")
return result
numbers = [1000000, 2000000, 3000000, 4000000]
start_time = time.time()
# 使用进程池
with multiprocessing.Pool(processes=2) as pool:
results = pool.map(cpu_intensive_task, numbers)
end_time = time.time()
print(f"计算完成!")
print(f"结果: {results}")
print(f"总耗时: {end_time - start_time:.2f}秒")
# 运行演示
basic_processes()
process_pool_demo()
print("\n" + "="*50)
2.2 进程间通信
python
import multiprocessing
import time
print("=== 进程间通信 ===")
# 1. 队列通信
def queue_communication():
"""使用队列进行进程间通信"""
print("1. 队列通信演示")
def producer(queue, items):
"""生产者进程"""
for item in items:
print(f"[生产者] 发送: {item}")
queue.put(item)
time.sleep(0.5)
queue.put(None) # 结束信号
def consumer(queue, name):
"""消费者进程"""
while True:
item = queue.get()
if item is None:
queue.put(None) # 传递结束信号
print(f"[消费者{name}] 结束")
break
print(f"[消费者{name}] 收到: {item}")
time.sleep(1)
# 创建队列
q = multiprocessing.Queue(maxsize=3)
# 创建进程
producer_p = multiprocessing.Process(
target=producer,
args=(q, ["A", "B", "C", "D", "E"])
)
consumer1_p = multiprocessing.Process(
target=consumer,
args=(q, "1")
)
consumer2_p = multiprocessing.Process(
target=consumer,
args=(q, "2")
)
# 启动进程
producer_p.start()
consumer1_p.start()
consumer2_p.start()
# 等待完成
producer_p.join()
consumer1_p.join()
consumer2_p.join()
print("队列通信完成")
# 2. 管道通信
def pipe_communication():
"""使用管道进行进程间通信"""
print("\n2. 管道通信演示")
def process_a(conn):
"""进程A"""
conn.send("Hello from Process A")
data = conn.recv()
print(f"[Process A] 收到: {data}")
conn.close()
def process_b(conn):
"""进程B"""
data = conn.recv()
print(f"[Process B] 收到: {data}")
conn.send("Hello from Process B")
conn.close()
# 创建管道
parent_conn, child_conn = multiprocessing.Pipe()
# 创建进程
p1 = multiprocessing.Process(target=process_a, args=(parent_conn,))
p2 = multiprocessing.Process(target=process_b, args=(child_conn,))
# 启动进程
p1.start()
p2.start()
# 等待完成
p1.join()
p2.join()
print("管道通信完成")
# 3. 共享内存
def shared_memory_demo():
"""共享内存演示"""
print("\n3. 共享内存演示")
def worker(shared_value, shared_array, lock, worker_id):
"""工作进程"""
with lock:
shared_value.value += 1
print(f"[Worker {worker_id}] 共享值: {shared_value.value}")
# 修改共享数组
for i in range(len(shared_array)):
shared_array[i] = worker_id * 10 + i
# 创建共享值
shared_value = multiprocessing.Value('i', 0) # 整数类型
# 创建共享数组
shared_array = multiprocessing.Array('i', 5) # 5个整数的数组
# 创建锁
lock = multiprocessing.Lock()
# 创建多个进程
processes = []
for i in range(3):
p = multiprocessing.Process(
target=worker,
args=(shared_value, shared_array, lock, i+1)
)
processes.append(p)
p.start()
# 等待所有进程完成
for p in processes:
p.join()
print(f"最终共享值: {shared_value.value}")
print(f"共享数组: {list(shared_array)}")
# 运行演示
queue_communication()
pipe_communication()
shared_memory_demo()
print("\n" + "="*50)
三、选择指南:线程 vs 进程
3.1 性能对比
python
import threading
import multiprocessing
import time
import os
def cpu_bound_task(n):
"""CPU密集型任务"""
count = 0
for i in range(n):
count += i * i
return count
def io_bound_task(duration):
"""I/O密集型任务"""
time.sleep(duration)
return duration
def compare_performance():
"""比较线程和进程的性能"""
print("=== 线程 vs 进程 性能对比 ===")
# 测试配置
task_count = 4
cpu_task_size = 10000000
io_task_duration = 1
# 1. CPU密集型任务对比
print("\n1. CPU密集型任务对比:")
# 单线程
start = time.time()
for _ in range(task_count):
cpu_bound_task(cpu_task_size)
single_time = time.time() - start
print(f" 单线程: {single_time:.2f}秒")
# 多线程(由于GIL,可能不会更快)
start = time.time()
threads = []
for _ in range(task_count):
t = threading.Thread(target=cpu_bound_task, args=(cpu_task_size,))
t.start()
threads.append(t)
for t in threads:
t.join()
thread_time = time.time() - start
print(f" 多线程({task_count}线程): {thread_time:.2f}秒")
# 多进程
start = time.time()
processes = []
for _ in range(task_count):
p = multiprocessing.Process(target=cpu_bound_task, args=(cpu_task_size,))
p.start()
processes.append(p)
for p in processes:
p.join()
process_time = time.time() - start
print(f" 多进程({task_count}进程): {process_time:.2f}秒")
print(f" CPU任务结论: 多进程比多线程快 {thread_time/process_time:.1f}倍")
# 2. I/O密集型任务对比
print("\n2. I/O密集型任务对比:")
# 单线程
start = time.time()
for _ in range(task_count):
io_bound_task(io_task_duration)
single_io_time = time.time() - start
print(f" 单线程: {single_io_time:.2f}秒")
# 多线程
start = time.time()
threads = []
for _ in range(task_count):
t = threading.Thread(target=io_bound_task, args=(io_task_duration,))
t.start()
threads.append(t)
for t in threads:
t.join()
thread_io_time = time.time() - start
print(f" 多线程({task_count}线程): {thread_io_time:.2f}秒")
# 多进程
start = time.time()
processes = []
for _ in range(task_count):
p = multiprocessing.Process(target=io_bound_task, args=(io_task_duration,))
p.start()
processes.append(p)
for p in processes:
p.join()
process_io_time = time.time() - start
print(f" 多进程({task_count}进程): {process_io_time:.2f}秒")
print(f" I/O任务结论: 多线程比单线程快 {single_io_time/thread_io_time:.1f}倍")
# 3. 内存使用对比
print("\n3. 内存使用注意事项:")
print(" - 线程: 共享内存,内存使用少,但需要处理线程安全问题")
print(" - 进程: 独立内存空间,内存使用多,但不需要处理线程安全问题")
print(" - 线程创建开销: 小")
print(" - 进程创建开销: 大")
# 4. 选择指南
print("\n4. 选择指南:")
print(" 使用多线程的场景:")
print(" ✓ I/O密集型任务(网络请求、文件读写)")
print(" ✓ 需要共享数据")
print(" ✓ 任务执行时间短")
print("\n 使用多进程的场景:")
print(" ✓ CPU密集型任务(科学计算、图像处理)")
print(" ✓ 需要进程隔离(一个进程崩溃不影响其他进程)")
print(" ✓ 需要利用多核CPU")
print("\n 通用建议:")
print(" - 优先使用concurrent.futures模块")
print(" - I/O密集型:使用ThreadPoolExecutor")
print(" - CPU密集型:使用ProcessPoolExecutor")
print(" - 任务数量多但简单:使用线程池/进程池")
# 运行比较
compare_performance()
print("\n" + "="*50)
3.2 实战案例:并行下载管理器
python
import concurrent.futures
import requests
import os
import time
class ParallelDownloader:
"""并行下载管理器"""
def __init__(self, max_workers=5, download_dir="downloads"):
self.max_workers = max_workers
self.download_dir = download_dir
# 创建下载目录
if not os.path.exists(download_dir):
os.makedirs(download_dir)
def download_file(self, url, filename=None):
"""下载单个文件"""
if filename is None:
filename = url.split("/")[-1]
filepath = os.path.join(self.download_dir, filename)
try:
print(f"开始下载: {url}")
start_time = time.time()
response = requests.get(url, stream=True, timeout=10)
response.raise_for_status()
total_size = int(response.headers.get('content-length', 0))
downloaded = 0
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
downloaded += len(chunk)
# 显示进度
if total_size > 0:
percent = downloaded * 100 / total_size
print(f" {filename}: {percent:.1f}%", end='\r')
end_time = time.time()
download_time = end_time - start_time
file_size = os.path.getsize(filepath)
speed = file_size / download_time / 1024 # KB/s
print(f"下载完成: {filename} ({file_size/1024:.1f}KB, {speed:.1f}KB/s)")
return {
'url': url,
'filename': filename,
'success': True,
'size': file_size,
'time': download_time,
'speed': speed
}
except Exception as e:
print(f"下载失败 {url}: {e}")
return {
'url': url,
'filename': filename,
'success': False,
'error': str(e)
}
def parallel_download(self, url_list):
"""并行下载多个文件"""
print(f"开始并行下载 {len(url_list)} 个文件")
print(f"工作线程数: {self.max_workers}")
print("-" * 50)
start_time = time.time()
results = []
# 使用线程池(I/O密集型任务适合用线程)
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
# 提交所有下载任务
future_to_url = {}
for url in url_list:
filename = url.split("/")[-1]
future = executor.submit(self.download_file, url, filename)
future_to_url[future] = url
# 收集结果
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
try:
result = future.result()
results.append(result)
except Exception as e:
print(f"任务异常 {url}: {e}")
results.append({
'url': url,
'success': False,
'error': str(e)
})
end_time = time.time()
total_time = end_time - start_time
# 统计结果
successful = [r for r in results if r['success']]
failed = [r for r in results if not r['success']]
print("\n" + "="*50)
print("下载完成!")
print(f"总耗时: {total_time:.2f}秒")
print(f"成功: {len(successful)} 个")
print(f"失败: {len(failed)} 个")
if successful:
total_size = sum(r['size'] for r in successful)
avg_speed = sum(r['speed'] for r in successful) / len(successful)
print(f"总大小: {total_size/1024/1024:.2f}MB")
print(f"平均速度: {avg_speed:.1f}KB/s")
if failed:
print("\n失败的文件:")
for result in failed:
print(f" {result['url']}: {result.get('error', '未知错误')}")
return results
# 模拟使用
def demo_downloader():
"""下载管理器演示"""
print("=== 并行下载管理器演示 ===")
# 示例URL列表(实际使用时替换为真实URL)
# 注意:这里使用一些公开的测试URL
test_urls = [
"https://httpbin.org/image/jpeg", # 测试图片
"https://httpbin.org/image/png",
"https://httpbin.org/image/svg",
"https://httpbin.org/html", # 测试HTML
"https://httpbin.org/robots.txt", # 测试文本
"https://httpbin.org/xml" # 测试XML
]
# 创建下载管理器
downloader = ParallelDownloader(max_workers=3, download_dir="test_downloads")
# 执行并行下载
results = downloader.parallel_download(test_urls)
return results
print("实战案例:并行下载管理器")
print("这个管理器可以同时下载多个文件,提高下载效率")
print("注意:演示代码中使用的是测试URL,实际使用时需要替换为真实URL")
print("\n" + "="*50)
总结:最佳实践指南
关键决策点:
- CPU密集型任务 → 使用多进程
- I/O密集型任务 → 使用多线程
- 需要数据共享 → 多线程或共享内存
- 需要进程隔离 → 多进程
- 不确定任务类型 → 先测试,后选择
最佳实践:
- 使用高层API :优先使用
concurrent.futures模块 - 控制并发数:合理设置线程/进程数量
- 处理异常:确保异常不会导致程序崩溃
- 资源清理:使用with语句确保资源正确释放
- 性能监控:记录执行时间和资源使用情况
常见陷阱:
- 死锁:多个锁相互等待
- 竞态条件:多个线程同时修改共享数据
- 内存泄漏:忘记释放资源
- 过度并发:创建过多线程/进程导致系统卡顿