TCP 线程池连接耗尽故障排查指导


🔍 一、快速诊断:Python 特有命令

  1. 进程级连接诊断
bash 复制代码
# 查看 Python 进程连接状态(替换 <pid> 为实际进程ID)
ss -tanp | grep python | grep <pid>

# 统计各状态连接数
cat /proc/<pid>/net/tcp | awk '{print $4}' | sort | uniq -c

# 查看文件描述符(socket 也是一种 fd)
ls -l /proc/<pid>/fd | grep socket | wc -l
ls -l /proc/<pid>/fd | awk '{print $NF}' | sort | uniq -c | sort -rn
  1. Python 运行时诊断(无需重启)
python 复制代码
# 在运行中的 Python 进程里注入诊断代码(使用 gdb 或 manhole)
import sys
import threading
import gc
import socket

# 查看活跃线程
print(f"活跃线程数: {threading.active_count()}")
for t in threading.enumerate():
    print(f"  - {t.name}: {t.ident}")

# 查看 socket 对象(可能被 GC 但还没关闭)
sockets = [obj for obj in gc.get_objects() if isinstance(obj, socket.socket)]
print(f"内存中 socket 对象数: {len(sockets)}")
for s in sockets[:10]:  # 只显示前10个
    try:
        print(f"  fd={s.fileno()}, {s.getpeername()} -> {s.getsockname()}")
    except:
        print(f"  fd={s.fileno()}, [未连接或已关闭]")
  1. 使用 py-spy 进行实时诊断(无需修改代码)
bash 复制代码
# 安装 py-spy
pip install py-spy

# 实时查看线程状态(类似 top)
py-spy top --pid <pid>

# 导出火焰图(查看 CPU 耗时)
py-spy record -o profile.svg --pid <pid>

# 查看线程堆栈(类似 jstack)
py-spy dump --pid <pid>

🎯 二、Python 常见故障模式

故障模式 Python 特有症状 典型场景 GIL 阻塞 CPU 不高但响应慢,线程状态多为 PyEval_RestoreThread 多线程 CPU 密集型任务 + IO 操作 asyncio 事件循环阻塞 协程不切换,大量任务 pending 在 async 函数中调用了同步阻塞 IO requests 会话未复用 大量 TIME_WAIT,每次请求新建 TCP 连接 未使用 requests.Session() 线程池未释放 线程数持续增长,最终 OOM ThreadPoolExecutorshutdown() 数据库连接泄漏 CLOSE_WAIT 堆积,连接池耗尽 SQLAlchemy 会话未 close()remove()


🛠️ 三、分层排查与解决方案

第一层:同步代码(requests + ThreadPoolExecutor)

❌ 错误示范:连接泄漏经典案例

python 复制代码
import requests
from concurrent.futures import ThreadPoolExecutor

# 错误1:每次新建 Session(导致大量 TIME_WAIT)
def fetch_bad(url):
    resp = requests.get(url, timeout=30)  # 每次新建 TCP 连接!
    return resp.text

# 错误2:线程池不复用 + 不关闭
def batch_fetch_bad(urls):
    for url in urls:  # 串行执行,慢
        fetch_bad(url)
    
    # 或者错误地使用线程池但不关闭
    executor = ThreadPoolExecutor(max_workers=100)  # 无界增长!
    futures = [executor.submit(fetch_bad, url) for url in urls]
    return [f.result() for f in futures]  # 忘记 executor.shutdown()

# 错误3:不设置超时(默认挂起 forever)
requests.get("http://slow-api.com")  # 可能卡死数小时

✅ 正确实践:连接池化 + 资源管理

python 复制代码
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import contextlib

# 1. 创建复用 Session(连接池核心)
def create_session(
    pool_connections=10,
    pool_maxsize=100,
    max_retries=3,
    backoff_factor=0.5
):
    session = requests.Session()
    
    # 重试策略(关键!防止瞬时故障耗尽连接)
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
    )
    
    # 连接池配置
    adapter = HTTPAdapter(
        pool_connections=pool_connections,  # 主机数
        pool_maxsize=pool_maxsize,            # 单主机连接数
        max_retries=retry_strategy,
        pool_block=True  # 连接池满时阻塞而非抛异常(可配)
    )
    
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # 全局超时默认值
    session.request = lambda *args, **kwargs: requests.Session.request(
        session, *args, **kwargs, timeout=(5, 30)  # (连接超时, 读取超时)
    )
    
    return session

# 2. 正确使用线程池(上下文管理器确保关闭)
def batch_fetch_good(urls, max_workers=10):
    session = create_session(pool_maxsize=max_workers * 2)
    
    # 使用上下文管理器确保线程池关闭
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 提交所有任务
        future_to_url = {
            executor.submit(session.get, url): url 
            for url in urls
        }
        
        results = {}
        for future in as_completed(future_to_url, timeout=60):  # 总超时
            url = future_to_url[future]
            try:
                resp = future.result()
                results[url] = resp.text
            except Exception as e:
                results[url] = f"Error: {e}"
                # 关键:异常时也要确保响应体被读取并关闭
                if hasattr(future, 'exception') and future.exception():
                    exc = future.exception()
                    if hasattr(exc, 'response') and exc.response:
                        exc.response.close()
        
        return results
    
    # 退出时自动调用 executor.shutdown(wait=True) + session.close()

# 3. 更安全的异步迭代(防止内存爆炸)
def batch_fetch_streaming(urls, max_workers=10, chunk_size=100):
    """分批处理,控制内存和并发"""
    session = create_session()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(urls), chunk_size):
            chunk = urls[i:i + chunk_size]
            futures = [executor.submit(session.get, url) for url in chunk]
            
            for future in as_completed(futures):
                yield future.result()
            
            # 每批完成后强制清理(可选)
            import gc
            gc.collect()

第二层:异步代码(asyncio + aiohttp)

❌ 错误示范:asyncio 阻塞陷阱

python 复制代码
import asyncio
import requests  # 致命:在 async 里用同步库!

async def bad_async_fetch(url):
    # 错误:requests 会阻塞整个事件循环!
    resp = requests.get(url)  # 阻塞所有协程
    return resp.text

async def main():
    # 错误:没有限制并发数,会瞬间创建数千连接
    tasks = [bad_async_fetch(f"http://api.com/{i}") for i in range(10000)]
    await asyncio.gather(*tasks)  # 资源爆炸!

# 错误:不设置超时(默认 None,可能永久挂起)
asyncio.wait_for(some_coroutine, timeout=None)

✅ 正确实践:纯异步 + 信号量控制

python 复制代码
import asyncio
import aiohttp
from aiohttp import ClientTimeout, TCPConnector
import asyncpg  # 异步数据库

# 1. 创建带限流的 Session(核心!)
async def create_async_session(
    limit=100,           # 总连接池大小
    limit_per_host=30,   # 单主机连接数
    ttl_dns_cache=300,   # DNS 缓存5分钟
    use_dns_cache=True
):
    # TCP 连接器配置(比 requests 更精细)
    connector = TCPConnector(
        limit=limit,
        limit_per_host=limit_per_host,
        ttl_dns_cache=ttl_dns_cache,
        use_dns_cache=use_dns_cache,
        enable_cleanup_closed=True,  # 自动清理关闭连接
        force_close=False,           # 保持长连接
        ssl=False  # 如需 HTTPS 设为 True 或 ssl.SSLContext
    )
    
    # 超时配置(多层防护)
    timeout = ClientTimeout(
        total=30,       # 总超时(含连接、发送、读取)
        connect=5,      # 建立连接超时
        sock_read=10    # 读取数据超时
    )
    
    session = aiohttp.ClientSession(
        connector=connector,
        timeout=timeout,
        headers={"Connection": "keep-alive"}
    )
    
    return session

# 2. 信号量控制并发(防止雪崩)
async def fetch_with_semaphore(session, url, semaphore):
    async with semaphore:  # 限制同时进行的请求数
        try:
            async with session.get(url) as resp:
                # 关键:使用 raise_for_status 检查 HTTP 错误
                resp.raise_for_status()
                return await resp.text()
        except asyncio.TimeoutError:
            # 超时后确保连接被回收
            return None
        except aiohttp.ClientError as e:
            # 记录错误但不让异常扩散
            print(f"Request failed: {e}")
            return None

# 3. 完整并发控制示例
async def controlled_fetch(urls, max_concurrent=50):
    semaphore = asyncio.Semaphore(max_concurrent)
    session = await create_async_session(limit=max_concurrent * 2)
    
    try:
        # 创建任务但立即开始执行(控制内存)
        tasks = []
        for url in urls:
            task = asyncio.create_task(
                fetch_with_semaphore(session, url, semaphore)
            )
            tasks.append(task)
            
            # 当积压任务过多时,先处理一批(背压控制)
            if len(tasks) >= 1000:
                done, pending = await asyncio.wait(
                    tasks, 
                    return_when=asyncio.FIRST_COMPLETED
                )
                tasks = list(pending)
                for d in done:
                    yield await d
        
        # 处理剩余任务
        if tasks:
            results = await asyncio.gather(*tasks, return_exceptions=True)
            for r in results:
                yield r
                
    finally:
        # 关键:确保 session 关闭(会关闭底层连接池)
        await session.close()

# 4. 异步数据库连接池(PostgreSQL 示例)
async def create_db_pool():
    return await asyncpg.create_pool(
        "postgresql://user:pass@localhost/db",
        min_size=5,           # 最小连接数
        max_size=20,          # 最大连接数(关键!)
        max_queries=50000,    # 单连接最大查询数(防泄漏)
        max_inactive_time=300, # 5分钟无活动回收
        command_timeout=60,   # 查询超时
        server_settings={
            'jit': 'off'  # 禁用 JIT 加速短查询
        }
    )

# 使用示例
async def main():
    pool = await create_db_pool()
    try:
        # 自动管理连接生命周期
        async with pool.acquire() as conn:
            async with conn.transaction():
                result = await conn.fetch("SELECT * FROM users WHERE id = $1", 1)
                return result
    finally:
        await pool.close()

第三层:WSGI/ASGI 服务器配置

Gunicorn + Flask/FastAPI 优化

python 复制代码
# gunicorn.conf.py - 生产环境配置
import multiprocessing
import os

# 工作进程模型(关键选择)
# - sync: 每个请求一个线程,简单但并发低
# - gevent: 协程模型,高并发推荐(需 monkey patch)
# - uvicorn.workers.UvicornWorker: ASGI 异步
worker_class = "gevent"  # 或 "uvicorn.workers.UvicornWorker"

# 工作进程数(CPU 密集型:2-4x CPU;IO 密集型:可更高)
workers = multiprocessing.cpu_count() * 2 + 1
worker_connections = 1000  # gevent 模式下每个 worker 的最大并发

# 线程池(仅 sync 模式有效)
threads = 4

# 连接超时(防止慢客户端占用连接)
timeout = 30
graceful_timeout = 10
keepalive = 5  # 长连接保持秒数

# 请求限制(防止恶意请求)
max_requests = 10000       # 单 worker 处理 10000 请求后重启(防内存泄漏)
max_requests_jitter = 1000  # 随机抖动,防止所有 worker 同时重启

# 日志
accesslog = "-"
errorlog = "-"
loglevel = "warning"

# 预加载应用(节省内存)
preload_app = True

# 关键:限制请求体大小(防攻击)
limit_request_line = 4094
limit_request_fields = 100
limit_request_field_size = 8190

启动命令

bash 复制代码
# 使用 gevent(高并发 HTTP)
gunicorn -c gunicorn.conf.py "app:create_app()"

# 使用 uvicorn(纯异步 ASGI)
gunicorn -k uvicorn.workers.UvicornWorker -w 4 "app:asgi_app"

# 使用 gunicorn + uvicorn(生产推荐)
gunicorn -k uvicorn.workers.UvicornH11Worker -w 4 -b 0.0.0.0:8000 "app:asgi_app"

第四层:监控与可观测性

使用 prometheus_client 暴露指标

python 复制代码
from prometheus_client import Counter, Histogram, Gauge, start_http_server
import functools
import time
import asyncio

# 定义指标
ACTIVE_REQUESTS = Gauge(
    'http_requests_active', 
    '当前处理中的请求数',
    ['method', 'endpoint']
)

REQUEST_DURATION = Histogram(
    'http_request_duration_seconds',
    '请求处理耗时',
    ['method', 'endpoint', 'status'],
    buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0]
)

CONNECTION_POOL_SIZE = Gauge(
    'connection_pool_size',
    '连接池大小',
    ['pool_name']
)

# 装饰器自动监控
def monitor_async(func):
    @functools.wraps(func)
    async def wrapper(*args, **kwargs):
        method = kwargs.get('method', 'GET')
        endpoint = func.__name__
        
        with ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).track_inprogress():
            start = time.time()
            try:
                result = await func(*args, **kwargs)
                status = 200
                return result
            except Exception as e:
                status = 500
                raise
            finally:
                duration = time.time() - start
                REQUEST_DURATION.labels(
                    method=method, 
                    endpoint=endpoint, 
                    status=status
                ).observe(duration)
    
    return wrapper

# 监控连接池(以 aiohttp 为例)
async def monitor_aiohttp_session(session, name="default"):
    if hasattr(session, '_connector'):
        connector = session._connector
        while True:
            CONNECTION_POOL_SIZE.labels(pool_name=name).set(connector.size)
            CONNECTION_POOL_SIZE.labels(pool_name=f"{name}_limit").set(connector.limit)
            await asyncio.sleep(10)  # 每10秒更新

# 启动指标服务器(在应用启动时调用)
def start_metrics_server(port=9090):
    start_http_server(port)
    print(f"Metrics server started on port {port}")

健康检查端点(Kubernetes 必备)

python 复制代码
from fastapi import FastAPI, HTTPException
import asyncio
import psutil

app = FastAPI()

# 连接池状态检查(自定义)
class ConnectionPoolHealth:
    def __init__(self):
        self.pools = {}
    
    def register_pool(self, name, pool_obj, max_size_attr='maxsize', 
                     used_attr='size', available_attr='available'):
        self.pools[name] = {
            'obj': pool_obj,
            'max': max_size_attr,
            'used': used_attr,
            'avail': available_attr
        }
    
    def check(self):
        status = {}
        for name, config in self.pools.items():
            pool = config['obj']
            try:
                max_size = getattr(pool, config['max'], 0)
                used = getattr(pool, config['used'], 0)
                available = getattr(pool, config['avail'], 0)
                
                usage = used / max_size if max_size > 0 else 0
                status[name] = {
                    'healthy': usage < 0.9,  # 使用率<90%为健康
                    'usage': f"{usage:.1%}",
                    'used': used,
                    'max': max_size,
                    'available': available
                }
            except Exception as e:
                status[name] = {'healthy': False, 'error': str(e)}
        
        return status

pool_health = ConnectionPoolHealth()

@app.get("/health")
async def health_check():
    # 系统资源检查
    cpu_percent = psutil.cpu_percent(interval=0.1)
    memory = psutil.virtual_memory()
    
    # 连接池检查
    pool_status = pool_health.check()
    pools_healthy = all(s.get('healthy', False) for s in pool_status.values())
    
    healthy = (
        cpu_percent < 90 and 
        memory.percent < 90 and 
        pools_healthy
    )
    
    status_code = 200 if healthy else 503
    
    return {
        "status": "healthy" if healthy else "unhealthy",
        "cpu": f"{cpu_percent}%",
        "memory": f"{memory.percent}%",
        "pools": pool_status
    }, status_code

@app.get("/ready")
async def readiness_check():
    """启动检查:依赖服务是否就绪"""
    # 检查数据库连接等
    try:
        # await check_db_connection()
        return {"ready": True}
    except Exception as e:
        raise HTTPException(status_code=503, detail=str(e))

🔧 五、Python 专用应急工具

  1. 现场诊断脚本(直接注入运行进程)
python 复制代码
#!/usr/bin/env python3
# debug_injector.py - 用于诊断运行中的 Python 进程

import sys
import threading
import gc
import asyncio
import inspect

def diagnose():
    print("=" * 50)
    print(f"Python 版本: {sys.version}")
    print(f"活跃线程数: {threading.active_count()}")
    print(f"当前线程: {threading.current_thread().name}")
    
    # 检查 asyncio 事件循环
    try:
        loop = asyncio.get_running_loop()
        print(f"事件循环运行中: {loop.is_running()}")
        print(f"默认执行器: {loop._default_executor}")
        if hasattr(loop, '_scheduled'):
            print(f"计划任务数: {len(loop._scheduled)}")
    except RuntimeError:
        print("无运行中的事件循环")
    
    # 检查 socket 对象
    import socket
    sockets = [obj for obj in gc.get_objects() if isinstance(obj, socket.socket)]
    print(f"\n内存中 socket 对象: {len(sockets)}")
    
    # 检查线程池
    from concurrent.futures import ThreadPoolExecutor
    executors = [obj for obj in gc.get_objects() if isinstance(obj, ThreadPoolExecutor)]
    print(f"\nThreadPoolExecutor 实例: {len(executors)}")
    for i, exe in enumerate(executors):
        print(f"  Executor {i}: workers={exe._max_workers}, "
              f"active={len(exe._threads)}, "
              f"queue={exe._work_queue.qsize()}")
    
    # 检查 aiohttp 会话(如果存在)
    try:
        import aiohttp
        sessions = [obj for obj in gc.get_objects() if isinstance(obj, aiohttp.ClientSession)]
        print(f"\naiohttp ClientSession: {len(sessions)}")
        for s in sessions:
            if hasattr(s, '_connector'):
                conn = s._connector
                print(f"  Connector: size={conn.size}, limit={conn.limit}, "
                      f"limit_per_host={conn.limit_per_host}")
    except ImportError:
        pass
    
    print("=" * 50)

if __name__ == "__main__":
    diagnose()

使用方法:

bash 复制代码
# 方法1:使用 gdb 注入运行进程
gdb -p <pid> -ex 'call PyRun_SimpleString("exec(open(\"debug_injector.py\").read())")' -ex 'detach' -ex 'quit'

# 方法2:使用 manhole(需预先安装)
pip install manhole
# 在应用启动时添加:
import manhole
manhole.install()

# 然后连接诊断
python -m manhole <pid>
  1. 连接泄漏检测装饰器
python 复制代码
import functools
import weakref
import logging

logger = logging.getLogger(__name__)

def track_connections(cls):
    """类装饰器:跟踪所有实例的生命周期"""
    instances = weakref.WeakSet()
    
    original_init = cls.__init__
    
    @functools.wraps(original_init)
    def new_init(self, *args, **kwargs):
        original_init(self, *args, **kwargs)
        instances.add(self)
        # 记录创建栈
        import traceback
        self._creation_stack = traceback.format_stack()
    
    cls.__init__ = new_init
    
    @classmethod
    def get_live_instances(cls):
        return list(instances)
    
    @classmethod
    def log_leaked(cls, threshold=100):
        live = cls.get_live_instances()
        if len(live) > threshold:
            logger.warning(f"检测到 {cls.__name__} 泄漏: {len(live)} 个存活实例")
            for i, inst in enumerate(live[:5]):  # 只显示前5个
                if hasattr(inst, '_creation_stack'):
                    logger.warning(f"实例 {i} 创建位置:\n{''.join(inst._creation_stack[-3:])}")
    
    cls.get_live_instances = get_live_instances
    cls.log_leaked = log_leaked
    
    return cls

# 使用示例
@track_connections
class DatabaseConnection:
    def __init__(self, dsn):
        self.dsn = dsn
        self.conn = None
    
    def close(self):
        if self.conn:
            self.conn.close()
            self.conn = None

# 定期检查泄漏
import asyncio
async def leak_monitor():
    while True:
        DatabaseConnection.log_leaked(threshold=50)
        await asyncio.sleep(60)

📋 六、Python 排查 checklist

python 复制代码
□ 确认 Python 版本(3.8+ 的 asyncio 更稳定)
□ 检查是否混用同步/异步代码(requests 在 async def 中?)
□ 验证 Session/ClientSession 是否复用(不要每次新建)
□ 确认所有池化资源有界(ThreadPoolExecutor、连接池)
□ 检查超时配置(连接、读取、总超时缺一不可)
□ 验证资源关闭逻辑(try/finally 或 async with)
□ 确认使用健康检查端点(Kubernetes 场景)
□ 部署监控指标(prometheus_client)
□ 配置 Gunicorn/Uvicorn 工作模式与参数
□ 测试故障注入(模拟下游超时、连接拒绝)
相关推荐
zhaoyin19942 小时前
关于文件读取中使用的斜杠问题
python
勇往直前plus3 小时前
从文件到屏幕:Python/java 字符编码、解码、文本处理的底层逻辑解析
java·开发语言·python
~央千澈~3 小时前
优雅草科技2026年2月重磅产品·优雅草·写作中枢 — 产品介绍与发布说明
python
Emotional。7 小时前
2025 年度技术总结与规划:AI 时代的开发者成长之路
人工智能·python·ai·langchain
witAI11 小时前
**AI仿真人剧制作软件2025推荐,解锁沉浸式数字内容创作
人工智能·python
Codefengfeng12 小时前
Python Base环境中加包的方法
开发语言·python
清水白石00812 小时前
《Python 编程全景解析:从核心精要到测试替身(Test Doubles)五大武器的实战淬炼》
开发语言·python
如若12313 小时前
AutoDL云服务器 NVIDIA 570驱动 EGL渲染修复全记录
运维·服务器·python
甲枫叶13 小时前
【claude】Claude Code正式引入Git Worktree原生支持:Agent全面实现并行独立工作
java·人工智能·git·python·ai编程