TCP 线程池连接耗尽故障排查指导

🔍 一、快速诊断：Python 特有命令

进程级连接诊断

bash 复制代码

# 查看 Python 进程连接状态（替换 <pid> 为实际进程ID）
ss -tanp | grep python | grep <pid>

# 统计各状态连接数
cat /proc/<pid>/net/tcp | awk '{print $4}' | sort | uniq -c

# 查看文件描述符（socket 也是一种 fd）
ls -l /proc/<pid>/fd | grep socket | wc -l
ls -l /proc/<pid>/fd | awk '{print $NF}' | sort | uniq -c | sort -rn

Python 运行时诊断（无需重启）

python 复制代码

# 在运行中的 Python 进程里注入诊断代码（使用 gdb 或 manhole）
import sys
import threading
import gc
import socket

# 查看活跃线程
print(f"活跃线程数: {threading.active_count()}")
for t in threading.enumerate():
    print(f"  - {t.name}: {t.ident}")

# 查看 socket 对象（可能被 GC 但还没关闭）
sockets = [obj for obj in gc.get_objects() if isinstance(obj, socket.socket)]
print(f"内存中 socket 对象数: {len(sockets)}")
for s in sockets[:10]:  # 只显示前10个
    try:
        print(f"  fd={s.fileno()}, {s.getpeername()} -> {s.getsockname()}")
    except:
        print(f"  fd={s.fileno()}, [未连接或已关闭]")

使用 py-spy 进行实时诊断（无需修改代码）

bash 复制代码

# 安装 py-spy
pip install py-spy

# 实时查看线程状态（类似 top）
py-spy top --pid <pid>

# 导出火焰图（查看 CPU 耗时）
py-spy record -o profile.svg --pid <pid>

# 查看线程堆栈（类似 jstack）
py-spy dump --pid <pid>

🎯 二、Python 常见故障模式

故障模式 Python 特有症状典型场景 GIL 阻塞 CPU 不高但响应慢，线程状态多为 PyEval_RestoreThread 多线程 CPU 密集型任务 + IO 操作 asyncio 事件循环阻塞协程不切换，大量任务 pending 在 async 函数中调用了同步阻塞 IO requests 会话未复用大量 TIME_WAIT，每次请求新建 TCP 连接未使用 requests.Session() 线程池未释放线程数持续增长，最终 OOM ThreadPoolExecutor 未 shutdown() 数据库连接泄漏 CLOSE_WAIT 堆积，连接池耗尽 SQLAlchemy 会话未 close() 或 remove()

🛠️ 三、分层排查与解决方案

第一层：同步代码（requests + ThreadPoolExecutor）

❌ 错误示范：连接泄漏经典案例

python 复制代码

import requests
from concurrent.futures import ThreadPoolExecutor

# 错误1：每次新建 Session（导致大量 TIME_WAIT）
def fetch_bad(url):
    resp = requests.get(url, timeout=30)  # 每次新建 TCP 连接！
    return resp.text

# 错误2：线程池不复用 + 不关闭
def batch_fetch_bad(urls):
    for url in urls:  # 串行执行，慢
        fetch_bad(url)
    
    # 或者错误地使用线程池但不关闭
    executor = ThreadPoolExecutor(max_workers=100)  # 无界增长！
    futures = [executor.submit(fetch_bad, url) for url in urls]
    return [f.result() for f in futures]  # 忘记 executor.shutdown()

# 错误3：不设置超时（默认挂起 forever）
requests.get("http://slow-api.com")  # 可能卡死数小时

✅ 正确实践：连接池化 + 资源管理

python 复制代码

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from concurrent.futures import ThreadPoolExecutor, as_completed
import contextlib

# 1. 创建复用 Session（连接池核心）
def create_session(
    pool_connections=10,
    pool_maxsize=100,
    max_retries=3,
    backoff_factor=0.5
):
    session = requests.Session()
    
    # 重试策略（关键！防止瞬时故障耗尽连接）
    retry_strategy = Retry(
        total=max_retries,
        backoff_factor=backoff_factor,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS", "POST"]
    )
    
    # 连接池配置
    adapter = HTTPAdapter(
        pool_connections=pool_connections,  # 主机数
        pool_maxsize=pool_maxsize,            # 单主机连接数
        max_retries=retry_strategy,
        pool_block=True  # 连接池满时阻塞而非抛异常（可配）
    )
    
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    
    # 全局超时默认值
    session.request = lambda *args, **kwargs: requests.Session.request(
        session, *args, **kwargs, timeout=(5, 30)  # (连接超时, 读取超时)
    )
    
    return session

# 2. 正确使用线程池（上下文管理器确保关闭）
def batch_fetch_good(urls, max_workers=10):
    session = create_session(pool_maxsize=max_workers * 2)
    
    # 使用上下文管理器确保线程池关闭
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 提交所有任务
        future_to_url = {
            executor.submit(session.get, url): url 
            for url in urls
        }
        
        results = {}
        for future in as_completed(future_to_url, timeout=60):  # 总超时
            url = future_to_url[future]
            try:
                resp = future.result()
                results[url] = resp.text
            except Exception as e:
                results[url] = f"Error: {e}"
                # 关键：异常时也要确保响应体被读取并关闭
                if hasattr(future, 'exception') and future.exception():
                    exc = future.exception()
                    if hasattr(exc, 'response') and exc.response:
                        exc.response.close()
        
        return results
    
    # 退出时自动调用 executor.shutdown(wait=True) + session.close()

# 3. 更安全的异步迭代（防止内存爆炸）
def batch_fetch_streaming(urls, max_workers=10, chunk_size=100):
    """分批处理，控制内存和并发"""
    session = create_session()
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(urls), chunk_size):
            chunk = urls[i:i + chunk_size]
            futures = [executor.submit(session.get, url) for url in chunk]
            
            for future in as_completed(futures):
                yield future.result()
            
            # 每批完成后强制清理（可选）
            import gc
            gc.collect()

第二层：异步代码（asyncio + aiohttp）

❌ 错误示范：asyncio 阻塞陷阱

python 复制代码

import asyncio
import requests  # 致命：在 async 里用同步库！

async def bad_async_fetch(url):
    # 错误：requests 会阻塞整个事件循环！
    resp = requests.get(url)  # 阻塞所有协程
    return resp.text

async def main():
    # 错误：没有限制并发数，会瞬间创建数千连接
    tasks = [bad_async_fetch(f"http://api.com/{i}") for i in range(10000)]
    await asyncio.gather(*tasks)  # 资源爆炸！

# 错误：不设置超时（默认 None，可能永久挂起）
asyncio.wait_for(some_coroutine, timeout=None)

✅ 正确实践：纯异步 + 信号量控制

python 复制代码

import asyncio
import aiohttp
from aiohttp import ClientTimeout, TCPConnector
import asyncpg  # 异步数据库

# 1. 创建带限流的 Session（核心！）
async def create_async_session(
    limit=100,           # 总连接池大小
    limit_per_host=30,   # 单主机连接数
    ttl_dns_cache=300,   # DNS 缓存5分钟
    use_dns_cache=True
):
    # TCP 连接器配置（比 requests 更精细）
    connector = TCPConnector(
        limit=limit,
        limit_per_host=limit_per_host,
        ttl_dns_cache=ttl_dns_cache,
        use_dns_cache=use_dns_cache,
        enable_cleanup_closed=True,  # 自动清理关闭连接
        force_close=False,           # 保持长连接
        ssl=False  # 如需 HTTPS 设为 True 或 ssl.SSLContext
    )
    
    # 超时配置（多层防护）
    timeout = ClientTimeout(
        total=30,       # 总超时（含连接、发送、读取）
        connect=5,      # 建立连接超时
        sock_read=10    # 读取数据超时
    )
    
    session = aiohttp.ClientSession(
        connector=connector,
        timeout=timeout,
        headers={"Connection": "keep-alive"}
    )
    
    return session

# 2. 信号量控制并发（防止雪崩）
async def fetch_with_semaphore(session, url, semaphore):
    async with semaphore:  # 限制同时进行的请求数
        try:
            async with session.get(url) as resp:
                # 关键：使用 raise_for_status 检查 HTTP 错误
                resp.raise_for_status()
                return await resp.text()
        except asyncio.TimeoutError:
            # 超时后确保连接被回收
            return None
        except aiohttp.ClientError as e:
            # 记录错误但不让异常扩散
            print(f"Request failed: {e}")
            return None

# 3. 完整并发控制示例
async def controlled_fetch(urls, max_concurrent=50):
    semaphore = asyncio.Semaphore(max_concurrent)
    session = await create_async_session(limit=max_concurrent * 2)
    
    try:
        # 创建任务但立即开始执行（控制内存）
        tasks = []
        for url in urls:
            task = asyncio.create_task(
                fetch_with_semaphore(session, url, semaphore)
            )
            tasks.append(task)
            
            # 当积压任务过多时，先处理一批（背压控制）
            if len(tasks) >= 1000:
                done, pending = await asyncio.wait(
                    tasks, 
                    return_when=asyncio.FIRST_COMPLETED
                )
                tasks = list(pending)
                for d in done:
                    yield await d
        
        # 处理剩余任务
        if tasks:
            results = await asyncio.gather(*tasks, return_exceptions=True)
            for r in results:
                yield r
                
    finally:
        # 关键：确保 session 关闭（会关闭底层连接池）
        await session.close()

# 4. 异步数据库连接池（PostgreSQL 示例）
async def create_db_pool():
    return await asyncpg.create_pool(
        "postgresql://user:pass@localhost/db",
        min_size=5,           # 最小连接数
        max_size=20,          # 最大连接数（关键！）
        max_queries=50000,    # 单连接最大查询数（防泄漏）
        max_inactive_time=300, # 5分钟无活动回收
        command_timeout=60,   # 查询超时
        server_settings={
            'jit': 'off'  # 禁用 JIT 加速短查询
        }
    )

# 使用示例
async def main():
    pool = await create_db_pool()
    try:
        # 自动管理连接生命周期
        async with pool.acquire() as conn:
            async with conn.transaction():
                result = await conn.fetch("SELECT * FROM users WHERE id = $1", 1)
                return result
    finally:
        await pool.close()

第三层：WSGI/ASGI 服务器配置

Gunicorn + Flask/FastAPI 优化

python 复制代码

# gunicorn.conf.py - 生产环境配置
import multiprocessing
import os

# 工作进程模型（关键选择）
# - sync: 每个请求一个线程，简单但并发低
# - gevent: 协程模型，高并发推荐（需 monkey patch）
# - uvicorn.workers.UvicornWorker: ASGI 异步
worker_class = "gevent"  # 或 "uvicorn.workers.UvicornWorker"

# 工作进程数（CPU 密集型：2-4x CPU；IO 密集型：可更高）
workers = multiprocessing.cpu_count() * 2 + 1
worker_connections = 1000  # gevent 模式下每个 worker 的最大并发

# 线程池（仅 sync 模式有效）
threads = 4

# 连接超时（防止慢客户端占用连接）
timeout = 30
graceful_timeout = 10
keepalive = 5  # 长连接保持秒数

# 请求限制（防止恶意请求）
max_requests = 10000       # 单 worker 处理 10000 请求后重启（防内存泄漏）
max_requests_jitter = 1000  # 随机抖动，防止所有 worker 同时重启

# 日志
accesslog = "-"
errorlog = "-"
loglevel = "warning"

# 预加载应用（节省内存）
preload_app = True

# 关键：限制请求体大小（防攻击）
limit_request_line = 4094
limit_request_fields = 100
limit_request_field_size = 8190

启动命令

bash 复制代码

# 使用 gevent（高并发 HTTP）
gunicorn -c gunicorn.conf.py "app:create_app()"

# 使用 uvicorn（纯异步 ASGI）
gunicorn -k uvicorn.workers.UvicornWorker -w 4 "app:asgi_app"

# 使用 gunicorn + uvicorn（生产推荐）
gunicorn -k uvicorn.workers.UvicornH11Worker -w 4 -b 0.0.0.0:8000 "app:asgi_app"

第四层：监控与可观测性

使用 prometheus_client 暴露指标

python 复制代码

from prometheus_client import Counter, Histogram, Gauge, start_http_server
import functools
import time
import asyncio

# 定义指标
ACTIVE_REQUESTS = Gauge(
    'http_requests_active', 
    '当前处理中的请求数',
    ['method', 'endpoint']
)

REQUEST_DURATION = Histogram(
    'http_request_duration_seconds',
    '请求处理耗时',
    ['method', 'endpoint', 'status'],
    buckets=[.005, .01, .025, .05, .075, .1, .25, .5, .75, 1.0, 2.5, 5.0, 7.5, 10.0]
)

CONNECTION_POOL_SIZE = Gauge(
    'connection_pool_size',
    '连接池大小',
    ['pool_name']
)

# 装饰器自动监控
def monitor_async(func):
    @functools.wraps(func)
    async def wrapper(*args, **kwargs):
        method = kwargs.get('method', 'GET')
        endpoint = func.__name__
        
        with ACTIVE_REQUESTS.labels(method=method, endpoint=endpoint).track_inprogress():
            start = time.time()
            try:
                result = await func(*args, **kwargs)
                status = 200
                return result
            except Exception as e:
                status = 500
                raise
            finally:
                duration = time.time() - start
                REQUEST_DURATION.labels(
                    method=method, 
                    endpoint=endpoint, 
                    status=status
                ).observe(duration)
    
    return wrapper

# 监控连接池（以 aiohttp 为例）
async def monitor_aiohttp_session(session, name="default"):
    if hasattr(session, '_connector'):
        connector = session._connector
        while True:
            CONNECTION_POOL_SIZE.labels(pool_name=name).set(connector.size)
            CONNECTION_POOL_SIZE.labels(pool_name=f"{name}_limit").set(connector.limit)
            await asyncio.sleep(10)  # 每10秒更新

# 启动指标服务器（在应用启动时调用）
def start_metrics_server(port=9090):
    start_http_server(port)
    print(f"Metrics server started on port {port}")

健康检查端点（Kubernetes 必备）

python 复制代码

from fastapi import FastAPI, HTTPException
import asyncio
import psutil

app = FastAPI()

# 连接池状态检查（自定义）
class ConnectionPoolHealth:
    def __init__(self):
        self.pools = {}
    
    def register_pool(self, name, pool_obj, max_size_attr='maxsize', 
                     used_attr='size', available_attr='available'):
        self.pools[name] = {
            'obj': pool_obj,
            'max': max_size_attr,
            'used': used_attr,
            'avail': available_attr
        }
    
    def check(self):
        status = {}
        for name, config in self.pools.items():
            pool = config['obj']
            try:
                max_size = getattr(pool, config['max'], 0)
                used = getattr(pool, config['used'], 0)
                available = getattr(pool, config['avail'], 0)
                
                usage = used / max_size if max_size > 0 else 0
                status[name] = {
                    'healthy': usage < 0.9,  # 使用率<90%为健康
                    'usage': f"{usage:.1%}",
                    'used': used,
                    'max': max_size,
                    'available': available
                }
            except Exception as e:
                status[name] = {'healthy': False, 'error': str(e)}
        
        return status

pool_health = ConnectionPoolHealth()

@app.get("/health")
async def health_check():
    # 系统资源检查
    cpu_percent = psutil.cpu_percent(interval=0.1)
    memory = psutil.virtual_memory()
    
    # 连接池检查
    pool_status = pool_health.check()
    pools_healthy = all(s.get('healthy', False) for s in pool_status.values())
    
    healthy = (
        cpu_percent < 90 and 
        memory.percent < 90 and 
        pools_healthy
    )
    
    status_code = 200 if healthy else 503
    
    return {
        "status": "healthy" if healthy else "unhealthy",
        "cpu": f"{cpu_percent}%",
        "memory": f"{memory.percent}%",
        "pools": pool_status
    }, status_code

@app.get("/ready")
async def readiness_check():
    """启动检查：依赖服务是否就绪"""
    # 检查数据库连接等
    try:
        # await check_db_connection()
        return {"ready": True}
    except Exception as e:
        raise HTTPException(status_code=503, detail=str(e))

🔧 五、Python 专用应急工具

现场诊断脚本（直接注入运行进程）

python 复制代码

#!/usr/bin/env python3
# debug_injector.py - 用于诊断运行中的 Python 进程

import sys
import threading
import gc
import asyncio
import inspect

def diagnose():
    print("=" * 50)
    print(f"Python 版本: {sys.version}")
    print(f"活跃线程数: {threading.active_count()}")
    print(f"当前线程: {threading.current_thread().name}")
    
    # 检查 asyncio 事件循环
    try:
        loop = asyncio.get_running_loop()
        print(f"事件循环运行中: {loop.is_running()}")
        print(f"默认执行器: {loop._default_executor}")
        if hasattr(loop, '_scheduled'):
            print(f"计划任务数: {len(loop._scheduled)}")
    except RuntimeError:
        print("无运行中的事件循环")
    
    # 检查 socket 对象
    import socket
    sockets = [obj for obj in gc.get_objects() if isinstance(obj, socket.socket)]
    print(f"\n内存中 socket 对象: {len(sockets)}")
    
    # 检查线程池
    from concurrent.futures import ThreadPoolExecutor
    executors = [obj for obj in gc.get_objects() if isinstance(obj, ThreadPoolExecutor)]
    print(f"\nThreadPoolExecutor 实例: {len(executors)}")
    for i, exe in enumerate(executors):
        print(f"  Executor {i}: workers={exe._max_workers}, "
              f"active={len(exe._threads)}, "
              f"queue={exe._work_queue.qsize()}")
    
    # 检查 aiohttp 会话（如果存在）
    try:
        import aiohttp
        sessions = [obj for obj in gc.get_objects() if isinstance(obj, aiohttp.ClientSession)]
        print(f"\naiohttp ClientSession: {len(sessions)}")
        for s in sessions:
            if hasattr(s, '_connector'):
                conn = s._connector
                print(f"  Connector: size={conn.size}, limit={conn.limit}, "
                      f"limit_per_host={conn.limit_per_host}")
    except ImportError:
        pass
    
    print("=" * 50)

if __name__ == "__main__":
    diagnose()

使用方法：

bash 复制代码

# 方法1：使用 gdb 注入运行进程
gdb -p <pid> -ex 'call PyRun_SimpleString("exec(open(\"debug_injector.py\").read())")' -ex 'detach' -ex 'quit'

# 方法2：使用 manhole（需预先安装）
pip install manhole
# 在应用启动时添加：
import manhole
manhole.install()

# 然后连接诊断
python -m manhole <pid>

连接泄漏检测装饰器

python 复制代码

import functools
import weakref
import logging

logger = logging.getLogger(__name__)

def track_connections(cls):
    """类装饰器：跟踪所有实例的生命周期"""
    instances = weakref.WeakSet()
    
    original_init = cls.__init__
    
    @functools.wraps(original_init)
    def new_init(self, *args, **kwargs):
        original_init(self, *args, **kwargs)
        instances.add(self)
        # 记录创建栈
        import traceback
        self._creation_stack = traceback.format_stack()
    
    cls.__init__ = new_init
    
    @classmethod
    def get_live_instances(cls):
        return list(instances)
    
    @classmethod
    def log_leaked(cls, threshold=100):
        live = cls.get_live_instances()
        if len(live) > threshold:
            logger.warning(f"检测到 {cls.__name__} 泄漏: {len(live)} 个存活实例")
            for i, inst in enumerate(live[:5]):  # 只显示前5个
                if hasattr(inst, '_creation_stack'):
                    logger.warning(f"实例 {i} 创建位置:\n{''.join(inst._creation_stack[-3:])}")
    
    cls.get_live_instances = get_live_instances
    cls.log_leaked = log_leaked
    
    return cls

# 使用示例
@track_connections
class DatabaseConnection:
    def __init__(self, dsn):
        self.dsn = dsn
        self.conn = None
    
    def close(self):
        if self.conn:
            self.conn.close()
            self.conn = None

# 定期检查泄漏
import asyncio
async def leak_monitor():
    while True:
        DatabaseConnection.log_leaked(threshold=50)
        await asyncio.sleep(60)

📋 六、Python 排查 checklist

python 复制代码

□ 确认 Python 版本（3.8+ 的 asyncio 更稳定）
□ 检查是否混用同步/异步代码（requests 在 async def 中？）
□ 验证 Session/ClientSession 是否复用（不要每次新建）
□ 确认所有池化资源有界（ThreadPoolExecutor、连接池）
□ 检查超时配置（连接、读取、总超时缺一不可）
□ 验证资源关闭逻辑（try/finally 或 async with）
□ 确认使用健康检查端点（Kubernetes 场景）
□ 部署监控指标（prometheus_client）
□ 配置 Gunicorn/Uvicorn 工作模式与参数
□ 测试故障注入（模拟下游超时、连接拒绝）