从零开始学 Python：自动化 / 运维开发实战（核心库 + 3 大实战场景）

从零开始学 Python：自动化 / 运维开发实战（核心库 + 3 大实战场景）

在运维工作中，重复的服务器巡检、批量部署、日志分析等任务不仅耗时耗力，还容易出现人为失误。而 Python 凭借丰富的运维核心库，能轻松实现这些任务的自动化，大幅提升运维效率、降低操作风险。

本文将从运维开发的核心库入手，详细讲解paramiko（SSH 远程操作）、fabric（批量运维）、ansible（自动化配置）、psutil（系统监控）的使用，再通过 3 个落地实战场景（服务器批量巡检、自动化部署、日志分析工具），帮你从零基础掌握 Python 自动化 / 运维开发，实现运维工作的 "提质增效"。

一、运维开发核心库：开箱即用的运维工具

Python 运维生态成熟，以下核心库覆盖了远程操作、批量管理、系统监控等核心场景，是运维自动化的基石。

1.1 paramiko：SSH 远程操作核心库

paramiko是 Python 实现的 SSH2 协议库，支持 SSH 远程登录、执行命令、上传下载文件，无需依赖系统 SSH 客户端，是实现单台 / 多台服务器远程操作的基础。

前置条件：安装 paramiko

复制代码

pip install paramiko

核心用法

场景 1：SSH 远程执行命令

复制代码

import paramiko

def ssh_execute_command(host, port, username, password, command):
    """
    SSH远程登录并执行命令
    :param host: 服务器IP
    :param port: SSH端口
    :param username: 登录用户名
    :param password: 登录密码
    :param command: 要执行的系统命令
    :return: 命令执行结果
    """
    # 1. 创建SSH客户端对象
    ssh_client = paramiko.SSHClient()
    
    # 2. 自动添加未知主机密钥（生产环境建议手动配置）
    ssh_client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    
    try:
        # 3. 连接服务器
        ssh_client.connect(
            hostname=host,
            port=port,
            username=username,
            password=password,
            timeout=10
        )
        
        # 4. 执行命令（stdin: 输入, stdout: 输出, stderr: 错误）
        stdin, stdout, stderr = ssh_client.exec_command(command)
        
        # 5. 获取执行结果
        stdout_result = stdout.read().decode("utf-8", errors="ignore")
        stderr_result = stderr.read().decode("utf-8", errors="ignore")
        
        return {
            "status": "success",
            "stdout": stdout_result,
            "stderr": stderr_result
        }
    except Exception as e:
        return {
            "status": "failed",
            "error": str(e)
        }
    finally:
        # 6. 关闭连接
        ssh_client.close()

# 调用示例
result = ssh_execute_command(
    host="192.168.1.100",
    port=22,
    username="root",
    password="your_server_password",
    command="df -h  # 查看磁盘使用情况"
)

# 打印结果
if result["status"] == "success":
    print("命令执行成功，输出结果：")
    print(result["stdout"])
    if result["stderr"]:
        print("命令执行警告/错误：")
        print(result["stderr"])
else:
    print(f"命令执行失败：{result['error']}")

场景 2：SFTP 上传 / 下载文件

复制代码

import paramiko

def sftp_upload_file(host, port, username, password, local_file, remote_file):
    """
    SFTP上传本地文件到远程服务器
    """
    transport = paramiko.Transport((host, port))
    try:
        # 连接SFTP
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        
        # 上传文件
        sftp.put(local_file, remote_file)
        print(f"文件 {local_file} 已成功上传到 {remote_file}")
        return True
    except Exception as e:
        print(f"文件上传失败：{str(e)}")
        return False
    finally:
        transport.close()

def sftp_download_file(host, port, username, password, remote_file, local_file):
    """
    SFTP从远程服务器下载文件到本地
    """
    transport = paramiko.Transport((host, port))
    try:
        transport.connect(username=username, password=password)
        sftp = paramiko.SFTPClient.from_transport(transport)
        
        # 下载文件
        sftp.get(remote_file, local_file)
        print(f"文件 {remote_file} 已成功下载到 {local_file}")
        return True
    except Exception as e:
        print(f"文件下载失败：{str(e)}")
        return False
    finally:
        transport.close()

# 调用示例
sftp_upload_file(
    host="192.168.1.100",
    port=22,
    username="root",
    password="your_server_password",
    local_file="local_test.txt",
    remote_file="/root/remote_test.txt"
)

sftp_download_file(
    host="192.168.1.100",
    port=22,
    username="root",
    password="your_server_password",
    remote_file="/root/remote_test.txt",
    local_file="downloaded_test.txt"
)

1.2 fabric：批量运维自动化工具

fabric是基于paramiko封装的高级批量运维库，支持批量执行命令、批量上传下载文件、任务编排，语法更简洁，是批量管理服务器的首选工具（当前主流版本为Fabric 3，兼容 Python3）。

前置条件：安装 fabric

复制代码

pip install fabric3

核心用法：批量执行运维任务

复制代码

from fabric import Connection
from invoke import Responder

# 1. 定义服务器列表（可从配置文件读取）
servers = [
    {"host": "192.168.1.100", "port": 22, "user": "root", "password": "your_server_password"},
    {"host": "192.168.1.101", "port": 22, "user": "root", "password": "your_server_password"}
]

# 2. 定义批量执行命令的函数
def batch_execute_command(servers, command):
    """
    批量在多台服务器上执行命令
    """
    for server in servers:
        print(f"========== 开始处理服务器 {server['host']} ==========")
        try:
            # 创建连接
            conn = Connection(
                host=server["host"],
                port=server["port"],
                user=server["user"],
                connect_kwargs={"password": server["password"]}
            )
            
            # 执行命令（支持sudo提权，若需要）
            sudo_responder = Responder(
                pattern=r"\[sudo\] password for .*:",
                response=f"{server['password']}\n"
            )
            
            # 执行普通命令
            result = conn.run(command, hide=False, warn=True)
            if result.ok:
                print(f"服务器 {server['host']} 命令执行成功，输出：\n{result.stdout}")
            else:
                print(f"服务器 {server['host']} 命令执行失败，错误：\n{result.stderr}")
                
        except Exception as e:
            print(f"服务器 {server['host']} 连接/执行失败：{str(e)}")
        finally:
            print(f"========== 结束处理服务器 {server['host']} ==========\n")

# 3. 定义批量上传文件的函数
def batch_upload_file(servers, local_file, remote_file):
    """
    批量上传文件到多台服务器
    """
    for server in servers:
        print(f"========== 开始上传文件到服务器 {server['host']} ==========")
        try:
            conn = Connection(
                host=server["host"],
                port=server["port"],
                user=server["user"],
                connect_kwargs={"password": server["password"]}
            )
            
            # 上传文件
            conn.put(local_file, remote_file)
            print(f"文件已成功上传到服务器 {server['host']} 的 {remote_file}")
            
        except Exception as e:
            print(f"服务器 {server['host']} 文件上传失败：{str(e)}")
        finally:
            print(f"========== 结束上传服务器 {server['host']} ==========\n")

# 4. 调用批量任务
if __name__ == "__main__":
    # 批量执行查看内存命令
    batch_execute_command(servers, "free -h")
    
    # 批量上传配置文件
    batch_upload_file(servers, "app.conf", "/etc/app.conf")

1.3 Ansible：自动化配置与运维（Python 脚本集成）

Ansible 是一款强大的 IT 自动化工具，基于 SSH 实现无代理批量运维，支持配置管理、应用部署、任务编排，其核心功能可通过 Python 脚本调用，实现更灵活的自动化流程。

前置条件：安装 Ansible

复制代码

# Ubuntu/Debian
apt install ansible

# CentOS/RHEL
yum install ansible

# 或通过pip安装
pip install ansible

核心用法：Python 脚本调用 Ansible

复制代码

import ansible.runner
import ansible.playbook
import ansible.inventory

# 1. 定义Ansible配置
inventory = ansible.inventory.Inventory(["192.168.1.100", "192.168.1.101"])
private_key_file = "/root/.ssh/id_rsa"  # 免密登录密钥（推荐）
remote_user = "root"

# 2. 执行单个命令（runner方式）
def ansible_execute_command(hosts, command):
    """
    用Ansible批量执行命令
    """
    runner = ansible.runner.Runner(
        pattern='*',
        module_name='command',
        module_args=command,
        inventory=inventory,
        remote_user=remote_user,
        private_key_file=private_key_file
    )
    
    # 执行并获取结果
    result = runner.run()
    if result:
        for host, host_result in result["contacted"].items():
            print(f"========== 服务器 {host} ==========")
            if "stdout" in host_result:
                print(f"执行成功：\n{host_result['stdout']}")
            else:
                print(f"执行失败：\n{host_result['msg']}")
        # 处理未连接的服务器
        for host in result["dark"]:
            print(f"服务器 {host} 无法连接")

# 3. 执行Ansible Playbook（更复杂的任务编排）
def ansible_run_playbook(playbook_path):
    """
    用Ansible执行Playbook
    """
    pb = ansible.playbook.PlayBook(
        playbook=playbook_path,
        inventory=inventory,
        remote_user=remote_user,
        private_key_file=private_key_file
    )
    
    # 执行Playbook
    result = pb.run()
    print(f"Playbook执行完成，结果：{result}")

# 4. 调用示例
if __name__ == "__main__":
    # 批量执行磁盘查看命令
    ansible_execute_command(["192.168.1.100", "192.168.1.101"], "df -h")
    
    # 执行Playbook（需提前编写deploy.yml）
    # ansible_run_playbook("deploy.yml")

补充：简易 Ansible Playbook 示例（deploy.yml）

复制代码

- hosts: all
  remote_user: root
  tasks:
    - name: 安装nginx
      yum:
        name: nginx
        state: present
    - name: 启动nginx并设置开机自启
      service:
        name: nginx
        state: started
        enabled: yes

1.4 psutil：系统监控与信息采集

psutil是 Python 跨平台系统监控库，支持获取 CPU、内存、磁盘、网络、进程等系统信息，无需调用系统命令，返回结构化数据，是服务器巡检、监控工具开发的核心库。

前置条件：安装 psutil

复制代码

pip install psutil

核心用法：获取系统核心信息

复制代码

import psutil
import datetime

def get_system_info():
    """
    采集服务器核心系统信息
    :return: 系统信息字典
    """
    system_info = {}
    
    # 1. 基本系统信息
    system_info["boot_time"] = datetime.datetime.fromtimestamp(
        psutil.boot_time()
    ).strftime("%Y-%m-%d %H:%M:%S")
    
    # 2. CPU信息
    cpu_count = psutil.cpu_count(logical=True)  # 逻辑CPU数
    cpu_percent = psutil.cpu_percent(interval=1, percpu=True)  # 每个CPU使用率（间隔1秒）
    system_info["cpu"] = {
        "logical_count": cpu_count,
        "total_percent": psutil.cpu_percent(interval=0),
        "per_cpu_percent": cpu_percent
    }
    
    # 3. 内存信息
    mem = psutil.virtual_memory()
    system_info["memory"] = {
        "total_gb": round(mem.total / (1024**3), 2),  # 总内存（GB）
        "used_gb": round(mem.used / (1024**3), 2),    # 已使用内存（GB）
        "free_gb": round(mem.free / (1024**3), 2),    # 空闲内存（GB）
        "used_percent": mem.percent                   # 内存使用率
    }
    
    # 4. 磁盘信息（仅获取挂载的本地磁盘）
    disk_info = {}
    for partition in psutil.disk_partitions(all=False):
        if partition.fstype:
            disk_usage = psutil.disk_usage(partition.mountpoint)
            disk_info[partition.mountpoint] = {
                "total_gb": round(disk_usage.total / (1024**3), 2),
                "used_gb": round(disk_usage.used / (1024**3), 2),
                "free_gb": round(disk_usage.free / (1024**3), 2),
                "used_percent": disk_usage.percent
            }
    system_info["disk"] = disk_info
    
    # 5. 网络信息（获取总收发流量）
    net_io = psutil.net_io_counters()
    system_info["network"] = {
        "sent_gb": round(net_io.bytes_sent / (1024**3), 2),
        "recv_gb": round(net_io.bytes_recv / (1024**3), 2)
    }
    
    return system_info

# 调用示例：打印系统信息
if __name__ == "__main__":
    sys_info = get_system_info()
    print("========== 服务器系统信息 ==========")
    print(f"开机时间：{sys_info['boot_time']}")
    print(f"CPU使用率：{sys_info['cpu']['total_percent']}%")
    print(f"内存使用率：{sys_info['memory']['used_percent']}%")
    print(f"磁盘使用率（/）：{sys_info['disk']['/']['used_percent']}%")

二、运维实战场景 1：服务器批量巡检脚本

场景需求

批量巡检多台服务器的 CPU、内存、磁盘使用率，设置阈值告警（如 CPU 使用率 > 80%、内存 > 90%、磁盘 > 95%），将巡检结果输出到日志文件并生成简易报告。

完整实现代码

复制代码

import psutil
import datetime
import logging
from fabric import Connection

# 1. 配置日志
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("server_inspection.log", encoding="utf-8"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 2. 配置项
SERVERS = [
    {"host": "192.168.1.100", "port": 22, "user": "root", "password": "your_server_password"},
    {"host": "192.168.1.101", "port": 22, "user": "root", "password": "your_server_password"}
]
# 告警阈值
THRESHOLDS = {
    "cpu": 80,
    "memory": 90,
    "disk": 95
}
# 巡检报告保存路径
REPORT_PATH = "server_inspection_report.txt"

# 3. 本地巡检函数（若脚本在目标服务器运行）
def local_inspection():
    """本地服务器巡检"""
    try:
        sys_info = get_system_info()
        return {"status": "success", "data": sys_info}
    except Exception as e:
        logger.error(f"本地巡检失败：{str(e)}")
        return {"status": "failed", "error": str(e)}

# 4. 远程巡检函数（通过fabric远程调用psutil，需目标服务器已安装psutil）
def remote_inspection(server):
    """远程服务器巡检"""
    logger.info(f"开始巡检远程服务器：{server['host']}")
    try:
        # 连接远程服务器
        conn = Connection(
            host=server["host"],
            port=server["port"],
            user=server["user"],
            connect_kwargs={"password": server["password"]}
        )
        
        # 远程执行Python脚本（采集系统信息）
        remote_script = """
import psutil
import datetime

def get_system_info():
    system_info = {}
    system_info["boot_time"] = datetime.datetime.fromtimestamp(psutil.boot_time()).strftime("%Y-%m-%d %H:%M:%S")
    system_info["cpu"] = {"total_percent": psutil.cpu_percent(interval=1)}
    mem = psutil.virtual_memory()
    system_info["memory"] = {"used_percent": mem.percent}
    disk_info = {}
    for p in psutil.disk_partitions(all=False):
        if p.fstype:
            du = psutil.disk_usage(p.mountpoint)
            disk_info[p.mountpoint] = {"used_percent": du.percent}
    system_info["disk"] = disk_info
    return system_info

print(get_system_info())
"""
        
        # 执行远程脚本并获取结果
        result = conn.run(f"python3 -c '{remote_script}'", hide=True, warn=True)
        if not result.ok:
            logger.error(f"服务器 {server['host']} 执行脚本失败：{result.stderr}")
            return {"status": "failed", "host": server["host"], "error": result.stderr}
        
        # 解析结果（简化处理，实际可使用json序列化）
        import ast
        sys_info = ast.literal_eval(result.stdout)
        logger.info(f"服务器 {server['host']} 巡检成功")
        return {"status": "success", "host": server["host"], "data": sys_info}
    
    except Exception as e:
        logger.error(f"服务器 {server['host']} 巡检异常：{str(e)}")
        return {"status": "failed", "host": server["host"], "error": str(e)}

# 5. 生成巡检报告
def generate_report(inspection_results):
    """生成巡检报告"""
    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    report = [f"# 服务器批量巡检报告 {now}", "="*50, ""]
    
    for result in inspection_results:
        if result["status"] == "success":
            host = result["host"]
            data = result["data"]
            report.append(f"## 服务器：{host}")
            report.append(f"  开机时间：{data['boot_time']}")
            report.append(f"  CPU使用率：{data['cpu']['total_percent']}% {'[告警]' if data['cpu']['total_percent'] > THRESHOLDS['cpu'] else ''}")
            report.append(f"  内存使用率：{data['memory']['used_percent']}% {'[告警]' if data['memory']['used_percent'] > THRESHOLDS['memory'] else ''}")
            report.append("  磁盘使用率：")
            for mount_point, disk_data in data["disk"].items():
                alarm = "[告警]" if disk_data["used_percent"] > THRESHOLDS["disk"] else ""
                report.append(f"    {mount_point}：{disk_data['used_percent']}% {alarm}")
        else:
            report.append(f"## 服务器：{result['host']} [巡检失败]")
            report.append(f"  错误信息：{result['error']}")
        report.append("")
    
    # 写入报告文件
    with open(REPORT_PATH, "w", encoding="utf-8") as f:
        f.write("\n".join(report))
    
    logger.info(f"巡检报告已生成，保存路径：{REPORT_PATH}")

# 6. 核心：系统信息采集（与前文一致，简化版）
def get_system_info():
    system_info = {}
    system_info["boot_time"] = datetime.datetime.fromtimestamp(
        psutil.boot_time()
    ).strftime("%Y-%m-%d %H:%M:%S")
    system_info["cpu"] = {"total_percent": psutil.cpu_percent(interval=1)}
    mem = psutil.virtual_memory()
    system_info["memory"] = {"used_percent": mem.percent}
    disk_info = {}
    for partition in psutil.disk_partitions(all=False):
        if partition.fstype:
            disk_usage = psutil.disk_usage(partition.mountpoint)
            disk_info[partition.mountpoint] = {"used_percent": disk_usage.percent}
    system_info["disk"] = disk_info
    return system_info

# 7. 主函数：批量巡检
def main():
    logger.info("========== 开始服务器批量巡检 ==========")
    inspection_results = []
    
    # 遍历所有服务器进行巡检
    for server in SERVERS:
        result = remote_inspection(server)
        inspection_results.append(result)
    
    # 生成巡检报告
    generate_report(inspection_results)
    logger.info("========== 服务器批量巡检结束 ==========")

if __name__ == "__main__":
    main()

使用说明

修改SERVERS配置中的服务器 IP、账号、密码；
目标服务器需安装 Python3 和psutil（可通过批量命令提前安装：pip3 install psutil）；
运行脚本后，生成server_inspection.log（巡检日志）和server_inspection_report.txt（巡检报告）；
超过阈值的指标会标注[告警]，可后续扩展邮件 / 钉钉告警功能。

三、运维实战场景 2：自动化部署脚本

场景需求

实现 Python/Web 项目的自动化部署，核心流程：「从 Git 拉取最新代码」→「安装 / 更新依赖」→「项目打包（可选）」→「停止旧服务」→「启动新服务」→「验证服务可用性」。

完整实现代码

复制代码

import logging
from fabric import Connection
from datetime import datetime

# 1. 配置日志
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[
        logging.FileHandler("deploy_automation.log", encoding="utf-8"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 2. 部署配置
DEPLOY_CONFIG = {
    "server": {
        "host": "192.168.1.100",
        "port": 22,
        "user": "root",
        "password": "your_server_password"
    },
    "project": {
        "name": "my_python_project",
        "git_repo": "https://github.com/your-name/your-project.git",
        "deploy_path": "/opt/projects",
        "venv_path": "/opt/venv",  # 虚拟环境路径
        "requirements_file": "requirements.txt",
        "service_name": "my_project.service"  # systemd服务名
    }
}

# 3. 定义单个部署步骤
def git_pull_code(conn, project_config):
    """从Git拉取最新代码"""
    project_path = f"{project_config['deploy_path']}/{project_config['name']}"
    logger.info(f"开始拉取代码，项目路径：{project_path}")
    
    # 检查项目目录是否存在，不存在则克隆
    result = conn.run(f"test -d {project_path}", warn=True)
    if result.failed:
        logger.info("项目目录不存在，开始克隆仓库")
        conn.run(f"git clone {project_config['git_repo']} {project_path}", hide=False)
    else:
        logger.info("项目目录已存在，拉取最新代码")
        conn.run(f"cd {project_path} && git pull", hide=False)
    
    logger.info("代码拉取完成")

def install_dependencies(conn, project_config):
    """安装/更新项目依赖"""
    project_path = f"{project_config['deploy_path']}/{project_config['name']}"
    venv_pip = f"{project_config['venv_path']}/bin/pip"
    requirements_path = f"{project_path}/{project_config['requirements_file']}"
    
    logger.info("开始安装/更新项目依赖")
    result = conn.run(
        f"{venv_pip} install -r {requirements_path} --upgrade",
        hide=False,
        warn=True
    )
    
    if result.ok:
        logger.info("依赖安装/更新完成")
    else:
        logger.error(f"依赖安装失败：{result.stderr}")
        raise Exception("依赖安装步骤失败")

def stop_old_service(conn, project_config):
    """停止旧服务"""
    logger.info("开始停止旧服务")
    result = conn.run(
        f"systemctl stop {project_config['service_name']}",
        warn=True,
        hide=False
    )
    
    if result.ok or "Unit * does not exist" in result.stderr:
        logger.info("旧服务已停止（或服务未存在）")
    else:
        logger.error(f"停止旧服务失败：{result.stderr}")
        raise Exception("停止旧服务步骤失败")

def start_new_service(conn, project_config):
    """启动新服务"""
    logger.info("开始启动新服务")
    # 重新加载systemd配置（若服务文件有修改）
    conn.run("systemctl daemon-reload", hide=False)
    # 启动服务并设置开机自启
    result = conn.run(
        f"systemctl start {project_config['service_name']} && systemctl enable {project_config['service_name']}",
        hide=False,
        warn=True
    )
    
    if result.ok:
        logger.info("新服务启动成功并设置开机自启")
    else:
        logger.error(f"启动新服务失败：{result.stderr}")
        raise Exception("启动新服务步骤失败")

def verify_service(conn, project_config):
    """验证服务可用性（示例：检查服务状态+访问接口）"""
    logger.info("开始验证服务可用性")
    # 1. 检查systemd服务状态
    service_result = conn.run(
        f"systemctl is-active {project_config['service_name']}",
        hide=True,
        warn=True
    )
    
    if service_result.stdout.strip() != "active":
        logger.error(f"服务未正常运行，状态：{service_result.stdout}")
        raise Exception("服务验证失败：服务未激活")
    
    # 2. 可选：访问项目接口验证（示例：curl访问本地端口）
    api_result = conn.run(
        f"curl -s -w '%{{http_code}}' http://127.0.0.1:8000/health -o /dev/null",
        hide=True,
        warn=True
    )
    
    if api_result.stdout.strip() == "200":
        logger.info("服务接口验证成功，返回200 OK")
    else:
        logger.warning(f"服务接口验证返回非200，状态码：{api_result.stdout}")

# 4. 主部署流程
def automated_deploy():
    """自动化部署主流程"""
    logger.info("========== 开始项目自动化部署 ==========")
    server_config = DEPLOY_CONFIG["server"]
    project_config = DEPLOY_CONFIG["project"]
    
    # 创建服务器连接
    try:
        conn = Connection(
            host=server_config["host"],
            port=server_config["port"],
            user=server_config["user"],
            connect_kwargs={"password": server_config["password"]}
        )
        
        # 执行部署步骤（按顺序编排）
        git_pull_code(conn, project_config)
        install_dependencies(conn, project_config)
        stop_old_service(conn, project_config)
        start_new_service(conn, project_config)
        verify_service(conn, project_config)
        
        logger.info("========== 项目自动化部署全部完成 ==========")
        return True
    
    except Exception as e:
        logger.error(f"========== 自动化部署失败：{str(e)} ==========")
        return False

if __name__ == "__main__":
    automated_deploy()

补充说明

项目需使用systemd管理服务（创建my_project.service放在/etc/systemd/system/）；
目标服务器需安装 Git、Python3、虚拟环境（可提前批量配置）；
若为 Java 项目，可修改步骤为「打包（mvn package）」→「替换 JAR 包」→「重启服务」；
可扩展回滚功能：部署前备份旧代码 / 旧包，部署失败时恢复。

示例 systemd 服务文件（my_project.service）

复制代码

[Unit]
Description=My Python Project Service
After=network.target

[Service]
User=root
WorkingDirectory=/opt/projects/my_python_project
ExecStart=/opt/venv/bin/python3 app.py
Restart=on-failure
RestartSec=5

[Install]
WantedBy=multi-user.target

四、运维实战场景 3：日志分析工具

场景需求

开发一款通用日志分析工具，支持：「按关键词过滤日志」→「统计指定报错信息出现次数」→「按时间范围筛选日志」→「生成分析报告」，适用于 Nginx、Python、Java 等各类日志文件。

完整实现代码

复制代码

import logging
import re
from datetime import datetime
from collections import Counter

# 1. 配置日志
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# 2. 日志分析核心功能
class LogAnalyzer:
    def __init__(self, log_file_path):
        self.log_file_path = log_file_path
        self.all_log_lines = []  # 存储所有日志行
        self.load_log_file()
    
    def load_log_file(self):
        """加载日志文件到内存（大文件可改为逐行处理）"""
        logger.info(f"开始加载日志文件：{self.log_file_path}")
        try:
            with open(self.log_file_path, "r", encoding="utf-8", errors="ignore") as f:
                self.all_log_lines = [line.strip() for line in f if line.strip()]
            logger.info(f"日志文件加载完成，共 {len(self.all_log_lines)} 行有效日志")
        except FileNotFoundError:
            logger.error(f"日志文件不存在：{self.log_file_path}")
            raise
        except Exception as e:
            logger.error(f"加载日志文件失败：{str(e)}")
            raise
    
    def filter_by_keyword(self, keywords, exclude_keywords=None):
        """
        按关键词过滤日志
        :param keywords: 包含的关键词列表（任意匹配）
        :param exclude_keywords: 排除的关键词列表（任意匹配）
        :return: 过滤后的日志列表
        """
        if exclude_keywords is None:
            exclude_keywords = []
        
        filtered_lines = []
        for line in self.all_log_lines:
            # 检查是否包含任意关键词
            include_flag = any(keyword.lower() in line.lower() for keyword in keywords)
            # 检查是否排除任意关键词
            exclude_flag = any(exclude_keyword.lower() in line.lower() for exclude_keyword in exclude_keywords)
            
            if include_flag and not exclude_flag:
                filtered_lines.append(line)
        
        logger.info(f"按关键词过滤完成，共 {len(filtered_lines)} 行匹配日志")
        return filtered_lines
    
    def count_error_messages(self, error_patterns):
        """
        统计报错信息出现次数
        :param error_patterns: 报错正则表达式列表
        :return: 报错统计结果（Counter）
        """
        error_messages = []
        for line in self.all_log_lines:
            for pattern in error_patterns:
                match = re.search(pattern, line, re.IGNORECASE)
                if match:
                    error_msg = match.group(0) if match.group(0) else line[:100]  # 截取前100字符
                    error_messages.append(error_msg)
                    break
        
        error_counter = Counter(error_messages)
        logger.info(f"报错信息统计完成，共识别 {len(error_counter)} 种不同报错")
        return error_counter
    
    def filter_by_time_range(self, start_time, end_time, time_pattern=r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}"):
        """
        按时间范围筛选日志
        :param start_time: 开始时间（字符串，格式匹配time_pattern）
        :param end_time: 结束时间（字符串，格式匹配time_pattern）
        :param time_pattern: 日志中的时间格式正则
        :return: 过滤后的日志列表
        """
        # 解析开始/结束时间
        try:
            start_dt = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S")
            end_dt = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S")
        except ValueError:
            logger.error("时间格式错误，需符合：YYYY-MM-DD HH:MM:SS")
            raise
        
        filtered_lines = []
        for line in self.all_log_lines:
            # 提取日志中的时间
            time_match = re.search(time_pattern, line)
            if not time_match:
                continue
            
            try:
                log_dt = datetime.strptime(time_match.group(0), "%Y-%m-%d %H:%M:%S")
                if start_dt <= log_dt <= end_dt:
                    filtered_lines.append(line)
            except ValueError:
                continue
        
        logger.info(f"按时间范围过滤完成，共 {len(filtered_lines)} 行匹配日志")
        return filtered_lines
    
    def generate_analysis_report(self, report_path, keywords=None, error_patterns=None, time_range=None):
        """
        生成日志分析报告
        :param report_path: 报告保存路径
        :param keywords: 过滤关键词
        :param error_patterns: 报错正则
        :param time_range: 时间范围（(start_time, end_time)）
        """
        if keywords is None:
            keywords = []
        if error_patterns is None:
            error_patterns = [r"error|exception|fail|fatal", r"500 Internal Server Error", r"404 Not Found"]
        if time_range is None:
            time_range = ("", "")
        
        logger.info(f"开始生成日志分析报告：{report_path}")
        report = [
            f"# 日志分析报告",
            f"生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            f"日志文件：{self.log_file_path}",
            f"日志总行数：{len(self.all_log_lines)}",
            "="*60,
            ""
        ]
        
        # 1. 关键词过滤结果（若指定关键词）
        if keywords:
            keyword_lines = self.filter_by_keyword(keywords)
            report.append(f"## 一、关键词过滤结果（关键词：{','.join(keywords)}）")
            report.append(f"匹配行数：{len(keyword_lines)}")
            report.append("前10条匹配日志：")
            for i, line in enumerate(keyword_lines[:10]):
                report.append(f"  {i+1}. {line}")
            report.append("")
        
        # 2. 报错信息统计
        error_counter = self.count_error_messages(error_patterns)
        report.append(f"## 二、报错信息统计")
        report.append(f"报错类型数：{len(error_counter)}")
        report.append(f"报错总次数：{sum(error_counter.values())}")
        report.append("报错Top10排名：")
        for i, (error_msg, count) in enumerate(error_counter.most_common(10), 1):
            report.append(f"  {i}. 「{error_msg}」：{count} 次")
        report.append("")
        
        # 3. 时间范围过滤结果（若指定时间范围）
        if all(time_range):
            time_lines = self.filter_by_time_range(time_range[0], time_range[1])
            report.append(f"## 三、时间范围过滤结果（{time_range[0]} - {time_range[1]}）")
            report.append(f"匹配行数：{len(time_lines)}")
            report.append("")
        
        # 4. 总结
        report.append(f"## 四、分析总结")
        if sum(error_counter.values()) == 0:
            report.append("未检测到明显报错信息，日志整体正常。")
        else:
            report.append(f"检测到 {sum(error_counter.values())} 次报错，需重点关注Top3报错类型。")
        
        # 写入报告文件
        with open(report_path, "w", encoding="utf-8") as f:
            f.write("\n".join(report))
        
        logger.info(f"日志分析报告生成完成，保存路径：{report_path}")

# 3. 主函数：使用日志分析工具
def main():
    # 配置项
    LOG_FILE = "/var/log/nginx/access.log"  # 日志文件路径
    REPORT_FILE = "log_analysis_report.txt"
    
    # 初始化日志分析器
    try:
        analyzer = LogAnalyzer(LOG_FILE)
        
        # 生成分析报告（可自定义关键词、报错正则、时间范围）
        analyzer.generate_analysis_report(
            report_path=REPORT_FILE,
            keywords=["404", "500"],
            time_range=("2024-01-01 00:00:00", "2024-01-31 23:59:59")
        )
    except Exception as e:
        logger.error(f"日志分析失败：{str(e)}")

if __name__ == "__main__":
    main()

使用说明

修改LOG_FILE为目标日志文件路径（Nginx、Python、Java 日志均可）；
支持自定义keywords（过滤关键词）、error_patterns（报错正则）、time_range（时间范围）；
大日志文件可优化为「逐行处理」，避免内存溢出；
生成的log_analysis_report.txt包含详细分析结果，可直接用于运维排查。

五、总结

本文系统讲解了 Python 自动化 / 运维开发的核心库和 3 个落地实战场景，核心要点总结如下：

核心运维库 ：paramiko实现基础 SSH/SFTP 操作，fabric简化批量运维，ansible适合复杂配置编排，psutil实现系统信息采集，四者覆盖运维核心需求；
批量巡检脚本 ：基于fabric+psutil实现多服务器指标采集，设置阈值告警，生成结构化报告，解决重复巡检痛点；
自动化部署脚本：按「拉取代码→安装依赖→启停服务→验证可用性」编排任务，支持扩展回滚、多环境部署，提升部署效率和一致性；
日志分析工具 ：基于re正则和Counter统计，支持关键词 / 时间过滤、报错统计，生成可视化报告，辅助快速排查问题。