本文详解如何用Python开发Telegram机器人,实现服务器监控和自动告警。
前言
服务器出问题了,怎么第一时间知道?
- 邮件通知?可能漏看
- 短信通知?要钱
- Telegram Bot?免费、实时、还能远程控制
今天用Python开发一个服务器监控机器人。
一、Telegram Bot简介
1.1 为什么选择Telegram
diff
优点:
- 完全免费
- API简单易用
- 消息实时推送
- 支持群组/频道
- 可发送文件、图片
- 支持命令和按钮交互
适用场景:
- 服务器监控告警
- CI/CD通知
- 定时任务提醒
- 远程执行命令
- 日志推送
1.2 创建Bot
markdown
1. 打开Telegram,搜索 @BotFather
2. 发送 /newbot
3. 输入Bot名称,如:MyServerBot
4. 输入Bot用户名,如:my_server_monitor_bot
5. 获得Token:123456789:ABCdefGHIjklMNOpqrsTUVwxyz
6. 保存好Token!
1.3 获取Chat ID
diff
方法1:使用@userinfobot
- 搜索@userinfobot
- 发送任意消息
- 返回你的Chat ID
方法2:API获取
- 给你的Bot发送消息
- 访问:https://api.telegram.org/bot<TOKEN>/getUpdates
- 在返回的JSON中找到chat.id
二、环境准备
2.1 安装依赖
bash
pip install python-telegram-bot
pip install psutil # 系统监控
pip install aiohttp # 异步HTTP
pip install apscheduler # 定时任务
2.2 项目结构
bash
telegram_bot/
├── bot.py # 主程序
├── config.py # 配置
├── monitors/ # 监控模块
│ ├── __init__.py
│ ├── cpu.py
│ ├── memory.py
│ ├── disk.py
│ └── network.py
├── handlers/ # 命令处理
│ ├── __init__.py
│ └── commands.py
└── requirements.txt
三、基础Bot开发
3.1 Hello World
python
# bot.py
from telegram import Update
from telegram.ext import Application, CommandHandler, ContextTypes
# 配置
TOKEN = "你的Bot Token"
# 命令处理器
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
await update.message.reply_text("你好!我是服务器监控机器人。")
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
help_text = """
可用命令:
/start - 开始
/help - 帮助
/status - 服务器状态
/cpu - CPU使用率
/memory - 内存使用
/disk - 磁盘使用
"""
await update.message.reply_text(help_text)
def main():
# 创建应用
app = Application.builder().token(TOKEN).build()
# 注册命令
app.add_handler(CommandHandler("start", start))
app.add_handler(CommandHandler("help", help_command))
# 启动
print("Bot启动中...")
app.run_polling()
if __name__ == "__main__":
main()
3.2 运行测试
bash
python bot.py
# 在Telegram中:
# 1. 搜索你的Bot
# 2. 发送 /start
# 3. 发送 /help
四、系统监控功能
4.1 CPU监控
python
# monitors/cpu.py
import psutil
def get_cpu_info():
"""获取CPU信息"""
cpu_percent = psutil.cpu_percent(interval=1)
cpu_count = psutil.cpu_count()
cpu_freq = psutil.cpu_freq()
# 每核使用率
per_cpu = psutil.cpu_percent(interval=1, percpu=True)
return {
"percent": cpu_percent,
"count": cpu_count,
"freq": cpu_freq.current if cpu_freq else 0,
"per_cpu": per_cpu
}
def format_cpu_info():
"""格式化CPU信息"""
info = get_cpu_info()
text = f"""
🖥️ CPU状态
━━━━━━━━━━━━━━━
总使用率: {info['percent']}%
核心数量: {info['count']}
当前频率: {info['freq']:.0f} MHz
各核心使用率:
"""
for i, p in enumerate(info['per_cpu']):
bar = "█" * int(p / 10) + "░" * (10 - int(p / 10))
text += f" 核心{i}: [{bar}] {p}%\n"
return text
4.2 内存监控
python
# monitors/memory.py
import psutil
def get_memory_info():
"""获取内存信息"""
mem = psutil.virtual_memory()
swap = psutil.swap_memory()
return {
"total": mem.total / (1024**3), # GB
"used": mem.used / (1024**3),
"available": mem.available / (1024**3),
"percent": mem.percent,
"swap_total": swap.total / (1024**3),
"swap_used": swap.used / (1024**3),
"swap_percent": swap.percent
}
def format_memory_info():
"""格式化内存信息"""
info = get_memory_info()
bar = "█" * int(info['percent'] / 10) + "░" * (10 - int(info['percent'] / 10))
text = f"""
💾 内存状态
━━━━━━━━━━━━━━━
使用率: [{bar}] {info['percent']}%
物理内存:
总量: {info['total']:.1f} GB
已用: {info['used']:.1f} GB
可用: {info['available']:.1f} GB
交换分区:
总量: {info['swap_total']:.1f} GB
已用: {info['swap_used']:.1f} GB ({info['swap_percent']}%)
"""
return text
4.3 磁盘监控
python
# monitors/disk.py
import psutil
def get_disk_info():
"""获取磁盘信息"""
partitions = []
for part in psutil.disk_partitions():
try:
usage = psutil.disk_usage(part.mountpoint)
partitions.append({
"device": part.device,
"mountpoint": part.mountpoint,
"total": usage.total / (1024**3),
"used": usage.used / (1024**3),
"free": usage.free / (1024**3),
"percent": usage.percent
})
except:
continue
return partitions
def format_disk_info():
"""格式化磁盘信息"""
partitions = get_disk_info()
text = "💿 磁盘状态\n━━━━━━━━━━━━━━━\n"
for p in partitions:
bar = "█" * int(p['percent'] / 10) + "░" * (10 - int(p['percent'] / 10))
text += f"""
{p['mountpoint']}
[{bar}] {p['percent']}%
已用: {p['used']:.1f} GB / {p['total']:.1f} GB
剩余: {p['free']:.1f} GB
"""
return text
4.4 网络监控
python
# monitors/network.py
import psutil
import socket
def get_network_info():
"""获取网络信息"""
# 网络IO
net_io = psutil.net_io_counters()
# IP地址
hostname = socket.gethostname()
try:
ip = socket.gethostbyname(hostname)
except:
ip = "未知"
# 网络连接数
connections = len(psutil.net_connections())
return {
"hostname": hostname,
"ip": ip,
"bytes_sent": net_io.bytes_sent / (1024**2), # MB
"bytes_recv": net_io.bytes_recv / (1024**2),
"packets_sent": net_io.packets_sent,
"packets_recv": net_io.packets_recv,
"connections": connections
}
def format_network_info():
"""格式化网络信息"""
info = get_network_info()
text = f"""
🌐 网络状态
━━━━━━━━━━━━━━━
主机名: {info['hostname']}
IP地址: {info['ip']}
连接数: {info['connections']}
流量统计:
发送: {info['bytes_sent']:.1f} MB
接收: {info['bytes_recv']:.1f} MB
发送包: {info['packets_sent']}
接收包: {info['packets_recv']}
"""
return text
五、完整Bot实现
5.1 主程序
python
# bot.py
from telegram import Update, InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import (
Application, CommandHandler, CallbackQueryHandler, ContextTypes
)
from apscheduler.schedulers.asyncio import AsyncIOScheduler
import psutil
from datetime import datetime
# 配置
TOKEN = "你的Bot Token"
ADMIN_CHAT_ID = 123456789 # 你的Chat ID
# 告警阈值
THRESHOLDS = {
"cpu": 80,
"memory": 85,
"disk": 90
}
# 监控函数(简化版,集成上面的模块)
def get_system_status():
cpu = psutil.cpu_percent(interval=1)
mem = psutil.virtual_memory()
disk = psutil.disk_usage('/')
return {
"cpu": cpu,
"memory": mem.percent,
"disk": disk.percent,
"uptime": datetime.now() - datetime.fromtimestamp(psutil.boot_time())
}
def format_status():
s = get_system_status()
# 状态图标
cpu_icon = "🔴" if s['cpu'] > THRESHOLDS['cpu'] else "🟢"
mem_icon = "🔴" if s['memory'] > THRESHOLDS['memory'] else "🟢"
disk_icon = "🔴" if s['disk'] > THRESHOLDS['disk'] else "🟢"
text = f"""
📊 服务器状态概览
━━━━━━━━━━━━━━━━━━
{cpu_icon} CPU: {s['cpu']}%
{mem_icon} 内存: {s['memory']}%
{disk_icon} 磁盘: {s['disk']}%
⏱️ 运行时间: {str(s['uptime']).split('.')[0]}
🕐 更新时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""
return text
# 命令处理器
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
keyboard = [
[
InlineKeyboardButton("📊 状态", callback_data="status"),
InlineKeyboardButton("🖥️ CPU", callback_data="cpu"),
],
[
InlineKeyboardButton("💾 内存", callback_data="memory"),
InlineKeyboardButton("💿 磁盘", callback_data="disk"),
],
[
InlineKeyboardButton("🌐 网络", callback_data="network"),
InlineKeyboardButton("🔄 刷新", callback_data="refresh"),
]
]
reply_markup = InlineKeyboardMarkup(keyboard)
await update.message.reply_text(
"🤖 服务器监控机器人\n选择要查看的信息:",
reply_markup=reply_markup
)
async def status(update: Update, context: ContextTypes.DEFAULT_TYPE):
await update.message.reply_text(format_status())
async def button_callback(update: Update, context: ContextTypes.DEFAULT_TYPE):
query = update.callback_query
await query.answer()
if query.data == "status" or query.data == "refresh":
text = format_status()
elif query.data == "cpu":
text = format_cpu_info()
elif query.data == "memory":
text = format_memory_info()
elif query.data == "disk":
text = format_disk_info()
elif query.data == "network":
text = format_network_info()
else:
text = "未知命令"
await query.edit_message_text(text=text)
# 告警检查
async def check_alerts(context: ContextTypes.DEFAULT_TYPE):
s = get_system_status()
alerts = []
if s['cpu'] > THRESHOLDS['cpu']:
alerts.append(f"🔴 CPU使用率过高: {s['cpu']}%")
if s['memory'] > THRESHOLDS['memory']:
alerts.append(f"🔴 内存使用率过高: {s['memory']}%")
if s['disk'] > THRESHOLDS['disk']:
alerts.append(f"🔴 磁盘使用率过高: {s['disk']}%")
if alerts:
alert_text = "⚠️ 服务器告警\n━━━━━━━━━━━━━━━\n" + "\n".join(alerts)
alert_text += f"\n\n⏰ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
await context.bot.send_message(chat_id=ADMIN_CHAT_ID, text=alert_text)
# 定时报告
async def daily_report(context: ContextTypes.DEFAULT_TYPE):
text = "📈 每日服务器报告\n" + format_status()
await context.bot.send_message(chat_id=ADMIN_CHAT_ID, text=text)
def main():
# 创建应用
app = Application.builder().token(TOKEN).build()
# 注册命令
app.add_handler(CommandHandler("start", start))
app.add_handler(CommandHandler("status", status))
app.add_handler(CallbackQueryHandler(button_callback))
# 定时任务
scheduler = AsyncIOScheduler()
# 每分钟检查告警
scheduler.add_job(
check_alerts, 'interval', minutes=1,
args=[app]
)
# 每天9点发送报告
scheduler.add_job(
daily_report, 'cron', hour=9,
args=[app]
)
scheduler.start()
# 启动
print("Bot启动中...")
app.run_polling()
if __name__ == "__main__":
main()
六、高级功能
6.1 远程执行命令
python
import subprocess
async def exec_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""执行系统命令(危险!仅限管理员)"""
# 权限检查
if update.effective_user.id != ADMIN_CHAT_ID:
await update.message.reply_text("❌ 无权限")
return
if not context.args:
await update.message.reply_text("用法: /exec <命令>")
return
cmd = " ".join(context.args)
try:
result = subprocess.run(
cmd, shell=True, capture_output=True,
text=True, timeout=30
)
output = result.stdout or result.stderr or "(无输出)"
# 限制输出长度
if len(output) > 4000:
output = output[:4000] + "\n...(输出过长已截断)"
await update.message.reply_text(f"```\n{output}\n```", parse_mode='Markdown')
except subprocess.TimeoutExpired:
await update.message.reply_text("❌ 命令执行超时")
except Exception as e:
await update.message.reply_text(f"❌ 执行失败: {e}")
6.2 进程管理
python
async def processes(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""查看进程列表"""
procs = []
for p in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
try:
procs.append(p.info)
except:
pass
# 按CPU排序
procs.sort(key=lambda x: x['cpu_percent'] or 0, reverse=True)
text = "📋 进程列表 (Top 10 CPU)\n━━━━━━━━━━━━━━━━━━\n"
for p in procs[:10]:
text += f"{p['pid']:>6} | {p['cpu_percent']:>5.1f}% | {p['name'][:20]}\n"
await update.message.reply_text(f"```\n{text}\n```", parse_mode='Markdown')
6.3 Docker容器监控
python
import docker
async def docker_status(update: Update, context: ContextTypes.DEFAULT_TYPE):
"""Docker容器状态"""
try:
client = docker.from_env()
containers = client.containers.list(all=True)
text = "🐳 Docker容器状态\n━━━━━━━━━━━━━━━━━━\n"
for c in containers:
status_icon = "🟢" if c.status == "running" else "🔴"
text += f"{status_icon} {c.name[:20]}: {c.status}\n"
if not containers:
text += "没有容器"
await update.message.reply_text(text)
except Exception as e:
await update.message.reply_text(f"❌ Docker连接失败: {e}")
七、跨网络访问
7.1 问题
diff
场景:
- Bot运行在内网服务器
- 想在外面通过Telegram控制服务器
- 服务器没有公网IP
问题:
- Telegram API可以正常调用(服务器能访问外网)
- 但无法直接SSH到服务器进行维护
7.2 解决方案
Bot本身可以正常工作,但如果需要直接访问内网服务器:
markdown
使用组网软件(如星空组网):
1. 内网服务器安装组网客户端
2. 你的手机/电脑安装组网客户端
3. 组建虚拟局域网
4. 通过虚拟IP直接SSH到服务器
优势:
- Bot负责告警和简单查询
- 组网负责需要时的直接访问
- 互相补充,完美配合
八、部署运维
8.1 后台运行
bash
# 使用nohup
nohup python bot.py > bot.log 2>&1 &
# 使用screen
screen -S bot
python bot.py
# Ctrl+A, D 退出
# 使用systemd(推荐)
8.2 Systemd服务
bash
# /etc/systemd/system/telegram-bot.service
[Unit]
Description=Telegram Server Monitor Bot
After=network.target
[Service]
Type=simple
User=root
WorkingDirectory=/opt/telegram_bot
ExecStart=/usr/bin/python3 /opt/telegram_bot/bot.py
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
bash
sudo systemctl daemon-reload
sudo systemctl enable telegram-bot
sudo systemctl start telegram-bot
sudo systemctl status telegram-bot
九、总结
Telegram Bot监控要点:
| 功能 | 实现 |
|---|---|
| 状态查询 | 命令 + 按钮 |
| 定时检查 | APScheduler |
| 告警推送 | 阈值触发 |
| 远程控制 | 命令执行 |
Bot能做的:
- 实时查看服务器状态
- 自动告警通知
- 远程执行命令
- 定时报告
参考资料
- python-telegram-bot文档:docs.python-telegram-bot.org/
- psutil文档:psutil.readthedocs.io/
- Telegram Bot API:core.telegram.org/bots/api
💡 Telegram Bot是服务器监控的好帮手,配合组网软件可以实现更完整的远程运维方案。