~监控节点安装
1. 系统准备
1.1 更新系统并安装依赖
sudo yum install -y httpd php php-cli gcc glibc glibc-common gd gd-devel make net-snmp openssl-devel wget unzip
sudo yum install -y epel-release # 安装 EPEL 仓库
sudo yum install -y automake autoconf libtool #安装 Autotools 和依赖
2. 创建 Nagios 用户和组
sudo useradd nagios
sudo groupadd nagcmd
sudo usermod -a -G nagcmd nagios
sudo usermod -a -G nagcmd apache # 允许 Apache 访问 Nagios
3. 安装 Nagios Core
3.1 下载并编译源码
cd /tmp
wget https://assets.nagios.com/downloads/nagioscore/releases/nagios-4.4.6.tar.gz
tar xzf nagios-4.4.6.tar.gz
cd nagios-4.4.6
# 配置、编译、安装
./configure --with-command-group=nagcmd
make all
sudo make install
sudo make install-config # 安装默认配置文件
sudo make install-commandmode
4. 安装 Nagios 插件
4.1 下载并编译插件
cd /tmp
wget https://nagios-plugins.org/download/nagios-plugins-2.3.3.tar.gz
tar xzf nagios-plugins-2.3.3.tar.gz
cd nagios-plugins-2.3.3
# 配置、编译、安装
./configure --with-nagios-user=nagios --with-nagios-group=nagios
make
sudo make install
5. 配置 Apache 和 Web 界面
5.1 生成 Apache 配置文件
sudo cp /tmp/nagios-4.4.6/sample-config/httpd.conf /etc/httpd/conf.d/nagios.conf
5.2 配置身份验证
# 创建 Web 登录用户(例如 `nagiosadmin`)
sudo htpasswd -c /usr/local/nagios/etc/htpasswd.users nagiosadmin
# 输入两次密码
5.3 重启 Apache
sudo systemctl start httpd
sudo systemctl enable httpd
6. 配置 Systemd 服务
6.1 创建服务文件
sudo vi /etc/systemd/system/nagios.service
添加以下内容:
[Unit]
Description=Nagios
Wants=network-online.target
After=network-online.target
[Service]
Type=forking
ExecStart=/usr/local/nagios/bin/nagios /usr/local/nagios/etc/nagios.cfg
ExecReload=/usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg && /usr/local/nagios/bin/nagios -s reload
[Install]
WantedBy=multi-user.target
6.2 启动 Nagios 服务
sudo systemctl daemon-reload
sudo systemctl start nagios
sudo systemctl enable nagios
sudo systemctl status nagios # 检查状态应为 active (running)
7. 配置防火墙
sudo firewall-cmd --permanent --add-service=http
sudo firewall-cmd --reload
8. 调整 SELinux 策略(可选)
sudo setenforce 0 # 临时禁用 SELinux(测试用)
# 或永久调整策略
sudo ausearch -c 'nagios' --raw | audit2allow -M my-nagios
sudo semodule -i my-nagios.pp
9. 访问 Nagios Web 界面
通过浏览器访问:
http://<服务器IP>/nagios
sudo chown -R nagios:nagios /usr/local/nagios
sudo chmod -R 755 /usr/local/nagios
1. 检查 Nagios 配置文件
运行配置验证命令,确保无语法错误:
sudo /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
•关键点:
如果输出显示错误(如 Error processing object config files),需按提示修复配置文件。
常见错误:未闭合的引号、错误的主机/服务定义、插件路径错误。
•2. 调整 Systemd 服务超时时间
编辑服务文件,增加启动超时时间(默认 90 秒):
sudo vi /etc/systemd/system/nagios.service
修改以下参数:
[Service]
TimeoutStartSec=300 # 增加至 300 秒(5 分钟)
TimeoutStopSec=30 # 停止超时 30 秒
重新加载并重启服务:
sudo systemctl daemon-reload
sudo systemctl restart nagios
-
用户名 :
nagiosadmin
-
密码:之前设置的密码
10. 添加被监控节点
10.1 创建主机配置文件
sudo vi /usr/local/nagios/etc/objects/hosts.cfg
添加主机定义(示例):
define host {
host_name node1
alias Node1 Server
# 被监控节点 IP
address 192.168.1.100
check_command check-host-alive
max_check_attempts 5
check_interval 5
check_period 24x7
contacts nagiosadmin
}
10.2 创建服务配置文件
sudo vi /usr/local/nagios/etc/objects/services.cfg
添加服务定义(示例):
sudo cat /usr/local/nagios/etc/objects/templates.cfg
sudo nano /usr/local/nagios/etc/objects/services1.cfg
define service {
use local-service #cat /usr/local/nagios/etc/objects/templates.cfg的service的name
host_name node1
service_description Disk Usage
check_command check_nrpe!check_disk
contacts nagiosadmin
}
sudo nano /usr/local/nagios/etc/objects/services.cfg
define service {
use local-service #cat /usr/local/nagios/etc/objects/templates.cfg的service的name
host_name node1
service_description CPU Load
check_command check_nrpe!check_load
contacts nagiosadmin
}
10.3 配置 check_nrpe
命令
编辑 /usr/local/nagios/etc/objects/commands.cfg
,添加:
define command {
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -t 30 -c $ARG1$
}
10.4 验证并重启 Nagios
sudo vi /usr/local/nagios/etc/nagios.cfg
cfg_file=/usr/local/nagios/etc/objects/hosts.cfg
cfg_file=/usr/local/nagios/etc/objects/services1.cfg
cfg_file=/usr/local/nagios/etc/objects/services2.cfg
sudo /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg # 验证配置
sudo systemctl restart nagios
11. 验证监控状态
-
Web 界面 :登录后查看
Hosts
和Services
状态是否为绿色(OK)。 -
命令行验证:/usr/local/nagios/libexec/check_nrpe -H <被监控节点IP> -c check_load
在 CentOS 7 主监控节点上安装 check_nrpe
插件的详细步骤
check_nrpe
是 Nagios 主监控节点用来通过 NRPE 协议与被监控节点通信的核心插件。以下是安装和配置的完整流程:
1. 安装依赖
sudo yum update -y
sudo yum install -y gcc make openssl-devel automake autoconf wget
2. 下载并编译 NRPE 源码
2.1 下载源码包
cd /tmp
wget https://githubfast.com/NagiosEnterprises/nrpe/releases/download/nrpe-4.1.0/nrpe-4.1.0.tar.gz
tar xzf nrpe-4.1.0.tar.gz
cd nrpe-4.1.0
2.2 配置并编译插件
仅编译 check_nrpe
插件(无需安装完整的 NRPE 服务):
./configure --with-ssl=/usr/bin/openssl
make check_nrpe # 仅编译插件
3. 安装 check_nrpe
插件
3.1 手动复制插件到 Nagins 插件目录
sudo cp src/check_nrpe /usr/local/nagios/libexec/
3.2 设置权限
sudo chown nagios:nagios /usr/local/nagios/libexec/check_nrpe
sudo chmod 755 /usr/local/nagios/libexec/check_nrpe
4. 验证插件功能
测试插件是否能连接被监控节点:
/usr/local/nagios/libexec/check_nrpe -H <被监控节点IP> -c check_load
5. 配置 Nagios 使用 check_nrpe
5.1 定义命令
编辑 Nagios 命令配置文件:
sudo vi /usr/local/nagios/etc/objects/commands.cfg
添加以下内容:
define command {
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -t 30 -c $ARG1$
}
5.2 添加主机和服务定义
示例主机配置文件(如 hosts.cfg
):
define host {
host_name node1
alias Node1 Server
address 192.168.1.100
check_command check-host-alive
max_check_attempts 5
check_interval 5
check_period 24x7
contacts nagiosadmin
}
define service {
host_name node1
service_description CPU Load
check_command check_nrpe!check_load
check_interval 5
check_period 24x7
notification_interval 30
contacts nagiosadmin
}
5.3 验证并重启 Nagios
sudo /usr/local/nagios/bin/nagios -v /usr/local/nagios/etc/nagios.cfg
sudo systemctl restart nagios
后台运行
sudo nohup /usr/local/nagios/bin/nagios -d /usr/local/nagios/etc/nagios.cfg >/dev/null 2>&1 &
杀死进行
kill -9 $(ps -aux | grep nagios | grep -v "grep" | awk -F " " '{print $2}')
~被监控节点
1. 系统准备
1.1 更新系统并安装依赖
sudo yum install -y epel-release
sudo yum install -y gcc make openssl-devel automake autoconf wget
2. 创建 Nagios 用户和组
# 创建 Nagios 用户和组(如果不存在)
sudo useradd -r -s /sbin/nologin nagios
sudo groupadd nagcmd
sudo usermod -a -G nagcmd nagios
3. 安装 Nagios 插件
3.1 下载并编译插件
cd /tmp
wget https://nagios-plugins.org/download/nagios-plugins-2.3.3.tar.gz
tar xzf nagios-plugins-2.3.3.tar.gz
cd nagios-plugins-2.3.3
# 配置、编译、安装
./configure --with-nagios-user=nagios --with-nagios-group=nagios
make
sudo make install
# 验证插件安装
ls /usr/local/nagios/libexec/check_* # 应显示插件列表
4. 安装和配置 NRPE
4.1 下载并编译 NRPE
cd /tmp
wget https://githubfast.com/NagiosEnterprises/nrpe/releases/download/nrpe-4.1.0/nrpe-4.1.0.tar.gz
tar xzf nrpe-4.1.0.tar.gz
cd nrpe-4.1.0
# 配置并编译
./configure \
--with-nagios-user=nagios \
--with-nagios-group=nagios \
--with-ssl=/usr/bin/openssl
make all
sudo make install
4.2 配置 NRPE 服务
编辑配置文件 /usr/local/nagios/etc/nrpe.cfg
:
sudo vi /usr/local/nagios/etc/nrpe.cfg
修改以下参数:
# 替换为实际IP
allowed_hosts=127.0.0.1,192.168.198.206
# 监听所有接口或者指定接口的IP地址
server_address=0.0.0.0
server_port=5666
command[check_disk]=/usr/local/nagios/libexec/check_disk -w 20% -c 10%
command[check_load]=/usr/local/nagios/libexec/check_load -w 15,10,5 -c 30,25,20
command[check_mem]=/usr/local/nagios/libexec/check_mem -w 20% -c 10%
4.3 创建 Systemd 服务文件
sudo vi /etc/systemd/system/nrpe.service
添加以下内容:
[Unit]
Description=NRPE (Nagios Remote Plugin Executor)
After=network.target
[Service]
Type=simple
User=nagios
Group=nagios
ExecStart=/usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
Restart=on-failure
[Install]
WantedBy=multi-user.target
4.4 启动 NRPE 服务
sudo systemctl daemon-reload
sudo systemctl start nrpe
sudo systemctl enable nrpe
sudo systemctl status nrpe # 检查状态应为 active (running)
5. 配置防火墙
开放 NRPE 默认端口(5666):
sudo firewall-cmd --permanent --add-port=5666/tcp
sudo firewall-cmd --reload
6. 调整 SELinux 策略(可选)
6.1 临时禁用 SELinux 测试
sudo setenforce 0
6.2 永久调整策略(推荐)
sudo yum install -y policycoreutils-python
sudo ausearch -c 'nrpe' --raw | audit2allow -M my-nrpe
sudo semodule -i my-nrpe.pp
7. 验证 NRPE 功能
7.1 在 Nagios 主服务器上测试
# 测试 NRPE 连通性
/usr/local/nagios/libexec/check_nrpe -H <被监控节点IP>
# 测试具体命令(如检查磁盘)
/usr/local/nagios/libexec/check_nrpe -H <被监控节点IP> -c check_disk
~在 CentOS 7 上使用 Grafana + InfluxDB 展示 Nagios 监控数据的详细步骤
以下是将 Nagios 监控数据存储到 InfluxDB,并通过 Grafana 可视化的完整流程:
1. 安装并配置 InfluxDB
1.1 安装 InfluxDB
# 添加 InfluxDB 仓库
cat <<EOF | sudo tee /etc/yum.repos.d/influxdb.repo
[influxdb]
name = InfluxDB Repository
baseurl = https://repos.influxdata.com/rhel/\$releasever/\$basearch/stable
enabled = 1
gpgcheck = 1
gpgkey = https://repos.influxdata.com/influxdb.key
EOF
# 安装并启动服务
sudo yum install influxdb
sudo systemctl start influxdb
sudo systemctl enable influxdb
1.2 创建数据库和用户
# 进入 InfluxDB CLI
influx
# 创建数据库和用户
CREATE DATABASE nagios;
CREATE USER nagios_user WITH PASSWORD 'your_password' WITH ALL PRIVILEGES;
GRANT ALL ON nagios TO nagios_user;
exit
2. 安装并配置 Grafana
2.1 安装 Grafana
# 添加 Grafana 仓库
sudo tee /etc/yum.repos.d/grafana.repo <<EOF
[grafana]
name=grafana
baseurl=https://rpm.grafana.com
repo_gpgcheck=1
enabled=1
gpgcheck=1
gpgkey=https://rpm.grafana.com/gpg.key
sslverify=1
sslcacert=/etc/pki/tls/certs/ca-bundle.crt
EOF
# 安装并启动服务
sudo yum install grafana
sudo systemctl start grafana-server
sudo systemctl enable grafana-server
2.2 访问 Grafana
- 打开浏览器访问:
http://<服务器IP>:3000
,默认账号:admin/admin
。
3. 配置 Nagios 将数据写入 InfluxDB
3.1 安装 NCPA(Nagios Cross-Platform Agent)
# 下载并安装 NCPA
wget https://assets.nagios.com/downloads/ncpa/ncpa-2.4.0-1.el7.x86_64.rpm
sudo rpm -i ncpa-*.rpm
# 配置 NCPA(允许主节点访问)
sudo nano /usr/local/ncpa/etc/ncpa.cfg
# 修改以下参数:
allowed_hosts = 127.0.0.1,<Nagios主节点IP>
community_string = your_community_string
# 重启服务
sudo systemctl restart ncpa_listener
3.2 配置 Nagios 输出到 InfluxDB
# 安装依赖工具(用于发送数据到 InfluxDB)
sudo yum install python3-pip
# 指定安装到系统目录
sudo pip3 install --target=/usr/lib64/python3.6/site-packages influxdb
~被监控节点
脚本内容**:
# 创建数据发送脚本
vi /usr/local/nagios/libexec/check_cpu_custom.sh
# cat /usr/local/nagios/libexec/check_cpu_custom.sh
#!/bin/bash
# 设置区域为使用小数点格式
export LC_ALL=C
# 定义 InfluxDB 连接参数
INFLUXDB_HOST="192.168.198.206" # InfluxDB 服务器地址
INFLUXDB_PORT="8086" # 端口
INFLUXDB_DB="nagios" # 数据库名
INFLUXDB_USER="nagios_user" # 用户名
INFLUXDB_PASS="admin@123" # 密码
HOST_NAME=$(hostname -s) # 获取主机名
# 获取CPU使用率(基于/proc/stat的可靠方法)
# 第一次采样
read -r cpu user nice system idle iowait irq softirq steal guest guest_nice < /proc/stat
total1=$((user + nice + system + idle + iowait + irq + softirq + steal))
idle1=$((idle + iowait))
sleep 1
# 第二次采样
read -r cpu user nice system idle iowait irq softirq steal guest guest_nice < /proc/stat
total2=$((user + nice + system + idle + iowait + irq + softirq + steal))
idle2=$((idle + iowait))
# 计算差值
total_diff=$((total2 - total1))
idle_diff=$((idle2 - idle1))
# 计算使用率(保留两位小数)
if [ $total_diff -eq 0 ]; then
usage=0.00
else
usage=$(echo "scale=2; 100 * ($total_diff - $idle_diff) / $total_diff" | bc)
usage=$(printf "%.2f" $usage) # 确保格式为 0.00
fi
# 判断状态码
if [ $(echo "$usage >= 90" | bc -l) -eq 1 ]; then
status_code=2
status_text="CRITICAL"
elif [ $(echo "$usage >= 80" | bc -l) -eq 1 ]; then
status_code=1
status_text="WARNING"
else
status_code=0
status_text="OK"
fi
# 生成纳秒级时间戳
#timestamp=$(date +%s%N)
timestamp=$(date +%s%N)
# 发送数据到 InfluxDB(包含状态码)
curl -i -XPOST "http://${INFLUXDB_HOST}:${INFLUXDB_PORT}/write?db=${INFLUXDB_DB}&u=${INFLUXDB_USER}&p=${INFLUXDB_PASS}" \
--data-binary "cpu_usage,host=${HOST_NAME} value=${usage},status=${status_code} ${timestamp}"
# 输出Nagios状态
echo "${status_text} - CPU使用率: ${usage}% | usage=${usage}%;80;90"
exit $status_code
验证脚本
chown nagios:nagios /usr/local/nagios/libexec/check_cpu_custom.sh
chmod 755 /usr/local/nagios/libexec/check_cpu_custom.sh
./usr/local/nagios/libexec/check_cpu_custom.sh
编辑nrpe.cfg文件,进行追加内容
vi /usr/local/nagios/etc/nrpe.cfg
command[check_cpu_custom]=/usr/local/nagios/libexec/check_cpu_custom.sh
~ 监控节点
设置权限**:
3.3 配置 Nagios 命令**
# 编辑 commands.cfg
sudo vi /usr/local/nagios/etc/objects/commands.cfg
# 添加以下命令定义
define command {
command_name check_cpu_custom
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
试验
/usr/local/nagios/libexec/check_nrpe -H 被监控节点IP地址 -c check_cpu_custom
3.4 修改服务模板
在服务定义中添加 submit_to_influxdb
命令:
define service {
use local-service
host_name node1
service_description CPU Usage
#check_command参数对应commands.cfg文件中参数,前面固定填写check_nrpe,后面的$HOSTADDRESS$参数command_line的参数用!隔开,默认被监控IP地址为host_name的node1 不进行填写 ,而node1在/usr/local/nagios/etc/objects/hosts.cfg文件有定义(PS:同时/etc/hosts文件也要对应IP地址和域名,注意统一性),$ARG1$填写command_name的参数
check_command check_nrpe[!被监控IP地址]!check_cpu_custom
contacts nagiosadmin
#event_handler参数填写commands.cfg文件中的command_name
event_handler check_cpu_custom
#event_handler_enabled为开启event_handler
event_handler_enabled 1
}
4. 配置 Grafana 数据源和仪表盘
4.1 添加 InfluxDB 数据源
-
登录 Grafana,点击 Configuration > Data Sources > Add data source。
-
选择
InfluxDB
,填写以下信息:
-
URL :
http://localhost:8086
-
Database :
nagios
-
User :
nagios_user
-
Password :
your_password
-
4.2 导入 Nagios 仪表盘模板
-
访问 Grafana 仪表盘市场,搜索 "Nagios"。
-
选择模板(例如 ID: 14541),下载 JSON 文件。
-
在 Grafana 中点击 Create > Import,上传 JSON 文件。
5. 验证数据流
5.1 检查 InfluxDB 数据
influx -database nagios -execute "SHOW MEASUREMENTS"
5.2 手动触发 Nagios 检查
/usr/local/nagios/libexec/check_nrpe -H node1 -c check_cpu_custom
6. 防火墙配置
# 开放 InfluxDB 和 Grafana 端口
sudo firewall-cmd --permanent --add-port=8086/tcp # InfluxDB
sudo firewall-cmd --permanent --add-port=3000/tcp # Grafana
sudo firewall-cmd --reload