#作者:程宏斌
文章目录
- 首先nomad组件配置meta、host_volume
- 启动nomad并查询meta
- loki配置
- [这是 Loki 的 Nomad Job 定义](#这是 Loki 的 Nomad Job 定义)
首先nomad组件配置meta、host_volume
# /etc/nomad.d/nomad.hcl
datacenter = "dc1"
data_dir = "/opt/nomad/data"
# 启用 Server 模式
server {
enabled = true
bootstrap_expect = 3
server_join {
retry_join = [
"192.168.123.70",
"192.168.123.71",
"192.168.123.72"
]
}
}
# 启用 Client 模式(允许本机运行任务)
client {
enabled = true
servers = [
"192.168.123.70:4647",
"192.168.123.71:4647",
"192.168.123.72:4647"
]
host_volume "loki-config" {
path = "/shaxiang/loki/config"
read_only = false
}
host_volume "loki-data" {
path = "/shaxiang/loki/data"
read_only = false
}
# node_class = "worker"
# ✅ meta 必须在这里!作为 client 的子配置
meta {
job_constraint = "shard-1"
}
}
# 绑定地址
addresses {
http = "0.0.0.0"
rpc = "0.0.0.0"
serf = "0.0.0.0"
}
ports {
http = 4646
rpc = 4647
serf = 4648
}
consul {
address = "127.0.0.1:8500"
auto_advertise = true
}
启动nomad并查询meta
/etc/systemd/system/nomad.service
[Unit]
Description=Nomad
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/local/bin/nomad agent -config=/etc/nomad.d
Restart=on-failure
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable nomad
systemctl start nomad
systemctl status nomad
查询命令
nomad node status [options] [<node-id>]
不带参数:列出所有节点
带 <node-id>:查看指定节点的详细信息
nomad node status -json [<node-id>] | jq '.Meta.job_constraint'
会输出配置的meta的volume视为正常
nomad node status -verbose [<node-id>] | grep ck #查看你的host_volume是否声明
loki-config false /shaxiang/loki/config
loki-data false /shaxiang/loki/data
loki配置
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
log_level: info
common:
path_prefix: /tmp/loki
replication_factor: 1
compactor_address: 127.0.0.1:9095
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2020-10-24
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
filesystem:
directory: /tmp/loki/chunks
compactor:
working_directory: /tmp/loki/compactor
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
chunk_store_config: {}
table_manager:
retention_deletes_enabled: false
retention_period: 0s
ruler:
enable_api: false
配置loki.hcl指定node部署
这是 Loki 的 Nomad Job 定义
job "loki" {
datacenters = ["dc1"] # Loki 运行在哪个 DC
type = "service" # 服务类型:Nomad 会一直保持运行
priority = 75 # 调度优先级
group "loki-service" {
# 重启策略:5 秒内失败 1 次就停止重启,避免 CrashLoop
restart {
interval = "5s"
attempts = 1
delay = "5s"
mode = "delay"
}
# 声明 Host Volume(需要在 Nomad client.hcl 里配置)
volume "loki-config" {
type = "host"
source = "loki-config" # 对应 client.hcl 的 host_volume 名
read_only = false
}
volume "loki-data" {
type = "host"
source = "loki-data"
read_only = false
}
# 调度约束:只调度到 meta.job_constraint = shard-1 的节点
constraint {
attribute = "${meta.job_constraint}"
value = "shard-1"
}
# 分配端口:将容器暴露的 3100 映射成 Nomad 动态端口
network {
port "loki" { to = 3100 }
}
# Nomad 自动注册服务到 Consul(如果启用)
service {
name = "loki"
port = "loki"
check {
type = "http" # 用 HTTP 健康检查
path = "/ready" # Loki 的就绪检查接口
interval = "20s"
timeout = "10s"
port = "3100" # 必须写容器内部端口
}
}
task "loki" { #定义具体容器名称以及日志文件名称
driver = "docker"
config {
network_mode = "host" # 使用 host network
image = "loki:2.9.8" # Loki 镜像
args = ["-config.file", "/etc/loki/local-config.yaml"] # Loki 配置文件
}
resources {
memory_max = 768
memory = 512
cpu = 500
}
# 挂载 Loki 配置文件目录
volume_mount {
volume = "loki-config"
destination = "/etc/loki"
}
# 挂载 Loki 数据目录
volume_mount {
volume = "loki-data"
destination = "/loki"
}
}
}
}
nomad job validate
验证Job文件语法和逻辑
作用
检查 .nomad 文件是否符合 HCL 语法
验证资源请求、网络、卷、约束等配置是否合法
不会连接 Nomad 集群,纯本地校验(安全!)
基本用法
nomad job validate clickhouse.hcl
成功输出示例
Job validation successful.
nomad job run (run之前对应的目录要给到权限)
提交并运行Job
作用
将 Job 提交到 Nomad 集群
触发调度器分配任务到 Client 节点
如果 Job 已存在,则执行 滚动更新(默认策略)
基本用法
nomad job run clickhouse.hcl
c
nomad job status [<job-name>]
用来查看 Nomad 中指定 Job 的整体状态。
输出信息包括:
Job ID、Name、Type、Datacenter 等基本信息
Task Group 的数量和状态(Queued、Starting、Running、Failed 等)
Placement Failure(如果调度失败,会显示具体原因)
最新 Deployment 状态
用途:快速判断整个 Job 是否正常运行或有调度问题。
nomad alloc status [<Allocation-id>]
用来查看某个具体Allocation的详细状态。
Allocation 是 Job 中实际运行的任务实例,每个 Task Group 的每个副本都会生成一个 allocation。
输出信息包括:
Allocation ID、对应 Job ID、Node ID
Client Status(pending、running、lost 等)
资源分配情况(CPU、Memory)
Task 内部状态(task name、driver、port 映射、volume mount)
Allocation Addresses(实际宿主机 IP + 端口)