172.16.0.100 master
172.16.0.101 node01
172.16.0.102 node02
1. 开机卡顿
bash
# 停止+禁用+屏蔽,彻底不启动
sudo systemctl disable --now NetworkManager-wait-online.service
sudo systemctl mask NetworkManager-wait-online.service
sudo systemctl disable --now systemd-networkd-wait-online.service
sudo systemctl mask systemd-networkd-wait-online.service
# 重载systemd配置
sudo systemctl daemon-reload
# Netplan 网卡设置(防止单网卡 DHCP 卡住等待
sudo vim /etc/netplan/*.yaml
network:
ethernets:
ens33:
dhcp4: true
optional: true # 新增这一行
version: 2
sudo netplan apply
# chrony aliyun.com
sudo vim /etc/chrony/sources.d/aliyun.sources
# 阿里云公共 NTP 服务
server ntp.aliyun.com iburst
server ntp1.aliyun.com iburst
server ntp2.aliyun.com iburst
server ntp3.aliyun.com iburst
server ntp4.aliyun.com iburst
server ntp5.aliyun.com iburst
server ntp6.aliyun.com iburst
server ntp7.aliyun.com iburst
systemctl restart chronyd
2. apt 源配置
bash
/etc/apt/sources.list.d/ubuntu.sources
Types: deb
URIs: https://mirrors.aliyun.com/ubuntu
Suites: resolute resolute-updates resolute-backports
Components: main universe restricted multiverse
Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
Types: deb
URIs: https://mirrors.aliyun.com/ubuntu
Suites: resolute-security
Components: main universe restricted multiverse
Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg
3. 添加域名
bash
echo "172.16.0.100 master" >> /etc/hosts
echo "172.16.0.101 node01" >> /etc/hosts
echo "172.16.0.102 node02" >> /etc/hosts
4. 关闭 Swap
bash
sudo systemctl stop ufw
sudo systemctl disable ufw
sudo systemctl mask ufw
sudo iptables -F
sudo iptables -X
sudo iptables -t nat -F
sudo iptables -t nat -X
sudo iptables -P INPUT ACCEPT
sudo iptables -P FORWARD ACCEPT
sudo iptables -P OUTPUT ACCEPT
swapoff -a
sed -i '/swap/s/^/#/' /etc/fstab
free -h
5. 加载内核模块并开启网络转发
bash
vim k8s-network.sh
#!/bin/bash
# 1. 内核模块开机自启
cat > /etc/modules-load.d/k8s-network.conf <<EOF
overlay
br_netfilter
EOF
# 2. 网络转发参数持久化
cat > /etc/sysctl.d/99-k8s-forward.conf <<EOF
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
EOF
# 3. 立即加载模块
modprobe overlay
modprobe br_netfilter
# 4. 立即生效sysctl
sysctl --system
# 5. 验证输出
echo "===== 验证参数 ====="
sysctl net.ipv4.ip_forward net.bridge.bridge-nf-call-iptables net.bridge.bridge-nf-call-ip6tables
echo "===== 验证内核模块 ====="
lsmod | grep -E "overlay|br_netfilter"
# 重启检验
# 重启后执行
sysctl net.ipv4.ip_forward
sysctl net.bridge.bridge-nf-call-iptables
sysctl net.bridge.bridge-nf-call-ip6tables
lsmod | grep overlay
lsmod | grep br_netfilter
6. 安装 containerd
bash
sudo apt install -y containerd
vi fix-containerd2.sh
set -e
mkdir -p /etc/containerd
# 生成纯净默认配置
containerd config default > /etc/containerd/config.toml
# 开启 systemd cgroup
sed -i 's/SystemdCgroup = false/SystemdCgroup = true/' /etc/containerd/config.toml
# 修改 pause 镜像地址
# 替换 pinned sandbox 为阿里云pause
sed -i "s|sandbox = .*|sandbox = 'registry.aliyuncs.com/google_containers/pause:3.10.1'|" /etc/containerd/config.toml
# 重载+重启
systemctl daemon-reload
systemctl restart containerd
7. 安装 K8s 组件
bash
mkdir -p /etc/apt/keyrings
curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.36/deb/Release.key | \
gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.36/deb/ /" |\
tee /etc/apt/sources.list.d/kubernetes.list
apt update
apt install -y kubelet kubeadm kubectl
apt-mark hold kubelet kubeadm kubectl
kubeadm version
kubectl version --client
kubelet --version
kubeadm reset -f
rm -rf /etc/kubernetes /var/lib/kubelet /var/lib/etcd
rm -rf $HOME/.kube
8. 初始化 master 节点
bash
# 查看镜像
# containerd 镜像 /etc/containerd/config.toml grep pause:3.10.1
root@ubuntu26:~# kubeadm config images list
registry.k8s.io/kube-apiserver:v1.36.2
registry.k8s.io/kube-controller-manager:v1.36.2
registry.k8s.io/kube-scheduler:v1.36.2
registry.k8s.io/kube-proxy:v1.36.2
registry.k8s.io/coredns/coredns:v1.14.2
registry.k8s.io/pause:3.10.2
registry.k8s.io/etcd:3.6.8-0
# 提前下载并更换 tag 脚本
#!/bin/bash
K8S_VERSION="v1.36.2"
COREDNS_VERSION="v1.14.2"
ETCD_VERSION="3.6.8-0"
PAUSE_VERSION="3.10.2"
REGISTRY_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/registry.k8s.io"
images=(
"kube-apiserver:${K8S_VERSION}"
"kube-controller-manager:${K8S_VERSION}"
"kube-scheduler:${K8S_VERSION}"
"kube-proxy:${K8S_VERSION}"
"coredns/coredns:${COREDNS_VERSION}"
"etcd:${ETCD_VERSION}"
"pause:${PAUSE_VERSION}"
)
for img in "${images[@]}"; do
source_img="${REGISTRY_PREFIX}/${img}"
target_img="registry.k8s.io/${img}"
echo "正在拉取(命名空间 k8s.io): ${source_img}"
ctr -n k8s.io images pull "${source_img}"
echo "正在打标签: ${target_img}"
ctr -n k8s.io images tag "${source_img}" "${target_img}"
# 若想删除源镜像(华为云前缀),执行:
ctr -n k8s.io images remove "${source_img}"
done
echo "所有镜像准备完毕!"
apt update
apt install -y cri-tools
cat > /etc/crictl.yaml <<EOF
runtime-endpoint: unix:///run/containerd/containerd.sock
image-endpoint: unix:///run/containerd/containerd.sock
timeout: 10
debug: false
EOF
crictl --version
crictl info
# 查看所有容器,看 apiserver/etcd 是否退出
crictl ps -a
# 查看 etcd 日志
ETCD_ID=$(crictl ps -a | grep etcd | awk '{print $1}')
crictl logs $ETCD_ID
# 查看 apiserver 日志
APISERVER_ID=$(crictl ps -a | grep kube-apiserver | awk '{print $1}')
crictl logs $APISERVER_ID
# 列出 k8s 命名空间所有容器
ctr -n k8s.io c list
# 查看镜像
ctr -n k8s.io images list
bash
# kubeadm 配置文件修改默认仓库
kubeadm config print init-defaults > kubeadm-config.yaml
vim kubeadm-config.yaml
imageRepository: registry.aliyuncs.com/google_containers
advertiseAddress: 172.16.0.100
kubeadm config images pull --config kubeadm-config.yaml
kubeadm config images pull \
--kubernetes-version=v1.36.1 \
--image-repository registry.aliyuncs.com/google_containers
kubeadm init --config kubeadm-config.yaml
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
kubeadm join 172.16.0.100:6443 --token abcdef.0123456789abcdef \
--discovery-token-ca-cert-hash sha256:2daec271f14cf5b143ba8ab7ece30c0b21a874942a1bf809b643f934cdd1c433
kubeadm reset -f
rm -rf /etc/kubernetes /var/lib/kubelet /var/lib/etcd
rm -rf $HOME/.kube
9. Node 节点加入
bash
kubeadm join 172.16.0.100:6443 --token abcdef.0123456789abcdef \
--discovery-token-ca-cert-hash sha256:7538b5a6f63256963831309c85ef60ec1a4fa7855144cbaa9a1ca2ff0286caa1
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
10. 安装calico网络插件
bash
wget https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml
wget https://fastgit.org/projectcalico/calico/raw/v3.30.3/manifests/calico.yaml
wget https://cdn.jsdelivr.net/gh/projectcalico/calico@v3.30.3/manifests/calico.yaml
bash
grep "image:" calico.yaml
image: docker.io/calico/cni:v3.30.3
image: docker.io/calico/node:v3.30.3
image: docker.io/calico/kube-controllers:v3.30.3
vim pull_calico_images.sh
#!/bin/bash
# Calico 版本(请根据实际需求修改)
CALICO_VERSION="v3.30.3"
# 华为云 SWR 镜像仓库前缀(源地址)
REGISTRY_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/calico"
# Calico 镜像名称列表(不含版本号)
images=(
"cni"
"node"
"kube-controllers"
)
for img in "${images[@]}"; do
source_img="${REGISTRY_PREFIX}/${img}:${CALICO_VERSION}"
target_img="docker.io/calico/${img}:${CALICO_VERSION}"
echo ">>> 正在从华为云拉取源镜像(命名空间 k8s.io): ${source_img}"
ctr -n k8s.io images pull "${source_img}" || { echo "拉取失败,请检查镜像是否存在或网络连接"; exit 1; }
echo ">>> 打标签为(命名空间 k8s.io): ${target_img}"
ctr -n k8s.io images tag "${source_img}" "${target_img}"
echo ">>> 删除源镜像(华为云前缀): ${source_img}"
ctr -n k8s.io images remove "${source_img}"
echo ">>> 完成 ${img}"
echo
done
echo "所有 Calico 镜像已准备在 k8s.io 命名空间!"
echo "当前 k8s.io 命名空间中的 Calico 镜像列表:"
ctr -n k8s.io images list | grep calico
# 根据 master kubeadm init --pod-network-cidr=10.244.0.0/16 指定为 pod 地址 10.244.0.0/16
# 修改 CALICO_IPV4POOL_CIDR 保持一致
- name: CALICO_IPV4POOL_CIDR
value: "10.0.17.0/24"
# 修改为 BGP 模式
# Enable IPIP
- name: CALICO_IPV4POOL_IPIP
value: "Always" #改成Off
kubectl apply -f calico.yaml
kubectl get pod -A
kubectl delete pod -n kube-system pod --force --grace-period=0
11. nginx
bash
# 创建 nginx 部署
kubectl create deployment nginx --image=docker.m.daocloud.io/library/nginx:alpine
# 暴露端口,外部可访问(NodePort)
kubectl expose deployment nginx --port=80 --type=NodePort
nginx-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nginx
spec:
replicas: 1
selector:
matchLabels:
app: nginx
template:
metadata:
labels:
app: nginx
spec:
containers:
- name: nginx
image: docker.m.daocloud.io/library/nginx:alpine
ports:
- containerPort: 80
---
apiVersion: v1
kind: Service
metadata:
name: nginx-svc
spec:
type: NodePort
selector:
app: nginx
ports:
- port: 80
targetPort: 80
nodePort: 30080
12 Ceph
bash
lsblk -f
# 清除硬盘数据
# 请将 /dev/sdX 替换为你的实际设备名,例如 /dev/sdb
sudo wipefs -a /dev/sdX
# 部署Rook Operator
# 克隆Rook仓库
git clone --single-branch --branch v1.15.1 https://gitee.com/mirrors/ROOK.git
cd ROOK/deploy/examples
# 一键拉取脚本
vim pull_ceph_images.sh
#!/bin/bash
# Rook v1.15.1 + Ceph v18.2.4 镜像预拉取脚本
set -e
# 定义镜像列表 (源镜像 -> 目标镜像)
# 格式: "源镜像地址 目标镜像地址"
# 如果源镜像与目标镜像相同,则只需填写一次
declare -A images=(
["swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/rook/ceph:v1.15.1"]="docker.io/rook/ceph:v1.15.1"
["swr.cn-north-4.myhuaweicloud.com/ddn-k8s/quay.io/ceph/ceph:v18.2.4"]="docker.io/ceph/ceph:v18.2.4"
# 如果 quay.io 能访问,也可以使用官方镜像
# ["quay.io/ceph/ceph:v18.2.4"]="quay.io/ceph/ceph:v18.2.4"
)
# 使用 ctr (containerd) 拉取并打标签
for src in "${!images[@]}"; do
target="${images[$src]}"
echo ">>> 正在拉取源镜像: ${src}"
ctr -n k8s.io images pull "${src}" || { echo "拉取 ${src} 失败,请检查网络"; exit 1; }
# 如果源镜像与目标镜像不同,则需要打标签
if [ "${src}" != "${target}" ]; then
echo ">>> 打标签为: ${target}"
ctr -n k8s.io images tag "${src}" "${target}"
# (可选) 删除源镜像以节省空间
# echo ">>> 删除源镜像: ${src}"
# ctr -n k8s.io images remove "${src}"
fi
echo ">>> 完成 ${target}"
echo
done
echo "所有 Ceph 镜像已准备完毕!"
echo "当前 k8s.io 命名空间中的相关镜像列表:"
ctr -n k8s.io images list | grep -E "rook/ceph|ceph/ceph"
# 部署
kubectl create -f crds.yaml -f common.yaml -f operator.yaml
kubectl -n rook-ceph get pod
# 创建 Ceph 集群
# 获取并修改集群配置文件
cp ROOK/deploy/examples/cluster.yaml cluster.yaml
vim cluster.yaml
# 修改 cephClusterSpec 中的版本为 v18.2.4
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
name: rook-ceph
namespace: rook-ceph
spec:
cephVersion:
image: quay.io/ceph/ceph:v18.2.4 # 使用的Ceph版本[reference:10]
dataDirHostPath: /var/lib/rook # 宿主机上存储Ceph配置的目录[reference:11]
mon:
count: 3 # 3个monitor,实现高可用[reference:12]
allowMultiplePerNode: false
mgr:
count: 2 # 2个manager
dashboard:
enabled: true # 启用Ceph仪表板[reference:13]
storage:
useAllNodes: true # 使用所有节点上的设备[reference:14]
useAllDevices: false # 【必须设为false】,防止使用系统盘
# 【关键】通过deviceFilter精准选择硬盘
deviceFilter: "^sd[bc]" # 正则匹配所有以 'sd' 开头,后跟 'b' 或 'c' 的设备
config:
osdsPerDevice: "1" # 每块硬盘创建一个OSD[reference:15]
# 资源限制配置(可选,但建议设置)[reference:16]
resources:
mon:
limits:
memory: "2Gi"
requests:
memory: "1Gi"
cpu: "500m"
osd:
limits:
memory: "4Gi"
requests:
memory: "2Gi"
cpu: "500m"
placement:
all: # 此配置将应用于所有 Ceph 组件 (mon, mgr, osd等)
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
- effect: NoSchedule
key: node-role.kubernetes.io/master
# 启动工具箱(如果尚未运行)
kubectl create -f toolbox.yaml
# 进入工具箱 Pod
kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
# 在工具箱内执行ceph命令
ceph status
ceph osd status
ceph df
# 创建存储类(StorageClass)以供应用使用
# 创建块存储(RBD)池和StorageClass:
kubectl create -f csi/rbd/storageclass.yaml
kubectl get sc
# dashboard
kubectl edit cephcluster rook-ceph -n rook-ceph
spec:
dashboard:
enabled: true # 开启面板
ssl: true # 默认HTTPS,建议开启
port: 8443 # 默认端口
kubectl get svc -n rook-ceph | grep dashboard
dashboard-nodeport.yaml
apiVersion: v1
kind: Service
metadata:
name: rook-ceph-mgr-dashboard-np
namespace: rook-ceph
spec:
type: NodePort
selector:
app: rook-ceph-mgr
rook_cluster: rook-ceph
mgr_role: active
ports:
- port: 8443
targetPort: 8443
nodePort: 30443 # 自定义端口(30000-32767)
# 查看密码
kubectl -n rook-ceph get secret rook-ceph-dashboard-password \
-o jsonpath='{.data.password}' | base64 --decode; echo
# 进入工具箱
kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
# 修改密码,替换自定义密码
ceph dashboard set-login-credentials admin 你的新密码