准备2台虚拟机实验
shell
172.16.157.129 //作为master
172.16.157.130 //作为node
1.关闭selinux
shell
vim /etc/selinux/config
shell
SELINUX=disable
2.关闭 firewalld
shell
systemctl disable firewalld.service
3.修改hosts
shell
tee -a /etc/hosts <<EOF
172.16.157.129 k8s-master-01
172.16.157.130 k8s-worker-01
EOF
4.关闭swap
shell
vim /etc/fstab
shell
# /dev/mapper/rl_172-swap none swap defaults 0 0
5.加载内核模块
shell
# 1. 加载必需的内核模块
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
# 立即加载模块
sudo modprobe overlay
sudo modprobe br_netfilter
# 2. 配置网络桥接和转发(您提到的部分)
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
net.ipv4.ip_forward = 1
EOF
# 3. 让配置生效
sudo sysctl --system
验证所有配置
shell
# 检查模块
lsmod | grep -E "(overlay|br_netfilter)"
# 检查网络参数
sysctl net.bridge.bridge-nf-call-iptables net.bridge.bridge-nf-call-ip6tables net.ipv4.ip_forward
shell
reboot
6.Docker CE 仓库
(/etc/yum.repos.d/docker-ce.repo)
shell
sudo tee /etc/yum.repos.d/docker-ce.repo <<EOF
[docker-ce-stable]
name=Docker CE Stable
baseurl=https://mirrors.aliyun.com/docker-ce/linux/centos/$(rpm -E %rhel)/x86_64/stable
enabled=1
gpgcheck=1
gpgkey=https://mirrors.aliyun.com/docker-ce/linux/centos/gpg
EOF
shell
# 安装 containerd.io
sudo yum install -y containerd.io
# 配置 containerd
sudo containerd config default | sudo tee /etc/containerd/config.toml
# 修改为使用 systemd cgroup(重要!)
sudo sed -i 's/SystemdCgroup = false/SystemdCgroup = true/g' /etc/containerd/config.toml
# 修改 sandbox 为国内源(加速下载)
sudo sed -i 's|registry.k8s.io/pause|registry.aliyuncs.com/google_containers/pause|g' /etc/containerd/config.toml
# 重启 containerd
systemctl restart containerd
systemctl status containerd
systemctl enable containerd
因为没有安装docker,为了调试容器,创建配置文件
替换
shell
sudo tee /etc/crictl.yaml <<'EOF'
runtime-endpoint: unix:///run/containerd/containerd.sock
image-endpoint: unix:///run/containerd/containerd.sock
timeout: 10
debug: false
EOF
7.Kubernetes 仓库
(/etc/yum.repos.d/kubernetes.repo)
shell
cat <<EOF | sudo tee /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.28/rpm/
enabled=1
gpgcheck=1
gpgkey=https://mirrors.aliyun.com/kubernetes-new/core/stable/v1.28/rpm/repodata/repomd.xml.key
EOF
shell
# 安装 Kubernetes 组件
yum install -y kubelet kubeadm kubectl
# 启动 kubelet
sudo systemctl enable kubelet && sudo systemctl start kubelet
预拉取 Kubernetes 集群初始化所需的所有镜像
shell
kubeadm config images pull --image-repository=registry.aliyuncs.com/google_containers
shell
kubeadm init \
--image-repository=registry.aliyuncs.com/google_containers \
--pod-network-cidr=10.244.0.0/16
# 或者使用自己创建的配置文件初始化
#sudo kubeadm init --config=kubeadm-config.yaml
如果失败了
shell
# 1. 重置 kubeadm
sudo kubeadm reset -f
# 2. 清理网络配置
sudo rm -rf /etc/cni/net.d
# 3. 清理 iptables 规则
sudo iptables -F && sudo iptables -t nat -F && sudo iptables -t mangle -F && sudo iptables -X
# 4. 重启 kubelet 和 containerd
sudo systemctl restart containerd
sudo systemctl restart kubelet
# 5. 检查 kubelet 状态(现在应该正常了)
sudo systemctl status kubelet
成功
shell
[addons] Applied essential addon: CoreDNS
[addons] Applied essential addon: kube-proxy
Your Kubernetes control-plane has initialized successfully!
To start using your cluster, you need to run the following as a regular user:
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
Alternatively, if you are the root user, you can run:
export KUBECONFIG=/etc/kubernetes/admin.conf
You should now deploy a pod network to the cluster.
Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at:
https://kubernetes.io/docs/concepts/cluster-administration/addons/
Then you can join any number of worker nodes by running the following on each as root:
kubeadm join 172.16.157.129:6443 --token dlzfpi.q0prlokpqwa97hx4 \
--discovery-token-ca-cert-hash sha256:c944839158db334e6bc806f4806d3227ddb778245a5e1eeda40fc0f198f0899a
8.安装calico网络插件
备份原文件 cp calico.yaml calico.yaml.bak # 替换所有 docker.io/calico → quay.io/calico sed -i 's|docker.io/calico|quay.io/calico|g' calico.yaml
shell
curl -O https://raw.githubusercontent.com/projectcalico/calico/v3.27.0/manifests/calico.yaml
shell
# 备份原文件
cp calico.yaml calico.yaml.bak
# 替换所有 docker.io/calico → quay.io/calico
sed -i 's|docker.io/calico|quay.io/calico|g' calico.yaml
手动下载镜像
shell
# 核心镜像:
crictl pull quay.io/calico/node:v3.27.0
crictl pull quay.io/calico/cni:v3.27.0
crictl pull quay.io/calico/kube-controllers:v3.27.0
# 附加镜像:
crictl pull quay.io/calico/pod2daemon-flexvol:v3.27.0
crictl pull quay.io/calico/csi:v3.27.0
crictl pull quay.io/calico/typha:v3.27.0
crictl pull quay.io/calico/node-driver-registrar:v3.27.0
shell
# 3. 应用新配置
kubectl apply -f calico.yaml
# 4. 观察状态
kubectl get pods -n kube-system -w
9.安装harbor证书
使用 containerd 的 certs.d 机制
shell
# 创建 registry 专属证书目录
sudo mkdir -p /etc/containerd/certs.d/192.168.31.130
# 放入证书 /tmp/harbor-ca.crt 为从harbor机器上手动上传的
sudo cp /tmp/harbor-ca.crt /etc/containerd/certs.d/192.168.31.130/ca.crt
📁 路径格式:
/etc/containerd/certs.d/<registry-host>/ca.crt
确保 containerd 启用了 config_path(关键!)
shell
vim /etc/containerd/config.toml
toml
[plugins.'io.containerd.cri.v1.images'.registry]
config_path = ''
修改如下
toml
[plugins.'io.containerd.cri.v1.images'.registry]
config_path = '/etc/containerd/certs.d'
10.安装dashbord
shell
curl -LO https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml
vim recommended.yaml
找到
yaml
kind: Service
apiVersion: v1
metadata:
labels:
k8s-app: kubernetes-dashboard
name: kubernetes-dashboard
namespace: kubernetes-dashboard
spec:
ports:
- port: 443
targetPort: 8443
selector:
k8s-app: kubernetes-dashboard
修改如下
yaml
kind: Service
apiVersion: v1
metadata:
labels:
k8s-app: kubernetes-dashboard
name: kubernetes-dashboard
namespace: kubernetes-dashboard
spec:
type: NodePort # NodePort模式
ports:
- port: 443
targetPort: 8443
nodePort: 30443 # ← 显式指定(必须在 30000-32767 范围内)
selector:
k8s-app: kubernetes-dashboard
还有两处镜像地址也要修改为可用的
yaml
image: kubernetesui/dashboard:v2.7.0
#我改为我本地的镜像了
image: 192.168.31.130/development/dashboard:v2.7.0
yaml
image: kubernetesui/metrics-scraper:v1.0.8
#我改为我本地的镜像了
image: 192.168.31.130/development/metrics-scraper:v1.0.8
允许污点,两处tolerations修改如下
yaml
tolerations:
- key: node-role.kubernetes.io/master
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane # 👈 新增这两行
effect: NoSchedule
安装
shell
kubectl apply -f recommended.yaml
查看
shell
[root@172 192.168.31.130]# kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
kube-system calico-kube-controllers-86778b9f8c-8ctqw 1/1 Running 0 22h
kube-system calico-node-bnvgn 1/1 Running 0 22h
kube-system coredns-66f779496c-6cfqj 1/1 Running 0 22h
kube-system coredns-66f779496c-8j7j2 1/1 Running 0 22h
kube-system etcd-172.16.157.129 1/1 Running 0 22h
kube-system kube-apiserver-172.16.157.129 1/1 Running 0 22h
kube-system kube-controller-manager-172.16.157.129 1/1 Running 0 22h
kube-system kube-proxy-9cddd 1/1 Running 0 22h
kube-system kube-scheduler-172.16.157.129 1/1 Running 0 22h
kubernetes-dashboard dashboard-metrics-scraper-79db99bf88-x2znj 1/1 Running 0 22m
kubernetes-dashboard kubernetes-dashboard-5f6f8d5bdb-bqmkz 1/1 Running 0 22m
[root@172 192.168.31.130]# kubectl -n kubernetes-dashboard get svc kubernetes-dashboard
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
kubernetes-dashboard NodePort 10.99.14.197 <none> 443:30443/TCP 35m
浏览器访问: https://:30443
创建管理员用户 admin-user.yaml
yaml
# admin-user.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: admin-user
namespace: kubernetes-dashboard
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: admin-user
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-admin
subjects:
- kind: ServiceAccount
name: admin-user
namespace: kubernetes-dashboard
应用并获取 Token:
shell
kubectl apply -f admin-user.yaml
kubectl -n kubernetes-dashboard create token admin-user
复制输出的 Token,在 Dashboard 登录页选择 Token 方式粘贴即可。
删除 admin-user的方式
shell
kubectl -n kubernetes-dashboard delete sa admin-user
kubectl delete clusterrolebinding admin-user
创建开发者账号 dev-user.yaml
yaml
---
# 1. 创建 ServiceAccount(在 default 命名空间)
apiVersion: v1
kind: ServiceAccount
metadata:
name: dev-user
namespace: default
---
# 2. 定义 Role(仅限 default 命名空间)
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: default
name: dev-role
rules:
- apiGroups: [""]
resources: ["pods", "services", "configmaps", "persistentvolumeclaims"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["apps"]
resources: ["deployments", "replicasets", "daemonsets", "statefulsets"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["batch"]
resources: ["jobs", "cronjobs"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
- apiGroups: ["networking.k8s.io"]
resources: ["ingresses"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
---
# 3. 绑定 Role 到 ServiceAccount
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: dev-rolebinding
namespace: default
subjects:
- kind: ServiceAccount
name: dev-user
namespace: default
roleRef:
kind: Role
name: dev-role
apiGroup: rbac.authorization.k8s.io
应用配置
shell
kubectl apply -f dev-user.yaml
获取登录 Token(用于 Dashboard 或 kubectl)
shell
kubectl -n default create token dev-user
11.开机脚本
All-in-One 开机自愈脚本startup-k8s.sh
shell
vim /usr/local/bin/startup-k8s.sh
bash
#!/bin/bash
# startup-k8s.sh - All-in-One Kubernetes 开机自愈脚本
# 放在 /usr/local/bin/startup-k8s.sh,并设置为开机启动(可选)
set -e
echo "🚀 Kubernetes 开机自愈流程启动..."
# 获取节点名
NODE_NAME=$(hostname)
# 1. 确保 kubelet 和 containerd 正在运行
echo "✅ 确保容器运行时和 kubelet 已启动..."
sudo systemctl enable --now containerd kubelet
# 2. 等待 kubelet 就绪(最多 60 秒)
echo "⏳ 等待 kubelet 就绪..."
for i in {1..30}; do
if sudo crictl info &>/dev/null; then
echo " ✅ containerd 就绪"
break
fi
sleep 2
done
# 3. 等待 API Server 可用
echo "⏳ 等待 Kubernetes API 就绪..."
for i in {1..60}; do
if kubectl get nodes &>/dev/null; then
echo " ✅ API Server 就绪"
break
fi
sleep 2
done
# 4. 自动 uncordon(解除 SchedulingDisabled)
echo "🔧 检查并解除节点调度禁用..."
if kubectl get node "$NODE_NAME" -o jsonpath='{.spec.unschedulable}' 2>/dev/null | grep -q 'true'; then
kubectl uncordon "$NODE_NAME"
echo " ✅ 节点已恢复调度"
else
echo " ℹ️ 节点未被 cordoned,跳过"
fi
# 5. 自动移除 Master 污点(All-in-One 必须!)
echo "🧹 检查并移除 control-plane 污点..."
if kubectl describe node "$NODE_NAME" | grep -q 'node-role.kubernetes.io/control-plane:NoSchedule'; then
kubectl taint nodes "$NODE_NAME" node-role.kubernetes.io/control-plane:NoSchedule- 2>/dev/null || true
echo " ✅ Master 污点已移除"
else
echo " ℹ️ 污点不存在,跳过"
fi
# 6. 等待 CoreDNS 就绪(可选,用于确认集群完全可用)
echo "⏳ 等待 CoreDNS 就绪..."
for i in {1..30}; do
if kubectl get pod -n kube-system -l k8s-app=kube-dns --field-selector=status.phase=Running --no-headers | grep -q 'Running'; then
echo " ✅ CoreDNS 已运行,集群完全就绪!"
break
fi
sleep 3
done
echo "🎉 Kubernetes All-in-One 集群已自愈完成!"
echo "👉 执行 'kubectl get pod -A' 查看状态"
shell
sudo chmod +x /usr/local/bin/startup-k8s.sh
12.关机脚本
All-in-One 关机脚本(通用版)
vim /usr/local/bin/shutdown-k8s.sh
shell
#!/bin/bash
# shutdown-k8s.sh - 适用于 kubeadm / k3s 单节点集群
set -e
echo "🔄 开始 Kubernetes 优雅关机流程..."
# 1. 获取当前节点名(兼容 kubeadm/k3s)
NODE_NAME=$(kubectl get nodes -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || hostname)
echo "📍 节点名称: $NODE_NAME"
# 2. 将节点标记为不可调度(cordon)
echo "⏸️ 标记节点为不可调度..."
kubectl cordon "$NODE_NAME" 2>/dev/null || echo "⚠️ kubectl 不可用,跳过 cordon"
# 3. 驱逐所有 Pod(保留 DaemonSet,删除临时数据)
echo "🧹 驱逐工作负载 Pod..."
kubectl drain "$NODE_NAME" \
--ignore-daemonsets \
--delete-emptydir-data \
--force \
--grace-period=30 \
2>/dev/null || echo "⚠️ drain 失败,继续关机(可能无工作负载)"
# 4. (可选)停止 kubelet 和容器运行时(防止关机时写入)
echo "⏹️ 停止 kubelet 和 containerd..."
sudo systemctl stop kubelet containerd 2>/dev/null || echo "⚠️ 服务停止失败,继续关机"
# 5. 执行关机
echo "🔌 正在关机..."
sudo shutdown -h now
shell
sudo chmod +x /usr/local/bin/shutdown-k8s.sh