ubuntu 26.04 k8s 1.36 ceph

172.16.0.100 master

172.16.0.101 node01

172.16.0.102 node02

1. 开机卡顿

bash 复制代码
# 停止+禁用+屏蔽,彻底不启动
sudo systemctl disable --now NetworkManager-wait-online.service
sudo systemctl mask NetworkManager-wait-online.service

sudo systemctl disable --now systemd-networkd-wait-online.service
sudo systemctl mask systemd-networkd-wait-online.service

# 重载systemd配置
sudo systemctl daemon-reload

# Netplan 网卡设置(防止单网卡 DHCP 卡住等待
sudo vim /etc/netplan/*.yaml

network:
  ethernets:
    ens33:
      dhcp4: true
      optional: true   # 新增这一行
  version: 2

sudo netplan apply

# chrony aliyun.com
sudo vim /etc/chrony/sources.d/aliyun.sources
# 阿里云公共 NTP 服务
server ntp.aliyun.com iburst
server ntp1.aliyun.com iburst
server ntp2.aliyun.com iburst
server ntp3.aliyun.com iburst
server ntp4.aliyun.com iburst
server ntp5.aliyun.com iburst
server ntp6.aliyun.com iburst
server ntp7.aliyun.com iburst

systemctl restart chronyd

2. apt 源配置

bash 复制代码
/etc/apt/sources.list.d/ubuntu.sources

Types: deb
URIs: https://mirrors.aliyun.com/ubuntu
Suites: resolute resolute-updates resolute-backports
Components: main universe restricted multiverse
Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg

Types: deb
URIs: https://mirrors.aliyun.com/ubuntu
Suites: resolute-security
Components: main universe restricted multiverse
Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg

3. 添加域名

bash 复制代码
echo "172.16.0.100 master" >> /etc/hosts
echo "172.16.0.101 node01" >> /etc/hosts
echo "172.16.0.102 node02" >> /etc/hosts

4. 关闭 Swap

bash 复制代码
sudo systemctl stop ufw
sudo systemctl disable ufw
sudo systemctl mask ufw

sudo iptables -F
sudo iptables -X
sudo iptables -t nat -F
sudo iptables -t nat -X
sudo iptables -P INPUT ACCEPT
sudo iptables -P FORWARD ACCEPT
sudo iptables -P OUTPUT ACCEPT


swapoff -a
sed -i '/swap/s/^/#/' /etc/fstab
free -h

5. 加载内核模块并开启网络转发

bash 复制代码
vim k8s-network.sh
#!/bin/bash
# 1. 内核模块开机自启
cat > /etc/modules-load.d/k8s-network.conf <<EOF
overlay
br_netfilter
EOF

# 2. 网络转发参数持久化
cat > /etc/sysctl.d/99-k8s-forward.conf <<EOF
net.ipv4.ip_forward = 1
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
EOF

# 3. 立即加载模块
modprobe overlay
modprobe br_netfilter

# 4. 立即生效sysctl
sysctl --system

# 5. 验证输出
echo "===== 验证参数 ====="
sysctl net.ipv4.ip_forward net.bridge.bridge-nf-call-iptables net.bridge.bridge-nf-call-ip6tables
echo "===== 验证内核模块 ====="
lsmod | grep -E "overlay|br_netfilter"





# 重启检验
# 重启后执行
sysctl net.ipv4.ip_forward
sysctl net.bridge.bridge-nf-call-iptables
sysctl net.bridge.bridge-nf-call-ip6tables
lsmod | grep overlay
lsmod | grep br_netfilter

6. 安装 containerd

bash 复制代码
sudo apt install -y containerd


vi fix-containerd2.sh


set -e
mkdir -p /etc/containerd
# 生成纯净默认配置
containerd config default > /etc/containerd/config.toml




# 开启 systemd cgroup
sed -i 's/SystemdCgroup = false/SystemdCgroup = true/' /etc/containerd/config.toml

# 修改 pause 镜像地址
# 替换 pinned sandbox 为阿里云pause
sed -i "s|sandbox = .*|sandbox = 'registry.aliyuncs.com/google_containers/pause:3.10.1'|" /etc/containerd/config.toml




# 重载+重启
systemctl daemon-reload
systemctl restart containerd

7. 安装 K8s 组件

bash 复制代码
mkdir -p /etc/apt/keyrings

curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.36/deb/Release.key | \
gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg

echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.36/deb/ /" |\
tee /etc/apt/sources.list.d/kubernetes.list

apt update
apt install -y kubelet kubeadm kubectl
apt-mark hold kubelet kubeadm kubectl

kubeadm version
kubectl version --client
kubelet --version




kubeadm reset -f
rm -rf /etc/kubernetes /var/lib/kubelet /var/lib/etcd
rm -rf $HOME/.kube

8. 初始化 master 节点

bash 复制代码
# 查看镜像
# containerd 镜像 /etc/containerd/config.toml grep pause:3.10.1
root@ubuntu26:~# kubeadm config images list
registry.k8s.io/kube-apiserver:v1.36.2
registry.k8s.io/kube-controller-manager:v1.36.2
registry.k8s.io/kube-scheduler:v1.36.2
registry.k8s.io/kube-proxy:v1.36.2
registry.k8s.io/coredns/coredns:v1.14.2
registry.k8s.io/pause:3.10.2
registry.k8s.io/etcd:3.6.8-0

# 提前下载并更换 tag 脚本
#!/bin/bash
K8S_VERSION="v1.36.2"  
COREDNS_VERSION="v1.14.2"
ETCD_VERSION="3.6.8-0"
PAUSE_VERSION="3.10.2"
REGISTRY_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/registry.k8s.io"

images=(
  "kube-apiserver:${K8S_VERSION}"
  "kube-controller-manager:${K8S_VERSION}"
  "kube-scheduler:${K8S_VERSION}"
  "kube-proxy:${K8S_VERSION}"
  "coredns/coredns:${COREDNS_VERSION}"
  "etcd:${ETCD_VERSION}"
  "pause:${PAUSE_VERSION}"
)

for img in "${images[@]}"; do
  source_img="${REGISTRY_PREFIX}/${img}"
  target_img="registry.k8s.io/${img}"
  
  echo "正在拉取(命名空间 k8s.io): ${source_img}"
  ctr -n k8s.io images pull "${source_img}"
  
  echo "正在打标签: ${target_img}"
  ctr -n k8s.io images tag "${source_img}" "${target_img}"
  
  # 若想删除源镜像(华为云前缀),执行:
  ctr -n k8s.io images remove "${source_img}"
done

echo "所有镜像准备完毕!"







apt update
apt install -y cri-tools

cat > /etc/crictl.yaml <<EOF
runtime-endpoint: unix:///run/containerd/containerd.sock
image-endpoint: unix:///run/containerd/containerd.sock
timeout: 10
debug: false
EOF


crictl --version
crictl info


# 查看所有容器,看 apiserver/etcd 是否退出
crictl ps -a

# 查看 etcd 日志
ETCD_ID=$(crictl ps -a | grep etcd | awk '{print $1}')
crictl logs $ETCD_ID

# 查看 apiserver 日志
APISERVER_ID=$(crictl ps -a | grep kube-apiserver | awk '{print $1}')
crictl logs $APISERVER_ID


# 列出 k8s 命名空间所有容器
ctr -n k8s.io c list
# 查看镜像
ctr -n k8s.io images list
bash 复制代码
# kubeadm 配置文件修改默认仓库
kubeadm config print init-defaults > kubeadm-config.yaml
vim kubeadm-config.yaml
imageRepository: registry.aliyuncs.com/google_containers
advertiseAddress: 172.16.0.100



kubeadm config images pull --config kubeadm-config.yaml

kubeadm config images pull \
  --kubernetes-version=v1.36.1 \
  --image-repository registry.aliyuncs.com/google_containers



kubeadm init --config kubeadm-config.yaml


  mkdir -p $HOME/.kube
  sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
  sudo chown $(id -u):$(id -g) $HOME/.kube/config

kubeadm join 172.16.0.100:6443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:2daec271f14cf5b143ba8ab7ece30c0b21a874942a1bf809b643f934cdd1c433






kubeadm reset -f
rm -rf /etc/kubernetes /var/lib/kubelet /var/lib/etcd
rm -rf $HOME/.kube

9. Node 节点加入

bash 复制代码
kubeadm join 172.16.0.100:6443 --token abcdef.0123456789abcdef \
        --discovery-token-ca-cert-hash sha256:7538b5a6f63256963831309c85ef60ec1a4fa7855144cbaa9a1ca2ff0286caa1




mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config

10. 安装calico网络插件

bash 复制代码
wget https://raw.githubusercontent.com/projectcalico/calico/v3.30.3/manifests/calico.yaml
wget https://fastgit.org/projectcalico/calico/raw/v3.30.3/manifests/calico.yaml
wget https://cdn.jsdelivr.net/gh/projectcalico/calico@v3.30.3/manifests/calico.yaml
bash 复制代码
grep "image:" calico.yaml
image: docker.io/calico/cni:v3.30.3
image: docker.io/calico/node:v3.30.3
image: docker.io/calico/kube-controllers:v3.30.3


vim pull_calico_images.sh



#!/bin/bash
# Calico 版本(请根据实际需求修改)
CALICO_VERSION="v3.30.3"

# 华为云 SWR 镜像仓库前缀(源地址)
REGISTRY_PREFIX="swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/calico"

# Calico 镜像名称列表(不含版本号)
images=(
  "cni"
  "node"
  "kube-controllers"
)

for img in "${images[@]}"; do
  source_img="${REGISTRY_PREFIX}/${img}:${CALICO_VERSION}"
  target_img="docker.io/calico/${img}:${CALICO_VERSION}"
  
  echo ">>> 正在从华为云拉取源镜像(命名空间 k8s.io): ${source_img}"
  ctr -n k8s.io images pull "${source_img}" || { echo "拉取失败,请检查镜像是否存在或网络连接"; exit 1; }
  
  echo ">>> 打标签为(命名空间 k8s.io): ${target_img}"
  ctr -n k8s.io images tag "${source_img}" "${target_img}"
  
  echo ">>> 删除源镜像(华为云前缀): ${source_img}"
  ctr -n k8s.io images remove "${source_img}"
  
  echo ">>> 完成 ${img}"
  echo
done

echo "所有 Calico 镜像已准备在 k8s.io 命名空间!"
echo "当前 k8s.io 命名空间中的 Calico 镜像列表:"
ctr -n k8s.io images list | grep calico




# 根据 master kubeadm init --pod-network-cidr=10.244.0.0/16 指定为 pod 地址 10.244.0.0/16
# 修改 CALICO_IPV4POOL_CIDR 保持一致
- name: CALICO_IPV4POOL_CIDR
  value: "10.0.17.0/24"

# 修改为 BGP 模式
# Enable IPIP
- name: CALICO_IPV4POOL_IPIP
  value: "Always"  #改成Off







kubectl apply -f calico.yaml
kubectl get pod -A


kubectl delete pod -n kube-system pod --force --grace-period=0

11. nginx

bash 复制代码
# 创建 nginx 部署
kubectl create deployment nginx --image=docker.m.daocloud.io/library/nginx:alpine

# 暴露端口,外部可访问(NodePort)
kubectl expose deployment nginx --port=80 --type=NodePort



nginx-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: nginx
spec:
  replicas: 1
  selector:
    matchLabels:
      app: nginx
  template:
    metadata:
      labels:
        app: nginx
    spec:
      containers:
      - name: nginx
        image: docker.m.daocloud.io/library/nginx:alpine
        ports:
        - containerPort: 80
---
apiVersion: v1
kind: Service
metadata:
  name: nginx-svc
spec:
  type: NodePort
  selector:
    app: nginx
  ports:
  - port: 80
    targetPort: 80
    nodePort: 30080

12 Ceph

bash 复制代码
lsblk -f


# 清除硬盘数据
# 请将 /dev/sdX 替换为你的实际设备名,例如 /dev/sdb
sudo wipefs -a /dev/sdX
# 部署Rook Operator
# 克隆Rook仓库
git clone --single-branch --branch v1.15.1 https://gitee.com/mirrors/ROOK.git
cd ROOK/deploy/examples


# 一键拉取脚本
vim pull_ceph_images.sh
#!/bin/bash
# Rook v1.15.1 + Ceph v18.2.4 镜像预拉取脚本

set -e

# 定义镜像列表 (源镜像 -> 目标镜像)
# 格式: "源镜像地址 目标镜像地址"
# 如果源镜像与目标镜像相同,则只需填写一次
declare -A images=(
    ["swr.cn-north-4.myhuaweicloud.com/ddn-k8s/docker.io/rook/ceph:v1.15.1"]="docker.io/rook/ceph:v1.15.1"
    ["swr.cn-north-4.myhuaweicloud.com/ddn-k8s/quay.io/ceph/ceph:v18.2.4"]="docker.io/ceph/ceph:v18.2.4"
    # 如果 quay.io 能访问,也可以使用官方镜像
    # ["quay.io/ceph/ceph:v18.2.4"]="quay.io/ceph/ceph:v18.2.4"
)

# 使用 ctr (containerd) 拉取并打标签
for src in "${!images[@]}"; do
    target="${images[$src]}"
    
    echo ">>> 正在拉取源镜像: ${src}"
    ctr -n k8s.io images pull "${src}" || { echo "拉取 ${src} 失败,请检查网络"; exit 1; }
    
    # 如果源镜像与目标镜像不同,则需要打标签
    if [ "${src}" != "${target}" ]; then
        echo ">>> 打标签为: ${target}"
        ctr -n k8s.io images tag "${src}" "${target}"
        
        # (可选) 删除源镜像以节省空间
        # echo ">>> 删除源镜像: ${src}"
        # ctr -n k8s.io images remove "${src}"
    fi
    
    echo ">>> 完成 ${target}"
    echo
done

echo "所有 Ceph 镜像已准备完毕!"
echo "当前 k8s.io 命名空间中的相关镜像列表:"
ctr -n k8s.io images list | grep -E "rook/ceph|ceph/ceph"

# 部署
kubectl create -f crds.yaml -f common.yaml -f operator.yaml

kubectl -n rook-ceph get pod



# 创建 Ceph 集群
# 获取并修改集群配置文件
cp ROOK/deploy/examples/cluster.yaml cluster.yaml

vim cluster.yaml
# 修改 cephClusterSpec 中的版本为 v18.2.4
apiVersion: ceph.rook.io/v1
kind: CephCluster
metadata:
  name: rook-ceph
  namespace: rook-ceph
spec:
  cephVersion:
    image: quay.io/ceph/ceph:v18.2.4 # 使用的Ceph版本[reference:10]
  dataDirHostPath: /var/lib/rook # 宿主机上存储Ceph配置的目录[reference:11]
  mon:
    count: 3 # 3个monitor,实现高可用[reference:12]
    allowMultiplePerNode: false
  mgr:
    count: 2 # 2个manager
  dashboard:
    enabled: true # 启用Ceph仪表板[reference:13]
  storage:
    useAllNodes: true # 使用所有节点上的设备[reference:14]
    useAllDevices: false # 【必须设为false】,防止使用系统盘
    # 【关键】通过deviceFilter精准选择硬盘
    deviceFilter: "^sd[bc]" # 正则匹配所有以 'sd' 开头,后跟 'b' 或 'c' 的设备
    config:
      osdsPerDevice: "1" # 每块硬盘创建一个OSD[reference:15]
  # 资源限制配置(可选,但建议设置)[reference:16]
  resources:
    mon:
      limits:
        memory: "2Gi"
      requests:
        memory: "1Gi"
        cpu: "500m"
    osd:
      limits:
        memory: "4Gi"
      requests:
        memory: "2Gi"
        cpu: "500m"
  placement:
    all:  # 此配置将应用于所有 Ceph 组件 (mon, mgr, osd等)
      tolerations:
        - effect: NoSchedule
          key: node-role.kubernetes.io/control-plane
        - effect: NoSchedule
          key: node-role.kubernetes.io/master



# 启动工具箱(如果尚未运行)
kubectl create -f toolbox.yaml

# 进入工具箱 Pod
kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash

# 在工具箱内执行ceph命令
ceph status
ceph osd status
ceph df


# 创建存储类(StorageClass)以供应用使用
# 创建块存储(RBD)池和StorageClass:
kubectl create -f csi/rbd/storageclass.yaml
kubectl get sc



# dashboard
kubectl edit cephcluster rook-ceph -n rook-ceph
spec:
  dashboard:
    enabled: true       # 开启面板
    ssl: true            # 默认HTTPS,建议开启
    port: 8443           # 默认端口

kubectl get svc -n rook-ceph | grep dashboard

dashboard-nodeport.yaml
apiVersion: v1
kind: Service
metadata:
  name: rook-ceph-mgr-dashboard-np
  namespace: rook-ceph
spec:
  type: NodePort
  selector:
    app: rook-ceph-mgr
    rook_cluster: rook-ceph
    mgr_role: active
  ports:
  - port: 8443
    targetPort: 8443
    nodePort: 30443  # 自定义端口(30000-32767)


# 查看密码
kubectl -n rook-ceph get secret rook-ceph-dashboard-password \
-o jsonpath='{.data.password}' | base64 --decode; echo
# 进入工具箱
kubectl -n rook-ceph exec -it deploy/rook-ceph-tools -- bash
# 修改密码,替换自定义密码
ceph dashboard set-login-credentials admin 你的新密码
相关推荐
爱吃龙利鱼2 小时前
k8s指定命名空间kubeconfig文件生成教程
容器·kubernetes
蜀道山老天师15 小时前
K8s 数据存储全解析:从 EmptyDir 到 PV/PVC
云原生·容器·kubernetes
创世宇图16 小时前
【Python工程化实战】Kubernetes 中 Python 应用的优雅启停与健康检查:零停机滚动更新实战
python·云原生·kubernetes·优雅停机
小二·20 小时前
Docker+K8s生产级部署实战:从0到1打造高可用微服务集群
docker·微服务·kubernetes
运维开发故事8 天前
基于 Arthas 的多集群在线诊断系统设计与实现
kubernetes
Patrick_Wilson10 天前
从「改个端口」到 502:Next.js on k8s 的容器端口、Service 映射与 env 覆盖
docker·kubernetes·next.js
探索云原生10 天前
K8s 1.36 这个 GA 特性,把 initContainer 拉模型的 hack 干掉了
ai·云原生·kubernetes
Java之美11 天前
一次k8s升级引发的DevicePlugin注册失败
云原生·kubernetes