Docker环境部署Apache Hadoop3.1定制版

说明

在最新版本的Docker环境部署Hadoop的定制版。容器镜像是AMD64架构CPU环境使用。ARM环境的需自己定制。参考历史文章记录。

创建资源

bash 复制代码
mkdir -p hadoop/config
cd hadoop/config

创建文件capacity-scheduler.xml

bash 复制代码
<?xml version="1.0"?>
<configuration>
  <property>
    <name>yarn.scheduler.capacity.maximum-applications</name>
    <value>10000</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
    <value>0.1</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.resource-calculator</name>
    <value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.queues</name>
    <value>default</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.capacity</name>
    <value>100</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
    <value>1</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
    <value>100</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.state</name>
    <value>RUNNING</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
    <value>*</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
    <value>*</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.node-locality-delay</name>
    <value>40</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.acl_application_max_priority</name>
    <value>*</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.root.default.maximum-application-lifetime</name>
    <value>-1</value>
  </property>
  <property>
    <name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
    <value>-1</value>
  </property>
</configuration>

创建文件core-site.xml

bash 复制代码
<?xml version="1.0"?>
<configuration>
  <property>
    <name>hadoop.tmp.dir</name>
    <value>/opt/hadoop/tmpdata</value>
  </property>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://namenode:9000</value> <!-- 适配compose的hostname -->
  </property>
  <property>
    <name>hadoop.proxyuser.hadoop.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hadoop.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hue.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hue.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hive.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.hive.groups</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.root.hosts</name>
    <value>*</value>
  </property>
  <property>
    <name>hadoop.proxyuser.root.groups</name>
    <value>*</value>
  </property>
</configuration>

创建文件hdfs-site.xml

bash 复制代码
<?xml version="1.0"?>
<configuration>
  <property>
    <name>dfs.namenode.name.dir</name>
    <value>/opt/hadoop/data/nn</value>
  </property>
  <property>
    <name>dfs.datanode.data.dir</name>
    <value>/opt/hadoop/data/dn</value>
  </property>
  <property>
    <name>dfs.namenode.rpc-address</name>
    <value>namenode:9000</value> <!-- 适配compose的hostname -->
  </property>
  <property>
    <name>dfs.namenode.rpc-bind-host</name>
    <value>0.0.0.0</value>
  </property>
  <property>
    <name>dfs.namenode.http-bind-host</name>
    <value>0.0.0.0</value>
  </property>
  <property>
    <name>dfs.replication</name>
    <value>1</value>
  </property>
  <property>
    <name>dfs.client.use.datanode.hostname</name>
    <value>true</value>
  </property>
  <property>
    <name>dfs.permissions.enabled</name>
    <value>false</value>
  </property>
  <property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
  </property>
  <!-- <property>
    <name>dfs.namenode.datanode.registration.ip-hostname-check</name>
    <value>false</value>
    <description>当namenode反向解析datanode失败时,改为false。</description>
  </property> -->
</configuration>

创建文件mapred-site.xml

bash 复制代码
<?xml version="1.0"?>
<configuration>
  <property>
    <name>mapreduce.framework.name</name>
    <value>yarn</value>
  </property>
  <property>
    <name>yarn.app.mapreduce.am.env</name>
    <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
  </property>
  <property>
    <name>mapreduce.map.env</name>
    <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
  </property>
  <property>
    <name>mapreduce.reduce.env</name>
    <value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
  </property>
</configuration>

创建文件yarn-site.xml

bash 复制代码
<?xml version="1.0"?>
<configuration>
  <property>
    <name>yarn.resourcemanager.hostname</name>
    <value>resourcemanager</value> <!-- 适配compose的hostname -->
  </property>
  <property>
    <name>yarn.resourcemanager.bind-host</name>
    <value>0.0.0.0</value>
  </property>
  <property>
    <name>yarn.resourcemanager.webapp.address</name>
    <value>0.0.0.0:8088</value>
  </property>
  <property>
    <name>yarn.nodemanager.bind-host</name>
    <value>0.0.0.0</value>
  </property>
  <property>
    <name>yarn.nodemanager.pmem-check-enabled</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.nodemanager.delete.debug-delay-sec</name>
    <value>600</value>
  </property>
  <property>
    <name>yarn.nodemanager.vmem-check-enabled</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services</name>
    <value>mapreduce_shuffle</value>
  </property>
  <property>
    <name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
    <value>org.apache.hadoop.mapred.ShuffleHandler</value>
  </property>
  <property>
    <name>yarn.acl.enable</name>
    <value>false</value>
  </property>
  <property>
    <name>yarn.nodemanager.env-whitelist</name>
    <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
  </property>
</configuration>

创建compose.yaml

bash 复制代码
volumes:
  hadoop-nn-data:
    name: hadoop-nn-data
  hadoop-dn-data:
    name: hadoop-dn-data
networks:
  hadoop:
    name: hadoop-net
    driver: bridge

services:
  namenode:
    image: zhuyifeiruichuang/hadoop:3.1.1
    hostname: namenode
    container_name: hadoop-namenode
    command: ["sh", "-c", "sudo mkdir -p /opt/hadoop/data/nn && sudo chown -R hadoop:hadoop /opt/hadoop/data && if [ ! -d /opt/hadoop/data/nn/current ]; then echo 'Formatting NameNode...' && hdfs namenode -format; fi && hdfs namenode"]
    ports:
      - 9870:9870
      - 9000:9000
    environment:
      ENSURE_NAMENODE_DIR: "/opt/hadoop/data/nn"
      HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
    volumes:
      - hadoop-nn-data:/opt/hadoop/data/nn
      - /etc/localtime:/etc/localtime:ro
      - ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
      - ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
      - ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
      - ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
      - ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:9870 || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 60s
    networks:
      - hadoop
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G

  datanode:
    image: zhuyifeiruichuang/hadoop:3.1.1
    hostname: datanode
    container_name: hadoop-datanode
    command: ["sh", "-c", "sudo mkdir -p /opt/hadoop/data/dn && sudo chown -R hadoop:hadoop /opt/hadoop/data && hdfs datanode"]
    ports:
      - 9866:9866
      - 9864:9864
    environment:
      HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
    volumes:
      - hadoop-dn-data:/opt/hadoop/data/dn
      - /etc/localtime:/etc/localtime:ro
      - ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
      - ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
      - ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
      - ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
      - ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
    depends_on:
      - namenode
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:9864 || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 40s
    networks:
      - hadoop
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G

  resourcemanager:
    image: zhuyifeiruichuang/hadoop:3.1.1
    hostname: resourcemanager
    container_name: hadoop-resourcemanager
    command: ["yarn", "resourcemanager"]
    ports:
      - 8088:8088
      - 8032:8032
    environment:
      HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
    depends_on:
      - namenode
    restart: unless-stopped
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
      - ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
      - ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
      - ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
      - ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
    healthcheck:
      # test: ["CMD-SHELL", "yarn rmadmin -checkHealth || exit 1"]
      test: ["CMD-SHELL", "curl -f http://localhost:8088 || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 60s
    networks:
      - hadoop
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G

  nodemanager:
    hostname: nodemanager
    image: zhuyifeiruichuang/hadoop:3.1.1
    container_name: hadoop-nodemanager
    command: ["yarn", "nodemanager"]
    ports:
      - 8042:8042
    environment:
      HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
    depends_on:
      - resourcemanager
      - namenode
    restart: unless-stopped
    volumes:
      - /etc/localtime:/etc/localtime:ro
      - ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
      - ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
      - ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
      - ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
      - ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8042 || exit 1"]
      interval: 10s
      timeout: 5s
      retries: 3
      start_period: 40s
    networks:
      - hadoop
    deploy:
      resources:
        limits:
          cpus: '4'
          memory: 8G

部署

bash 复制代码
docker compose up -d
相关推荐
AI服务老曹1 小时前
统一接入百家私有协议:基于 Docker 容器化的 GB28181/RTSP 边缘计算视频中台架构解析(附全源码交付)
人工智能·docker·边缘计算
小此方2 小时前
Re:Linux系统篇(二十九)文件篇·二:深度解析Linux文件描述符、dup2指针覆盖与内建命令重定向完全解析
linux·运维·驱动开发
Cosolar5 小时前
LlamaIndex索引类型全解析:原理与实战指南
运维·服务器
方便面不加香菜8 小时前
Linux--基础IO(一)
linux·运维·服务器
鼎讯信通10 小时前
风电光缆运维提质增效:G-4000A 光缆故障追踪仪破解风场巡检难题
运维·网络·数据库
三十..11 小时前
MySQL 从入门到高可用架构实战精要
运维·数据库·mysql
杨浦老苏11 小时前
开源多用户图书追踪系统LibrisLog
docker·群晖·收藏管理
跨境数据猎手11 小时前
大数据在电商行业的应用
大数据·运维·爬虫