说明
在最新版本的Docker环境部署Hadoop的定制版。容器镜像是AMD64架构CPU环境使用。ARM环境的需自己定制。参考历史文章记录。
创建资源
bash
mkdir -p hadoop/config
cd hadoop/config
创建文件capacity-scheduler.xml
bash
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.scheduler.capacity.maximum-applications</name>
<value>10000</value>
</property>
<property>
<name>yarn.scheduler.capacity.maximum-am-resource-percent</name>
<value>0.1</value>
</property>
<property>
<name>yarn.scheduler.capacity.resource-calculator</name>
<value>org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.queues</name>
<value>default</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.capacity</name>
<value>100</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.user-limit-factor</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.maximum-capacity</name>
<value>100</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.state</name>
<value>RUNNING</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_submit_applications</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_administer_queue</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.node-locality-delay</name>
<value>40</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.acl_application_max_priority</name>
<value>*</value>
</property>
<property>
<name>yarn.scheduler.capacity.root.default.maximum-application-lifetime</name>
<value>-1</value>
</property>
<property>
<name>yarn.scheduler.capacity.rack-locality-additional-delay</name>
<value>-1</value>
</property>
</configuration>
创建文件core-site.xml
bash
<?xml version="1.0"?>
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/hadoop/tmpdata</value>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://namenode:9000</value> <!-- 适配compose的hostname -->
</property>
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
创建文件hdfs-site.xml
bash
<?xml version="1.0"?>
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/hadoop/data/nn</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/hadoop/data/dn</value>
</property>
<property>
<name>dfs.namenode.rpc-address</name>
<value>namenode:9000</value> <!-- 适配compose的hostname -->
</property>
<property>
<name>dfs.namenode.rpc-bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>dfs.namenode.http-bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<!-- <property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
<description>当namenode反向解析datanode失败时,改为false。</description>
</property> -->
</configuration>
创建文件mapred-site.xml
bash
<?xml version="1.0"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=/opt/hadoop</value>
</property>
</configuration>
创建文件yarn-site.xml
bash
<?xml version="1.0"?>
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>resourcemanager</value> <!-- 适配compose的hostname -->
</property>
<property>
<name>yarn.resourcemanager.bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>0.0.0.0:8088</value>
</property>
<property>
<name>yarn.nodemanager.bind-host</name>
<value>0.0.0.0</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.delete.debug-delay-sec</name>
<value>600</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce_shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.acl.enable</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME,PATH,LANG,TZ,HADOOP_MAPRED_HOME</value>
</property>
</configuration>
创建compose.yaml
bash
volumes:
hadoop-nn-data:
name: hadoop-nn-data
hadoop-dn-data:
name: hadoop-dn-data
networks:
hadoop:
name: hadoop-net
driver: bridge
services:
namenode:
image: zhuyifeiruichuang/hadoop:3.1.1
hostname: namenode
container_name: hadoop-namenode
command: ["sh", "-c", "sudo mkdir -p /opt/hadoop/data/nn && sudo chown -R hadoop:hadoop /opt/hadoop/data && if [ ! -d /opt/hadoop/data/nn/current ]; then echo 'Formatting NameNode...' && hdfs namenode -format; fi && hdfs namenode"]
ports:
- 9870:9870
- 9000:9000
environment:
ENSURE_NAMENODE_DIR: "/opt/hadoop/data/nn"
HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
volumes:
- hadoop-nn-data:/opt/hadoop/data/nn
- /etc/localtime:/etc/localtime:ro
- ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
- ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
- ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
- ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
- ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9870 || exit 1"]
interval: 10s
timeout: 5s
retries: 3
start_period: 60s
networks:
- hadoop
deploy:
resources:
limits:
cpus: '4'
memory: 8G
datanode:
image: zhuyifeiruichuang/hadoop:3.1.1
hostname: datanode
container_name: hadoop-datanode
command: ["sh", "-c", "sudo mkdir -p /opt/hadoop/data/dn && sudo chown -R hadoop:hadoop /opt/hadoop/data && hdfs datanode"]
ports:
- 9866:9866
- 9864:9864
environment:
HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
volumes:
- hadoop-dn-data:/opt/hadoop/data/dn
- /etc/localtime:/etc/localtime:ro
- ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
- ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
- ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
- ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
- ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
depends_on:
- namenode
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:9864 || exit 1"]
interval: 10s
timeout: 5s
retries: 3
start_period: 40s
networks:
- hadoop
deploy:
resources:
limits:
cpus: '4'
memory: 8G
resourcemanager:
image: zhuyifeiruichuang/hadoop:3.1.1
hostname: resourcemanager
container_name: hadoop-resourcemanager
command: ["yarn", "resourcemanager"]
ports:
- 8088:8088
- 8032:8032
environment:
HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
depends_on:
- namenode
restart: unless-stopped
volumes:
- /etc/localtime:/etc/localtime:ro
- ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
- ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
- ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
- ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
- ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
healthcheck:
# test: ["CMD-SHELL", "yarn rmadmin -checkHealth || exit 1"]
test: ["CMD-SHELL", "curl -f http://localhost:8088 || exit 1"]
interval: 10s
timeout: 5s
retries: 3
start_period: 60s
networks:
- hadoop
deploy:
resources:
limits:
cpus: '4'
memory: 8G
nodemanager:
hostname: nodemanager
image: zhuyifeiruichuang/hadoop:3.1.1
container_name: hadoop-nodemanager
command: ["yarn", "nodemanager"]
ports:
- 8042:8042
environment:
HADOOP_CONF_DIR: "/opt/hadoop/etc/hadoop"
depends_on:
- resourcemanager
- namenode
restart: unless-stopped
volumes:
- /etc/localtime:/etc/localtime:ro
- ./config/core-site.xml:/opt/hadoop/etc/hadoop/core-site.xml
- ./config/hdfs-site.xml:/opt/hadoop/etc/hadoop/hdfs-site.xml
- ./config/yarn-site.xml:/opt/hadoop/etc/hadoop/yarn-site.xml
- ./config/mapred-site.xml:/opt/hadoop/etc/hadoop/mapred-site.xml
- ./config/capacity-scheduler.xml:/opt/hadoop/etc/hadoop/capacity-scheduler.xml
healthcheck:
test: ["CMD-SHELL", "curl -f http://localhost:8042 || exit 1"]
interval: 10s
timeout: 5s
retries: 3
start_period: 40s
networks:
- hadoop
deploy:
resources:
limits:
cpus: '4'
memory: 8G
部署
bash
docker compose up -d