bash
#!/bin/bash
software_dir=/root/hadoop/
install_dir=/opt/
hostname=(master node1 node2)
jdk_name=$(tar -tf "$software_dir"jdk*|head -n 1|sed 's|\/||')
hadoop_name=$(tar -tf "$software_dir"hadoop*|head -n 1|sed 's|\/||')
zookeeper_name=$(tar -tf "$software_dir"zookeeper*|head -n 1|sed 's|\/||')
### 搭建时间同步集群
echo =================== Start Chrony Install =====================
for host in ${hostname[@]}
do
ssh -T $host <<-EOF
[ ! \$(rpm -qa chrony) ] && yum install -y chrony &>/dev/null
[ ! \$(rpm -qa chrony) ] && echo "============ install chrony fail ==============" && exit
echo -e >> /etc/chrony.conf
sed -i 's/^server/#server/' /etc/chrony.conf
echo -e "server master iburst\nbindaddress ::\nallow" >> /etc/chrony.conf
[ ${hostname[0]} = \$(hostname) ] && echo "local stratum 10" >> /etc/chrony.conf && sed -i 's/master/127\.0\.0\.1/' /etc/chrony.conf
systemctl enable --now chronyd
timedatectl set-timezone Asia/Shanghai
chronyc -a makestep
exit
EOF
done
[ ! $(rpm -qa chrony) ] && exit
### xsync script
for host in ${hostname[@]}
do
ssh -T $host <<-EOF
[ ! \$(rpm -qa rsync) ] && yum install -y rsync &>/dev/null
[ ! \$(rpm -qa rsync) ] && echo "============ install rsync fail ==============" && exit
exit
EOF
done
[ ! $(rpm -qa rsync) ] && exit
cat > /usr/local/bin/xsync << EOF
#!/bin/bash
pcount=\$#
if [ \$pcount -lt 1 ]
then
echo "Not Enough Arguement !"
exit
fi
for host in ${hostname[@]}
do
if [ "\$host" = \$(hostname) ]
then
continue
fi
echo ======================== Start \$host File Sync =======================
for file in \$@
do
if [ -e \$file ]
then
pdir=\$(cd -P \$(dirname \$file); pwd)
echo fileDir=\$pdir
fname=\$(basename \$file)
echo fileName=\$fname
ssh \$host "mkdir -p \$pdir"
rsync -av \$pdir/\$fname \$host:\$pdir &>/dev/null
else
echo "\$file does not exists"
fi
done
done
EOF
chmod +x /usr/local/bin/xsync
### Install Component(Jdk Hadoop Zookeeper)
echo =================== Start Install Component =====================
tar -xf "$software_dir"jdk* -C $install_dir
tar -xf "$software_dir"hadoop* -C $install_dir
tar -xf "$software_dir"zookeeper* -C $install_dir
rm -rf $install_dir$hadoop_name/share/doc
xsync ${install_dir}{$jdk_name,$hadoop_name,$zookeeper_name}
for host in ${hostname[@]}
do
ssh -T $host <<EOF
echo -e "\
#java
export JAVA_HOME=$install_dir${jdk_name}\n\
export PATH=\\\$PATH:\\\$JAVA_HOME/bin\n\
#hadoop
export HADOOP_HOME=$install_dir${hadoop_name}\n\
export PATH=\\\$PATH:\\\$HADOOP_HOME/bin:\\\$HADOOP_HOME/sbin\n\
#zookeeper
export ZOOKEEPER_HOME=$install_dir${zookeeper_name}\n\
export PATH=\\\$PATH:\\\$ZOOKEEPER_HOME/bin\n\
#hive
export HIVE_HOME=${install_dir}hive\n\
export PATH=\\\$PATH:\\\$HIVE_HOME/bin\
" >> /etc/profile
source /etc/profile
exit
EOF
done
java -version &>/dev/null
[ $? -eq 0 ] && echo "========= java/hadoop/zookeeper/hive installation complete !========"
### jpsall script
cat > /usr/local/bin/jpsall << EOF
#!/bin/bash
for host in ${hostname[@]}
do
echo -e "\033[32m======================== \$host =======================\033[0m"
if [ $# -gt 0 ]
then
ssh $host "source /etc/profile;$*"
else
ssh $host "source /etc/profile;jps|grep -v Jps"
fi
done
EOF
chmod +x /usr/local/bin/jpsall
### Zookeeper Configuration
echo =================== Start Zookeeper Configuration =====================
zookeeper_path=$install_dir$zookeeper_name
rm -rf $zookeeper_path
tar -xf "$software_dir"zookeeper* -C $install_dir
mkdir -p $zookeeper_path/{data,logs}
mv $install_dir$zookeeper_name/conf/{zoo_sample.cfg,zoo.cfg}
sed -i "/^dataDir=/c\dataDir=$zookeeper_path/data" $zookeeper_path/conf/zoo.cfg
count=1
for host in ${hostname[@]}
do
zookeeper_host+="server.$count=$host:2888:3888"
if [ $count -lt $(( ${#hostname[@]} )) ]
then
zookeeper_host+="\n"
fi
((count++))
done
echo -e "$zookeeper_host" >> $zookeeper_path/conf/zoo.cfg
sed -i "s|ZOO_LOG_DIR=\".\"|ZOO_LOG_DIR=\"$zookeeper_path/logs\"|" $zookeeper_path/bin/zkEnv.sh
cat > /usr/lib/systemd/system/zookeeper.service <<EOF
[Unit]
Description=Zookeeper Service
After=network.target syslog.target
[Service]
Type=forking
User=root
Group=root
Environment=JAVA_HOME=$install_dir$jdk_name
PIDFile=$zookeeper_path/data/zookeeper_server.pid
ExecStart=$zookeeper_path/bin/zkServer.sh start
ExecStop=$zookeeper_path/bin/zkServer.sh stop
ExecReload=$zookeeper_path/bin/zkServer.sh restart
Restart=always
TimeoutSec=30
SuccessExitStatus=130 143
[Install]
WantedBy=multi-user.target
EOF
chown -R root:root $zookeeper_path
xsync $zookeeper_path
xsync /usr/lib/systemd/system/zookeeper.service
count=1
for host in ${hostname[@]}
do
ssh -T $host <<EOF
echo $count > $zookeeper_path/data/myid
systemctl daemon-reload
systemctl enable --now zookeeper
exit
EOF
((count++))
done
jpsall "zkServer.sh status"
### Hadoop Configuration
echo =================== Start Hadoop Configuration =====================
hadoop_path=$install_dir$hadoop_name
hadoop_conf_path=$hadoop_path/etc/hadoop
stop-all.sh
jpsall "rm -rf $hadoop_path" &>/dev/null
tar -xf "$software_dir"hadoop* -C $install_dir
rm -rf $hadoop_path/share/doc/
cp $hadoop_conf_path/{mapred-site.xml.template,mapred-site.xml}
sed -i '/configuration>$/d' $hadoop_conf_path/{core-site,hdfs-site,mapred-site,yarn-site}.xml
for ((i=0; i<${#hostname[@]}; i++))
do
zookeeper_address+=${hostname[$i]}:2181
qjournal_address+=${hostname[$i]}:8485
cluster_node+=${hostname[$i]}
[ $i -lt $(( ${#hostname[@]} -1 )) ] && zookeeper_address+="," && qjournal_address+=";" && cluster_node+="\n"
done
cat >> $hadoop_conf_path/core-site.xml <<EOF
<configuration>
<!-- 设置 HDFS 服务名称 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://hacluster</value>
</property>
<!-- 配置hadoop的元数据存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>$hadoop_path/data</value>
</property>
<!-- 配置zookeeper通信地址 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>$zookeeper_address</value>
</property>
<!-- 开启HiveServer2 root用户全部权限 -->
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
EOF
cat >> $hadoop_conf_path/hdfs-site.xml <<EOF
<configuration>
<!-- 设置Ha集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>hacluster</value>
</property>
<!-- 设置数据副本数 -->
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<!-- 设置ha集群的节点名称 -->
<property>
<name>dfs.ha.namenodes.hacluster</name>
<value>nn1,nn2</value>
</property>
<!-- 配置nn1 RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.hacluster.nn1</name>
<value>${hostname[0]}:9000</value>
</property>
<!-- 配置nn2 RPC通信地址 -->
<property>
<name>dfs.namenode.rpc-address.hacluster.nn2</name>
<value>${hostname[1]}:9000</value>
</property>
<!-- 配置nn1 Http通信地址 -->
<property>
<name>dfs.namenode.http-address.hacluster.nn1</name>
<value>${hostname[0]}:50070</value>
</property>
<!-- 配置nn2 Http通信地址 -->
<property>
<name>dfs.namenode.http-address.hacluster.nn2</name>
<value>${hostname[1]}:50070</value>
</property>
<!-- 配置journal节点位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://$qjournal_address/hacluster</value>
</property>
<!-- 配置journal元数据存储目录-->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>$hadoop_path/data/dfs/journal</value>
</property>
<!-- 使用ssh方式进行远程切换主备 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 配置ssh密钥文件位置 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<!-- 关闭权限检查 -->
<property>
<name>dfs.permissions.enable</name>
<value>false</value>
</property>
<!-- 设置故障转移类 -->
<property>
<name>dfs.client.failover.proxy.provider.hacluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 开启自动故障转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
EOF
cat >> $hadoop_conf_path/mapred-site.xml <<EOF
<configuration>
<!-- 指定MR运行在yarn上 默认是在本地运行 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 配置历史服务器 JobHistoryServer 进程通信IPC地址 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>${hostname[2]}:10020</value>
</property>
<!-- 配置历史服务器 JobHistoryServer Web UI地址 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>${hostname[2]}:19888</value>
</property>
</configuration>
EOF
cat >> $hadoop_conf_path/yarn-site.xml <<EOF
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 开启yarn 高可用 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 设置yarn集群名称 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>cluster-yarn</value>
</property>
<!-- 配置yarn集群节点名称 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 配置resourcemanager安装在指定的节点 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>${hostname[1]}</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>${hostname[2]}</value>
</property>
<!-- 配置resourcemanager的zookeeper集群地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>$zookeeper_address</value>
</property>
<!-- 开启resourcemanager自动恢复 -->
<property>
<name>yarn.resourcemanager.recovery.enable</name>
<value>true</value>
</property>
<!-- 指定resourcemanager的状态信息存储在zookeeper集群 -->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- 关闭 resourcemanager 虚拟内存检查 -->
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<!-- 配置 resourcemanager 内存大小 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>3072</value>
</property>
<!-- 开启日志聚集功能 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 聚集日志保留时间设置7天 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<!-- 日志聚集位置默认为HDFS文件系统的/tmp/logs路径下 -->
<property>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/tmp/jobhistory/log</value>
</property>
</configuration>
EOF
echo -e $cluster_node > $hadoop_conf_path/slaves
chown -R root:root $hadoop_path
sed -i "/^export JAVA_HOME=/c\export JAVA_HOME=$install_dir$jdk_name" $hadoop_conf_path/hadoop-env.sh
xsync $hadoop_path
for host in ${hostname[@]}
do
ssh -T $host <<EOF
[ ! \$(rpm -qa psmisc) ] && yum install -y psmisc &>/dev/null
EOF
done
expect <<-EOF
spawn hdfs zkfc -formatZK
expect {
"(Y or N)" {send "Y\r"; exp_continue}
expect eof
}
EOF
echo ============ ZookeeperCluster Formatting Complete =============
for host in ${hostname[@]}
do
ssh -T $host <<EOF
echo ============ Start $host Journalnode =============
hadoop-daemon.sh start journalnode
while true
do
curl $host:8485 &>/dev/null
if [ \$? -eq 0 ]
then
exit
fi
sleep 4
done
EOF
done
hdfs namenode -format
echo ============ HadoopCluster Formatting Complete =============
hadoop-daemon.sh start namenode
echo ============ Start SecondaryNamenode Data Sync =============
ssh -T ${hostname[1]} <<EOF
hdfs namenode -bootstrapStandby
hadoop-daemon.sh start namenode
exit
EOF
### custom_script
yarn_custom_script(){
sed -i -E '/resourcemanager|nodemanager/s/^/#/' $hadoop_path/sbin/$1
cat >> $hadoop_path/sbin/$1 <<EOF
# $2 resourceManager
AUTOHA_ENABLED=\$(\$HADOOP_PREFIX/bin/hdfs getconf -confKey yarn.resourcemanager.ha.enabled)
if [ "\$(echo "\$AUTOHA_ENABLED" | tr A-Z a-z)" = "true" ]; then
IDS=\$(\$HADOOP_PREFIX/bin/hdfs getconf -confKey yarn.resourcemanager.ha.rm-ids|tr "," " ")
IDS=(\$IDS)
for ((i=0; i<\${#IDS[@]}; i++))
do
NODES+=\$(\$HADOOP_PREFIX/bin/hdfs getconf -confKey yarn.resourcemanager.hostname.\${IDS[\$i]})
if [ \$i -lt \$(( \${#IDS[@]} -1 )) ]
then
NODES+=" "
fi
done
echo "$3 Resourcemanager HA on [\$NODES]"
"\$bin"/yarn-daemons.sh --config \$YARN_CONF_DIR --hostnames "\$NODES" $2 resourcemanager
else
"\$bin"/yarn-daemon.sh --config \$YARN_CONF_DIR $2 resourcemanager
fi
# $2 nodeManager
"\$bin"/yarn-daemons.sh --config \$YARN_CONF_DIR $2 nodemanager
# $2 historyserver
HISTORYSERVER_ENABLE=\$(\$HADOOP_PREFIX/bin/hdfs getconf -confKey yarn.log-aggregation-enable)
REMOTE=\$(\$HADOOP_PREFIX/bin/hdfs getconf -confKey mapreduce.jobhistory.webapp.address|cut -d ":" -f1)
if [ "\$(echo "\$HISTORYSERVER_ENABLE" | tr A-Z a-z)" = "true" ]; then
echo "$3 Historyserver on [\$REMOTE]"
ssh -T \$REMOTE "\$bin"/mr-jobhistory-daemon.sh $2 historyserver 2>&1 | sed "s/^/\$REMOTE: /"
fi
EOF
}
yarn_custom_script start-yarn.sh start Starting
yarn_custom_script stop-yarn.sh stop Stoping
#### ha集群启动使用该start-all.sh脚本有个bug就启动顺序的问题 该脚本的启动顺序是 namenode->datanode->journal nodes ......
#### 这样执行下来就会导致每次需要执行两次start-all.sh 才能让集群启动成功
#### ha集群正确的启动顺序是 journal nodes -> namenode ->datanode ......
#### 解决方案:把start-all.sh里面关于启动journal node的代码放到namenode 代码前面就能保证ha集群每次启动成功了
sed -i '/^# quor/,/^#------/d;49 r /dev/fd/3' $hadoop_path/sbin/start-dfs.sh \
3< <(sed -n '/^# quor/,/^#-----/p' $hadoop_path/sbin/start-dfs.sh)
sed -i '/Deprecated/s/^/#/' $hadoop_path/sbin/start-all.sh
sed -i '/Deprecated/s/^/#/' $hadoop_path/sbin/stop-all.sh
start-all.sh
jpsall
hdfs dfs -chmod -R 777 /tmp
echo ============ HadoopCluster Startup Complete =============
#### hive configuration
echo ============ Start Hive Configuration =============
hive_path=${install_dir}hive
hive_conf_path=$hive_path/conf
tez_path=${install_dir}tez
rm -rf $tez_path
tar xf ${software_dir}*tez* -C $install_dir
mv ${install_dir}*tez* $tez_path
chown -R root:root $tez_path
rm -rf $hive_path
tar xf ${software_dir}*hive* -C $install_dir
mv ${install_dir}*hive* ${install_dir}hive
mysql_user=root
mysql_password=1234
if [ -f "$hive_conf_path/hive-log4j2.properties.template" ]
then
mv $hive_conf_path/hive-log4j2.properties.template $hive_conf_path/hive-log4j2.properties
fi
if [ -f "$hive_conf_path/hive-exec-log4j2.properties.template" ]
then
mv $hive_conf_path/hive-exec-log4j2.properties.template $hive_conf_path/hive-exec-log4j2.properties
fi
cp ${software_dir}mysql-connector-java-5.1.44-bin.jar $hive_path/lib
sed -i "/property.hive.log.dir/c\property.hive.log.dir=$hive_path/logs" $hive_conf_path/hive-log4j2.properties
sed -i "/property.hive.log.dir/c\property.hive.log.dir=$hive_path/logs" $hive_conf_path/hive-exec-log4j2.properties
cat > $hive_conf_path/hive-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<!-- 配置hive元数据在hdfs的存储路径 -->
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/hive/database</value>
</property>
<!-- 对于小数据量,自动使用本地模式执行 MR job 加快执行过程 ,默认是false -->
<property>
<name>hive.exec.mode.local.auto</name>
<value>true</value>
</property>
<!-- 取消列名前面的表名 -->
<property>
<name>hive.resultset.use.unique.column.names</name>
<value>false</value>
</property>
<!-- 更换计算引擎 默认MR -->
<property>
<name>hive.execution.engine</name>
<value>tez</value>
</property>
<!-- 关闭元数据校验 -->
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://${HOSTNAME}:3306/hive?createDatabaseIfNotExist=true&useUnicode=true&characterEncodeing=UTF-8&useSSL=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>$mysql_user</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>$mysql_password</value>
</property>
</configuration>
EOF
rm -rf ${hive_path}/lib/log4j-slf4j-impl-*.jar
docker exec -it mysql mysql -u $mysql_user -p$mysql_password -e "drop database if exists hive;" &>/dev/null
schematool -dbType mysql -initSchema
hdfs dfs -rm -r /tez
hdfs dfs -mkdir /tez
tez_name=$(ls $tez_path/share)
hdfs dfs -put $tez_path/share/$tez_name /tez
rm -rf ${tez_path}/lib/slf4j-log4j12-*.jar
cat > $hive_conf_path/tez-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>tez.lib.uris</name>
<value>\${fs.defaultFS}/tez/$tez_name</value>
</property>
<property>
<name>tez.use.cluster.hadoop-libs</name>
<value>true</value>
</property>
<property>
<name>tez.history.logging.service.class</name>
<value>org.apache.tez.dag.history.logging.ats.ATSHisoryLoggingService</value>
</property>
</configuration>
EOF
mv $hive_conf_path/hive-env.sh.template $hive_conf_path/hive-env.sh
cat >> $hive_conf_path/hive-env.sh <<EOF
export TZEZ_HOME=$tez_path
export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:\$TEZ_HOME/*.jar:\$TEZ_HOME/lib/*
EOF
for jar in \$(ls \$TEZ_HOME|grep jar)
do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/$jar
done
for jar in \$(ls \$TEZ_HOME\lib)
do
export TEZ_JARS=$TEZ_JARS:$TEZ_HOME/lib/$jar
done