9节点集群hadoop 2.7.7 + hbase 1.4.13 + hive 2.3.9 + spark 2.4.8
-
管理
node1:Active NameNode + Active ResourceManager
-
备用管理
node2:Standby NameNode + Standby ResourceManager
-
存储:
-
存储元数据 + 存储 + 数据库
node3、node4、node5:JournalNode + DataNode + HRegionServer
-
数据库元数据 + 数据库
node6:hive-metastore + Active HMaster
node7:hive-metastore + Standby HMaster
-
计算
node8、node9:Nodemanager(Spark on YARN)
初始化
bash
ufw disable
systemctl stop ufw
systemctl stop apparmor
systemctl disable apparmor
vim /etc/hostname
node1
vim /etc/hosts
ip1 node1
ip2 node2
ip3 node3
ip4 node4
ip5 node5
ip6 node6
ip7 node7
ip8 node8
ip9 node9
ip11 zk1
ip12 zk2
ip13 zk3
ip14 mysql-vip
ssh-keygen -t rsa
ssh-copy-id it@node1
ssh-copy-id it@node2
ssh-copy-id it@node3
ssh-copy-id it@node4
ssh-copy-id it@node5
ssh-copy-id it@node6
ssh-copy-id it@node7
ssh-copy-id it@node8
ssh-copy-id it@node9
文件
- 所有节点
bash
apt install -y openjdk-8-jdk
vim /etc/profile
# 其他略
# JDK Environment
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export JRE_HOME=$JAVA_HOME/jre
export CLASSPATH=.:$JAVA_HOME/lib:$JRE_HOME/lib
# Hadoop Environment
export HADOOP_HOME=/opt/modules/hadoop
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export HADOOP_YARN_HOME=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
# HBase Environment
export HBASE_HOME=/opt/modules/hbase
# Hive Environment
export HIVE_HOME=/opt/modules/hive
# Spark Environment
export SPARK_HOME=/opt/modules/spark
export SPARK_DIST_CLASSPATH=$(/opt/modules/hadoop/bin/hadoop classpath)
#
export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HBASE_HOME/bin:$HIVE_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH
source /etc/profile
bash
# 安装包
mkdir -p /opt/software
# 组件目录
mkdir -p /opt/modules
# HDFS的数据存储目录
mkdir -p /opt/data/hadoop
mkdir -p /opt/data/hadoop/journalnode
# HBase的数据存储目录
mkdir -p /opt/data/hbase
chown -R it:it /opt/software /opt/modules /opt/data
wget https://archive.apache.org/dist/hadoop/common/hadoop-2.7.7/hadoop-2.7.7.tar.gz
wget https://archive.apache.org/dist/hbase/1.4.13/hbase-1.4.13-bin.tar.gz
wget https://archive.apache.org/dist/hive/hive-2.3.9/apache-hive-2.3.9-bin.tar.gz
wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-hadoop2.7.tgz
tar -zxvf hadoop-2.7.7.tar.gz -C /opt/modules/
tar -zxvf hbase-1.4.13-bin.tar.gz -C /opt/modules/
tar -zxvf apache-hive-2.3.9-bin.tar.gz -C /opt/modules/
tar -zxvf spark-2.4.8-bin-hadoop2.7.tgz -C /opt/modules/
ln -s /opt/modules/hadoop-2.7.7 /opt/modules/hadoop
ln -s /opt/modules/hbase-1.4.13 /opt/modules/hbase
ln -s /opt/modules/apache-hive-2.3.9-bin /opt/modules/hive
ln -s /opt/modules/spark-2.4.8-bin-hadoop2.7 /opt/modules/spark
之后同步
bash
# 同步Hadoop文件
for i in {2..9}; do
scp -r /opt/modules/hadoop it@node$i:/opt/modules
done
# 同步HBase文件
for i in {2..9}; do
scp -r /opt/modules/hbase it@node$i:/opt/modules/
done
# 同步Hive文件
for i in {2..9}; do
scp -r /opt/modules/hive it@node$i:/opt/modules/
done
# 同步Spark文件
for i in {2..9}; do
scp -r /opt/modules/spark it@node$i:/opt/modules/
done
HDFS+YARN
bash
cd /opt/modules/hadoop/etc/hadoop
# 部署时环境变量
vim hadoop-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export HADOOP_JOURNALNODE_OPTS="-Xmx1g"
export HADOOP_ZKFC_OPTS="-Xmx512m"
# 指定NameNode地址与临时目录
vim core-site.xml
xml
<configuration>
<!-- Nameservice名称 -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns1</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/data/hadoop/tmp</value>
</property>
<property>
<name>hadoop.proxyuser.it.hosts</name>
<value>*</value>
</property>
<!-- 外置zk配置,故障转移用 -->
<property>
<name>hadoop.proxyuser.it.groups</name>
<value>*</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>zk1,zk2,zk3:2181</value>
</property>
</configuration>
bash
# Active/Passive架构
vim hdfs-site.xml
xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>/opt/data/hadoop/namenode</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>/opt/data/hadoop/datanode</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<!-- 外置zk配置,故障转移用 -->
<property>
<name>dfs.ha.zookeeper.quorum</name>
<value>zk1,zk2,zk3:2181</value>
</property>
<!-- Nameservice名称-->
<property>
<name>dfs.nameservices</name>
<value>ns1</value>
</property>
<!-- ns1对应的主备NameNode别名-->
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>
<!-- nn1对应的node1 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>node1:9000</value>
</property>
<property>
<name>dfs.namenode.http-address.ns1.nn1</name>
<value>node1:50070</value>
</property>
<!-- nn2对应的node2 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>node2:9000</value>
</property>
<property>
<name>dfs.namenode.http-address.ns1.nn2</name>
<value>node2:50070</value>
</property>
<!-- 配置JournalNode,NameNode元数据共享存储目录 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>journal://node3:8485;node4:8485;node5:8485/ns1</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/opt/data/hadoop/journalnode</value>
</property>
<!-- 故障转移代理类 -->
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!-- 故障隔离方法sshfence:通过ssh免密登录,杀死备节点进程 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 私钥 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/it/.ssh/id_rsa</value>
</property>
<!-- 启用自动故障转移 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
bash
# Active/Passive架构
vim yarn-site.xml
xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>2048</value>
</property>
<!-- 启用资源管理高可用 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yarn-cluster-1</value>
</property>
<!-- 对应的主备yarn-cluster-1别名-->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- rm1对应的node1 -->
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>node1:8032</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>node1:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>node1:8031</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>node1:8030</value>
</property>
<!-- rm2对应的node2 -->
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>node2:8032</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>node2:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>node2:8031</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>node2:8030</value>
</property>
<!-- 配置ZK集群地址,实现自动故障转移 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>zk1,zk2,zk3:2181</value>
</property>
<!-- 启用自动故障转移 -->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- 配置故障转移控制类 -->
<property>
<name>yarn.resourcemanager.ha.failover-controller.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.ha.ZKFailoverController</value>
</property>
</configuration>
bash
# 指定运行在YARN的MapReduce配置
cp mapred-site.xml.template mapred-site.xml
vim mapred-site.xml
xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>node1:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>node1:19888</value>
</property>
</configuration>
bash
vim slaves
node8
node9
bash
# node3至5启动JournalNode
hadoop-daemon.sh start journalnode
# node1初始化NameNode
hdfs namenode -format
# node1启动NameNode
hadoop-daemon.sh start namenode
# node2同步备NameNode
hdfs namenode -bootstrapStandby
# node1初始化ZK集群
hdfs zkfc -formatZK
# 启动HDFS
start-dfs.sh
# 启动YARN
start-yarn.sh
# 主NameNode:http://node1:50070
# 备NameNode:http://node2:50070
# 主RM:http://node1:8088
# 备RM:http://node2:8088
hbase
bash
cd /opt/modules/hbase/conf
# 部署时环境变量
vim hbase-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
# 禁用内部zookeeper
export HBASE_MANAGES_ZK=false
# HBase识别HDFS
export HBASE_CLASSPATH=/opt/modules/hadoop/etc/hadoop
vim hbase-site.xml
xml
<configuration>
<property>
<name>hbase.rootdir</name>
<value>hdfs://ns1/hbase</value>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
</property>
<property>
<name>hbase.zookeeper.quorum</name>
<!-- 实际zk节点ip -->
<value>zk1,zk2,zk3</value>
</property>
<property>
<name>hbase.zookeeper.property.clientPort</name>
<value>2181</value>
</property>
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/opt/data/hbase/zookeeper</value>
</property>
<property>
<name>hbase.unsafe.stream.capability.enforce</name>
<value>false</value>
</property>
</configuration>
bash
# 增加从节点
vim regionservers
node3
node4
node5
bash
# 启动
start-hbase.sh
hbase shell
hbase(main):001:0> list
TABLE
0 row(s) in 1.234 seconds
=> []
# 查看HMaster:http://ip1:16010
hive
bash
cd /opt/modules/hive/conf
# 该版本hive、该版本haddop,存在slf4j.jar冲突,所以移除
mv slf4j-log4j12-1.7.10.jar slf4j-log4j12-1.7.10.jar.bak
cd /opt/modules/hive/lib
# 下载驱动,兼容Hive 2.3.9 和 MySQL 5.x/8.x
wget https://repo1.maven.org/maven2/mysql/mysql-connector-java/5.1.49/mysql-connector-java-5.1.49.jar
chmod 644 mysql-connector-java-5.1.49.jar
# 部署环境变量
cp hive-env.sh.template hive-env.sh
vim hive-env.sh
export HADOOP_HOME=/opt/modules/hadoop
export HIVE_CONF_DIR=/opt/modules/hive/conf
export HIVE_AUX_JARS_PATH=/opt/modules/hive/lib
export HIVE_CLASSPATH=/opt/modules/hive/lib/mysql-connector-java-5.1.49.jar
bash
# 创建
vim hive-site.xml
xml
<configuration>
<!-- 指定元数据存储为MySQL-->
<property>
<name>hive.metastore.db.type</name>
<value>mysql</value>
</property>
<!-- 指定MySQL连接URL-->
<!-- hive数据库需提前创建,useSSL=false关闭SSL验证-->
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://mysql-vip:3306/hive?createDatabaseIfNotExist=true&useSSL=false&characterEncoding=UTF-8</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
</property>
<!-- 数据库信息 -->
<property>
<name>hive.metastore.db.name</name>
<value>hive</value>
</property
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive用户密码</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>
<property>
<name>hive.exec.scratchdir</name>
<value>/tmp/hive</value>
</property>
<property>
<name>hive.mapred.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.cli.print.header</name>
<value>true</value>
</property>
<property>
<name>hive.cli.print.current.db</name>
<value>true</value>
</property>
<!-- 关闭本地元数据模式,启用外置元数据存储 -->
<property>
<name>hive.metastore.local</name>
<value>false</value>
</property>
<!-- 元数据存储持久化配置 -->
<property>
<name>datanucleus.schema.autoCreateAll</name>
<value>true</value>
</property>
<property>
<name>datanucleus.autoCreateSchema</name>
<value>true</value>
</property>
<property>
<name>datanucleus.fixedDatastore</name>
<value>false</value>
</property>
<!-- hive高可用 -->
<property>
<name>hive.metastore.uris</name>
<value>thrift://node6:9083,thrift://node7:9083</value>
</property>
<property>
<name>hive.metastore.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>hive.metastore.ha.zk.quorum</name>
<value>zk1,zk2,zk3:2181</value>
</property>
</configuration>
bash
# 仓库目录
hdfs dfs -mkdir -p /user/hive/warehouse
# 临时目录
hdfs dfs -mkdir -p /tmp/hive
# 授权
hdfs dfs -chmod 744 /user/hive/warehouse
hdfs dfs -chmod 744 /tmp/hive
# 数据库初始化
schematool -dbType mysql -initSchema
# Schema initialization completed successfully
hive
hive (default)> create database test_db;
hive (default)> show databases;
spark
bash
cd /opt/modules/spark/conf
# 部署时环境变量
cp spark-env.sh.template spark-env.sh
vim spark-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
# Spark集成Hadoop配置
export HADOOP_CONF_DIR=/opt/modules/hadoop/etc/hadoop
# Spark集成Hive配置
export HIVE_CONF_DIR=/opt/modules/hive/conf
export SPARK_DRIVER_MEMORY=1g
export SPARK_EXECUTOR_MEMORY=1g
bash
/opt/modules/spark/bin/start-all.sh
# 查看Spark Master:http://ip1:8080
spark-sql
sql
create table test_tb (id int, name string);
insert into test_tb values (1, 'hive_test');
select * from test_tb;
进程举例
bash
jps
7618 HRegionServer
5989 NameNode
6566 NodeManager
19398 Jps
7466 HMaster
6445 ResourceManager
6141 DataNode
6319 SecondaryNameNode
4567 JournalNode
# 停止
/opt/modules/spark/bin/stop-all.sh
hive --service metastore stop
stop-hbase.sh
stop-yarn.sh
stop-dfs.sh
mariadb galera集群 + 对hive统一访问
- 初始化
bash
ufw disable
systemctl stop ufw
systemctl stop apparmor
systemctl disable apparmor
vim /etc/hostname
node1
vim /etc/hosts
ip1 node1
ip2 node2
ip3 node3
galera
- 所有节点
bash
apt install -y mariadb-server galera-4 mariadb-client
# 主配置:/etc/mysql/mariadb.conf.d/50-server.cnf
# 专用配置:/etc/mysql/conf.d/galera.cnf
vim /etc/mysql/conf.d/galera.cnf
[mysqld]
# 二进制日志、中继日志
log_bin = OFF
log_slave_updates = OFF
# 存储引擎
default_storage_engine = InnoDB
# 事务隔离级,避免锁冲突
transaction_isolation = READ-COMMITTED
# 自动提交开启
autocommit = 1
# 字符集
character_set_server = utf8mb4
collation_server = utf8mb4_unicode_ci
# 集群名称
wsrep_cluster_name = "mariadb-galera-cluster"
# 集群节点地址
wsrep_cluster_address = "gcomm://ip1,ip2,ip3"
# 本地节点地址
wsrep_node_address = "ip1"
# 本地节点名称
wsrep_node_name = "node1"
# Galera同步提供者
wsrep_provider = /usr/lib/galera4/libgalera_smm.so
# 同步模式,rsync效率高
wsrep_sst_method = rsync
# 开启写集复制
wsrep_on = ON
- 节点1
bash
# 集群初始化
galera_new_cluster
systemctl enable mariadb
# 验证,初始无密码
mysql -u root
sql
SHOW STATUS LIKE 'wsrep_%';
- 节点2、3
bash
systemctl start mariadb
systemctl enable mariadb
# 验证,初始无密码
mysql -u root
sql
SHOW STATUS LIKE 'wsrep_cluster_size';
- 所有节点
bash
# 安全配置
mysql_secure_installation
# 回车
# 设置root密码
# Y
# Y
# n
# Y
# Y
keepalived + haproxy
- 所有节点
bash
apt install -y keepalived haproxy
bash
vim /etc/haproxy/haproxy.cfg
# 全局配置
global
log /dev/log local0
log /dev/log local1 notice
chroot /var/lib/haproxy
stats socket /run/haproxy/admin.sock mode 660 level admin expose-fd listeners
stats timeout 30s
user haproxy
group haproxy
daemon
# 优化:最大连接数
maxconn 4096
# 默认配置
defaults
log global
mode tcp
option tcplog
option dontlognull
# 连接超时时间
timeout connect 5000
# 客户端超时时间
timeout client 10000
# 服务端超时时间
timeout server 10000
# 失败重试次数
retries 3
# 重试失败后重新分发
option redispatch
# 每个业务端的最大连接数
maxconn 100
# 监听前端
frontend mariadb_frontend
bind *:3306
mode tcp
default_backend mariadb_backend
# 转发后端
backend mariadb_backend
mode tcp
# 负载均衡算法
balance leastconn
# 健康检查
option tcp-check
# 每 inter 秒检查一次,rise健康则上线,fall次失败则下线
server node1 ip1:3306 check inter 5000 rise 2 fall 3 weight 10
server node2 ip2:3306 check inter 5000 rise 2 fall 3 weight 10
server node3 ip3:3306 check inter 5000 rise 2 fall 3 weight 10
# 验证配置
haproxy -c -f /etc/haproxy/haproxy.cfg
systemctl start haproxy
systemctl enable haproxy
systemctl status haproxy
- 节点1
bash
vim /etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
# 标识
router_id GALERA_HAPROXY
script_user root
enable_script_security
}
# 监控HAProxy是否运行
vrrp_script check_haproxy {
script "/usr/bin/systemctl is-active haproxy"
interval 2
weight -5
fall 3
rise 2
}
# VRRP实例配置
vrrp_instance VI_1 {
# 主节点为MASTER
state MASTER
# 网卡
interface ens33
# 虚拟路由 ID
virtual_router_id 51
# 主节点优先级
priority 100
advert_int 1
# 关闭抢占模式,主节点恢复后,不主动抢占vip
nopreempt
authentication {
# 认证方式:密码
auth_type PASS
auth_pass Ab#192837
}
# 绑定vip
virtual_ipaddress {
VIP/24 dev ens33
}
# 调用健康检查脚本
track_script {
check_haproxy
}
}
- 节点2、3
bash
# 只修改state和priority
vim /etc/keepalived/keepalived.conf
state BACKUP
priority 90
state BACKUP
priority 80
- 所有节点
bash
systemctl start keepalived
systemctl enable keepalived
systemctl status keepalived
zookeeper分布式协调
- 初始化
bash
ufw disable
systemctl stop ufw
systemctl stop apparmor
systemctl disable apparmor
vim /etc/hostname
zk1
vim /etc/hosts
ip1 zk1
ip2 zk2
ip3 zk3
bash
apt install -y openjdk-8-jdk
vim /etc/profile
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
export JRE_HOME=$JAVA_HOME/jre
export CLASSPATH=.:$JAVA_HOME/lib:$JRE_HOME/lib
export PATH=$JAVA_HOME/bin:$PATH
source /etc/profile
cd /usr/local
wget https://archive.apache.org/dist/zookeeper/zookeeper-3.4.14/zookeeper-3.4.14.tar.gz
tar -zxvf zookeeper-3.4.14.tar.gz
ln -s zookeeper-3.4.14 zookeeper
# 数据目录:存储myid文件、集群数据
mkdir -p /usr/local/zookeeper/data
# 日志目录
mkdir -p /usr/local/zookeeper/logs
chown -R it:it /usr/local/zookeeper-3.4.14
chown -R it:it /usr/local/zookeeper
bash
cd /usr/local/zookeeper/conf
cp zoo_sample.cfg zoo.cfg
vim zoo.cfg
# 节点通信间隔
tickTime=2000
# 客户端初始化最大超时时间
initLimit=10
# 节点同步超时时间
syncLimit=5
# 数据目录
dataDir=/usr/local/zookeeper/data
# 日志目录
dataLogDir=/usr/local/zookeeper/logs
clientPort=2181
server.1=ip1:2888:3888
server.2=ip2:2888:3888
server.3=ip3:2888:3888
# 可选
maxClientCnxns=60
autopurge.snapRetainCount=3
autopurge.purgeInterval=1
bash
# 节点1
echo "1" > /usr/local/zookeeper/data/myid
# 节点2
echo "2" > /usr/local/zookeeper/data/myid
# 节点3
echo "3" > /usr/local/zookeeper/data/myid
# 所有节点启动
./zkServer.sh start
# 检查
./zkServer.sh status
./zkCli.sh -server ip:2181