Hadoop-HA-Hive-on-Spark 4台虚拟机安装配置文件

版本号
步骤
hadoop
hive
- hive-site.xml
- spark-defaults.conf
spark

版本号

apache-hive-3.1.3-bin.tar

spark-3.0.0-bin-hadoop3.2.tgz

hadoop-3.1.3.tar.gz

步骤

在hdfs上新建

spark-history（设置权限777），spark-jars文件夹

上传jar到hdfs

sh 复制代码

hdfs dfs -D dfs.replication=1 -put ./* /spark-jars

hadoop

core-site.xml

xml 复制代码

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
        <!--指定hadoop集群在zookeeper上注册的节点名-->
        <property>
                <name>fs.defaultFS</name>
                <value>hdfs://hacluster</value>
        </property>

        <!--指定hadoop运行时产生的临时文件-->
        <property>
                <name>hadoop.tmp.dir</name>
                <value>file:///opt/hadoop-3.1.3/tmp</value>
        </property>

        <!--设置缓存大小 默认4KB--> <property>
                <name>io.file.buffer.size</name>
                <value>4096</value>
        </property>

        <!--指定zookeeper的存放地址-->
        <property>
                <name>ha.zookeeper.quorum</name>
                <value>node15:2181,node16:2181,node17:2181,node18:2181</value>
        </property>

        <!--配置允许root代理访问主机节点-->
        <property>
                <name>hadoop.proxyuser.root.hosts</name>
                <value>*</value>
        </property>

        <!--配置该节点允许root用户所属的组-->
        <property>
                <name>hadoop.proxyuser.root.groups</name>
                <value>*</value>
        </property>
        <!-- 配置HDFS网页登录使用的静态用户为summer-->
        <property>
                <name>hadoop.http.staticuser.user</name>
                <value>root</value>
        </property>
</configuration>

hdfs-site.xml

xml 复制代码

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
	<property> 
		<!--数据块默认大小128M--> 
		<name>dfs.block.size</name> 
		<value>134217728</value> 
	</property> 

	<property>
	    <name>dfs.nameservices</name>
	    <value>activeNode</value>
	</property>	

	<property> 
		<!--副本数量 不配置默认为3--> 
		<name>dfs.replication</name> 
		<value>3</value> 
	</property> 
	
	<property> 
		<!--namenode节点数据(元数据)的存放位置--> 
		<name>dfs.name.dir</name> 
		<value>file:///opt/hadoop-3.1.3/dfs/namenode_data</value> 
	</property>
	
	<property> 
		<!--datanode节点数据(元数据)的存放位置--> 
		<name>dfs.data.dir</name> 
		<value>file:///opt/hadoop-3.1.3/dfs/datanode_data</value> 
	</property>
	
	<property>
		<!--开启hdfs的webui界面--> 
		<name>dfs.webhdfs.enabled</name> 
		<value>true</value> 
	</property> 
	
	<property> 
		<!--datanode上负责进行文件操作的线程数--> 
		<name>dfs.datanode.max.transfer.threads</name> 
		<value>4096</value> 
	</property> 
	
	<property> 
		<!--指定hadoop集群在zookeeper上的注册名--> 
		<name>dfs.nameservices</name> 
		<value>hacluster</value> 
	</property> 
	
	<property> 
		<!--hacluster集群下有两个namenode分别是nn1,nn2--> 
		<name>dfs.ha.namenodes.hacluster</name> 
		<value>nn1,nn2</value> 
	</property> 
	
	<!--nn1的rpc、servicepc和http通讯地址 --> 
	<property> 
		<name>dfs.namenode.rpc-address.hacluster.nn1</name> 
		<value>node15:9000</value> 
	</property>
	
	<property> 
		<name>dfs.namenode.servicepc-address.hacluster.nn1</name> 
		<value>node15:53310</value> 
	</property> 
	
	<property> 
		<name>dfs.namenode.http-address.hacluster.nn1</name> 
		<value>node15:50070</value> 
	</property> 
	
	<!--nn2的rpc、servicepc和http通讯地址 --> 
	<property> 
		<name>dfs.namenode.rpc-address.hacluster.nn2</name> 
		<value>node16:9000</value> 
	</property> 
	
	<property> 
		<name>dfs.namenode.servicepc-address.hacluster.nn2</name> 
		<value>node16:53310</value> 
	</property> 
	
	<property> 
		<name>dfs.namenode.http-address.hacluster.nn2</name> 
		<value>node16:50070</value> 
	</property> 
	
	<property> 
	<!--指定Namenode的元数据在JournalNode上存放的位置--> 
		<name>dfs.namenode.shared.edits.dir</name> 
		<value>qjournal://node15:8485;node16:8485;node17:8485;node18:8485/hacluster</value> 
	</property> 
	
	<property> 
		<!--指定JournalNode在本地磁盘的存储位置--> 
		<name>dfs.journalnode.edits.dir</name> 
		<value>/opt/hadoop-3.1.3/dfs/journalnode_data</value> 
	</property> 
	
	<property> 
		<!--指定namenode操作日志存储位置--> 
		<name>dfs.namenode.edits.dir</name> 
		<value>/opt/hadoop-3.1.3/dfs/edits</value> 
	</property> 
	
	<property> 
		<!--开启namenode故障转移自动切换--> 
		<name>dfs.ha.automatic-failover.enabled</name> 
		<value>true</value> 
	</property> 
	
	<property> 
		<!--配置失败自动切换实现方式--> 
		<name>dfs.client.failover.proxy.provider.hacluster</name> 
		<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value> 
	</property> 
	
	<property> 
		<!--配置隔离机制--> 
		<name>dfs.ha.fencing.methods</name> 
		<value>sshfence</value> 
	</property> 
	
	<property> 
		<!--配置隔离机制需要SSH免密登录--> 
		<name>dfs.ha.fencing.ssh.private-key-files</name> 
		<value>/root/.ssh/id_rsa</value>
	</property> 
	
	<property> 
		<!--hdfs文件操作权限 false为不验证--> 
		<name>dfs.premissions</name> 
		<value>false</value> 
	</property> 

</configuration>

mapred-site.xml

xml 复制代码

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
	<!-- 指定mapreduce使用yarn资源管理器-->
	<property>　　　　　　　　
		<name>mapred.job.tracker</name> 　　　　　　　　 　　　　　　　　
		<value>node15:9001</value> 　　　　　　
	</property>

	<property>
		<name>mapreduce.framework.name</name>
		<value>yarn</value>
	</property>
	<!-- 配置作业历史服务器的地址-->
	<property>
		<name>mapreduce.jobhistory.address</name>
		<value>node15:10020</value>
	</property>
	<!-- 配置作业历史服务器的http地址-->
	<property>
		<name>mapreduce.jobhistory.webapp.address</name>
		<value>node15:19888</value>
	</property>
	<property>
		<name>yarn.application.classpath</name>
		<value>/opt/hadoop-3.1.3/etc/hadoop:/opt/hadoop-3.1.3/share/hadoop/common/lib/*:/opt/hadoop-3.1.3/share/hadoop/common/*:/opt/hadoop-3.1.3/share/hadoop/hdfs:/opt/hadoop-3.1.3/share/hadoop/hdfs/lib/*:/opt/hadoop-3.1.3/share/hadoop/hdfs/*:/opt/hadoop-3.1.3/share/hadoop/mapreduce/lib/*:/opt/hadoop-3.1.3/share/hadoop/mapreduce/*:/opt/hadoop-3.1.3/share/hadoop/yarn:/opt/hadoop-3.1.3/share/hadoop/yarn/lib/*:/opt/hadoop-3.1.3/share/hadoop/yarn/*</value>
	</property>

	<property>
		<name>mapreduce.framework.name</name>
		<value>yarn</value>
	</property>
	 
	<property>
		<name>mapreduce.map.memory.mb</name>
		<value>1500</value>
		<description>每个Map任务的物理内存限制</description>
	</property>
	 
	<property>
		<name>mapreduce.reduce.memory.mb</name>
		<value>3000</value>
		<description>每个Reduce任务的物理内存限制</description>
	</property>
	 
	<property>
		<name>mapreduce.map.java.opts</name>
		<value>-Xmx1200m</value>
	</property>
	 
	<property>
		<name>mapreduce.reduce.java.opts</name>
		<value>-Xmx2600m</value>
	</property>
	<property>
		<name>mapreduce.framework.name</name>
		<value>yarn</value>
	</property>

</configuration>

slaves

复制代码

node15
node16
node17
node18

workers

复制代码

node15
node16
node17
node18

yarn-site.xml

xml 复制代码

<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
	<property>
		<!-- 是否对容器强制执行虚拟内存限制 -->
		<name>yarn.nodemanager.vmem-check-enabled</name>
		<value>false</value>
		<description>Whether virtual memory limits will be enforced for containers</description>
	</property>
	<property>
		<!-- 为容器设置内存限制时虚拟内存与物理内存之间的比率 -->
		<name>yarn.nodemanager.vmem-pmem-ratio</name>
		<value>4</value>
		<description>Ratio between virtual memory to physical memory when setting memory limits for containers</description>
	</property>

	<property> 
		<!--开启yarn高可用--> 
		<name>yarn.resourcemanager.ha.enabled</name> 
		<value>true</value> 
	</property> 
	
	<property> 
		<!-- 指定Yarn集群在zookeeper上注册的节点名--> 
		<name>yarn.resourcemanager.cluster-id</name> 
		<value>hayarn</value> 
	</property> 
	
	<property> 
		<!--指定两个resourcemanager的名称--> 
		<name>yarn.resourcemanager.ha.rm-ids</name> 
		<value>rm1,rm2</value> 
	</property> 
	
	<property> 
		<!--指定rm1的主机--> 
		<name>yarn.resourcemanager.hostname.rm1</name> 
		<value>node15</value> 
	</property>
	
	<property> 
		<!--指定rm2的主机--> 
		<name>yarn.resourcemanager.hostname.rm2</name> 
		<value>node16</value> 
	</property> 

	<property>
		<!-- RM HTTP访问地址 默认：${yarn.resourcemanager.hostname}:8088-->
		<name>yarn.resourcemanager.webapp.address.rm1</name>
		<value>node15:8088</value>
	</property>
	<property>
		<!-- RM HTTP访问地址 默认：${yarn.resourcemanager.hostname}:8088-->
		<name>yarn.resourcemanager.webapp.address.rm2</name>
		<value>node16:8088</value>
	</property>
	
	<property> 
		<!--配置zookeeper的地址--> 
		<name>yarn.resourcemanager.zk-address</name> 
		<value>node15:2181,node16:2181,node17:2181</value> 
	</property> 
	
	<property> 
		<!--开启yarn恢复机制--> 
		<name>yarn.resourcemanager.recovery.enabled</name> 
		<value>true</value> 
	</property> 
	
	<property> 
		<!--配置执行resourcemanager恢复机制实现类--> 
		<name>yarn.resourcemanager.store.class</name> 
		<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value> 
	</property> 
	
	<property> 
		<!--指定主resourcemanager的地址--> 
		<name>yarn.resourcemanager.hostname</name> 
		<value>node18</value> 
	</property> 
	
	<property> 
		<!--nodemanager获取数据的方式--> 
		<name>yarn.nodemanager.aux-services</name> 
		<value>mapreduce_shuffle</value> 
	</property> 
	
	<property> 
		<!--开启日志聚集功能--> 
		<name>yarn.log-aggregation-enable</name> 
		<value>true</value> 
	</property> 
	
	<property> 
		<!--配置日志保留7天--> 
		<name>yarn.log-aggregation.retain-seconds</name> 
		<value>604800</value> 
	</property> 
	<property>
    		<name>yarn.log.server.url</name>
    		<value>http://node15:19888/jobhistory/logs</value>
	</property>
</configuration>

hive

hive-site.xml

xml 复制代码

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
    <!-- jdbc连接的URL -->
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://node15:3306/metastore?useSSL=false</value>
        </property>

    <!-- jdbc连接的Driver-->
    <property>
        <name>javax.jdo.option.ConnectionDriverName</name>
        <value>com.mysql.jdbc.Driver</value>
        </property>

        <!-- jdbc连接的username-->
    <property>
        <name>javax.jdo.option.ConnectionUserName</name>
        <value>root</value>
    </property>

    <!-- jdbc连接的password -->
    <property>
        <name>javax.jdo.option.ConnectionPassword</name>
        <value>hadoop</value>
        </property>

    <!-- Hive默认在HDFS的工作目录 -->
    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/user/hive/warehouse</value>
    </property>

   <!-- Hive元数据存储的验证 -->
    <property>
        <name>hive.metastore.schema.verification</name>
        <value>false</value>
    </property>

    <!-- 元数据存储授权  -->
    <property>
        <name>hive.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>

    <!-- 指定hiveserver2连接的host -->
    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>node15</value>
    </property>

    <!-- 指定hiveserver2连接的端口号 -->
    <property>
        <name>hive.server2.thrift.port</name>
        <value>10000</value>
    </property>

    <property>
        <name>spark.yarn.jars</name>
        <value>hdfs://node15:9000/spark-jars/*</value>
    </property>

    <!--Hive执行引擎-->
    <property>
        <name>hive.execution.engine</name>
        <value>spark</value>
    </property>
    <property>
        <name>spark.home</name>
        <value>/opt/spark-3.0.0-bin-hadoop3.2/</value>
    </property>
</configuration>

spark-defaults.conf

json 复制代码

spark.master                               yarn
spark.eventLog.enabled                   true
spark.eventLog.dir                        hdfs://node15:9000/spark-history
spark.executor.memory                    600m
spark.driver.memory                     600m

spark

hdfs-site.xml

链接hadoop中的文件

ln -s 源文件名新文件名

hive-site.xml

链接hive中的文件

ln -s 源文件名新文件名

slaves

复制代码

node15
node16
node17
node18

yarn-site.xml

链接hadoop中的文件

ln -s 源文件名新文件名

spark-env.sh

sh 复制代码

#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


export SCALA_HOME=/usr/share/scala
export JAVA_HOME=/usr/java/jdk1.8.0_241-amd64
export SPARK_HOME=/opt/spark-3.0.0-bin-hadoop3.2
export SPARK_MASTER_IP=192.168.206.215
export SPARK_MASTER_PORT=7077
export SPARK_MASTER_WEBUI_PORT=7080     #spark的web访问端口默认是8080，防止可能存在端口冲突，可以修
改端口号为其他的export SPARK_WORKER_CORES=1
export SPARK_WORKER_INSTANCES=1
export SPARK_EXECUTOR_MEMORY=512M
export SPARK_WORKER_MEMORY=1G
export SPARK_DIST_CLASSPATH=$(/opt/hadoop-3.1.3/bin/hadoop classpath)
export HADOOP_CONF_DIR=/opt/hadoop-3.1.3/etc/hadoop


# This file is sourced when running various Spark programs.
# Copy it as spark-env.sh and edit that to configure Spark for your site.

# Options read when launching programs locally with
# ./bin/run-example or ./bin/spark-submit
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program

# Options read by executors and drivers running inside the cluster
# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos

# Options read in YARN client/cluster mode
# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)

# Options for the daemons used in the standalone deploy mode
# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2
g)# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
# - SPARK_WORKER_DIR, to set the working directory of worker processes
# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 
1g).# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y
")# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers

# Options for launcher
# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")

# Generic options for the daemons used in the standalone deploy mode
# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS