安装spark
上传安装包到/opt/software目录并解压
bash
[bigdata@node101 software]$ tar -xvf spark-3.3.1-bin-hadoop3.tgz -C /opt/services/
[bigdata@node101 software]$ tar -xvf spark-3.3.1-bin-without-hadoop.tgz -C /opt/services/
重命名文件
bash
[bigdata@node101 services]$ mv spark-3.3.1-bin-hadoop3 spark-3.3.1
配置环境变量
bash
[bigdata@node101 ~]$ sudo vim /etc/profile.d/bigdata_env.sh
bash
export SPARK_HOME=/opt/services/spark-3.3.1
export PATH=$PATH:$JAVA_HOME/bin:$ZK_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$HIVE_HOME/bin:$SPARK_HOME/bin
分发环境变量
bash
[bigdata@node101 ~]$ sudo ./bin/xsync /etc/profile.d/bigdata_env.sh
刷新环境变量,5台机器上执行
bash
[bigdata@node101 ~]$ source /etc/profile
bash
export SPARK_DIST_CLASSPATH=$(hadoop classpath)
export SPARK_HISTORY_OPTS="
-Dspark.history.ui.port=18080
-Dspark.history.fs.logDirectory=hdfs://mycluster:8020/spark-history
-Dspark.history.retainedApplications=30"
配置spark-defaults.conf
bash
[bigdata@node101 conf]$ cp spark-defaults.conf.template spark-defaults.conf
bash
spark.master yarn
spark.eventLog.enabled true
spark.eventLog.dir hdfs://mycluster:8020/spark-history
spark.serializer org.apache.spark.serializer.KryoSerializer
#启动动态分配
spark.dynamicAllocation.enabled true
#启用Spark shuffle服务
spark.shuffle.service.enabled true
#Executor个数初始值
spark.dynamicAllocation.initialExecutors 1
#Executor个数最小值
spark.dynamicAllocation.minExecutors 1
#Executor个数最大值
spark.dynamicAllocation.maxExecutors 6
#Executor空闲时长,若某Executor空闲时间超过此值,则会被关闭
spark.dynamicAllocation.executorIdleTimeout 60s
#积压任务等待时长,若有Task等待时间超过此值,则申请启动新的Executor
spark.dynamicAllocation.schedulerBacklogTimeout 1s
#spark shuffle老版本协议
#spark.shuffle.useOldFetchProtocol true
spark.yarn.historyServer.address=node101:18080
spark.history.ui.port=18080
spark.history.fs.logDirectory=hdfs://mycluster:8020/spark-history
hdfs上创建日志文件
bash
[bigdata@node101 conf]$ hdfs dfs -mkdir /spark-history
上传spark-defaults.conf到hive目录下
bash
[bigdata@node101 conf]$cp $SPARK_HOME/conf/spark-defaults.conf $HIVE_HOME/conf/
上传hive-exec-3.1.3.jar到纯净版
bash
[bigdata@node101 conf]$cp $HIVE_HOME/lib/hive-exec-3.1.3.jar /opt/services/spark-3.3.1-bin-without-hadoop/jars/
上传spark-3.3.1-yarn-shuffle.jar
bash
[bigdata@node101 conf]$ cp $SPARK_HOME/yarn/spark-3.3.1-yarn-shuffle.jar /opt/services/hadoop-3.3.5/share/hadoop/yarn/lib/
上传spark jar包到hdfs
bash
[bigdata@node101 spark-3.3.1-bin-without-hadoop]$ hdfs dfs -mkdir /spark-jars
[bigdata@node101 spark-3.3.1-bin-without-hadoop]$ hdfs dfs -put ./jars/* /spark-jars
启动历史服务
bash
[bigdata@node101 spark-3.3.1]$ sbin/start-history-server.sh