本地运行+本地数据
import os
import re
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
"""
数据在本地
代码在本地
使用的是windows的资源
"""
if __name__ == '__main__':
# 配置环境
os.environ['JAVA_HOME'] = 'E:/java-configuration/jdk-8'
# 配置Hadoop的路径,就是前面解压的那个路径
os.environ['HADOOP_HOME'] = 'E:/applications/bigdata_config/hadoop-3.3.1/hadoop-3.3.1'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_PYTHON'] = 'C:/Users/35741/miniconda3/python.exe'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/35741/miniconda3/python.exe'
conf = SparkConf().setMaster("local[*]").setAppName("第一个spark项目")
sc = SparkContext(conf=conf)
fileRdd = sc.textFile("../data/wordcount/input")
# split 默认是切空格的 假如是多个空格可以识别么
fileRdd.filter(lambda line: len(line) > 0).flatMap(lambda line: line.strip().split()).map(lambda word: (word, 1)) \
.reduceByKey(lambda sum, tmp: sum + tmp).saveAsTextFile("../data/wordcount/output3")
# fileRdd.filter(lambda line: len(line) > 0).flatMap(lambda line: re.split("\s+",line.strip()) ).map(lambda word: (word, 1)) \
# .reduceByKey(lambda sum, tmp: sum + tmp).saveAsTextFile("../data/wordcount/output2")
sc.stop()
本地运行+集群数据
import os
import re
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
"""
数据在hdfs
代码在本地
资源使用的是windows的
"""
if __name__ == '__main__':
# 配置环境
os.environ['JAVA_HOME'] = 'E:/java-configuration/jdk-8'
# 配置Hadoop的路径,就是前面解压的那个路径
os.environ['HADOOP_HOME'] = 'E:/applications/bigdata_config/hadoop-3.3.1/hadoop-3.3.1'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_PYTHON'] = 'C:/Users/35741/miniconda3/python.exe'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_DRIVER_PYTHON'] = 'C:/Users/35741/miniconda3/python.exe'
os.environ['HADOOP_USER_NAME'] = 'root'
conf = SparkConf().setMaster("local[*]").setAppName("第一个spark项目")
sc = SparkContext(conf=conf)
fileRdd = sc.textFile("hdfs://bigdata01:9820/spark/wordcount/input")
fileRdd.filter(lambda line: len(line) > 0).flatMap(lambda line: re.split("\s+",line.strip()) ).map(lambda word: (word, 1)) \
.reduceByKey(lambda sum, tmp: sum + tmp).saveAsTextFile("hdfs://bigdata01:9820/spark/wordcount/output2")
sc.stop()
外部传参+服务器模式
import os
import re
import sys
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
"""
数据在hdfs
代码在本地
资源使用的是windows的
"""
if __name__ == '__main__':
inputPath = sys.argv[1]
outputPath = sys.argv[2]
# 配置环境
os.environ['JAVA_HOME'] = '/opt/installs/jdk'
# 配置Hadoop的路径,就是前面解压的那个路径
os.environ['HADOOP_HOME'] = '/opt/installs/hadoop'
# 配置base环境Python解析器的路径
os.environ['PYSPARK_PYTHON'] = '/opt/installs/anaconda3/bin/python3' # 配置base环境Python解析器的路径
os.environ['PYSPARK_DRIVER_PYTHON'] = '/opt/installs/anaconda3/bin/python3'
conf = SparkConf().setAppName("第一个spark项目")
sc = SparkContext(conf=conf)
fileRdd = sc.textFile(inputPath)
fileRdd \
.filter(lambda line: len(line) > 0) \
.flatMap(lambda line: re.split("\s+", line.strip())) \
.map(lambda word: (word, 1)) \
.reduceByKey(lambda sum, tmp: sum + tmp) \
.saveAsTextFile(outputPath)
sc.stop()