好,这次我给你做一个完整版(可直接运行 + 分章节 + 练习 + 最终综合题),完全按照你要求来:
👉 5.2 / 5.3 / 5.4:案例提取 → 完整代码 → 注释 → 练习 → 最终大题
(全部代码都已补全,可直接运行)
一、5.2 键值对RDD(完整案例代码)
来源:
✅ 案例1:reduceByKey词频统计(完整版)
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
# 1. 初始化
conf = SparkConf().setMaster("local[*]").setAppName("WordCount")
sc = SparkContext(conf=conf)
# 2. 原始数据
data = ["Hadoop", "Spark", "Hive", "Spark"]
# 3. 创建RDD
rdd = sc.parallelize(data)
# 4. 转换为键值对 (word,1)
pairRDD = rdd.map(lambda word: (word, 1))
# 5. 按key聚合
resultRDD = pairRDD.reduceByKey(lambda a, b: a + b)
# 6. 输出
resultRDD.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
✅ 案例2:groupByKey实现词频
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("GroupByKey")
sc = SparkContext(conf=conf)
data = ["one", "two", "two", "three", "three", "three"]
rdd = sc.parallelize(data)
# 转换为键值对
pairRDD = rdd.map(lambda word: (word, 1))
# groupByKey分组
groupedRDD = pairRDD.groupByKey()
# 再统计
resultRDD = groupedRDD.map(lambda x: (x[0], sum(x[1])))
resultRDD.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
✅ 案例3:combineByKey(平均值)
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("CombineByKey")
sc = SparkContext(conf=conf)
# 数据:公司 + 收入
data = [
("company-1", 88), ("company-1", 96),
("company-2", 94), ("company-2", 86)
]
rdd = sc.parallelize(data)
# combineByKey
resultRDD = rdd.combineByKey(
lambda income: (income, 1), # 初始化
lambda acc, income: (acc[0]+income, acc[1]+1), # 分区内聚合
lambda acc1, acc2: (acc1[0]+acc2[0], acc1[1]+acc2[1]) # 分区间聚合
).map(lambda x: (x[0], x[1][0], x[1][0]/x[1][1]))
resultRDD.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
二、5.3 数据读写(完整代码)
✅ 案例1:读写本地文件
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("FileIO")
sc = SparkContext(conf=conf)
# 读取文件
rdd = sc.textFile("file:///home/hadoop/input.txt")
# 简单处理
result = rdd.map(lambda x: x.upper())
# 写入文件(输出目录必须不存在)
result.saveAsTextFile("file:///home/hadoop/output")
sc.stop()
if __name__ == "__main__":
main()
✅ 案例2:MySQL读取(完整版)
python
# coding:utf8
from pyspark import SparkConf, SparkContext
import pymysql
def main():
conf = SparkConf().setMaster("local[*]").setAppName("ReadMySQL")
sc = SparkContext(conf=conf)
# 连接数据库
conn = pymysql.connect(
host="localhost",
user="root",
password="123456",
database="spark"
)
cursor = conn.cursor()
cursor.execute("SELECT * FROM student")
# 转换为RDD
rdd = sc.parallelize(cursor.fetchall())
rdd.foreach(print)
cursor.close()
conn.close()
sc.stop()
if __name__ == "__main__":
main()
三、5.4 综合案例(完整代码)
✅ 案例1:TopN(完整版🔥)
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("TopN")
sc = SparkContext(conf=conf)
rdd = sc.textFile("file:///home/hadoop/orders.txt")
# 过滤有效数据
rdd1 = rdd.filter(lambda line: len(line.split(",")) == 4)
# 提取payment
rdd2 = rdd1.map(lambda x: int(x.split(",")[2]))
# 排序取Top5
result = rdd2.sortBy(lambda x: x, False).take(5)
print(result)
sc.stop()
if __name__ == "__main__":
main()
✅ 案例2:文件排序
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[1]").setAppName("FileSort")
sc = SparkContext(conf=conf)
rdd = sc.textFile("file:///home/hadoop/file*.txt")
result = rdd.filter(lambda x: x.strip() != "") \
.map(lambda x: int(x.strip())) \
.sortBy(lambda x: x)
result.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
✅ 案例3:二次排序(完整版🔥)
python
# coding:utf8
from pyspark import SparkConf, SparkContext
class SecondarySortKey:
def __init__(self, k):
self.first = k[0]
self.second = k[1]
def __gt__(self, other):
if self.first == other.first:
return self.second > other.second
else:
return self.first > other.first
def main():
conf = SparkConf().setMaster("local[*]").setAppName("SecondarySort")
sc = SparkContext(conf=conf)
rdd = sc.textFile("file:///home/hadoop/data.txt")
rdd1 = rdd.filter(lambda x: x.strip() != "")
# 转换为 ((第一列,第二列), 原始数据)
rdd2 = rdd1.map(lambda x: ((int(x.split(" ")[0]), int(x.split(" ")[1])), x))
# 使用自定义key
rdd3 = rdd2.map(lambda x: (SecondarySortKey(x[0]), x[1]))
# 排序
rdd4 = rdd3.sortByKey(False)
result = rdd4.map(lambda x: x[1])
result.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
四、5个练习题(按章节对应)
🧠 练习1(5.2)
👉 用reduceByKey统计单词次数
🧠 练习2(5.2)
👉 用groupByKey实现平均值计算
🧠 练习3(5.3)
👉 读取文件 → 去重 → 写入文件
🧠 练习4(5.4)
👉 求Top3最大值
🧠 练习5(5.4)
👉 文件排序并编号
五、最终综合练习
🎯 题目
👉 输入:
text
时间戳,省份,城市,用户,商品
👉 要求:
- 统计每个省份每个商品销量
- 每个省份Top3商品
- 按销量降序
六、最终答案代码
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("FinalExam")
sc = SparkContext(conf=conf)
# 1. 读取数据
rdd = sc.textFile("file:///home/hadoop/sales.txt")
# 2. 清洗
rdd1 = rdd.filter(lambda line: len(line.split(",")) == 5)
# 3. 转换 ((省份,商品),1)
rdd2 = rdd1.map(lambda x: ((x.split(",")[1], x.split(",")[4]), 1))
# 4. 聚合
rdd3 = rdd2.reduceByKey(lambda a, b: a + b)
# 5. 转换结构
rdd4 = rdd3.map(lambda x: (x[0][0], (x[0][1], x[1])))
# 6. 分组
rdd5 = rdd4.groupByKey()
# 7. Top3
rdd6 = rdd5.mapValues(
lambda items: sorted(list(items), key=lambda x: x[1], reverse=True)[:3]
)
# 8. 输出
rdd6.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
下面给你这5个练习的标准答案(可直接运行 + 面试级规范 + 注释),每个都对应你PPT中的知识点。
🧠 练习1(5.2)
👉 用reduceByKey统计单词次数
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("Exercise1")
sc = SparkContext(conf=conf)
# 原始数据
data = ["a", "b", "a", "c", "b", "a"]
# 1. 创建RDD
rdd = sc.parallelize(data)
# 2. 转换为 (word,1)
pairRDD = rdd.map(lambda word: (word, 1))
# 3. reduceByKey统计
result = pairRDD.reduceByKey(lambda a, b: a + b)
# 4. 输出
result.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
🧠 练习2(5.2)
👉 用groupByKey实现平均值计算
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("Exercise2")
sc = SparkContext(conf=conf)
# 数据:(key,value)
data = [("A",10),("A",20),("B",30),("B",40)]
rdd = sc.parallelize(data)
# 1. 按key分组
grouped = rdd.groupByKey()
# 2. 计算平均值
result = grouped.map(lambda x: (x[0], sum(x[1]) / len(list(x[1]))))
result.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
👉 面试提示:
groupByKey → 数据量大时不推荐(会OOM)
🧠 练习3(5.3)
👉 读取文件 → 去重 → 写入文件
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("Exercise3")
sc = SparkContext(conf=conf)
# 1. 读取文件
rdd = sc.textFile("file:///home/hadoop/input.txt")
# 2. 去重
distinctRDD = rdd.distinct()
# 3. 写入文件(目录必须不存在)
distinctRDD.saveAsTextFile("file:///home/hadoop/output")
sc.stop()
if __name__ == "__main__":
main()
👉 面试点:
- distinct底层 = map + reduceByKey
🧠 练习4(5.4)
👉 求Top3最大值
python
# coding:utf8
from pyspark import SparkConf, SparkContext
def main():
conf = SparkConf().setMaster("local[*]").setAppName("Exercise4")
sc = SparkContext(conf=conf)
data = [10, 5, 30, 20, 50, 40]
rdd = sc.parallelize(data)
# 方法1:排序 + 取前3
top3 = rdd.sortBy(lambda x: x, False).take(3)
print("Top3:", top3)
sc.stop()
if __name__ == "__main__":
main()
👉 面试加分回答:
- 可用
takeOrdered(3, key=lambda x: -x)
🧠 练习5(5.4)
👉 文件排序并编号
python
# coding:utf8
from pyspark import SparkConf, SparkContext
# 全局变量(用于编号)
index = 0
def getIndex():
global index
index += 1
return index
def main():
conf = SparkConf().setMaster("local[1]").setAppName("Exercise5")
sc = SparkContext(conf=conf)
# 1. 读取多个文件
rdd = sc.textFile("file:///home/hadoop/file*.txt")
# 2. 过滤空行
rdd1 = rdd.filter(lambda x: x.strip() != "")
# 3. 转换为整数
rdd2 = rdd1.map(lambda x: int(x.strip()))
# 4. 排序
rdd3 = rdd2.sortBy(lambda x: x)
# 5. 编号
rdd4 = rdd3.map(lambda x: (getIndex(), x))
# 6. 输出
rdd4.foreach(print)
sc.stop()
if __name__ == "__main__":
main()
| 题目 | 核心考点 |
|---|---|
| 练习1 | reduceByKey |
| 练习2 | groupByKey |
| 练习3 | 文件读写 + distinct |
| 练习4 | TopN |
| 练习5 | 排序 + 编号 |