一、RDD 分区器简介
- Spark 分区器的父类是 Partitioner 抽象类
- 分区器直接决定了 RDD 中分区的个数、RDD 中每条数据经过 Shuffle 后进入哪个分区,进而决定了 Reduce 的个数
- 只有 Key-Value 类型的 RDD 才有分区器,非 Key-Value 类型的 RDD 分区的值是 None
- 每个 RDD 的分区索引的范围:0~(numPartitions - 1)
二、HashPartitioner
默认的分区器,对于给定的 key,计算其 hashCode 并除以分区个数取余获得数据所在的分区索引
scala
class HashPartitioner(partitions: Int) extends Partitioner {
require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")
def numPartitions: Int = partitions
def getPartition(key: Any): Int = key match {
case null => 0
case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
}
override def equals(other: Any): Boolean = other match {
case h: HashPartitioner => h.numPartitions == numPartitions
case _ => false
}
override def hashCode: Int = numPartitions
}
三、RangePartitioner
将一定范围内的数据映射到一个分区中,尽量保证每个分区数据均匀,而且分区间有序
scala
class RangePartitioner[K: Ordering: ClassTag, V](partitions: Int, rdd: RDD[_ <: Product2[K, V]], private var ascending: Boolean = true) extends Partitioner {
// We allow partitions = 0, which happens when sorting an empty RDD under the default settings.
require(partitions >= 0, s"Number of partitions cannot be negative but found
$partitions.")
private var ordering = implicitly[Ordering[K]]
// An array of upper bounds for the first (partitions - 1) partitions
private var rangeBounds: Array[K] = {
...
}
def numPartitions: Int = rangeBounds.length + 1
private var binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K]
def getPartition(key: Any): Int = {
val k = key.asInstanceOf[K]
var partition = 0
if (rangeBounds.length <= 128) {
// If we have less than 128 partitions naive search
while(partition < rangeBounds.length && ordering.gt(k, rangeBounds(partition))) {
partition += 1
}
} else {
// Determine which binary search method to use only once.
partition = binarySearch(rangeBounds, k)
// binarySearch either returns the match location or -[insertion point]-1
if (partition < 0) {
partition = -partition-1
}
if (partition > rangeBounds.length) {
partition = rangeBounds.length
}
}
if (ascending) {
partition
} else {
rangeBounds.length - partition
}
}
override def equals(other: Any): Boolean = other match {
...
}
override def hashCode(): Int = {
...
}
@throws(classOf[IOException])
private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException
{
...
}
@throws(classOf[IOException])
private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
...
}
}
四、自定义 Partitioner
scala
/**
1.继承 Partitioner 抽象类
2.重写 numPartitions: Int 和 getPartition(key: Any): Int 方法
*/
object TestRDDPartitioner {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("partition")
val sc = new SparkContext(conf)
val rdd = sc.makeRDD(List(
("nba", "xxxxxxxxxxx"),
("cba", "xxxxxxxxxxx"),
("nba", "xxxxxxxxxxx"),
("ncaa", "xxxxxxxxxxx"),
("cuba", "xxxxxxxxxxx")
))
val partRdd = rdd.partitionBy(new MyPartitioner)
partRdd.saveAsTextFile("output")
}
}
class MyPartitioner extends Partitioner {
// 重写返回分区数量的方法
override def numPartitions: Int = 3
// 重写根据数据的key返回数据所在的分区索引的方法
override def getPartition(key: Any): Int = {
key match {
case "nba" => 0
case "cba" => 1
case _ => 2
}
}
}