赞
踩
RDD的依赖内部解密
视频学习来源:DT-大数据梦工厂 IMF传奇行动视频(后附王家林老师联系方式)
DAG逻辑图:
- /**
- * Generic function to combine the elements for each key using a custom set of aggregation聚合
- * functions. This method is here for backward compatibility. It does not provide combiner
- * classtag information to the shuffle.
- *
- * @see [[combineByKeyWithClassTag]]
- */
- def combineByKey[C](
- createCombiner: V => C,
- mergeValue: (C, V) => C,
- mergeCombiners: (C, C) => C,
- partitioner: Partitioner,
- mapSideCombine: Boolean = true,
- serializer: Serializer = null): RDD[(K, C)] = self.withScope {
- combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners,
- partitioner, mapSideCombine, serializer)(null)
- }
- /**
- * Simplified version of combineByKeyWithClassTag that hash-partitions the output RDD.
- * This method is here for backward compatibility. It does not provide combiner
- * classtag information to the shuffle.
- *
- * @see [[combineByKeyWithClassTag]]
- */
- def combineByKey[C](
- createCombiner: V => C,
- mergeValue: (C, V) => C,
- mergeCombiners: (C, C) => C,
- numPartitions: Int): RDD[(K, C)] = self.withScope {
- combineByKeyWithClassTag(createCombiner, mergeValue, mergeCombiners, numPartitions)(null)
- }
- @DeveloperApi
- abstract class Dependency[T] extends Serializable {
- def rdd: RDD[T]
- }
-
- /**
- * :: DeveloperApi ::
- * Base class for dependencies where each partition of the child RDD depends on a small number
- * of partitions of the parent RDD. Narrow dependencies allow for pipelined execution.
- */
- @DeveloperApi
- abstract class NarrowDependency[T](_rdd: RDD[T]) extends Dependency[T] {
- /**
- * Get the parent partitions for a child partition.
- * @param partitionId a partition of the child RDD
- * @return the partitions of the parent RDD that the child partition depends upon
- */
- def getParents(partitionId: Int): Seq[Int]
- override def rdd: RDD[T] = _rdd
- }
两种RDD的窄依赖:- /**
- * :: DeveloperApi ::
- * Represents a one-to-one dependency between partitions of the parent and child RDDs.
- */
- @DeveloperApi
- class OneToOneDependency[T](rdd: RDD[T]) extends NarrowDependency[T](rdd) {
- override def getParents(partitionId: Int): List[Int] = List(partitionId)
- }
- /**
- * :: DeveloperApi ::
- * Represents a one-to-one dependency between ranges of partitions in the parent and child RDDs.
- * @param rdd the parent RDD
- * @param inStart the start of the range in the parent RDD
- * @param outStart the start of the range in the child RDD
- * @param length the length of the range
- */
- @DeveloperApi
- class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int)
- extends NarrowDependency[T](rdd) {
- override def getParents(partitionId: Int): List[Int] = {
- if (partitionId >= outStart && partitionId < outStart + length) {
- List(partitionId - outStart + inStart)
- } else {
- Nil
- }
- }
- }
- class ShuffleDependency[K: ClassTag, V: ClassTag, C: ClassTag](
- @transient private val _rdd: RDD[_ <: Product2[K, V]],
- val partitioner: Partitioner,
- val serializer: Option[Serializer] = None,
- val keyOrdering: Option[Ordering[K]] = None,
- val aggregator: Option[Aggregator[K, V, C]] = None,
- val mapSideCombine: Boolean = false)
- extends Dependency[Product2[K, V]] {
注:Hadoop中的MapReduce中的mapper和reducer相当于Spark中map和ReduceByKey算子。
DT大数据梦工厂
新浪微博:www.weibo.com/ilovepains/
微信公众号:DT_Spark
博客:http://.blog.sina.com.cn/ilovepains
TEL:18610086859
Email:18610086859@vip.126.com
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。