





spark@SparkSingleNode:/usr/local/hadoop/hadoop-2.6.0$ sbin/start-dfs.sh


spark@SparkSingleNode:/usr/local/spark/spark-1.5.2-bin-hadoop2.6$ sbin/start-all.sh


spark@SparkSingleNode:/usr/local/spark/spark-1.5.2-bin-hadoop2.6/bin$ ./spark-shell --master spark://SparkSingleNode:7077 --executor-memory 1g


scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@3bcc8f13

scala> val numbers = sc.parallelize
<console>:21: error: missing arguments for method parallelize in class SparkContext;
follow this method with `_' if you want to treat it as a partially applied function
val numbers = sc.parallelize

scala> val numbers = sc.parallelize(1 to 100)
numbers: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:21

scala> numbers.reduce(_+_)

took 11.790246 s
res1: Int = 5050


scala> val result = numbers.map(2*_)
result: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[1] at map at <console>:23

scala> val data = result.collect


/** * Reduces the elements of this RDD using the specified commutative and * associative binary operator. */def reduce(f: (T, T) => T): T = withScope {  val cleanF = sc.clean(f)  val reducePartition: Iterator[T] => Option[T] = iter => {    if (iter.hasNext) {      Some(iter.reduceLeft(cleanF))    } else {      None    }  }  var jobResult: Option[T] = None  val mergeResult = (index: Int, taskResult: Option[T]) => {    if (taskResult.isDefined) {      jobResult = jobResult match {        case Some(value) => Some(f(value, taskResult.get))        case None => taskResult      }    }  }  sc.runJob(this, reducePartition, mergeResult)  // Get the final result out of our Option, or throw an exception if the RDD was empty  jobResult.getOrElse(throw new UnsupportedOperationException("empty collection"))}


data: Array[Int] = Array(2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 160, 162, 164, 166, 168, 170, 172, 174, 176, 178, 180, 182, 184, 186, 188, 190, 192, 194, 196, 198, 200)



/** * Return an array that contains all of the elements in this RDD. */def collect(): Array[T] = withScope {  val results = sc.runJob(this, (iter: Iterator[T]) => iter.toArray)  Array.concat(results: _*)}




scala> numbers
res2: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:21

scala> 1 to 100
res3: scala.collection.immutable.Range.Inclusive = Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100)

scala> numbers.count

took 0.649005 s
res4: Long = 100


/** * Return the number of elements in the RDD. */def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum


scala> val topN = numbers.take(5)

topN: Array[Int] = Array(1, 2, 3, 4, 5)


/** * Take the first num elements of the RDD. It works by first scanning one partition, and use the * results from that partition to estimate the number of additional partitions needed to satisfy * the limit. * * @note due to complications in the internal implementation, this method will raise * an exception if called on an RDD of `Nothing` or `Null`. */def take(num: Int): Array[T] = withScope {  if (num == 0) {    new Array[T](0)  } else {    val buf = new ArrayBuffer[T]    val totalParts = this.partitions.length    var partsScanned = 0    while (buf.size < num && partsScanned < totalParts) {      // The number of partitions to try in this iteration. It is ok for this number to be      // greater than totalParts because we actually cap it at totalParts in runJob.      var numPartsToTry = 1      if (partsScanned > 0) {        // If we didn't find any rows after the previous iteration, quadruple and retry.        // Otherwise, interpolate the number of partitions we need to try, but overestimate        // it by 50%. We also cap the estimation in the end.        if (buf.size == 0) {          numPartsToTry = partsScanned * 4        } else {          // the left side of max is >=1 whenever partsScanned >= 2          numPartsToTry = Math.max((1.5 * num * partsScanned / buf.size).toInt - partsScanned, 1)          numPartsToTry = Math.min(numPartsToTry, partsScanned * 4)        }      }

      val left = num - buf.size      val p = partsScanned until math.min(partsScanned + numPartsToTry, totalParts)      val res = sc.runJob(this, (it: Iterator[T]) => it.take(left).toArray, p)

      res.foreach(buf ++= _.take(num - buf.size))      partsScanned += numPartsToTry    }

    buf.toArray  }}


scala> val scores = Array(Tuple2(1,100),Tuple2(1,100),Tuple2(2,100),Tuple2(2,100),Tuple2(3,100))
scores: Array[(Int, Int)] = Array((1,100), (1,100), (2,100), (2,100), (3,100))

scala> val content = sc.parallelize
<console>:21: error: missing arguments for method parallelize in class SparkContext;
follow this method with `_' if you want to treat it as a partially applied function
val content = sc.parallelize

scala> val content = sc.parallelize(scores)
content: org.apache.spark.rdd.RDD[(Int, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:23

scala> val data = content.countByKey()

took 10.556634 s
data: scala.collection.Map[Int,Long] = Map(2 -> 2, 1 -> 2, 3 -> 1)


/** * Count the number of elements for each key, collecting the results to a local Map. * * Note that this method should only be used if the resulting map is expected to be small, as * the whole thing is loaded into the driver's memory. * To handle very large results, consider using rdd.mapValues(_ => 1L).reduceByKey(_ + _), which * returns an RDD[T, Long] instead of a map. */def countByKey(): Map[K, Long] = self.withScope {  self.mapValues(_ => 1L).reduceByKey(_ + _).collect().toMap}

之前,在  rdd实战(rdd基本操作实战及transformation和action流程图)(源码)
scala> val partitionsReadmeRdd =  sc.textFile("hdfs://SparkSingleNode:9000/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).saveAsTextFile("~/partition1README.txt")


scala> val partitionsReadmeRdd =  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).saveAsTextFile("/partition1README.txt")

scala> val partitionsReadmeRdd =  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).saveAsTextFile("/partition1README.txt")


/** * Save this RDD as a text file, using string representations of elements. */def saveAsTextFile(path: String): Unit = withScope {  // https://issues.apache.org/jira/browse/SPARK-2075  //  // NullWritable is a `Comparable` in Hadoop 1.+, so the compiler cannot find an implicit  // Ordering for it and will use the default `null`. However, it's a `Comparable[NullWritable]`  // in Hadoop 2.+, so the compiler will call the implicit `Ordering.ordered` method to create an  // Ordering for `NullWritable`. That's why the compiler will generate different anonymous  // classes for `saveAsTextFile` in Hadoop 1.+ and Hadoop 2.+.  //  // Therefore, here we provide an explicit Ordering `null` to make sure the compiler generate  // same bytecodes for `saveAsTextFile`.  val nullWritableClassTag = implicitly[ClassTag[NullWritable]]  val textClassTag = implicitly[ClassTag[Text]]  val r = this.mapPartitions { iter =>    val text = new Text()    iter.map { x =>      text.set(x.toString)      (NullWritable.get(), text)    }  }  RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)    .saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path)}

/** * Save this RDD as a compressed text file, using string representations of elements. */def saveAsTextFile(path: String, codec: Class[_ <: CompressionCodec]): Unit = withScope {  // https://issues.apache.org/jira/browse/SPARK-2075  val nullWritableClassTag = implicitly[ClassTag[NullWritable]]  val textClassTag = implicitly[ClassTag[Text]]  val r = this.mapPartitions { iter =>    val text = new Text()    iter.map { x =>      text.set(x.toString)      (NullWritable.get(), text)    }  }  RDD.rddToPairRDDFunctions(r)(nullWritableClassTag, textClassTag, null)    .saveAsHadoopFile[TextOutputFormat[NullWritable, Text]](path, codec)}


1、在某个步骤非常费时的情况下,不好使                                    (手动)2、计算链条特别长的情况下                                           (手动)
3、checkpoint所在的rdd也一定要持久化数据      (注意:在checkpoint之前,进行persist)          (手动)
  先写,某个具体rdd.checkpoint  或   某个具体rdd.cache ,再写,  某个具体rdd.persist
4、shuffle之后   (因为shuffle之后,要网络传输,风险大)                          (手动)
5、shuffle之前    (框架,默认给我们做的,把数据持久化到本地磁盘)


/** * Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint * directory set with `SparkContext#setCheckpointDir` and all references to its parent * RDDs will be removed. This function must be called before any job has been * executed on this RDD. It is strongly recommended that this RDD is persisted in * memory, otherwise saving it on a file will require recomputation. */def checkpoint(): Unit = RDDCheckpointData.synchronized {  // NOTE: we use a global lock here due to complexities downstream with ensuring  // children RDD partitions point to the correct parent partitions. In the future  // we should revisit this consideration.  if (context.checkpointDir.isEmpty) {    throw new SparkException("Checkpoint directory has not been set in the SparkContext")  } else if (checkpointData.isEmpty) {    checkpointData = Some(new ReliableRDDCheckpointData(this))  }}
/** * Mark this RDD for persisting using the specified level. * * @param newLevel the target storage level * @param allowOverride whether to override any existing level with the new one */private def persist(newLevel: StorageLevel, allowOverride: Boolean): this.type = {  // TODO: Handle changes of StorageLevel  if (storageLevel != StorageLevel.NONE && newLevel != storageLevel && !allowOverride) {    throw new UnsupportedOperationException(      "Cannot change storage level of an RDD after it was already assigned a level")  }  // If this is the first time this RDD is marked for persisting, register it  // with the SparkContext for cleanups and accounting. Do this only once.  if (storageLevel == StorageLevel.NONE) {    sc.cleaner.foreach(_.registerRDDForCleanup(this))    sc.persistRDD(this)  }  storageLevel = newLevel  this}

/** * Set this RDD's storage level to persist its values across operations after the first time * it is computed. This can only be used to assign a new storage level if the RDD does not * have a storage level set yet. Local checkpointing is an exception. */def persist(newLevel: StorageLevel): this.type = {  if (isLocallyCheckpointed) {    // This means the user previously called localCheckpoint(), which should have already    // marked this RDD for persisting. Here we should override the old storage level with    // one that is explicitly requested by the user (after adapting it to use disk).    persist(LocalRDDCheckpointData.transformStorageLevel(newLevel), allowOverride = true)  } else {    persist(newLevel, allowOverride = false)  }}

/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */def persist(): this.type = persist(StorageLevel.MEMORY_ONLY)

/** Persist this RDD with the default storage level (`MEMORY_ONLY`). */def cache(): this.type = persist()





MEMORY_AND_DISK 假设,我们制定数据存储方式是,MEMORY_AND_DISK。则,是不是同时,存储到内存和磁盘呢?答:不是啊,亲。spark一定是优先考虑内存的啊,只要内存足够,不会考虑磁盘。若内存不够了,则才放部分数据到磁盘。





scala> val partitionsReadmeRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).count

took 6.270138 s

scala> val partitionsReadmeRdd =  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 4.147545 s

scala> val partitionsReadmeRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 4.914212 s

scala> val cacheRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache

scala> cacheRdd.count

took 3.371621

scala> val cacheRdd = sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache

scala> cacheRdd.count

took 0.943499 s


scala>  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 5.603903

scala>  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 4.146627

scala>  sc.textFile("/README.md").flatMap(_.split(" ")).map(word =>(word,1)).reduceByKey(_+_,1).cache.count

took 3.071122


实际工程中, cache之后,如果有其他算子,则会,重新触发这个工作过程。












  小知识:cache之后,一定不能立即有其他算子!实际工程中, cache之后,如果有其他算子,则会,重新触发这个工作过程。







text在读取数据时候,拷贝一份的数据副本(变量),因为函数式编程(变量不变),不拷贝状态容易被改变,数据量小(1、引用较小2、数据本身小),变量大容易产生oom(task拷贝数据 在内存中运行),网络传输慢,要提前,冗余、共享,减少通信。






参考: http://blog.csdn.net/kxr0502/article/details/50574561







scala> val number = 10
number: Int = 10

scala> val broadcastNumber = sc.broadcast(number)

16/09/29 17:26:47 INFO storage.MemoryStore: ensureFreeSpace(40) called with curMem=1782734, maxMem=560497950
16/09/29 17:26:47 INFO storage.MemoryStore: Block broadcast_38 stored as values in memory (estimated size 40.0 B, free 532.8 MB)
16/09/29 17:26:48 INFO storage.MemoryStore: ensureFreeSpace(97) called with curMem=1782774, maxMem=560497950
16/09/29 17:26:48 INFO storage.MemoryStore: Block broadcast_38_piece0 stored as bytes in memory (estimated size 97.0 B, free 532.8 MB)
16/09/29 17:26:48 INFO storage.BlockManagerInfo: Added broadcast_38_piece0 in memory on (size: 97.0 B, free: 534.4 MB)
16/09/29 17:26:48 INFO spark.SparkContext: Created broadcast 38 from broadcast at <console>:23
broadcastNumber: org.apache.spark.broadcast.Broadcast[Int] = Broadcast(38)

scala> val data = sc.parallelize
<console>:21: error: missing arguments for method parallelize in class SparkContext;
follow this method with `_' if you want to treat it as a partially applied function
val data = sc.parallelize

scala> val data = sc.parallelize(1 to 100)
data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[61] at parallelize at <console>:21

scala> val bn = data.map(_* broadcastNumber.value)
bn: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[62] at map at <console>:27





答:当然可以有很多,用java bin或scala封装,就可以了。

  如,在这里。广播变量是,broadcastNumber, 里,有变量value等。

scala> val broadcastNumber = sc.broadcast(number)

scala> val bn = data.map(_* broadcastNumber.value)

scala> bn.collect

res12: Array[Int] = Array(10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720, 730, 740, 750, 760, 770, 780, 790, 800, 810, 820, 830, 840, 850, 860, 870, 880, 890, 900, 910, 920, 930, 940, 950, 960, 970, 980, 990, 1000)




 参考: http://www.cnblogs.com/seaspring/p/5682053.html


package org.apache.spark.broadcast

import java.util.concurrent.atomic.AtomicLong

import scala.reflect.ClassTag

import org.apache.spark._import org.apache.spark.util.Utils

private[spark] class BroadcastManager(    val isDriver: Boolean,    conf: SparkConf,    securityManager: SecurityManager)  extends Logging {

  private var initialized = false  private var broadcastFactory: BroadcastFactory = null


  // Called by SparkContext or Executor before using Broadcast  private def initialize() {    synchronized {      if (!initialized) {        val broadcastFactoryClass =          conf.get("spark.broadcast.factory", "org.apache.spark.broadcast.TorrentBroadcastFactory")

        broadcastFactory =          Utils.classForName(broadcastFactoryClass).newInstance.asInstanceOf[BroadcastFactory]

        // Initialize appropriate BroadcastFactory and BroadcastObject        broadcastFactory.initialize(isDriver, conf, securityManager)

        initialized = true      }    }  }

  def stop() {    broadcastFactory.stop()  }

  private val nextBroadcastId = new AtomicLong(0)

  def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = {    broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement())  }

  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) {    broadcastFactory.unbroadcast(id, removeFromDriver, blocking)  }}


package org.apache.spark.broadcast

import java.io.Serializable

import org.apache.spark.SparkExceptionimport org.apache.spark.Loggingimport org.apache.spark.util.Utils

import scala.reflect.ClassTag

/** * A broadcast variable. Broadcast variables allow the programmer to keep a read-only variable * cached on each machine rather than shipping a copy of it with tasks. They can be used, for * example, to give every node a copy of a large input dataset in an efficient manner. Spark also * attempts to distribute broadcast variables using efficient broadcast algorithms to reduce * communication cost. * * Broadcast variables are created from a variable `v` by calling * [[org.apache.spark.SparkContext#broadcast]]. * The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the * `value` method. The interpreter session below shows this: * * {{{ * scala> val broadcastVar = sc.broadcast(Array(1, 2, 3)) * broadcastVar: org.apache.spark.broadcast.Broadcast[Array[Int]] = Broadcast(0) * * scala> broadcastVar.value * res0: Array[Int] = Array(1, 2, 3) * }}} * * After the broadcast variable is created, it should be used instead of the value `v` in any * functions run on the cluster so that `v` is not shipped to the nodes more than once. * In addition, the object `v` should not be modified after it is broadcast in order to ensure * that all nodes get the same value of the broadcast variable (e.g. if the variable is shipped * to a new node later). * * @param id A unique identifier for the broadcast variable. * @tparam T Type of the data contained in the broadcast variable. */abstract class Broadcast[T: ClassTag](val id: Long) extends Serializable with Logging {

  /**   * Flag signifying whether the broadcast variable is valid   * (that is, not already destroyed) or not.   */  @volatile private var _isValid = true

  private var _destroySite = ""

  /** Get the broadcasted value. */  def value: T = {    assertValid()    getValue()  }

  /**   * Asynchronously delete cached copies of this broadcast on the executors.   * If the broadcast is used after this is called, it will need to be re-sent to each executor.   */  def unpersist() {    unpersist(blocking = false)  }

  /**   * Delete cached copies of this broadcast on the executors. If the broadcast is used after   * this is called, it will need to be re-sent to each executor.   * @param blocking Whether to block until unpersisting has completed   */  def unpersist(blocking: Boolean) {    assertValid()    doUnpersist(blocking)  }

  /**   * Destroy all data and metadata related to this broadcast variable. Use this with caution;   * once a broadcast variable has been destroyed, it cannot be used again.   * This method blocks until destroy has completed   */  def destroy() {    destroy(blocking = true)  }

  /**   * Destroy all data and metadata related to this broadcast variable. Use this with caution;   * once a broadcast variable has been destroyed, it cannot be used again.   * @param blocking Whether to block until destroy has completed   */  private[spark] def destroy(blocking: Boolean) {    assertValid()    _isValid = false    _destroySite = Utils.getCallSite().shortForm    logInfo("Destroying %s (from %s)".format(toString, _destroySite))    doDestroy(blocking)  }

  /**   * Whether this Broadcast is actually usable. This should be false once persisted state is   * removed from the driver.   */  private[spark] def isValid: Boolean = {    _isValid  }

  /**   * Actually get the broadcasted value. Concrete implementations of Broadcast class must   * define their own way to get the value.   */  protected def getValue(): T

  /**   * Actually unpersist the broadcasted value on the executors. Concrete implementations of   * Broadcast class must define their own logic to unpersist their own data.   */  protected def doUnpersist(blocking: Boolean)

  /**   * Actually destroy all data and metadata related to this broadcast variable.   * Implementation of Broadcast class must define their own logic to destroy their own   * state.   */  protected def doDestroy(blocking: Boolean)

  /** Check if this broadcast is valid. If not valid, exception is thrown. */  protected def assertValid() {    if (!_isValid) {      throw new SparkException(        "Attempted to use %s after it was destroyed (%s) ".format(toString, _destroySite))    }  }

  override def toString: String = "Broadcast(" + id + ")"}









  累加器的特征:全局的,Accumulator:对于Executor只能修改但不可读,只对Driver可读(因为通过Driver控制整个集群的状态),不同的executor 修改,不会彼此覆盖(枷锁机制)



scala> val sum = sc.accumulator(0)
sum: org.apache.spark.Accumulator[Int] = 0

scala> val data = sc.parallelize(1 to 100)
data: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[63] at parallelize at <console>:21

scala> val result = data.foreach(item =>sum += item)

took 6.548568 s
result: Unit = ()

scala> println(sum)

累加器 在记录集群全局唯一的状态的时候极其重要,保持唯一的全局状态的变量,所以其重要性不言而喻。


package org.apache.spark

import java.io.{ObjectInputStream, Serializable}

import scala.collection.generic.Growableimport scala.collection.Mapimport scala.collection.mutableimport scala.ref.WeakReferenceimport scala.reflect.ClassTag

import org.apache.spark.serializer.JavaSerializerimport org.apache.spark.util.Utils

/** * A data type that can be accumulated, ie has an commutative and associative "add" operation, * but where the result type, `R`, may be different from the element type being added, `T`. * * You must define how to add data, and how to merge two of these together.  For some data types, * such as a counter, these might be the same operation. In that case, you can use the simpler * [[org.apache.spark.Accumulator]]. They won't always be the same, though -- e.g., imagine you are * accumulating a set. You will add items to the set, and you will union two sets together. * * @param initialValue initial value of accumulator * @param param helper object defining how to add elements of type `R` and `T` * @param name human-readable name for use in Spark's web UI * @param internal if this [[Accumulable]] is internal. Internal [[Accumulable]]s will be reported *                 to the driver via heartbeats. For internal [[Accumulable]]s, `R` must be *                 thread safe so that they can be reported correctly. * @tparam R the full accumulated data (result type) * @tparam T partial data that can be added in */class Accumulable[R, T] private[spark] (    @transient initialValue: R,    param: AccumulableParam[R, T],    val name: Option[String],    internal: Boolean)  extends Serializable {

  private[spark] def this(      @transient initialValue: R, param: AccumulableParam[R, T], internal: Boolean) = {    this(initialValue, param, None, internal)  }

  def this(@transient initialValue: R, param: AccumulableParam[R, T], name: Option[String]) =    this(initialValue, param, name, false)

  def this(@transient initialValue: R, param: AccumulableParam[R, T]) =    this(initialValue, param, None)

  val id: Long = Accumulators.newId

  @volatile @transient private var value_ : R = initialValue // Current value on master  val zero = param.zero(initialValue)  // Zero value to be passed to workers  private var deserialized = false


  /**   * If this [[Accumulable]] is internal. Internal [[Accumulable]]s will be reported to the driver   * via heartbeats. For internal [[Accumulable]]s, `R` must be thread safe so that they can be   * reported correctly.   */  private[spark] def isInternal: Boolean = internal

  /**   * Add more data to this accumulator / accumulable   * @param term the data to add   */  def += (term: T) { value_ = param.addAccumulator(value_, term) }

  /**   * Add more data to this accumulator / accumulable   * @param term the data to add   */  def add(term: T) { value_ = param.addAccumulator(value_, term) }

  /**   * Merge two accumulable objects together   *   * Normally, a user will not want to use this version, but will instead call `+=`.   * @param term the other `R` that will get merged with this   */  def ++= (term: R) { value_ = param.addInPlace(value_, term)}

  /**   * Merge two accumulable objects together   *   * Normally, a user will not want to use this version, but will instead call `add`.   * @param term the other `R` that will get merged with this   */  def merge(term: R) { value_ = param.addInPlace(value_, term)}

  /**   * Access the accumulator's current value; only allowed on master.   */  def value: R = {    if (!deserialized) {      value_    } else {      throw new UnsupportedOperationException("Can't read accumulator value in task")    }  }

  /**   * Get the current value of this accumulator from within a task.   *   * This is NOT the global value of the accumulator.  To get the global value after a   * completed operation on the dataset, call `value`.   *   * The typical use of this method is to directly mutate the local value, eg., to add   * an element to a Set.   */  def localValue: R = value_

  /**   * Set the accumulator's value; only allowed on master.   */  def value_= (newValue: R) {    if (!deserialized) {      value_ = newValue    } else {      throw new UnsupportedOperationException("Can't assign accumulator value in task")    }  }

  /**   * Set the accumulator's value; only allowed on master   */  def setValue(newValue: R) {    this.value = newValue  }

  // Called by Java when deserializing an object  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {    in.defaultReadObject()    value_ = zero    deserialized = true    // Automatically register the accumulator when it is deserialized with the task closure.    //    // Note internal accumulators sent with task are deserialized before the TaskContext is created    // and are registered in the TaskContext constructor. Other internal accumulators, such SQL    // metrics, still need to register here.    val taskContext = TaskContext.get()    if (taskContext != null) {      taskContext.registerAccumulator(this)    }  }

  override def toString: String = if (value_ == null) "null" else value_.toString}

/** * Helper object defining how to accumulate values of a particular type. An implicit * AccumulableParam needs to be available when you create [[Accumulable]]s of a specific type. * * @tparam R the full accumulated data (result type) * @tparam T partial data that can be added in */trait AccumulableParam[R, T] extends Serializable {  /**   * Add additional data to the accumulator value. Is allowed to modify and return `r`   * for efficiency (to avoid allocating objects).   *   * @param r the current value of the accumulator   * @param t the data to be added to the accumulator   * @return the new value of the accumulator   */  def addAccumulator(r: R, t: T): R

  /**   * Merge two accumulated values together. Is allowed to modify and return the first value   * for efficiency (to avoid allocating objects).   *   * @param r1 one set of accumulated data   * @param r2 another set of accumulated data   * @return both data sets merged together   */  def addInPlace(r1: R, r2: R): R

  /**   * Return the "zero" (identity) value for an accumulator type, given its initial value. For   * example, if R was a vector of N dimensions, this would return a vector of N zeroes.   */  def zero(initialValue: R): R}

private[spark] classGrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Serializable: ClassTag, T]  extends AccumulableParam[R, T] {

  def addAccumulator(growable: R, elem: T): R = {    growable += elem    growable  }

  def addInPlace(t1: R, t2: R): R = {    t1 ++= t2    t1  }

  def zero(initialValue: R): R = {    // We need to clone initialValue, but it's hard to specify that R should also be Cloneable.    // Instead we'll serialize it to a buffer and load it back.    val ser = new JavaSerializer(new SparkConf(false)).newInstance()    val copy = ser.deserialize[R](ser.serialize(initialValue))    copy.clear()   // In case it contained stuff    copy  }}

/** * A simpler value of [[Accumulable]] where the result type being accumulated is the same * as the types of elements being merged, i.e. variables that are only "added" to through an * associative operation and can therefore be efficiently supported in parallel. They can be used * to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of numeric * value types, and programmers can add support for new types. * * An accumulator is created from an initial value `v` by calling [[SparkContext#accumulator]]. * Tasks running on the cluster can then add to it using the [[Accumulable#+=]] operator. * However, they cannot read its value. Only the driver program can read the accumulator's value, * using its value method. * * The interpreter session below shows an accumulator being used to add up the elements of an array: * * {{{ * scala> val accum = sc.accumulator(0) * accum: spark.Accumulator[Int] = 0 * * scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x) * ... * 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s * * scala> accum.value * res2: Int = 10 * }}} * * @param initialValue initial value of accumulator * @param param helper object defining how to add elements of type `T` * @tparam T result type */class Accumulator[T] private[spark] (    @transient private[spark] val initialValue: T,    param: AccumulatorParam[T],    name: Option[String],    internal: Boolean)  extends Accumulable[T, T](initialValue, param, name, internal) {

  def this(initialValue: T, param: AccumulatorParam[T], name: Option[String]) = {    this(initialValue, param, name, false)  }

  def this(initialValue: T, param: AccumulatorParam[T]) = {    this(initialValue, param, None, false)  }}

/** * A simpler version of [[org.apache.spark.AccumulableParam]] where the only data type you can add * in is the same type as the accumulated value. An implicit AccumulatorParam object needs to be * available when you create Accumulators of a specific type. * * @tparam T type of value to accumulate */trait AccumulatorParam[T] extends AccumulableParam[T, T] {  def addAccumulator(t1: T, t2: T): T = {    addInPlace(t1, t2)  }}

object AccumulatorParam {

  // The following implicit objects were in SparkContext before 1.2 and users had to  // `import SparkContext._` to enable them. Now we move them here to make the compiler find  // them automatically. However, as there are duplicate codes in SparkContext for backward  // compatibility, please update them accordingly if you modify the following implicit objects.

  implicit object DoubleAccumulatorParam extends AccumulatorParam[Double] {    def addInPlace(t1: Double, t2: Double): Double = t1 + t2    def zero(initialValue: Double): Double = 0.0  }

  implicit object IntAccumulatorParam extends AccumulatorParam[Int] {    def addInPlace(t1: Int, t2: Int): Int = t1 + t2    def zero(initialValue: Int): Int = 0  }

  implicit object LongAccumulatorParam extends AccumulatorParam[Long] {    def addInPlace(t1: Long, t2: Long): Long = t1 + t2    def zero(initialValue: Long): Long = 0L  }

  implicit object FloatAccumulatorParam extends AccumulatorParam[Float] {    def addInPlace(t1: Float, t2: Float): Float = t1 + t2    def zero(initialValue: Float): Float = 0f  }

  // TODO: Add AccumulatorParams for other types, e.g. lists and strings}

// TODO: The multi-thread support in accumulators is kind of lame; check// if there's a more intuitive way of doing it rightprivate[spark] object Accumulators extends Logging {  /**   * This global map holds the original accumulator objects that are created on the driver.   * It keeps weak references to these objects so that accumulators can be garbage-collected   * once the RDDs and user-code that reference them are cleaned up.   */  val originals = mutable.Map[Long, WeakReference[Accumulable[_, _]]]()

  private var lastId: Long = 0

  def newId(): Long = synchronized {    lastId += 1    lastId  }

  def register(a: Accumulable[_, _]): Unit = synchronized {    originals(a.id) = new WeakReference[Accumulable[_, _]](a)  }

  def remove(accId: Long) {    synchronized {      originals.remove(accId)    }  }

  // Add values to the original accumulators with some given IDs  def add(values: Map[Long, Any]): Unit = synchronized {    for ((id, value) <- values) {      if (originals.contains(id)) {        // Since we are now storing weak references, we must check whether the underlying data        // is valid.        originals(id).get match {          case Some(accum) => accum.asInstanceOf[Accumulable[Any, Any]] ++= value          case None =>            throw new IllegalAccessError("Attempted to access garbage collected Accumulator.")        }      } else {        logWarning(s"Ignoring accumulator update for unknown accumulator id $id")      }    }  }


private[spark] object InternalAccumulator {  val PEAK_EXECUTION_MEMORY = "peakExecutionMemory"  val TEST_ACCUMULATOR = "testAccumulator"

  // For testing only.  // This needs to be a def since we don't want to reuse the same accumulator across stages.  private def maybeTestAccumulator: Option[Accumulator[Long]] = {    if (sys.props.contains("spark.testing")) {      Some(new Accumulator(        0L, AccumulatorParam.LongAccumulatorParam, Some(TEST_ACCUMULATOR), internal = true))    } else {      None    }  }

  /**   * Accumulators for tracking internal metrics.   *   * These accumulators are created with the stage such that all tasks in the stage will   * add to the same set of accumulators. We do this to report the distribution of accumulator   * values across all tasks within each stage.   */  def create(sc: SparkContext): Seq[Accumulator[Long]] = {    val internalAccumulators = Seq(        // Execution memory refers to the memory used by internal data structures created        // during shuffles, aggregations and joins. The value of this accumulator should be        // approximately the sum of the peak sizes across all such data structures created        // in this task. For SQL jobs, this only tracks all unsafe operators and ExternalSort.        new Accumulator(          0L, AccumulatorParam.LongAccumulatorParam, Some(PEAK_EXECUTION_MEMORY), internal = true)      ) ++ maybeTestAccumulator.toSeq    internalAccumulators.foreach { accumulator =>      sc.cleaner.foreach(_.registerAccumulatorForCleanup(accumulator))    }    internalAccumulators  }}













