job提交大體執行流程圖：
Spark job分配流程原始碼分析

SparkContext : scheduler建立

// Create and start the scheduler
val (sched, ts) = SparkContext.createTaskScheduler(this, master)
_schedulerBackend = sched
_taskScheduler = ts
_dagScheduler = new DAGScheduler(this)
_heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)

SparkContext : createTaskScheduler()方法，主要透過傳入的Master URL來生成Scheduler 和 SchedulerBackend

private def createTaskScheduler(
sc: SparkContext,
master: String): (SchedulerBackend, TaskScheduler) = {
// Regular expression used for local[N] and local[*] master formats
val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
// Regular expression for local[N, maxRetries], used in tests with failing tasks
val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
// Regular expression for simulating a Spark cluster of [N, cores, memory] locally
val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
// Regular expression for connecting to Spark deploy clusters
val SPARK_REGEX = """spark://(.*)""".r
// Regular expression for connection to Mesos cluster by mesos:// or zk:// url
val MESOS_REGEX = """(mesos|zk)://.*""".r
// Regular expression for connection to Simr cluster
val SIMR_REGEX = """simr://(.*)""".r
// When running locally, don't try to re-execute tasks on failure.
val MAX_LOCAL_TASK_FAILURES = 1
master match {
case "local" =>
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
val backend = new LocalBackend(sc.getConf, scheduler, 1)
scheduler.initialize(backend)
(backend, scheduler)
case LOCAL_N_REGEX(threads) =>
def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
// local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
val threadCount = if (threads == "*") localCpuCount else threads.toInt
if (threadCount <= 0) {
throw new SparkException(s"Asked to run locally with $threadCount threads")
}
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
val backend = new LocalBackend(sc.getConf, scheduler, threadCount)
scheduler.initialize(backend)
(backend, scheduler)
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
// local[*, M] means the number of cores on the computer with M failures
// local[N, M] means exactly N threads with M failures
val threadCount = if (threads == "*") localCpuCount else threads.toInt
val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
val backend = new LocalBackend(sc.getConf, scheduler, threadCount)
scheduler.initialize(backend)
(backend, scheduler)
case SPARK_REGEX(sparkUrl) =>
val scheduler = new TaskSchedulerImpl(sc)
val masterUrls = sparkUrl.split(",").map("spark://" + _)
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
scheduler.initialize(backend)
(backend, scheduler)
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
// Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
val memoryPerSlaveInt = memoryPerSlave.toInt
if (sc.executorMemory > memoryPerSlaveInt) {
throw new SparkException(
"Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
memoryPerSlaveInt, sc.executorMemory))
}
val scheduler = new TaskSchedulerImpl(sc)
val localCluster = new LocalSparkCluster(
numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)
val masterUrls = localCluster.start()
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
scheduler.initialize(backend)
backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => {
localCluster.stop()
}
(backend, scheduler)
case "yarn-standalone" | "yarn-cluster" =>
if (master == "yarn-standalone") {
logWarning(
"\"yarn-standalone\" is deprecated as of Spark 1.0. Use \"yarn-cluster\" instead.")
}
val scheduler = try {
val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
val cons = clazz.getConstructor(classOf[SparkContext])
cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
} catch {
// TODO: Enumerate the exact reasons why it can fail
// But irrespective of it, it means we cannot proceed !
case e: Exception => {
throw new SparkException("YARN mode not available ?", e)
}
}
val backend = try {
val clazz =
Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")
val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
} catch {
case e: Exception => {
throw new SparkException("YARN mode not available ?", e)
}
}
scheduler.initialize(backend)
(backend, scheduler)
case "yarn-client" =>
val scheduler = try {
val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnScheduler")
val cons = clazz.getConstructor(classOf[SparkContext])
cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
} catch {
case e: Exception => {
throw new SparkException("YARN mode not available ?", e)
}
}
val backend = try {
val clazz =
Utils.classForName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
} catch {
case e: Exception => {
throw new SparkException("YARN mode not available ?", e)
}
}
scheduler.initialize(backend)
(backend, scheduler)
case mesosUrl @ MESOS_REGEX(_) =>
MesosNativeLibrary.load()
val scheduler = new TaskSchedulerImpl(sc)
val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", false)
val url = mesosUrl.stripPrefix("mesos://") // strip scheme from raw Mesos URLs
val backend = if (coarseGrained) {
new CoarseMesosSchedulerBackend(scheduler, sc, url, sc.env.securityManager)
} else {
new MesosSchedulerBackend(scheduler, sc, url)
}
scheduler.initialize(backend)
(backend, scheduler)
case SIMR_REGEX(simrUrl) =>
val scheduler = new TaskSchedulerImpl(sc)
val backend = new SimrSchedulerBackend(scheduler, sc, simrUrl)
scheduler.initialize(backend)
(backend, scheduler)
case _ =>
throw new SparkException("Could not parse Master URL: '" + master + "'")
}
}
}

RDD的Action操作會呼叫sc的runJob方法
1.SparkContext的runJob方法：

def runJob[T, U: ClassTag](
rdd: RDD[T], //此處是具體的 RDD 例項值
func: (TaskContext, Iterator[T]) => U, //具體的執行的 action 的邏輯 , 如 reduceByKey
partitions: Seq[Int], //分割槽陣列 , 一個數值從 0 到 partitions.size-1
//result 的處理邏輯 , 每一個 Task 的處理
resultHandler: (Int, U) => Unit): Unit = {
if (stopped.get()) {
throw new IllegalStateException("SparkContext has been shutdown")
}
val callSite = getCallSite
val cleanedFunc = clean(func)
logInfo("Starting job: " + callSite.shortForm)
if (conf.getBoolean("spark.logLineage", false)) {
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
}
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
progressBar.foreach(_.finishAll())
rdd.doCheckpoint()
}

跟進：DAGScheduler

2.DAGScheduler的runjob方法:

def runJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): Unit = {
val start = System.nanoTime
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
//等待 job 執行完成。
waiter.awaitResult() match {
case JobSucceeded =>
logInfo("Job %d finished: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
case JobFailed(exception: Exception) =>
logInfo("Job %d failed: %s, took %f s".format
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
val callerStackTrace = Thread.currentThread().getStackTrace.tail
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
throw exception
}
}

繼續進入submitJob：

def submitJob[T, U](
rdd: RDD[T],
func: (TaskContext, Iterator[T]) => U,
partitions: Seq[Int],
callSite: CallSite,
resultHandler: (Int, U) => Unit,
properties: Properties): JobWaiter[U] = {
// Check to make sure we are not launching a task on a partition that does not exist.
val maxPartitions = rdd.partitions.length
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
throw new IllegalArgumentException(
"Attempting to access a non-existent partition: " + p + ". " +
"Total number of partitions: " + maxPartitions)
}
val jobId = nextJobId.getAndIncrement()
if (partitions.size == 0) {
// Return immediately if the job is running 0 tasks
return new JobWaiter[U](this, jobId, 0, resultHandler)
}
assert(partitions.size > 0)
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
eventProcessLoop.post(JobSubmitted( // 向佇列新增一個 event, 此 event 為 JobSubmitted
jobId, rdd, func2, partitions.toArray, callSite, waiter,
SerializationUtils.clone(properties)))
waiter
}

submitJob方法將JobSubmitted傳送到佇列（並沒有採用Actor模型而是1.4版本後重新採用了1.0版本之前的佇列方式），那佇列是在哪消費的呢？看下邊;
在DAGScheduler類的最後有這麼一段：

// Start the event thread and register the metrics source at the end of the constructor
env.metricsSystem.registerSource(metricsSource)
eventProcessLoop.start()

這段程式碼是寫在方法體外的，如果這是一個java類，則這顯然是不合乎語法的。這裡涉及一個scala的特性：在一個類裡，你可以隨意的寫一些指令碼，或者說是程式碼片段更加形象，這些指令碼會在物件例項化時被執行。

DAGSchedulerEventProcessLoop 繼承 EventLoop，看一下EventLoop的start方法：

//開一個執行緒不斷消費佇列呼叫onReceive()方法
private val eventThread = new Thread(name) {
setDaemon(true)
override def run(): Unit = {
try {
while (!stopped.get) {
val event = eventQueue.take()
try {
onReceive(event)
} catch {
case NonFatal(e) => {
try {
onError(e)
} catch {
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
}
} catch {
case ie: InterruptedException => // exit even if eventQueue is not empty
case NonFatal(e) => logError("Unexpected error in " + name, e)
}
}
}
def start(): Unit = {
if (stopped.get) {
throw new IllegalStateException(name + " has already been stopped")
}
// Call onStart before starting the event thread to make sure it happens before onReceive
onStart()
eventThread.start()
}

由此可知在DAGScheduler初始化時會起一個執行緒去進行佇列消費。
接下來看onReceive()方法：

/**
* The main event loop of the DAG scheduler.
*/
override def onReceive(event: DAGSchedulerEvent): Unit = {
val timerContext = timer.time()
try {
doOnReceive(event)
} finally {
timerContext.stop()
}
}

private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
case StageCancelled(stageId) =>
dagScheduler.handleStageCancellation(stageId)
case JobCancelled(jobId) =>
dagScheduler.handleJobCancellation(jobId)
case JobGroupCancelled(groupId) =>
dagScheduler.handleJobGroupCancelled(groupId)
case AllJobsCancelled =>
dagScheduler.doCancelAllJobs()
case ExecutorAdded(execId, host) =>
dagScheduler.handleExecutorAdded(execId, host)
case ExecutorLost(execId) =>
dagScheduler.handleExecutorLost(execId, fetchFailed = false)
case BeginEvent(task, taskInfo) =>
dagScheduler.handleBeginEvent(task, taskInfo)
case GettingResultEvent(taskInfo) =>
dagScheduler.handleGetTaskResult(taskInfo)
case completion @ CompletionEvent(task, reason, _, _, taskInfo, taskMetrics) =>
dagScheduler.handleTaskCompletion(completion)
case TaskSetFailed(taskSet, reason, exception) =>
dagScheduler.handleTaskSetFailed(taskSet, reason, exception)
case ResubmitFailedStages =>
dagScheduler.resubmitFailedStages()
}

進入handleJobSubmitted方法：

private[scheduler] def handleJobSubmitted(jobId: Int,

      finalRDD: RDD[_],

      func: (TaskContext, Iterator[_]) => _,

      partitions: Array[Int],

      callSite: CallSite,

      listener: JobListener,

      properties: Properties) {

    var finalStage: ResultStage = null

    try {

      // New stage creation may throw an exception if, for example, jobs are run on a

      // HadoopRDD whose underlying HDFS files have been deleted.

      //生成一個 finalStage, 每一個 JOB 都有一個 finalStage, 根據 job 劃分出不同的 stage , RDD提交過來的肯定為最後一個RDD 所以此處封裝為ResultStage

  //RDD中： def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum

      finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)

    } catch {

      case e: Exception =>

        logWarning("Creating new stage failed due to exception - job: " + jobId, e)

        listener.jobFailed(e)

        return

    }

    val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)

    clearCacheLocs()

    logInfo("Got job %s (%s) with %d output partitions".format(

      job.jobId, callSite.shortForm, partitions.length))

    logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")

    logInfo("Parents of final stage: " + finalStage.parents)

    logInfo("Missing parents: " + getMissingParentStages(finalStage))

    val jobSubmissionTime = clock.getTimeMillis()

    jobIdToActiveJob(jobId) = job

    activeJobs += job

    finalStage.resultOfJob = Some(job)

    val stageIds = jobIdToStageIds(jobId).toArray

    val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))

    listenerBus.post(

      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))

    submitStage(finalStage)

    submitWaitingStages()

  }

進入submitStage方法：

/** Submits stage, but first recursively submits any missing parents. */

  //提交階段，但首先遞迴提交的父母首次進來為ResultStage,在此方法中遞迴提交父RDD組成的ShuffleMapStage

  private def submitStage(stage: Stage) {

    val jobId = activeJobForStage(stage)

    if (jobId.isDefined) {

      logDebug("submitStage(" + stage + ")")

      if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {

        val missing = getMissingParentStages(stage).sortBy(_.id) //獲取父Stages

        logDebug("missing: " + missing)

        if (missing.isEmpty) {

          logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")

          submitMissingTasks(stage, jobId.get)

        } else {

          for (parent <- missing) {

            submitStage(parent)

          }

          waitingStages += stage

        }

      }

    } else {

      abortStage(stage, "No active job for stage " + stage.id, None)

    }

  }

繼續進入submitMissingTasks方法：

/** Called when stage's parents are available and we can now do its task. */

  // taskScheduler.submitTasks 提交

  private def submitMissingTasks(stage: Stage, jobId: Int) {

    logDebug("submitMissingTasks(" + stage + ")")

    // Get our pending tasks and remember them in our pendingTasks entry

    stage.pendingTasks.clear()

    // First figure out the indexes of partition ids to compute.

    val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {

      stage match {                                                            //stage分為兩種型別 shuffleMapStage和ResultStage

        case stage: ShuffleMapStage =>

          val allPartitions = 0 until stage.numPartitions

          val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }

          (allPartitions, filteredPartitions)

        case stage: ResultStage =>

          val job = stage.resultOfJob.get

          val allPartitions = 0 until job.numPartitions

          val filteredPartitions = allPartitions.filter { id => !job.finished(id) }

          (allPartitions, filteredPartitions)

      }

    }

    // Create internal accumulators if the stage has no accumulators initialized.

    // Reset internal accumulators only if this stage is not partially submitted

    // Otherwise, we may override existing accumulator values from some tasks

    if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {

      stage.resetInternalAccumulators()

    }

    val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull

    runningStages += stage

    // SparkListenerStageSubmitted should be posted before testing whether tasks are

    // serializable. If tasks are not serializable, a SparkListenerStageCompleted event

    // will be posted, which should always come after a corresponding SparkListenerStageSubmitted

    // event.

    outputCommitCoordinator.stageStart(stage.id)

    val taskIdToLocations = try {

      stage match {

        case s: ShuffleMapStage =>

          partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap

        case s: ResultStage =>

          val job = s.resultOfJob.get

          partitionsToCompute.map { id =>

            val p = s.partitions(id)

            (id, getPreferredLocs(stage.rdd, p))

          }.toMap

      }

    } catch {

      case NonFatal(e) =>

        stage.makeNewStageAttempt(partitionsToCompute.size)

        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))

        runningStages -= stage

        return

    }

    stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)

    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))

    // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.

    // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast

    // the serialized copy of the RDD and for each task we will deserialize it, which means each

    // task gets a different copy of the RDD. This provides stronger isolation between tasks that

    // might modify state of objects referenced in their closures. This is necessary in Hadoop

    // where the JobConf/Configuration object is not thread-safe.

    var taskBinary: Broadcast[Array[Byte]] = null

    try {

      // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).

      // For ResultTask, serialize and broadcast (rdd, func).

      val taskBinaryBytes: Array[Byte] = stage match {

        case stage: ShuffleMapStage =>

          closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()

        case stage: ResultStage =>

          closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()

      }

      taskBinary = sc.broadcast(taskBinaryBytes)

    } catch {

      // In the case of a failure during serialization, abort the stage.

      case e: NotSerializableException =>

        abortStage(stage, "Task not serializable: " + e.toString, Some(e))

        runningStages -= stage

        // Abort execution

        return

      case NonFatal(e) =>

        abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))

        runningStages -= stage

        return

    }

    val tasks: Seq[Task[_]] = try {

      stage match {

        case stage: ShuffleMapStage =>

          partitionsToCompute.map { id =>

            val locs = taskIdToLocations(id)

            val part = stage.rdd.partitions(id)

            new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,

              taskBinary, part, locs, stage.internalAccumulators)

          }

        case stage: ResultStage =>

          val job = stage.resultOfJob.get

          partitionsToCompute.map { id =>

            val p: Int = stage.partitions(id)

            val part = stage.rdd.partitions(p)

            val locs = taskIdToLocations(id)

            new ResultTask(stage.id, stage.latestInfo.attemptId,

              taskBinary, part, locs, id, stage.internalAccumulators)

          }

      }

    } catch {

      case NonFatal(e) =>

        abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))

        runningStages -= stage

        return

    }

    if (tasks.size > 0) {

      logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")

      stage.pendingTasks ++= tasks

      logDebug("New pending tasks: " + stage.pendingTasks)

      //把 stage 根據 partition 生成 TaskSet, 透過 TaskScheduler 提交 Task

      taskScheduler.submitTasks(new TaskSet(

        tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))

      stage.latestInfo.submissionTime = Some(clock.getTimeMillis())

    } else {

      // Because we posted SparkListenerStageSubmitted earlier, we should mark

      // the stage as completed here in case there are no tasks to run

      markStageAsFinished(stage, None)

      val debugString = stage match {

        case stage: ShuffleMapStage =>

          s"Stage ${stage} is actually done; " +

            s"(available: ${stage.isAvailable}," +

            s"available outputs: ${stage.numAvailableOutputs}," +

            s"partitions: ${stage.numPartitions})"

        case stage : ResultStage =>

          s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"

      }

      logDebug(debugString)

    }

  }

此處不深入研究該方法看一下taskScheduler.submitTasks方法：

override def submitTasks(taskSet: TaskSet) {

    val tasks = taskSet.tasks

    logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")

    this.synchronized {

      //生成一個 TaskSetManager 例項，TaskSetManager負責監控該TaskSet的所有Task任務，有任務完成後，taskScheduler會從該Manager中刪除task

      val manager = createTaskSetManager(taskSet, maxTaskFailures)

      val stage = taskSet.stageId

      val stageTaskSets =

        taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])

      stageTaskSets(taskSet.stageAttemptId) = manager

      val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>

        ts.taskSet != taskSet && !ts.isZombie

      }

      if (conflictingTaskSet) {

        throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +

          s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")

      }

      schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)                //將manager加入到rootPool池中，在初始化schedulableBuilder時賦予rootPool池（FIFO、FAIR）

      if (!isLocal && !hasReceivedTask) {

        starvationTimer.scheduleAtFixedRate(new TimerTask() {

          override def run() {

            if (!hasLaunchedTask) {

              logWarning("Initial job has not accepted any resources; " +

                "check your cluster UI to ensure that workers are registered " +

                "and have sufficient resources")

            } else {

              this.cancel()

            }

          }

        }, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)

      }

      hasReceivedTask = true

    }

    //透過 backend 發起執行訊息 ,backend 是 SchedulerBackend 的具體實現，在 yarn-cluster 模式為 CoarseGrainedSchedulerBackend

    backend.reviveOffers()

  }

CoarseGrainedSchedulerBackend：

override def reviveOffers() {

    driverEndpoint.send(ReviveOffers)

  }

可見reviveOffers方法是向schedulerBackend傳送了一個ReviveOffers訊息。schedulerBackend為Actor模型看一下start方法，taskScheduler中初始化時會呼叫start()方法：

override def start() {

    val properties = new ArrayBuffer[(String, String)]

    for ((key, value) <- scheduler.sc.conf.getAll) {

      if (key.startsWith("spark.")) {

        properties += ((key, value))

      }

    }

    // TODO (prashant) send conf instead of properties

// 初始化話 endpoint 用來通訊

    driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))

  }

//建立一個Actor

  protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {

    new DriverEndpoint(rpcEnv, properties)

  }

看一下receive方法：

override def receive: PartialFunction[Any, Unit] = {

      case StatusUpdate(executorId, taskId, state, data) =>

        scheduler.statusUpdate(taskId, state, data.value)

        if (TaskState.isFinished(state)) {

          executorDataMap.get(executorId) match {

            case Some(executorInfo) =>

              executorInfo.freeCores += scheduler.CPUS_PER_TASK

              makeOffers(executorId)

            case None =>

              // Ignoring the update since we don't know about the executor.

              logWarning(s"Ignored task status update ($taskId state $state) " +

                s"from unknown executor with ID $executorId")

          }

        }

      case ReviveOffers => //在初始化時會每隔一定時間（spark.scheduler.revive.interval, 預設為1s）進行一次排程（給自身傳送ReviveOffers訊息，進行呼叫makeOffers進行排程）。

        makeOffers()

      case KillTask(taskId, executorId, interruptThread) =>

        executorDataMap.get(executorId) match {

          case Some(executorInfo) =>

            executorInfo.executorEndpoint.send(KillTask(taskId, executorId, interruptThread))

          case None =>

            // Ignoring the task kill since the executor is not registered.

            logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")

        }

    }

看紅色部分進入makeOffers（）方法：

// Make fake resource offers on all executors

    private def makeOffers() {

      // Filter out executors under killing

      val activeExecutors = executorDataMap.filterKeys(!executorsPendingToRemove.contains(_)) //過濾出executor

      val workOffers = activeExecutors.map { case (id, executorData) =>

        new WorkerOffer(id, executorData.executorHost, executorData.freeCores)                      //轉為seq

      }.toSeq

      launchTasks(scheduler.resourceOffers(workOffers))

    }

重點是TaskScheduler的resourceOffers()方法：

def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized { //SchedulerBackend中呼叫封裝TaskDescription 傳送給Executor端執行

    // Mark each slave as alive and remember its hostname

    // Also track if new executor is added

    var newExecAvail = false

    // 遍歷worker提供的資源，更新executor相關的對映

    for (o <- offers) {

      executorIdToHost(o.executorId) = o.host

      activeExecutorIds += o.executorId

      if (!executorsByHost.contains(o.host)) {

        executorsByHost(o.host) = new HashSet[String]()

        executorAdded(o.executorId, o.host)

        newExecAvail = true

      }

      for (rack <- getRackForHost(o.host)) {

        hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host

      }

    }

    // 從worker當中隨機選出一些來，防止任務都堆在一個機器上

    // Randomly shuffle offers to avoid always placing tasks on the same set of workers.

    val shuffledOffers = Random.shuffle(offers) //將offers集合打散順序

    // Build a list of tasks to assign to each worker.

    val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))

    val availableCpus = shuffledOffers.map(o => o.cores).toArray

    // getSortedTask函式對taskset進行排序

    val sortedTaskSets = rootPool.getSortedTaskSetQueue //排序佇列獲取到一組 TaskSetManager ：ArrayBuffer[TaskSetManager]

    //按就近原則進行Task排程

    for (taskSet <- sortedTaskSets) {

      logDebug("parentName: %s, name: %s, runningTasks: %s".format(

        taskSet.parent.name, taskSet.name, taskSet.runningTasks))

      if (newExecAvail) {

        taskSet.executorAdded()

      }

    }

    // Take each TaskSet in our scheduling order, and then offer it each node in increasing order

    // of locality levels so that it gets a chance to launch local tasks on all of them.

    // NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY

    var launchedTask = false

    for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {

      do {

        launchedTask = resourceOfferSingleTaskSet(

            taskSet, maxLocality, shuffledOffers, availableCpus, tasks)

      } while (launchedTask)

    }

    if (tasks.size > 0) {

      hasLaunchedTask = true

    }

    return tasks

  }

看一下resourceOfferSingleTaskSet():

private def resourceOfferSingleTaskSet(

      taskSet: TaskSetManager,

      maxLocality: TaskLocality,

      shuffledOffers: Seq[WorkerOffer],

      availableCpus: Array[Int],

      tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {

    var launchedTask = false

    for (i <- 0 until shuffledOffers.size) {  // 遍歷所有Executor

      val execId = shuffledOffers(i).executorId

      val host = shuffledOffers(i).host

      if (availableCpus(i) >= CPUS_PER_TASK) { // >1

        try {

          for (task <- taskSet.resourceOffer(execId, host, maxLocality)) {   // 根據executor的資訊返回一個最合適的Task

            // resourceOffer ：

            //根據TaskScheduler所提供的單個Resource資源包括host，executor和locality的要求返回一個合適的Task。

            // TaskSetManager內部會根據上一個任務成功提交的時間，自動調整自身的Locality匹配策略，

            // 如果上一次成功提交任務的時間間隔很長，則降低對Locality的要求（例如從最差要求Process Local降低為最差要求Node Local），

            // 反之則提高對Locality的要求。這一動態調整Locality策略基本可以理解為是為了提高任務在最佳Locality的情況下得到執行的機會，

            // 因為Resource資源可能是在短期內分批提供給TaskSetManager的，動態調整Locality門檻有助於改善整體的Locality分佈情況。

            tasks(i) += task

            val tid = task.taskId

            taskIdToTaskSetManager(tid) = taskSet

            taskIdToExecutorId(tid) = execId

            executorsByHost(host) += execId

            availableCpus(i) -= CPUS_PER_TASK

            assert(availableCpus(i) >= 0)

            launchedTask = true

          }

        } catch {

          case e: TaskNotSerializableException =>

            logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")

            // Do not offer resources for this task, but don't throw an error to allow other

            // task sets to be submitted.

            return launchedTask

        }

      }

    }

    return launchedTask

  }

launchTasks()方法：

// Launch tasks returned by a set of resource offers

    private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {

      for (task <- tasks.flatten) {

        val serializedTask = ser.serialize(task)

        if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {

          scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>

            try {

              var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +

                "spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +

                "spark.akka.frameSize or using broadcast variables for large values."

              msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,

                AkkaUtils.reservedSizeBytes)

              taskSetMgr.abort(msg)

            } catch {

              case e: Exception => logError("Exception in error callback", e)

            }

          }

        }

        else {

          val executorData = executorDataMap(task.executorId)

          executorData.freeCores -= scheduler.CPUS_PER_TASK

          executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) // 獲取到executorEndpoint併發射到ExecutorBackend

        }

      }

    }

找到CoarseGrainedExecutorBackend的receive方法：

override def receive: PartialFunction[Any, Unit] = {

    case RegisteredExecutor =>

      logInfo("Successfully registered with driver")

      val (hostname, _) = Utils.parseHostPort(hostPort)

      executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)

    case RegisterExecutorFailed(message) =>

      logError("Slave registration failed: " + message)

      System.exit(1)

    case LaunchTask(data) =>                                                                          // 接收任務

      if (executor == null) {

        logError("Received LaunchTask command but executor was null")

        System.exit(1)

      } else {

        val taskDesc = ser.deserialize[TaskDescription](data.value)

        logInfo("Got assigned task " + taskDesc.taskId)

        executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber, // 交由Executor執行

          taskDesc.name, taskDesc.serializedTask)

      }

    case KillTask(taskId, _, interruptThread) =>

      if (executor == null) {

        logError("Received KillTask command but executor was null")

        System.exit(1)

      } else {

        executor.killTask(taskId, interruptThread)

      }

    case StopExecutor =>

      logInfo("Driver commanded a shutdown")

      executor.stop()

      stop()

      rpcEnv.shutdown()

  }

Executor中有一個執行緒池物件，每分發一個任務就會啟一個執行緒去執行Task。

簡要總結：

DAGScheduler:
1.handleJobSubmitted(): 根據提交過來的最後一個RDD封裝成一個ResultStage
2.submitStage(): 根據這個ResultStage遞迴提交父RDD組成的ShuffleMapStage
3.submitMissingTasks() : 根據Stage封裝成ShuffleMapTask或者ResultTask再封裝成TaskSet呼叫TaskScheduler提交

TaskScheduler:
1.submitTasks() : 將TaskSet封裝成TaskSetManager提交至Executor端
                2.launchTasks() : 透過resourceOffers()方法獲取到TaskDescription集合後
透過再透過該方法提交至ExecutorBackend

SchedulerBackend:
1.makeOffers() : 該方法呼叫TaskScheduler的resourceOffers(workOffers)
workOffers為executorBackend註冊的子節點資訊
  2.resourceOffers() : 透過該方法將workOffers順序打散後再從佇列中獲取到一組
TaskSetManager，然後呼叫TaskSetManager的resourceOffer()
方法獲取到每個節點上最合適的Task並封裝成TaskDescription

Executor :
1.執行緒在run()中將Task反序列化解析出來呼叫run()方法
2.run()方法中呼叫子類(ShuffleMapTask、ResultTask)的runTask()方法
3.runTask中呼叫RDD的iterator()方法，迭代計算父RDD。
4.在iterator()方法中如果當前RDD的storage level變數標記不是NONE的話，表示該RDD呼叫了快取方法，在BlockManager中應有儲存，那麼呼叫
  CacheManager中的getOrCompute()函式計算RDD

Spark job分配流程原始碼分析

相關文章