Spark job分配流程原始碼分析
job提交大體執行流程圖:
SparkContext : scheduler建立
-
// Create and start the scheduler
-
val (sched, ts) = SparkContext.createTaskScheduler(this, master)
-
_schedulerBackend = sched
-
_taskScheduler = ts
-
_dagScheduler = new DAGScheduler(this)
- _heartbeatReceiver.ask[Boolean](TaskSchedulerIsSet)
-
private def createTaskScheduler(
-
sc: SparkContext,
-
master: String): (SchedulerBackend, TaskScheduler) = {
-
// Regular expression used for local[N] and local[*] master formats
-
val LOCAL_N_REGEX = """local\[([0-9]+|\*)\]""".r
-
// Regular expression for local[N, maxRetries], used in tests with failing tasks
-
val LOCAL_N_FAILURES_REGEX = """local\[([0-9]+|\*)\s*,\s*([0-9]+)\]""".r
-
// Regular expression for simulating a Spark cluster of [N, cores, memory] locally
-
val LOCAL_CLUSTER_REGEX = """local-cluster\[\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*]""".r
-
// Regular expression for connecting to Spark deploy clusters
-
val SPARK_REGEX = """spark://(.*)""".r
-
// Regular expression for connection to Mesos cluster by mesos:// or zk:// url
-
val MESOS_REGEX = """(mesos|zk)://.*""".r
-
// Regular expression for connection to Simr cluster
-
val SIMR_REGEX = """simr://(.*)""".r
-
-
// When running locally, don't try to re-execute tasks on failure.
-
val MAX_LOCAL_TASK_FAILURES = 1
-
-
master match {
-
case "local" =>
-
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
-
val backend = new LocalBackend(sc.getConf, scheduler, 1)
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case LOCAL_N_REGEX(threads) =>
-
def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
-
// local[*] estimates the number of cores on the machine; local[N] uses exactly N threads.
-
val threadCount = if (threads == "*") localCpuCount else threads.toInt
-
if (threadCount <= 0) {
-
throw new SparkException(s"Asked to run locally with $threadCount threads")
-
}
-
val scheduler = new TaskSchedulerImpl(sc, MAX_LOCAL_TASK_FAILURES, isLocal = true)
-
val backend = new LocalBackend(sc.getConf, scheduler, threadCount)
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case LOCAL_N_FAILURES_REGEX(threads, maxFailures) =>
-
def localCpuCount: Int = Runtime.getRuntime.availableProcessors()
-
// local[*, M] means the number of cores on the computer with M failures
-
// local[N, M] means exactly N threads with M failures
-
val threadCount = if (threads == "*") localCpuCount else threads.toInt
-
val scheduler = new TaskSchedulerImpl(sc, maxFailures.toInt, isLocal = true)
-
val backend = new LocalBackend(sc.getConf, scheduler, threadCount)
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case SPARK_REGEX(sparkUrl) =>
-
val scheduler = new TaskSchedulerImpl(sc)
-
val masterUrls = sparkUrl.split(",").map("spark://" + _)
-
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case LOCAL_CLUSTER_REGEX(numSlaves, coresPerSlave, memoryPerSlave) =>
-
// Check to make sure memory requested <= memoryPerSlave. Otherwise Spark will just hang.
-
val memoryPerSlaveInt = memoryPerSlave.toInt
-
if (sc.executorMemory > memoryPerSlaveInt) {
-
throw new SparkException(
-
"Asked to launch cluster with %d MB RAM / worker but requested %d MB/worker".format(
-
memoryPerSlaveInt, sc.executorMemory))
-
}
-
-
val scheduler = new TaskSchedulerImpl(sc)
-
val localCluster = new LocalSparkCluster(
-
numSlaves.toInt, coresPerSlave.toInt, memoryPerSlaveInt, sc.conf)
-
val masterUrls = localCluster.start()
-
val backend = new SparkDeploySchedulerBackend(scheduler, sc, masterUrls)
-
scheduler.initialize(backend)
-
backend.shutdownCallback = (backend: SparkDeploySchedulerBackend) => {
-
localCluster.stop()
-
}
-
(backend, scheduler)
-
-
case "yarn-standalone" | "yarn-cluster" =>
-
if (master == "yarn-standalone") {
-
logWarning(
-
"\"yarn-standalone\" is deprecated as of Spark 1.0. Use \"yarn-cluster\" instead.")
-
}
-
val scheduler = try {
-
val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterScheduler")
-
val cons = clazz.getConstructor(classOf[SparkContext])
-
cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
-
} catch {
-
// TODO: Enumerate the exact reasons why it can fail
-
// But irrespective of it, it means we cannot proceed !
-
case e: Exception => {
-
throw new SparkException("YARN mode not available ?", e)
-
}
-
}
-
val backend = try {
-
val clazz =
-
Utils.classForName("org.apache.spark.scheduler.cluster.YarnClusterSchedulerBackend")
-
val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
-
cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
-
} catch {
-
case e: Exception => {
-
throw new SparkException("YARN mode not available ?", e)
-
}
-
}
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case "yarn-client" =>
-
val scheduler = try {
-
val clazz = Utils.classForName("org.apache.spark.scheduler.cluster.YarnScheduler")
-
val cons = clazz.getConstructor(classOf[SparkContext])
-
cons.newInstance(sc).asInstanceOf[TaskSchedulerImpl]
-
-
} catch {
-
case e: Exception => {
-
throw new SparkException("YARN mode not available ?", e)
-
}
-
}
-
-
val backend = try {
-
val clazz =
-
Utils.classForName("org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend")
-
val cons = clazz.getConstructor(classOf[TaskSchedulerImpl], classOf[SparkContext])
-
cons.newInstance(scheduler, sc).asInstanceOf[CoarseGrainedSchedulerBackend]
-
} catch {
-
case e: Exception => {
-
throw new SparkException("YARN mode not available ?", e)
-
}
-
}
-
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case mesosUrl @ MESOS_REGEX(_) =>
-
MesosNativeLibrary.load()
-
val scheduler = new TaskSchedulerImpl(sc)
-
val coarseGrained = sc.conf.getBoolean("spark.mesos.coarse", false)
-
val url = mesosUrl.stripPrefix("mesos://") // strip scheme from raw Mesos URLs
-
val backend = if (coarseGrained) {
-
new CoarseMesosSchedulerBackend(scheduler, sc, url, sc.env.securityManager)
-
} else {
-
new MesosSchedulerBackend(scheduler, sc, url)
-
}
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case SIMR_REGEX(simrUrl) =>
-
val scheduler = new TaskSchedulerImpl(sc)
-
val backend = new SimrSchedulerBackend(scheduler, sc, simrUrl)
-
scheduler.initialize(backend)
-
(backend, scheduler)
-
-
case _ =>
-
throw new SparkException("Could not parse Master URL: '" + master + "'")
-
}
-
}
- }
RDD的Action操作會呼叫sc的runJob方法
1.SparkContext的runJob方法:
-
def runJob[T, U: ClassTag](
-
rdd: RDD[T], //此處是具體的 RDD 例項值
-
-
func: (TaskContext, Iterator[T]) => U, //具體的執行的 action 的邏輯 , 如 reduceByKey
-
-
partitions: Seq[Int], //分割槽陣列 , 一個數值從 0 到 partitions.size-1
-
-
//result 的處理邏輯 , 每一個 Task 的處理
-
-
resultHandler: (Int, U) => Unit): Unit = {
-
if (stopped.get()) {
-
throw new IllegalStateException("SparkContext has been shutdown")
-
}
-
val callSite = getCallSite
-
val cleanedFunc = clean(func)
-
logInfo("Starting job: " + callSite.shortForm)
-
if (conf.getBoolean("spark.logLineage", false)) {
-
logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
-
}
-
dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)
-
progressBar.foreach(_.finishAll())
-
rdd.doCheckpoint()
- }
2.DAGScheduler的runjob方法:
-
def runJob[T, U](
-
rdd: RDD[T],
-
func: (TaskContext, Iterator[T]) => U,
-
partitions: Seq[Int],
-
callSite: CallSite,
-
resultHandler: (Int, U) => Unit,
-
properties: Properties): Unit = {
-
val start = System.nanoTime
-
val waiter = submitJob(rdd, func, partitions, callSite, resultHandler, properties)
-
//等待 job 執行完成。
-
-
waiter.awaitResult() match {
-
case JobSucceeded =>
-
logInfo("Job %d finished: %s, took %f s".format
-
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
-
case JobFailed(exception: Exception) =>
-
logInfo("Job %d failed: %s, took %f s".format
-
(waiter.jobId, callSite.shortForm, (System.nanoTime - start) / 1e9))
-
// SPARK-8644: Include user stack trace in exceptions coming from DAGScheduler.
-
-
val callerStackTrace = Thread.currentThread().getStackTrace.tail
-
exception.setStackTrace(exception.getStackTrace ++ callerStackTrace)
-
throw exception
-
}
- }
-
def submitJob[T, U](
-
rdd: RDD[T],
-
func: (TaskContext, Iterator[T]) => U,
-
partitions: Seq[Int],
-
callSite: CallSite,
-
resultHandler: (Int, U) => Unit,
-
properties: Properties): JobWaiter[U] = {
-
// Check to make sure we are not launching a task on a partition that does not exist.
-
-
val maxPartitions = rdd.partitions.length
-
partitions.find(p => p >= maxPartitions || p < 0).foreach { p =>
-
throw new IllegalArgumentException(
-
"Attempting to access a non-existent partition: " + p + ". " +
-
"Total number of partitions: " + maxPartitions)
-
}
-
-
val jobId = nextJobId.getAndIncrement()
-
if (partitions.size == 0) {
-
// Return immediately if the job is running 0 tasks
-
-
return new JobWaiter[U](this, jobId, 0, resultHandler)
-
}
-
-
assert(partitions.size > 0)
-
val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
-
val waiter = new JobWaiter(this, jobId, partitions.size, resultHandler)
-
eventProcessLoop.post(JobSubmitted( // 向佇列新增一個 event, 此 event 為 JobSubmitted
-
-
jobId, rdd, func2, partitions.toArray, callSite, waiter,
-
SerializationUtils.clone(properties)))
-
waiter
- }
submitJob方法將JobSubmitted傳送到佇列(並沒有採用Actor模型 而是1.4版本後重新採用了1.0版本之前的佇列方式),那佇列是在哪消費的呢? 看下邊;
在DAGScheduler類的最後有這麼一段:
-
// Start the event thread and register the metrics source at the end of the constructor
-
-
env.metricsSystem.registerSource(metricsSource)
- eventProcessLoop.start()
DAGSchedulerEventProcessLoop 繼承 EventLoop,看一下EventLoop的start方法:
-
//開一個執行緒不斷消費佇列 呼叫onReceive()方法
-
-
private val eventThread = new Thread(name) {
-
setDaemon(true)
-
-
override def run(): Unit = {
-
try {
-
while (!stopped.get) {
-
val event = eventQueue.take()
-
try {
-
onReceive(event)
-
} catch {
-
case NonFatal(e) => {
-
try {
-
onError(e)
-
} catch {
-
case NonFatal(e) => logError("Unexpected error in " + name, e)
-
}
-
}
-
}
-
}
-
} catch {
-
case ie: InterruptedException => // exit even if eventQueue is not empty
-
-
case NonFatal(e) => logError("Unexpected error in " + name, e)
-
}
-
}
-
-
}
-
-
def start(): Unit = {
-
if (stopped.get) {
-
throw new IllegalStateException(name + " has already been stopped")
-
}
-
// Call onStart before starting the event thread to make sure it happens before onReceive
-
-
onStart()
-
eventThread.start()
- }
接下來看onReceive()方法:
-
/**
-
* The main event loop of the DAG scheduler.
-
*/
-
override def onReceive(event: DAGSchedulerEvent): Unit = {
-
val timerContext = timer.time()
-
try {
-
doOnReceive(event)
-
} finally {
-
timerContext.stop()
-
}
- }
-
private def doOnReceive(event: DAGSchedulerEvent): Unit = event match {
-
case JobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties) =>
-
dagScheduler.handleJobSubmitted(jobId, rdd, func, partitions, callSite, listener, properties)
-
-
case MapStageSubmitted(jobId, dependency, callSite, listener, properties) =>
-
dagScheduler.handleMapStageSubmitted(jobId, dependency, callSite, listener, properties)
-
-
case StageCancelled(stageId) =>
-
dagScheduler.handleStageCancellation(stageId)
-
-
case JobCancelled(jobId) =>
-
dagScheduler.handleJobCancellation(jobId)
-
-
case JobGroupCancelled(groupId) =>
-
dagScheduler.handleJobGroupCancelled(groupId)
-
-
case AllJobsCancelled =>
-
dagScheduler.doCancelAllJobs()
-
-
case ExecutorAdded(execId, host) =>
-
dagScheduler.handleExecutorAdded(execId, host)
-
-
case ExecutorLost(execId) =>
-
dagScheduler.handleExecutorLost(execId, fetchFailed = false)
-
-
case BeginEvent(task, taskInfo) =>
-
dagScheduler.handleBeginEvent(task, taskInfo)
-
-
case GettingResultEvent(taskInfo) =>
-
dagScheduler.handleGetTaskResult(taskInfo)
-
-
case completion @ CompletionEvent(task, reason, _, _, taskInfo, taskMetrics) =>
-
dagScheduler.handleTaskCompletion(completion)
-
-
case TaskSetFailed(taskSet, reason, exception) =>
-
dagScheduler.handleTaskSetFailed(taskSet, reason, exception)
-
-
case ResubmitFailedStages =>
-
dagScheduler.resubmitFailedStages()
- }
-
private[scheduler] def handleJobSubmitted(jobId: Int,
-
finalRDD: RDD[_],
-
func: (TaskContext, Iterator[_]) => _,
-
partitions: Array[Int],
-
callSite: CallSite,
-
listener: JobListener,
-
properties: Properties) {
-
var finalStage: ResultStage = null
-
try {
-
// New stage creation may throw an exception if, for example, jobs are run on a
-
-
// HadoopRDD whose underlying HDFS files have been deleted.
-
-
//生成一個 finalStage, 每一個 JOB 都有一個 finalStage, 根據 job 劃分出不同的 stage , RDD提交過來的肯定為最後一個RDD 所以此處封裝為ResultStage
- //RDD中: def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
-
-
finalStage = newResultStage(finalRDD, func, partitions, jobId, callSite)
-
} catch {
-
case e: Exception =>
-
logWarning("Creating new stage failed due to exception - job: " + jobId, e)
-
listener.jobFailed(e)
-
return
-
}
-
-
val job = new ActiveJob(jobId, finalStage, callSite, listener, properties)
-
clearCacheLocs()
-
logInfo("Got job %s (%s) with %d output partitions".format(
-
job.jobId, callSite.shortForm, partitions.length))
-
logInfo("Final stage: " + finalStage + " (" + finalStage.name + ")")
-
logInfo("Parents of final stage: " + finalStage.parents)
-
logInfo("Missing parents: " + getMissingParentStages(finalStage))
-
-
val jobSubmissionTime = clock.getTimeMillis()
-
jobIdToActiveJob(jobId) = job
-
activeJobs += job
-
finalStage.resultOfJob = Some(job)
-
val stageIds = jobIdToStageIds(jobId).toArray
-
val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
-
listenerBus.post(
-
SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
-
submitStage(finalStage)
-
-
submitWaitingStages()
- }
-
/** Submits stage, but first recursively submits any missing parents. */
- //提交階段,但首先遞迴提交的父母 首次進來為ResultStage,在此方法中遞迴提交父RDD組成的ShuffleMapStage
-
private def submitStage(stage: Stage) {
-
val jobId = activeJobForStage(stage)
-
if (jobId.isDefined) {
-
logDebug("submitStage(" + stage + ")")
-
if (!waitingStages(stage) && !runningStages(stage) && !failedStages(stage)) {
-
val missing = getMissingParentStages(stage).sortBy(_.id) //獲取父Stages
-
logDebug("missing: " + missing)
-
if (missing.isEmpty) {
-
logInfo("Submitting " + stage + " (" + stage.rdd + "), which has no missing parents")
-
submitMissingTasks(stage, jobId.get)
-
} else {
-
for (parent <- missing) {
-
submitStage(parent)
-
}
-
waitingStages += stage
-
}
-
}
-
} else {
-
abortStage(stage, "No active job for stage " + stage.id, None)
-
}
- }
繼續進入submitMissingTasks方法:
-
/** Called when stage's parents are available and we can now do its task. */
-
// taskScheduler.submitTasks 提交
-
-
private def submitMissingTasks(stage: Stage, jobId: Int) {
-
logDebug("submitMissingTasks(" + stage + ")")
-
// Get our pending tasks and remember them in our pendingTasks entry
-
-
stage.pendingTasks.clear()
-
-
// First figure out the indexes of partition ids to compute.
-
-
val (allPartitions: Seq[Int], partitionsToCompute: Seq[Int]) = {
-
stage match { //stage分為兩種型別 shuffleMapStage和ResultStage
-
case stage: ShuffleMapStage =>
-
val allPartitions = 0 until stage.numPartitions
-
val filteredPartitions = allPartitions.filter { id => stage.outputLocs(id).isEmpty }
-
(allPartitions, filteredPartitions)
-
case stage: ResultStage =>
-
val job = stage.resultOfJob.get
-
val allPartitions = 0 until job.numPartitions
-
val filteredPartitions = allPartitions.filter { id => !job.finished(id) }
-
(allPartitions, filteredPartitions)
-
}
-
}
-
-
// Create internal accumulators if the stage has no accumulators initialized.
-
-
// Reset internal accumulators only if this stage is not partially submitted
-
-
// Otherwise, we may override existing accumulator values from some tasks
-
-
if (stage.internalAccumulators.isEmpty || allPartitions == partitionsToCompute) {
-
stage.resetInternalAccumulators()
-
}
-
-
val properties = jobIdToActiveJob.get(stage.firstJobId).map(_.properties).orNull
-
-
runningStages += stage
-
// SparkListenerStageSubmitted should be posted before testing whether tasks are
-
-
// serializable. If tasks are not serializable, a SparkListenerStageCompleted event
-
-
// will be posted, which should always come after a corresponding SparkListenerStageSubmitted
-
-
// event.
-
-
outputCommitCoordinator.stageStart(stage.id)
-
val taskIdToLocations = try {
-
stage match {
-
case s: ShuffleMapStage =>
-
partitionsToCompute.map { id => (id, getPreferredLocs(stage.rdd, id))}.toMap
-
case s: ResultStage =>
-
val job = s.resultOfJob.get
-
partitionsToCompute.map { id =>
-
val p = s.partitions(id)
-
(id, getPreferredLocs(stage.rdd, p))
-
}.toMap
-
}
-
} catch {
-
case NonFatal(e) =>
-
stage.makeNewStageAttempt(partitionsToCompute.size)
-
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
-
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
-
runningStages -= stage
-
return
-
}
-
-
stage.makeNewStageAttempt(partitionsToCompute.size, taskIdToLocations.values.toSeq)
-
listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
-
-
// TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
-
-
// Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
-
-
// the serialized copy of the RDD and for each task we will deserialize it, which means each
-
-
// task gets a different copy of the RDD. This provides stronger isolation between tasks that
-
-
// might modify state of objects referenced in their closures. This is necessary in Hadoop
-
-
// where the JobConf/Configuration object is not thread-safe.
-
-
var taskBinary: Broadcast[Array[Byte]] = null
-
try {
-
// For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep).
-
-
// For ResultTask, serialize and broadcast (rdd, func).
-
-
val taskBinaryBytes: Array[Byte] = stage match {
-
case stage: ShuffleMapStage =>
-
closureSerializer.serialize((stage.rdd, stage.shuffleDep): AnyRef).array()
-
case stage: ResultStage =>
-
closureSerializer.serialize((stage.rdd, stage.func): AnyRef).array()
-
}
-
-
taskBinary = sc.broadcast(taskBinaryBytes)
-
} catch {
-
// In the case of a failure during serialization, abort the stage.
-
-
case e: NotSerializableException =>
-
abortStage(stage, "Task not serializable: " + e.toString, Some(e))
-
runningStages -= stage
-
-
// Abort execution
-
-
return
-
case NonFatal(e) =>
-
abortStage(stage, s"Task serialization failed: $e\n${e.getStackTraceString}", Some(e))
-
runningStages -= stage
-
return
-
}
-
-
val tasks: Seq[Task[_]] = try {
-
stage match {
-
case stage: ShuffleMapStage =>
-
partitionsToCompute.map { id =>
-
val locs = taskIdToLocations(id)
-
val part = stage.rdd.partitions(id)
-
new ShuffleMapTask(stage.id, stage.latestInfo.attemptId,
-
taskBinary, part, locs, stage.internalAccumulators)
-
}
-
-
case stage: ResultStage =>
-
val job = stage.resultOfJob.get
-
partitionsToCompute.map { id =>
-
val p: Int = stage.partitions(id)
-
val part = stage.rdd.partitions(p)
-
val locs = taskIdToLocations(id)
-
new ResultTask(stage.id, stage.latestInfo.attemptId,
-
taskBinary, part, locs, id, stage.internalAccumulators)
-
}
-
}
-
} catch {
-
case NonFatal(e) =>
-
abortStage(stage, s"Task creation failed: $e\n${e.getStackTraceString}", Some(e))
-
runningStages -= stage
-
return
-
}
-
-
if (tasks.size > 0) {
-
logInfo("Submitting " + tasks.size + " missing tasks from " + stage + " (" + stage.rdd + ")")
-
stage.pendingTasks ++= tasks
-
logDebug("New pending tasks: " + stage.pendingTasks)
-
//把 stage 根據 partition 生成 TaskSet, 透過 TaskScheduler 提交 Task
-
-
taskScheduler.submitTasks(new TaskSet(
-
tasks.toArray, stage.id, stage.latestInfo.attemptId, stage.firstJobId, properties))
-
stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
-
} else {
-
// Because we posted SparkListenerStageSubmitted earlier, we should mark
-
-
// the stage as completed here in case there are no tasks to run
-
-
markStageAsFinished(stage, None)
-
-
val debugString = stage match {
-
case stage: ShuffleMapStage =>
-
s"Stage ${stage} is actually done; " +
-
s"(available: ${stage.isAvailable}," +
-
s"available outputs: ${stage.numAvailableOutputs}," +
-
s"partitions: ${stage.numPartitions})"
-
case stage : ResultStage =>
-
s"Stage ${stage} is actually done; (partitions: ${stage.numPartitions})"
-
}
-
logDebug(debugString)
-
}
- }
-
override def submitTasks(taskSet: TaskSet) {
-
val tasks = taskSet.tasks
-
logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks")
- this.synchronized {
-
-
//生成一個 TaskSetManager 例項,TaskSetManager負責監控該TaskSet的所有Task任務,有任務完成後,taskScheduler會從該Manager中刪除task
-
-
val manager = createTaskSetManager(taskSet, maxTaskFailures)
-
val stage = taskSet.stageId
-
val stageTaskSets =
-
taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager])
-
stageTaskSets(taskSet.stageAttemptId) = manager
-
val conflictingTaskSet = stageTaskSets.exists { case (_, ts) =>
-
ts.taskSet != taskSet && !ts.isZombie
-
}
-
if (conflictingTaskSet) {
-
throw new IllegalStateException(s"more than one active taskSet for stage $stage:" +
-
s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}")
-
}
-
schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) //將manager加入到rootPool池中,在初始化schedulableBuilder時賦予rootPool池(FIFO、FAIR)
-
-
if (!isLocal && !hasReceivedTask) {
-
starvationTimer.scheduleAtFixedRate(new TimerTask() {
-
override def run() {
-
if (!hasLaunchedTask) {
-
logWarning("Initial job has not accepted any resources; " +
-
"check your cluster UI to ensure that workers are registered " +
-
"and have sufficient resources")
-
} else {
-
this.cancel()
-
}
-
}
-
}, STARVATION_TIMEOUT_MS, STARVATION_TIMEOUT_MS)
-
}
-
hasReceivedTask = true
-
}
-
//透過 backend 發起執行訊息 ,backend 是 SchedulerBackend 的具體實現,在 yarn-cluster 模式為 CoarseGrainedSchedulerBackend
-
-
-
backend.reviveOffers()
- }
CoarseGrainedSchedulerBackend:
-
override def reviveOffers() {
-
driverEndpoint.send(ReviveOffers)
- }
-
override def start() {
-
val properties = new ArrayBuffer[(String, String)]
-
for ((key, value) <- scheduler.sc.conf.getAll) {
-
if (key.startsWith("spark.")) {
-
properties += ((key, value))
-
}
-
}
-
-
// TODO (prashant) send conf instead of properties
-
// 初始化話 endpoint 用來通訊
-
driverEndpoint = rpcEnv.setupEndpoint(ENDPOINT_NAME, createDriverEndpoint(properties))
-
}
//建立一個Actor
-
protected def createDriverEndpoint(properties: Seq[(String, String)]): DriverEndpoint = {
-
new DriverEndpoint(rpcEnv, properties)
- }
看一下receive方法:
-
override def receive: PartialFunction[Any, Unit] = {
-
case StatusUpdate(executorId, taskId, state, data) =>
-
scheduler.statusUpdate(taskId, state, data.value)
-
if (TaskState.isFinished(state)) {
-
executorDataMap.get(executorId) match {
-
case Some(executorInfo) =>
-
executorInfo.freeCores += scheduler.CPUS_PER_TASK
-
makeOffers(executorId)
-
case None =>
-
// Ignoring the update since we don't know about the executor.
-
logWarning(s"Ignored task status update ($taskId state $state) " +
-
s"from unknown executor with ID $executorId")
-
}
-
}
-
-
case ReviveOffers => //在初始化時會每隔一定時間(spark.scheduler.revive.interval, 預設為1s)進行一次排程(給自身傳送ReviveOffers訊息, 進行呼叫makeOffers進行排程)。
-
makeOffers()
-
-
case KillTask(taskId, executorId, interruptThread) =>
-
executorDataMap.get(executorId) match {
-
case Some(executorInfo) =>
-
executorInfo.executorEndpoint.send(KillTask(taskId, executorId, interruptThread))
-
case None =>
-
// Ignoring the task kill since the executor is not registered.
-
logWarning(s"Attempted to kill task $taskId for unknown executor $executorId.")
-
}
-
- }
-
// Make fake resource offers on all executors
-
private def makeOffers() {
-
// Filter out executors under killing
-
val activeExecutors = executorDataMap.filterKeys(!executorsPendingToRemove.contains(_)) //過濾出executor
-
val workOffers = activeExecutors.map { case (id, executorData) =>
-
new WorkerOffer(id, executorData.executorHost, executorData.freeCores) //轉為seq
-
}.toSeq
-
launchTasks(scheduler.resourceOffers(workOffers))
- }
-
def resourceOffers(offers: Seq[WorkerOffer]): Seq[Seq[TaskDescription]] = synchronized { //SchedulerBackend中呼叫 封裝TaskDescription 傳送給Executor端執行
-
// Mark each slave as alive and remember its hostname
-
// Also track if new executor is added
-
var newExecAvail = false
-
// 遍歷worker提供的資源,更新executor相關的對映
-
for (o <- offers) {
-
executorIdToHost(o.executorId) = o.host
-
activeExecutorIds += o.executorId
-
if (!executorsByHost.contains(o.host)) {
-
executorsByHost(o.host) = new HashSet[String]()
-
executorAdded(o.executorId, o.host)
-
newExecAvail = true
-
}
-
for (rack <- getRackForHost(o.host)) {
-
hostsByRack.getOrElseUpdate(rack, new HashSet[String]()) += o.host
-
}
-
}
-
// 從worker當中隨機選出一些來,防止任務都堆在一個機器上
-
// Randomly shuffle offers to avoid always placing tasks on the same set of workers.
-
val shuffledOffers = Random.shuffle(offers) //將offers集合打散順序
-
// Build a list of tasks to assign to each worker.
-
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores))
-
val availableCpus = shuffledOffers.map(o => o.cores).toArray
-
// getSortedTask函式對taskset進行排序
-
val sortedTaskSets = rootPool.getSortedTaskSetQueue //排序佇列 獲取到一組 TaskSetManager :ArrayBuffer[TaskSetManager]
-
//按就近原則進行Task排程
-
for (taskSet <- sortedTaskSets) {
-
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
-
taskSet.parent.name, taskSet.name, taskSet.runningTasks))
-
if (newExecAvail) {
-
taskSet.executorAdded()
-
}
-
}
-
-
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
-
// of locality levels so that it gets a chance to launch local tasks on all of them.
-
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
-
var launchedTask = false
-
for (taskSet <- sortedTaskSets; maxLocality <- taskSet.myLocalityLevels) {
-
do {
-
launchedTask = resourceOfferSingleTaskSet(
-
taskSet, maxLocality, shuffledOffers, availableCpus, tasks)
-
} while (launchedTask)
-
}
-
-
if (tasks.size > 0) {
-
hasLaunchedTask = true
-
}
-
return tasks
- }
-
private def resourceOfferSingleTaskSet(
-
taskSet: TaskSetManager,
-
maxLocality: TaskLocality,
-
shuffledOffers: Seq[WorkerOffer],
-
availableCpus: Array[Int],
-
tasks: Seq[ArrayBuffer[TaskDescription]]) : Boolean = {
-
var launchedTask = false
-
for (i <- 0 until shuffledOffers.size) { // 遍歷所有Executor
-
val execId = shuffledOffers(i).executorId
-
val host = shuffledOffers(i).host
-
if (availableCpus(i) >= CPUS_PER_TASK) { // >1
-
try {
-
for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { // 根據executor的資訊返回一個最合適的Task
-
// resourceOffer :
-
//根據TaskScheduler所提供的單個Resource資源包括host,executor和locality的要求返回一個合適的Task。
-
// TaskSetManager內部會根據上一個任務成功提交的時間,自動調整自身的Locality匹配策略,
-
// 如果上一次成功提交任務的時間間隔很長,則降低對Locality的要求(例如從最差要求Process Local降低為最差要求Node Local),
-
// 反之則提高對Locality的要求。這一動態調整Locality策略基本可以理解為是為了提高任務在最佳Locality的情況下得到執行的機會,
-
// 因為Resource資源可能是在短期內分批提供給TaskSetManager的,動態調整Locality門檻有助於改善整體的Locality分佈情況。
-
tasks(i) += task
-
val tid = task.taskId
-
taskIdToTaskSetManager(tid) = taskSet
-
taskIdToExecutorId(tid) = execId
-
executorsByHost(host) += execId
-
availableCpus(i) -= CPUS_PER_TASK
-
assert(availableCpus(i) >= 0)
-
launchedTask = true
-
}
-
} catch {
-
case e: TaskNotSerializableException =>
-
logError(s"Resource offer failed, task set ${taskSet.name} was not serializable")
-
// Do not offer resources for this task, but don't throw an error to allow other
-
// task sets to be submitted.
-
return launchedTask
-
}
-
}
-
}
-
return launchedTask
- }
launchTasks()方法:
-
// Launch tasks returned by a set of resource offers
-
private def launchTasks(tasks: Seq[Seq[TaskDescription]]) {
-
for (task <- tasks.flatten) {
-
val serializedTask = ser.serialize(task)
-
if (serializedTask.limit >= akkaFrameSize - AkkaUtils.reservedSizeBytes) {
-
scheduler.taskIdToTaskSetManager.get(task.taskId).foreach { taskSetMgr =>
-
try {
-
var msg = "Serialized task %s:%d was %d bytes, which exceeds max allowed: " +
-
"spark.akka.frameSize (%d bytes) - reserved (%d bytes). Consider increasing " +
-
"spark.akka.frameSize or using broadcast variables for large values."
-
msg = msg.format(task.taskId, task.index, serializedTask.limit, akkaFrameSize,
-
AkkaUtils.reservedSizeBytes)
-
taskSetMgr.abort(msg)
-
} catch {
-
case e: Exception => logError("Exception in error callback", e)
-
}
-
}
-
}
-
else {
-
val executorData = executorDataMap(task.executorId)
-
executorData.freeCores -= scheduler.CPUS_PER_TASK
-
executorData.executorEndpoint.send(LaunchTask(new SerializableBuffer(serializedTask))) // 獲取到executorEndpoint併發射到ExecutorBackend
-
}
-
}
- }
-
override def receive: PartialFunction[Any, Unit] = {
-
case RegisteredExecutor =>
-
logInfo("Successfully registered with driver")
-
val (hostname, _) = Utils.parseHostPort(hostPort)
-
executor = new Executor(executorId, hostname, env, userClassPath, isLocal = false)
-
-
case RegisterExecutorFailed(message) =>
-
logError("Slave registration failed: " + message)
-
System.exit(1)
-
-
case LaunchTask(data) => // 接收任務
-
if (executor == null) {
-
logError("Received LaunchTask command but executor was null")
-
System.exit(1)
-
} else {
-
val taskDesc = ser.deserialize[TaskDescription](data.value)
-
logInfo("Got assigned task " + taskDesc.taskId)
-
executor.launchTask(this, taskId = taskDesc.taskId, attemptNumber = taskDesc.attemptNumber, // 交由Executor執行
-
taskDesc.name, taskDesc.serializedTask)
-
}
-
-
case KillTask(taskId, _, interruptThread) =>
-
if (executor == null) {
-
logError("Received KillTask command but executor was null")
-
System.exit(1)
-
} else {
-
executor.killTask(taskId, interruptThread)
-
}
-
-
case StopExecutor =>
-
logInfo("Driver commanded a shutdown")
-
executor.stop()
-
stop()
-
rpcEnv.shutdown()
- }
簡要總結:
DAGScheduler:
1.handleJobSubmitted(): 根據提交過來的最後一個RDD封裝成一個ResultStage
2.submitStage(): 根據這個ResultStage遞迴提交父RDD組成的ShuffleMapStage
3.submitMissingTasks() : 根據Stage封裝成ShuffleMapTask或者ResultTask再封裝成TaskSet呼叫TaskScheduler提交
TaskScheduler:
1.submitTasks() : 將TaskSet封裝成TaskSetManager提交至Executor端
2.launchTasks() : 透過resourceOffers()方法獲取到TaskDescription集合後
透過再透過該方法提交至ExecutorBackend
SchedulerBackend:
1.makeOffers() : 該方法呼叫TaskScheduler的resourceOffers(workOffers)
workOffers為executorBackend註冊的子節點資訊
2.resourceOffers() : 透過該方法將workOffers順序打散後再從佇列中獲取到一組
TaskSetManager,然後呼叫TaskSetManager的resourceOffer()
方法獲取到每個節點上最合適的Task並封裝成TaskDescription
Executor :
1.執行緒在run()中將Task反序列化解析出來呼叫run()方法
2.run()方法中呼叫子類(ShuffleMapTask、ResultTask)的runTask()方法
3.runTask中呼叫RDD的iterator()方法,迭代計算父RDD。
4.在iterator()方法中如果當前RDD的storage level變數標記不是NONE的話,表示該RDD呼叫了快取方法,在BlockManager中應有儲存,那麼呼叫
CacheManager中的getOrCompute()函式計算RDD
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/29754888/viewspace-1813916/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- mapreduce job提交流程原始碼級分析(三)原始碼
- spark core原始碼分析2 master啟動流程Spark原始碼AST
- spark core原始碼分析4 worker啟動流程Spark原始碼
- Spark on Yarn 任務提交流程原始碼分析SparkYarn原始碼
- mapreduce job提交流程原始碼級分析(二)(原創)原始碼
- job提交的原始碼分析原始碼
- spark核心(下)——job任務提交原始碼解析Spark原始碼
- Spark 原始碼分析系列Spark原始碼
- spark 原始碼分析之二十一 -- Task的執行流程Spark原始碼
- Spark3.0YarnCluster模式任務提交流程原始碼分析SparkYarn模式原始碼
- Mapreduce Job提交流程原始碼和切片原始碼詳解原始碼
- 【原始碼分析】XXL-JOB的執行器的註冊流程原始碼
- Spark原始碼分析之MemoryManagerSpark原始碼
- Spark原始碼分析之BlockStoreSpark原始碼BloC
- Spark原始碼分析之DiskBlockMangaer分析Spark原始碼BloC
- Spark原始碼分析之cahce原理分析Spark原始碼
- Spark原始碼解析-Yarn部署流程(ApplicationMaster)Spark原始碼YarnAPPAST
- Spark RPC框架原始碼分析(三)Spark心跳機制分析SparkRPC框架原始碼
- WindowManager呼叫流程原始碼分析原始碼
- 執行流程原始碼分析原始碼
- 核心堆分配函式brk()原始碼分析函式原始碼
- spark 原始碼分析之十三 -- SerializerManager剖析Spark原始碼
- Spark原始碼分析之Checkpoint機制Spark原始碼
- spark core原始碼分析3 Master HASpark原始碼AST
- spark 原始碼分析之十八 -- Spark儲存體系剖析Spark原始碼
- spark 原始碼分析之十五 -- Spark記憶體管理剖析Spark原始碼記憶體
- Flutter啟動流程原始碼分析Flutter原始碼
- View繪製流程原始碼分析View原始碼
- 原始碼分析Retrofit請求流程原始碼
- Mybatis執行流程原始碼分析MyBatis原始碼
- apiserver原始碼分析——啟動流程APIServer原始碼
- Activity啟動流程原始碼分析原始碼
- BlueStore原始碼分析之Stupid分配器原始碼
- MySQL • 原始碼分析 • 記憶體分配機制MySql原始碼記憶體
- spark 原始碼分析之十九 -- Stage的提交Spark原始碼
- spark 原始碼分析之十六 -- Spark記憶體儲存剖析Spark原始碼記憶體
- MapReduce job在JobTracker初始化原始碼級分析原始碼
- Java物件記憶體分配原理及原始碼分析Java物件記憶體原始碼