流程分析，以作備忘：

shell呼叫org.apache.spark.deploy.SparkSubmit > org.apache.spark.deploy.yarn.Client

開啟org.apache.spark.deploy.yarn.Client的main方法：

def main(argStrings: Array[String]) {

    if (!sys.props.contains("SPARK_SUBMIT")) {

      logWarning("WARNING: This client is deprecated and will be removed in a " +

        "future version of Spark. Use ./bin/spark-submit with \"--master yarn\"")

    }

    // Set an env variable indicating we are running in YARN mode.

    // Note that any env variable with the SPARK_ prefix gets propagated to all (remote) processes

    System.setProperty("SPARK_YARN_MODE", "true")

    val sparkConf = new SparkConf

    val args = new ClientArguments(argStrings, sparkConf)

    // to maintain backwards-compatibility

    if (!Utils.isDynamicAllocationEnabled(sparkConf)) {

      sparkConf.setIfMissing("spark.executor.instances", args.numExecutors.toString)

    }

    new Client(args, sparkConf).run()

  }

進入run（）方法：

def run(): Unit = {

    val appId = submitApplication()

//判斷是否等待完成，根據提交模式判斷（client、cluster兩種模式）

// private val fireAndForget = isClusterMode && !sparkConf.getBoolean("spark.yarn.submit.waitAppCompletion", true)

if (fireAndForget) {

      val report = getApplicationReport(appId) //ApplicationReport是應用程式的報告（包括程式使用者、程式佇列、程式名稱等等）

      val state = report.getYarnApplicationState   //得到應用程式的完成狀態

      logInfo(s"Application report for $appId (state: $state)")

      logInfo(formatReportDetails(report))

      if (state == YarnApplicationState.FAILED || state == YarnApplicationState.KILLED) {    //判斷狀態

        throw new SparkException(s"Application $appId finished with status: $state")

      }

    } else {

      val (yarnApplicationState, finalApplicationStatus) = monitorApplication(appId) //client模式提交的處理（博文後邊細說此方法）涉及兩個物件YarnApplicationState（對於yarn來說任務的狀態）、FinalApplicationStatus（對於任務來說任務的執行狀態）

      if (yarnApplicationState == YarnApplicationState.FAILED ||

        finalApplicationStatus == FinalApplicationStatus.FAILED) {

        throw new SparkException(s"Application $appId finished with failed status")

      }

      if (yarnApplicationState == YarnApplicationState.KILLED ||

        finalApplicationStatus == FinalApplicationStatus.KILLED) {

        throw new SparkException(s"Application $appId is killed")

      }

      if (finalApplicationStatus == FinalApplicationStatus.UNDEFINED) {

        throw new SparkException(s"The final status of application $appId is undefined")

      }

    }

  }

跟進submitApplication（）方法：

def submitApplication(): ApplicationId = {

    var appId: ApplicationId = null

    try {

      // Setup the credentials before doing anything else,

      // so we have don't have issues at any point.

      setupCredentials()

      yarnClient.init(yarnConf)

      yarnClient.start()

      logInfo("Requesting a new application from cluster with %d NodeManagers"

        .format(yarnClient.getYarnClusterMetrics.getNumNodeManagers))

      // Get a new application from our RM

      val newApp = yarnClient.createApplication()

      val newAppResponse = newApp.getNewApplicationResponse()

      appId = newAppResponse.getApplicationId()

      // Verify whether the cluster has enough resources for our AM

      verifyClusterResources(newAppResponse) //記憶體判斷 (博文後邊細講此方法)

      // Set up the appropriate contexts to launch our AM

      val containerContext = createContainerLaunchContext(newAppResponse) // 構建ApplicationMaster的container(包括jar包路徑 userClas等)

      val appContext = createApplicationSubmissionContext(newApp, containerContext)

      // Finally, submit and monitor the application

      logInfo(s"Submitting application ${appId.getId} to ResourceManager")

      yarnClient.submitApplication(appContext)

      appId

    } catch {

      case e: Throwable =>

        if (appId != null) {

          cleanupStagingDir(appId)

        }

        throw e

    }

  }

看一下yarnClient什麼來路：

import org.apache.hadoop.yarn.client.api.{YarnClient, YarnClientApplication}

private val yarnClient = YarnClient.createYarnClient

找一下hadoop原始碼：

@InterfaceAudience.Public

@InterfaceStability.Stable

public abstract class YarnClient extends AbstractService

{

  @InterfaceAudience.Public

  public static YarnClient createYarnClient()

  {

    YarnClient client = new YarnClientImpl();

    return client;

  }

  @InterfaceAudience.Private

  protected YarnClient(String name) {

    super(name);

  }

......

}

看一下createApplication（）方法：

public YarnClientApplication createApplication()

    throws YarnException, IOException

  {

    ApplicationSubmissionContext context = (ApplicationSubmissionContext)Records.newRecord(ApplicationSubmissionContext.class);

    GetNewApplicationResponse newApp = getNewApplication();

    ApplicationId appId = newApp.getApplicationId();

    context.setApplicationId(appId);

    return new YarnClientApplication(newApp, context);

  }

private GetNewApplicationResponse getNewApplication() throws YarnException, IOException
{
GetNewApplicationRequest request = (GetNewApplicationRequest)Records.newRecord(GetNewApplicationRequest.class);

return this.rmClient.getNewApplication(request);
}

看一下ApplicationClientProtocol的getNewApplication(request)方法：

  @InterfaceAudience.Public
@InterfaceStability.Stable
@Idempotent
public abstract GetNewApplicationResponse getNewApplication(GetNewApplicationRequest paramGetNewApplicationRequest)
throws YarnException, IOException;

最後再回到run（）方法看一下如果為client模式提交的處理邏輯進入monitorApplication（）方法：

  def monitorApplication(

      appId: ApplicationId,

      returnOnRunning: Boolean = false,

      logApplicationReport: Boolean = true): (YarnApplicationState, FinalApplicationStatus) = {

    val interval = sparkConf.getLong("spark.yarn.report.interval", 1000)    //app執行監控的間隔時間ms

    var lastState: YarnApplicationState = null

    while (true) { //寫死一直等到程式完成才返回

      Thread.sleep(interval)

      val report: ApplicationReport =

        try {

          getApplicationReport(appId)

        } catch {

          case e: ApplicationNotFoundException =>

            logError(s"Application $appId not found.")

            return (YarnApplicationState.KILLED, FinalApplicationStatus.KILLED)

          case NonFatal(e) =>

            logError(s"Failed to contact YARN for application $appId.", e)

            return (YarnApplicationState.FAILED, FinalApplicationStatus.FAILED)

        }

      val state = report.getYarnApplicationState

      if (logApplicationReport) {

        logInfo(s"Application report for $appId (state: $state)")

        // If DEBUG is enabled, log report details every iteration

        // Otherwise, log them every time the application changes state

        if (log.isDebugEnabled) {

          logDebug(formatReportDetails(report))

        } else if (lastState != state) {

          logInfo(formatReportDetails(report))

        }

      }

      if (state == YarnApplicationState.FINISHED ||

        state == YarnApplicationState.FAILED ||

        state == YarnApplicationState.KILLED) {

        cleanupStagingDir(appId)

        return (state, report.getFinalApplicationStatus) //返回執行結果

      }

      if (returnOnRunning && state == YarnApplicationState.RUNNING) {

        return (state, report.getFinalApplicationStatus)

      }

      lastState = state

    }

    // Never reached, but keeps compiler happy

    throw new SparkException("While loop is depleted! This should never happen...")

  }

至此，流程結束 yarn利用分散式快取機制將application部署到各個計算節點

深入看一下verifyClusterResources(newAppResponse)方法：

private def verifyClusterResources(newAppResponse: GetNewApplicationResponse): Unit = {

    val maxMem = newAppResponse.getMaximumResourceCapability().getMemory()   //每個task最多可申請的記憶體 container的最大值

    logInfo("Verifying our application has not requested more than the maximum " +

      s"memory capability of the cluster ($maxMem MB per container)")

    val executorMem = args.executorMemory + executorMemoryOverhead   //1024(M) +設定的executor的值 args.executorMemory在1.5版本中為寫死1024單位M

    if (executorMem > maxMem) { //如果executor所需要的記憶體大於container的最大值

      throw new IllegalArgumentException(s"Required executor memory (${args.executorMemory}" +

        s"+$executorMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +

        "Please increase the value of 'yarn.scheduler.maximum-allocation-mb'.")

    }

    val amMem = args.amMemory + amMemoryOverhead

//args.amMemory在1.5版本中為寫死512單位M args.amMemoryOverhead:

//    if (isClusterMode) driverMemOverheadKey else amMemOverheadKey

//     val driverMemOverheadKey = "spark.yarn.driver.memoryOverhead"
  //   val amMemOverheadKey = "spark.yarn.am.memoryOverhead"

    if (amMem > maxMem) {

      throw new IllegalArgumentException(s"Required AM memory (${args.amMemory}" +

        s"+$amMemoryOverhead MB) is above the max threshold ($maxMem MB) of this cluster! " +

        "Please increase the value of 'yarn.scheduler.maximum-allocation-mb'.")

    }

    logInfo("Will allocate AM container, with %d MB memory including %d MB overhead".format(

      amMem,

      amMemoryOverhead))

    // We could add checks to make sure the entire cluster has enough resources but that involves

    // getting all the node reports and computing ourselves.

  }

總結：

Cluster模式:

客戶端操作：
　 1、SparkSubmit中根據yarnConf來初始化yarnClient，並啟動yarnClient 　　
2、建立客戶端Application，並獲取Application的ID，進一步判斷叢集中的資源是否滿足executor和ApplicationMaster申請的資源，如果不滿足則丟擲IllegalArgumentException；
　 3、設定資源、環境變數：其中包括了設定Application的Staging目錄、準備本地資源（jar檔案、log4j.properties）、設定Application其中的環境變數、建立Container啟動的Context等；
　 4、設定Application提交的Context，包括設定應用的名字、佇列、AM的申請的Container、標記該作業的型別為spark；
　 5、申請Memory，並最終透過submitApplication方法向ResourceManager提交該Application。　　
當作業提交到YARN上之後，客戶端就沒事了，會關閉此程式，因為整個作業執行在YARN叢集上進行，執行的結果將會儲存到HDFS或者日誌中。
Yarn操作：
  1、執行ApplicationMaster的run方法；
　　2、設定好相關的環境變數。
　　3、建立amClient，並啟動；
　　4、在Spark UI啟動之前設定Spark UI的AmIpFilter；
　　5、在startUserClass函式專門啟動了一個執行緒（名稱為Driver的執行緒）來啟動使用者提交的Application，也就是啟動了Driver。在Driver中將會初始化SparkContext；
　　6、等待SparkContext初始化完成，最多等待spark.yarn.applicationMaster.waitTries次數（預設為10），如果等待了的次數超過了配置的，程式將會退出；否則用SparkContext初始化yarnAllocator；
怎麼知道SparkContext初始化完成？
　　   其實在5步驟中啟動Application的過程中會初始化SparkContext，在初始化SparkContext的時候將會建立YarnClusterScheduler，在SparkContext初始化完成的時候，
會呼叫YarnClusterScheduler類中的postStartHook方法，而該方法會通知ApplicationMaster已經初始化好了SparkContext

為何要等待SparkContext初始化完成？
CoarseGrainedExecutorBackend啟動後需要向CoarseGrainedSchedulerBackend註冊
　　7、當SparkContext初始化完成的時候，透過amClient向ResourceManager註冊ApplicationMaster
　　8、分配並啟動Executeors。在啟動Executeors之前，先要透過yarnAllocator獲取到numExecutors個Container，然後在Container中啟動Executeors。如果在啟動Executeors的過程中失敗的次數達到了maxNumExecutorFailures的次數，
　　那麼這個Application將失敗，將Application Status標明為FAILED，並將關閉SparkContext。其實，啟動Executeors是透過ExecutorRunnable實現的，而ExecutorRunnable內部是啟動CoarseGrainedExecutorBackend的，
CoarseGrainedExecutorBackend啟動後會向SchedulerBackend註冊。(resourceManager是如何決定該分配幾個container？在shell提交時跟引數預設啟動兩個executor)
　　9、最後，Task將在CoarseGrainedExecutorBackend裡面執行，然後執行狀況會透過Akka通知CoarseGrainedScheduler，直到作業執行完成。

Client模式:

客戶端操作：
1、透過SparkSubmit類的launch的函式直接呼叫作業的main函式（透過反射機制實現），如果是叢集模式就會呼叫Client的main函式。
　　2、而應用程式的main函式一定都有個SparkContent，並對其進行初始化；
　　3、在SparkContent初始化中將會依次做如下的事情：設定相關的配置、註冊MapOutputTracker、BlockManagerMaster、BlockManager，建立taskScheduler和dagScheduler；其中比較重要的是建立taskScheduler和dagScheduler。在建立taskScheduler的時候會根據我們傳進來的master來選擇Scheduler和SchedulerBackend。由於我們選擇的是yarn-client模式，程式會選擇YarnClientClusterScheduler和YarnClientSchedulerBackend，並將YarnClientSchedulerBackend的例項初始化YarnClientClusterScheduler，上面兩個例項的獲取都是透過反射機制實現的，YarnClientSchedulerBackend類是CoarseGrainedSchedulerBackend類的子類，YarnClientClusterScheduler是TaskSchedulerImpl的子類，僅僅重寫了TaskSchedulerImpl中的getRackForHost方法。
　　4、初始化完taskScheduler後，將建立dagScheduler，然後透過taskScheduler.start()啟動taskScheduler，而在taskScheduler啟動的過程中也會呼叫SchedulerBackend的start方法。在SchedulerBackend啟動的過程中將會初始化一些引數，封裝在ClientArguments中，並將封裝好的ClientArguments傳進Client類中，並client.submitApplication()方法獲取Application ID。

　　
  Yarn操作：
1、執行ApplicationMaster的run方法（runExecutorLauncher）；
2、無需等待SparkContext初始化完成（因為YarnClientClusterScheduler已啟動完成），向sparkYarnAM註冊該Application
　 3、分配Executors，這裡面的分配邏輯和yarn-cluster裡面類似，就不再說了。
　 4、最後，Task將在CoarseGrainedExecutorBackend裡面執行，然後執行狀況會透過Akka通知CoarseGrainedScheduler，直到作業執行完成。
　 5、在作業執行的時候，YarnClientSchedulerBackend會每隔1秒透過client獲取到作業的執行狀況，並列印出相應的執行資訊，當Application的狀態是FINISHED、FAILED和KILLED中的一種，那麼程式將退出等待。
　 6、最後有個執行緒會再次確認Application的狀態，當Application的狀態是FINISHED、FAILED和KILLED中的一種，程式就執行完成，並停止SparkContext。整個過程就結束了。

Spark on Yarn 任務提交流程原始碼分析

相關文章