原始碼位置:org.apache.spark.deploy.master.Master.scala
一、main主方法:
-
def main(argStrings: Array[String]) {
-
SignalLogger.register(log)
-
val conf = new SparkConf
-
val args = new MasterArguments(argStrings, conf)
-
val (actorSystem, _, _, _) = startSystemAndActor(args.host, args.port, args.webUiPort, conf)
-
actorSystem.awaitTermination()
-
}
解析spark相關的環境變數及方法引數,建立akka actorSystem及ActorRef用於與其它節點的互動,訊息處理類為Master
二、Actor preStart方法
-
既然建立了akka,自然最先執行了master的preStart方法。
-
override def preStart() {
-
logInfo("Starting Spark master at " + masterUrl)
-
logInfo(s"Running Spark version ${org.apache.spark.SPARK_VERSION}")
-
-
-
context.system.eventStream.subscribe(self, classOf[RemotingLifecycleEvent])
-
webUi.bind()
-
masterWebUiUrl = "http://" + masterPublicAddress + ":" + webUi.boundPort
-
-
context.system.scheduler.schedule(0 millis, WORKER_TIMEOUT millis, self, CheckForWorkerTimeOut)
-
-
masterMetricsSystem.registerSource(masterSource)
-
masterMetricsSystem.start()
-
applicationMetricsSystem.start()
-
-
-
masterMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
-
applicationMetricsSystem.getServletHandlers.foreach(webUi.attachHandler)
-
-
-
val (persistenceEngine_, leaderElectionAgent_) = RECOVERY_MODE match {
-
case "ZOOKEEPER" =>
-
logInfo("Persisting recovery state to ZooKeeper")
-
val zkFactory =
-
new ZooKeeperRecoveryModeFactory(conf, SerializationExtension(context.system))
-
(zkFactory.createPersistenceEngine(), zkFactory.createLeaderElectionAgent(this))
-
case "FILESYSTEM" =>
-
val fsFactory =
-
new FileSystemRecoveryModeFactory(conf, SerializationExtension(context.system))
-
(fsFactory.createPersistenceEngine(), fsFactory.createLeaderElectionAgent(this))
-
case "CUSTOM" =>
-
val clazz = Class.forName(conf.get("spark.deploy.recoveryMode.factory"))
-
val factory = clazz.getConstructor(classOf[SparkConf], classOf[Serialization])
-
.newInstance(conf, SerializationExtension(context.system))
-
.asInstanceOf[StandaloneRecoveryModeFactory]
-
(factory.createPersistenceEngine(), factory.createLeaderElectionAgent(this))
-
case _ =>
-
(new BlackHolePersistenceEngine(), new MonarchyLeaderAgent(this))
-
}
-
persistenceEngine = persistenceEngine_
-
leaderElectionAgent = leaderElectionAgent_
至此,master主動處理的流程就完了,之後就接受其他的請求來被動處理。
三、接受worker節點的註冊
-
case RegisterWorker(id, workerHost, workerPort, cores, memory, workerUiPort, publicAddress) =>
-
{
-
logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
-
workerHost, workerPort, cores, Utils.megabytesToString(memory)))
-
if (state == RecoveryState.STANDBY) {
-
-
} else if (idToWorker.contains(id)) {
-
-
sender ! RegisterWorkerFailed("Duplicate worker ID")
-
} else {
-
val worker = new WorkerInfo(id, workerHost, workerPort, cores, memory,
-
sender, workerUiPort, publicAddress)
-
if (registerWorker(worker)) {
-
persistenceEngine.addWorker(worker)
-
-
sender ! RegisteredWorker(masterUrl, masterWebUiUrl)
-
schedule()
-
} else {
-
val workerAddress = worker.actor.path.address
-
logWarning("Worker registration failed. Attempted to re-register worker at same " +
-
"address: " + workerAddress)
-
sender ! RegisterWorkerFailed("Attempted to re-register worker at same address: "
-
+ workerAddress)
-
}
-
}
-
}
-
-
private def schedule(): Unit = {
-
if (state != RecoveryState.ALIVE) { return }
-
-
val shuffledWorkers = Random.shuffle(workers)
-
-
for (worker <- shuffledWorkers if worker.state == WorkerState.ALIVE) {
-
for (driver <- waitingDrivers) {
-
if (worker.memoryFree >= driver.desc.mem && worker.coresFree >= driver.desc.cores) {
-
launchDriver(worker, driver)
-
waitingDrivers -= driver
-
}
-
}
-
}
-
startExecutorsOnWorkers()
-
}
-
-
private def startExecutorsOnWorkers(): Unit = {
-
-
-
-
if (spreadOutApps) {
-
-
for (app <- waitingApps if app.coresLeft > 0) {
-
val usableWorkers = workers.toArray.filter(_.state == WorkerState.ALIVE)
-
.filter(worker => worker.memoryFree >= app.desc.memoryPerExecutorMB &&
-
worker.coresFree >= app.desc.coresPerExecutor.getOrElse(1))
-
.sortBy(_.coresFree).reverse
-
val numUsable = usableWorkers.length
-
val assigned = new Array[Int](numUsable)
-
var toAssign = math.min(app.coresLeft, usableWorkers.map(_.coresFree).sum)
-
var pos = 0
-
-
while (toAssign > 0) {
-
if (usableWorkers(pos).coresFree - assigned(pos) > 0) {
-
toAssign -= 1
-
assigned(pos) += 1
-
}
-
pos = (pos + 1) % numUsable
-
}
-
-
for (pos <- 0 until numUsable if assigned(pos) > 0) {
-
allocateWorkerResourceToExecutors(app, assigned(pos), usableWorkers(pos))
-
}
-
}
-
} else {
-
-
for (worker <- workers if worker.coresFree > 0 && worker.state == WorkerState.ALIVE) {
-
for (app <- waitingApps if app.coresLeft > 0) {
-
allocateWorkerResourceToExecutors(app, app.coresLeft, worker)
-
}
-
}
-
}
-
}
-
-
private def allocateWorkerResourceToExecutors(
-
app: ApplicationInfo,
-
coresToAllocate: Int,
-
worker: WorkerInfo): Unit = {
-
val memoryPerExecutor = app.desc.memoryPerExecutorMB
-
val coresPerExecutor = app.desc.coresPerExecutor.getOrElse(coresToAllocate)
-
var coresLeft = coresToAllocate
-
while (coresLeft >= coresPerExecutor && worker.memoryFree >= memoryPerExecutor) {
-
val exec = app.addExecutor(worker, coresPerExecutor)
-
coresLeft -= coresPerExecutor
-
-
launchExecutor(worker, exec)
-
app.state = ApplicationState.RUNNING
-
}
-
}
除HA相關訊息之外,還可接收如下訊息,功能處理其實也都比較簡單,後續會結合Job的提交過程逐步分析。。
case RequestSubmitDriver(description)//請求提交Driver訊息,記錄Driver的資訊並排程
case RequestKillDriver(driverId)
case RequestDriverStatus(driverId)
case RegisterApplication(description)//提交Application,記錄Application的資訊並排程
case ExecutorStateChanged(appId, execId, state, message, exitStatus)
case DriverStateChanged(driverId, state, exception)
case Heartbeat(workerId)//心跳,用於worker節點的保活
轉載:http://blog.csdn.net/yueqian_zhu/article/details/47907095