引言
隨著大資料的發展,任務排程系統成為了資料處理和管理中至關重要的部分。Apache DolphinScheduler 是一款優秀的開源分散式工作流排程平臺,在大資料場景中得到廣泛應用。
在本文中,我們將對 Apache DolphinScheduler 1.3.9 版本的原始碼進行深入分析,介紹 Master 啟動以及排程流程。
透過這些分析,開發者可以更好地理解 DolphinScheduler 的工作機制,並在實際使用中更高效地進行二次開發或最佳化。
Master Server啟動
啟動流程圖
Master排程工作流流程圖
MasterServer啟動方法
public void run() {
// init remoting server
NettyServerConfig serverConfig = new NettyServerConfig();
serverConfig.setListenPort(masterConfig.getListenPort());
this.nettyRemotingServer = new NettyRemotingServer(serverConfig);
this.nettyRemotingServer.registerProcessor(CommandType.TASK_EXECUTE_RESPONSE, new TaskResponseProcessor());
this.nettyRemotingServer.registerProcessor(CommandType.TASK_EXECUTE_ACK, new TaskAckProcessor());
this.nettyRemotingServer.registerProcessor(CommandType.TASK_KILL_RESPONSE, new TaskKillResponseProcessor());
this.nettyRemotingServer.start();
// self tolerant
this.zkMasterClient.start();
this.zkMasterClient.setStoppable(this);
// scheduler start
this.masterSchedulerService.start();
// start QuartzExecutors
// what system should do if exception
try {
logger.info("start Quartz server...");
QuartzExecutors.getInstance().start();
} catch (Exception e) {
try {
QuartzExecutors.getInstance().shutdown();
} catch (SchedulerException e1) {
logger.error("QuartzExecutors shutdown failed : " + e1.getMessage(), e1);
}
logger.error("start Quartz failed", e);
}
/**
* register hooks, which are called before the process exits
*/
Runtime.getRuntime().addShutdownHook(new Thread(() -> {
if (Stopper.isRunning()) {
close("shutdownHook");
}
}));
}
- nettyServer會註冊三種Command
- TASK_EXECUTE_ACK:Worker在接收到Master執行任務的請求後,會給Master傳送一條Ack Command,告訴Master已經開始執行Task了。
- TASK_EXECUTE_RESPONSE:Worker在執行完Task之後,會給Master傳送一條Response Command,告訴Master任務排程/執行結果。
- TASK_KILL_RESPONSE:Master接收到Task停止的請求會,會給Worker傳送TASK_KILL_REQUEST Command,之後Worker會把Task_KILL_RESPONSE Command返回給Master。
- 啟動排程和定時器。
- 新增ShutdownHook,關閉資源。
Master 配置檔案
master.listen.port=5678
# 限制Process Instance併發排程的執行緒數
master.exec.threads=100
# 限制每個ProcessInstance可以執行的任務數
master.exec.task.num=20
# 每一批次可以分發的任務數
master.dispatch.task.num=3
# master需要選擇一個穩定的worker去執行任務
# 演算法有:Random,RoundRobin,LowerWeight。預設是LowerWeight
master.host.selector=LowerWeight
# master需要向Zookeeper傳送心跳,單位:秒
master.heartbeat.interval=10
# master提交任務失敗,重試次數
master.task.commit.retryTimes=5
# master提交任務失敗,重試時間間隔
master.task.commit.interval=1000
# master最大cpu平均負載,只有當系統cpu平均負載還沒有達到這個值,master才能排程任務
# 預設值為-1,系統cpu核數 * 2
master.max.cpuload.avg=-1
# master為其他程序保留記憶體,只有當系統可用記憶體大於這個值,master才能排程
# 預設值0.3G
master.reserved.memory=0.3
Master Scheduler啟動
MasterSchedulerService初始化方法
public void init(){
// masterConfig.getMasterExecThreads(),master.properties裡master.exec.threads=100
// 該執行緒池的核心執行緒數和最大執行緒數為100
this.masterExecService = (ThreadPoolExecutor)ThreadUtils.newDaemonFixedThreadExecutor("Master-Exec-Thread", masterConfig.getMasterExecThreads());
NettyClientConfig clientConfig = new NettyClientConfig();
this.nettyRemotingClient = new NettyRemotingClient(clientConfig);
}
MasterSchedulerService啟動方法
public void run() {
logger.info("master scheduler started");
while (Stopper.isRunning()){
try {
// 這個方法是用來檢查master cpu load和memory,判斷master是否還有資源進行排程
// 如果不能排程,Sleep 1 秒種
boolean runCheckFlag = OSUtils.checkResource(masterConfig.getMasterMaxCpuloadAvg(), masterConfig.getMasterReservedMemory());
if(!runCheckFlag) {
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
continue;
}
if (zkMasterClient.getZkClient().getState() == CuratorFrameworkState.STARTED) {
// 這裡才是真正去執行排程的方法
scheduleProcess();
}
} catch (Exception e) {
logger.error("master scheduler thread error", e);
}
}
}
MasterSchedulerService排程方法
private void scheduleProcess() throws Exception {
InterProcessMutex mutex = null;
try {
// 阻塞式獲取分散式鎖
mutex = zkMasterClient.blockAcquireMutex();
// 獲取執行緒池的活躍執行緒數
int activeCount = masterExecService.getActiveCount();
// make sure to scan and delete command table in one transaction
// 獲取其中一個command,必須保證操作都在一個事務裡
Command command = processService.findOneCommand();
if (command != null) {
logger.info("find one command: id: {}, type: {}", command.getId(),command.getCommandType());
try{
// 獲取ProcessInstance,
// 這個方法會根據master.exec.threads配置和活躍執行緒數來判斷是否可以排程processInstance
ProcessInstance processInstance = processService.handleCommand(logger,
getLocalAddress(),
this.masterConfig.getMasterExecThreads() - activeCount, command);
if (processInstance != null) {
logger.info("start master exec thread , split DAG ...");
masterExecService.execute(
new MasterExecThread(
processInstance
, processService
, nettyRemotingClient
));
}
}catch (Exception e){
logger.error("scan command error ", e);
processService.moveToErrorCommand(command, e.toString());
}
} else{
//indicate that no command ,sleep for 1s
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
}
} finally{
// 釋放分散式鎖
zkMasterClient.releaseMutex(mutex);
}
}
ProcessService處理Command的方法
public ProcessInstance handleCommand(Logger logger, String host, int validThreadNum, Command command) {
// 這裡是去構造ProcessInstance
ProcessInstance processInstance = constructProcessInstance(command, host);
//cannot construct process instance, return null;
if(processInstance == null){
logger.error("scan command, command parameter is error: {}", command);
moveToErrorCommand(command, "process instance is null");
return null;
}
// 這裡是檢測當前剩餘執行緒數是否大於等於該ProcessDefinition及其所有子Process的數量
// 如果檢測不透過,process instance的狀態變為wait thread.並且返回空的process instance
if(!checkThreadNum(command, validThreadNum)){
logger.info("there is not enough thread for this command: {}", command);
return setWaitingThreadProcess(command, processInstance);
}
processInstance.setCommandType(command.getCommandType());
processInstance.addHistoryCmd(command.getCommandType());
saveProcessInstance(processInstance);
this.setSubProcessParam(processInstance);
delCommandByid(command.getId());
return processInstance;
}
MasterExecThread初始化方法
public MasterExecThread(ProcessInstance processInstance, ProcessService processService, NettyRemotingClient nettyRemotingClient){
this.processService = processService;
this.processInstance = processInstance;
this.masterConfig = SpringApplicationContext.getBean(MasterConfig.class);
// master.properties檔案裡的master.task.exec.num
int masterTaskExecNum = masterConfig.getMasterExecTaskNum();
this.taskExecService = ThreadUtils.newDaemonFixedThreadExecutor("Master-Task-Exec-Thread",
masterTaskExecNum);
this.nettyRemotingClient = nettyRemotingClient;
}
MasterExecThread啟動方法
public void run() {
// 省略...
try {
if (processInstance.isComplementData() && Flag.NO == processInstance.getIsSubProcess()){
// 補數邏輯... 暫不看
executeComplementProcess();
}else{
// 執行task方法
executeProcess();
}
}catch (Exception e){
logger.error("master exec thread exception", e);
logger.error("process execute failed, process id:{}", processInstance.getId());
processInstance.setState(ExecutionStatus.FAILURE);
processInstance.setEndTime(new Date());
processService.updateProcessInstance(processInstance);
}finally {
taskExecService.shutdown();
}
}
private void executeProcess() throws Exception {
// 前置
prepareProcess();
// 執行
runProcess();
// 後置
endProcess();
}
private void runProcess(){
// 從根task開始提交
submitPostNode(null);
boolean sendTimeWarning = false;
while(!processInstance.isProcessInstanceStop() && Stopper.isRunning()){
// 省略部分程式碼...
// 根據cpu load avg和Memorry判斷是否可以排程
if(canSubmitTaskToQueue()){
submitStandByTask();
}
try {
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (InterruptedException e) {
logger.error(e.getMessage(),e);
}
updateProcessInstanceState();
}
logger.info("process:{} end, state :{}", processInstance.getId(), processInstance.getState());
}
// 獲取可以並行的task
/**
* task 1 -> task 2 -> task3
* task 4 -> task 5
* task 6
* task 1,task4,task6可以並行跑
*/
private void submitPostNode(String parentNodeName){
Set<String> submitTaskNodeList = DagHelper.parsePostNodes(parentNodeName, skipTaskNodeList, dag, completeTaskList);
List<TaskInstance> taskInstances = new ArrayList<>();
for(String taskNode : submitTaskNodeList){
taskInstances.add(createTaskInstance(processInstance, taskNode,
dag.getNode(taskNode)));
}
// if previous node success , post node submit
for(TaskInstance task : taskInstances){
if(readyToSubmitTaskQueue.contains(task)){
continue;
}
if(completeTaskList.containsKey(task.getName())){
logger.info("task {} has already run success", task.getName());
continue;
}
if(task.getState().typeIsPause() || task.getState().typeIsCancel()){
logger.info("task {} stopped, the state is {}", task.getName(), task.getState());
}else{
// task新增到priorityQueue
addTaskToStandByList(task);
}
}
}
/**
* handling the list of tasks to be submitted
*/
private void submitStandByTask(){
try {
int length = readyToSubmitTaskQueue.size();
for (int i=0;i<length;i++) {
// 從佇列裡面取task, 提交給worker執行
TaskInstance task = readyToSubmitTaskQueue.peek();
// 先判斷task的前置依賴有沒有都執行成功,如果執行成功,在提交該task執行
// 如果執行失敗,或者沒有執行,則不提交
DependResult dependResult = getDependResultForTask(task);
if(DependResult.SUCCESS == dependResult){
if(retryTaskIntervalOverTime(task)){
submitTaskExec(task);
removeTaskFromStandbyList(task);
}
}else if(DependResult.FAILED == dependResult){
// if the dependency fails, the current node is not submitted and the state changes to failure.
dependFailedTask.put(task.getName(), task);
removeTaskFromStandbyList(task);
logger.info("task {},id:{} depend result : {}",task.getName(), task.getId(), dependResult);
} else if (DependResult.NON_EXEC == dependResult) {
// for some reasons(depend task pause/stop) this task would not be submit
removeTaskFromStandbyList(task);
logger.info("remove task {},id:{} , because depend result : {}", task.getName(), task.getId(), dependResult);
}
}
} catch (Exception e) {
logger.error("submit standby task error",e);
}
}
/**
* 建立TaskExecThread
*/
private TaskInstance submitTaskExec(TaskInstance taskInstance) {
MasterBaseTaskExecThread abstractExecThread = null;
if(taskInstance.isSubProcess()){
abstractExecThread = new SubProcessTaskExecThread(taskInstance);
}else if(taskInstance.isDependTask()){
abstractExecThread = new DependentTaskExecThread(taskInstance);
}else if(taskInstance.isConditionsTask()){
abstractExecThread = new ConditionsTaskExecThread(taskInstance);
}else {
abstractExecThread = new MasterTaskExecThread(taskInstance);
}
Future<Boolean> future = taskExecService.submit(abstractExecThread);
activeTaskNode.putIfAbsent(abstractExecThread, future);
return abstractExecThread.getTaskInstance();
}
MasterBaseTaskExecThread
MasterBaseTaskExecThread
是SubProcessTaskExecThread
,DependentTaskExecThread
,ConditionsTaskExecThread
,MasterTaskExecThread
的父類,實現Callable介面。
-
SubProcessTaskExecThread
任務例項不會下發到worker節點執行,在submitTask(TaskInstance taskInstance)方法中,針對子流程,會增加一條子流程例項命令,然後在waitTaskQuit方法中迴圈等待子流程執行完成。在當前工作流執行結束後會繼續執行子工作流並做相關狀態更新,子工作流完全完成才同步狀態為子工作流的狀態。
-
DependentTaskExecThread
Dependent 節點,就是依賴檢查節點。比如 A 流程依賴昨天的 B 流程執行成功,依賴節點會去檢查 B 流程在昨天是否有執行成功的例項。
-
ConditionsTaskExecThrea
Conditions 是一個條件節點,根據上游任務執行狀態,判斷應該執行哪個下游任務。截止目前 Conditions 支援多個上游任務,但只支援兩個下游任務。當上遊任務數超過一個時,可以透過且以及或運算子實現複雜上游依賴。
-
MasterTaskExecThread
將任務例項下發到worker節點執行,並在waitTaskQuit方法中迴圈等待任務例項執行完成,任務完成後則即出。例如SQKL,Shell等任務型別。
MasterBaseTaskExecThread初始化方法
public MasterBaseTaskExecThread(TaskInstance taskInstance){
this.processService = SpringApplicationContext.getBean(ProcessService.class);
this.alertDao = SpringApplicationContext.getBean(AlertDao.class);
this.cancel = false;
this.taskInstance = taskInstance;
this.masterConfig = SpringApplicationContext.getBean(MasterConfig.class);
this.taskUpdateQueue = SpringApplicationContext.getBean(TaskPriorityQueueImpl.class);
initTaskParams();
}
MasterBaseTaskExecThread執行方法
@Override
public Boolean call() throws Exception {
this.processInstance = processService.findProcessInstanceById(taskInstance.getProcessInstanceId());
return submitWaitComplete(); // 由各子類實現
}
MasterBaseTaskExecThread公共方法
submit()
protected TaskInstance submit(){
// 提交任務重試次數. master.task.commit.retryTimes=5
Integer commitRetryTimes = masterConfig.getMasterTaskCommitRetryTimes();
// 提交任務失敗,重試間隔時間 master.task.commit.interval=1000
Integer commitRetryInterval = masterConfig.getMasterTaskCommitInterval();
int retryTimes = 1;
boolean submitDB = false;
boolean submitTask = false;
TaskInstance task = null;
while (retryTimes <= commitRetryTimes){
try {
if(!submitDB){
// 持久化TaskInstance到資料庫
task = processService.submitTask(taskInstance);
if(task != null && task.getId() != 0){
submitDB = true;
}
}
if(submitDB && !submitTask){
// 分發任務到Woroker執行
submitTask = dispatchTask(task);
}
if(submitDB && submitTask){
return task;
}
if(!submitDB){
logger.error("task commit to db failed , taskId {} has already retry {} times, please check the database", taskInstance.getId(), retryTimes);
}else if(!submitTask){
logger.error("task commit failed , taskId {} has already retry {} times, please check", taskInstance.getId(), retryTimes);
}
Thread.sleep(commitRetryInterval);
} catch (Exception e) {
logger.error("task commit to mysql and dispatcht task failed",e);
}
retryTimes += 1;
}
return task;
}
dispatchTask(TaskInstance task)
public Boolean dispatchTask(TaskInstance taskInstance) {
try{
// 如果是子流程,條件任務,依賴任務,直接返回true,不提交給worker執行
if(taskInstance.isConditionsTask()
|| taskInstance.isDependTask()
|| taskInstance.isSubProcess()){
return true;
}
if(taskInstance.getState().typeIsFinished()){
logger.info(String.format("submit task , but task [%s] state [%s] is already finished. ", taskInstance.getName(), taskInstance.getState().toString()));
return true;
}
// task cannot submit when running
if(taskInstance.getState() == ExecutionStatus.RUNNING_EXECUTION){
logger.info(String.format("submit to task, but task [%s] state already be running. ", taskInstance.getName()));
return true;
}
logger.info("task ready to submit: {}", taskInstance);
/**
* taskPriority
*/
TaskPriority taskPriority = buildTaskPriority(processInstance.getProcessInstancePriority().getCode(),
processInstance.getId(),
taskInstance.getProcessInstancePriority().getCode(),
taskInstance.getId(),
org.apache.dolphinscheduler.common.Constants.DEFAULT_WORKER_GROUP);
// 放入TaskPriorityQueue中,
// org.apache.dolphinscheduler.service.queue.TaskPriorityQueueImpl用於消費,從佇列裡取出TaskInstance,提交給Worker執行
taskUpdateQueue.put(taskPriority);
logger.info(String.format("master submit success, task : %s", taskInstance.getName()) );
return true;
}catch (Exception e){
logger.error("submit task Exception: ", e);
logger.error("task error : %s", JSONUtils.toJson(taskInstance));
return false;
}
}
MasterTaskExecThread
submitWaitComplete()
public Boolean submitWaitComplete() {
Boolean result = false;
// 提交任務
this.taskInstance = submit();
if(this.taskInstance == null){
logger.error("submit task instance to mysql and queue failed , please check and fix it");
return result;
}
if(!this.taskInstance.getState().typeIsFinished()) {
// 等待任務執行結果
result = waitTaskQuit();
}
taskInstance.setEndTime(new Date());
processService.updateTaskInstance(taskInstance);
logger.info("task :{} id:{}, process id:{}, exec thread completed ",
this.taskInstance.getName(),taskInstance.getId(), processInstance.getId() );
return result;
}
waitTaskQuit()
public Boolean waitTaskQuit(){
// query new state
taskInstance = processService.findTaskInstanceById(taskInstance.getId());
logger.info("wait task: process id: {}, task id:{}, task name:{} complete",
this.taskInstance.getProcessInstanceId(), this.taskInstance.getId(), this.taskInstance.getName());
while (Stopper.isRunning()){
try {
if(this.processInstance == null){
logger.error("process instance not exists , master task exec thread exit");
return true;
}
// task instance add queue , waiting worker to kill
if(this.cancel || this.processInstance.getState() == ExecutionStatus.READY_STOP){
cancelTaskInstance();
}
if(processInstance.getState() == ExecutionStatus.READY_PAUSE){
pauseTask();
}
// task instance finished
if (taskInstance.getState().typeIsFinished()){
// if task is final result , then remove taskInstance from cache
// taskInstanceCacheManager其實現類為:org.apache.dolphinscheduler.server.master.cache.impl.TaskInstanceCacheManagerImpl
// taskInstance在觸發ack和response Command會被新增到taskInstanceCache裡
taskInstanceCacheManager.removeByTaskInstanceId(taskInstance.getId());
break;
}
if (checkTaskTimeout()) {
this.checkTimeoutFlag = !alertTimeout();
}
// updateProcessInstance task instance
taskInstance = processService.findTaskInstanceById(taskInstance.getId());
processInstance = processService.findProcessInstanceById(processInstance.getId());
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (Exception e) {
logger.error("exception",e);
if (processInstance != null) {
logger.error("wait task quit failed, instance id:{}, task id:{}",
processInstance.getId(), taskInstance.getId());
}
}
}
return true;
}
SubProcessTaskExecThread
submitWaitComplete()
public Boolean submitWaitComplete() {
Boolean result = false;
try{
// submit task instance
this.taskInstance = submit();
if(taskInstance == null){
logger.error("sub work flow submit task instance to mysql and queue failed , please check and fix it");
return result;
}
setTaskInstanceState();
waitTaskQuit();
subProcessInstance = processService.findSubProcessInstance(processInstance.getId(), taskInstance.getId());
// at the end of the subflow , the task state is changed to the subflow state
if(subProcessInstance != null){
if(subProcessInstance.getState() == ExecutionStatus.STOP){
this.taskInstance.setState(ExecutionStatus.KILL);
}else{
this.taskInstance.setState(subProcessInstance.getState());
}
}
taskInstance.setEndTime(new Date());
processService.updateTaskInstance(taskInstance);
logger.info("subflow task :{} id:{}, process id:{}, exec thread completed ",
this.taskInstance.getName(),taskInstance.getId(), processInstance.getId() );
result = true;
}catch (Exception e){
logger.error("exception: ",e);
if (null != taskInstance) {
logger.error("wait task quit failed, instance id:{}, task id:{}",
processInstance.getId(), taskInstance.getId());
}
}
return result;
}
waitTaskQuit()
private void waitTaskQuit() throws InterruptedException {
logger.info("wait sub work flow: {} complete", this.taskInstance.getName());
if (taskInstance.getState().typeIsFinished()) {
logger.info("sub work flow task {} already complete. task state:{}, parent work flow instance state:{}",
this.taskInstance.getName(),
this.taskInstance.getState(),
this.processInstance.getState());
return;
}
while (Stopper.isRunning()) {
// waiting for subflow process instance establishment
if (subProcessInstance == null) {
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
if(!setTaskInstanceState()){
continue;
}
}
subProcessInstance = processService.findProcessInstanceById(subProcessInstance.getId());
if (checkTaskTimeout()) {
this.checkTimeoutFlag = !alertTimeout();
handleTimeoutFailed();
}
updateParentProcessState();
if (subProcessInstance.getState().typeIsFinished()){
break;
}
if(this.processInstance.getState() == ExecutionStatus.READY_PAUSE){
// parent process "ready to pause" , child process "pause"
pauseSubProcess();
}else if(this.cancel || this.processInstance.getState() == ExecutionStatus.READY_STOP){
// parent Process "Ready to Cancel" , subflow "Cancel"
stopSubProcess();
}
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
}
}
ConditionsTaskExecThread
submitWaitComplete()
public Boolean submitWaitComplete() {
try{
this.taskInstance = submit();
logger = LoggerFactory.getLogger(LoggerUtils.buildTaskId(LoggerUtils.TASK_LOGGER_INFO_PREFIX,
taskInstance.getProcessDefinitionId(),
taskInstance.getProcessInstanceId(),
taskInstance.getId()));
String threadLoggerInfoName = String.format(Constants.TASK_LOG_INFO_FORMAT, processService.formatTaskAppId(this.taskInstance));
Thread.currentThread().setName(threadLoggerInfoName);
initTaskParameters();
logger.info("dependent task start");
// 等待判斷
waitTaskQuit();
// 更新最終依賴結果
updateTaskState();
}catch (Exception e){
logger.error("conditions task run exception" , e);
}
return true;
}
waitTaskQuit
private void waitTaskQuit() {
List<TaskInstance> taskInstances = processService.findValidTaskListByProcessId(
taskInstance.getProcessInstanceId()
);
for(TaskInstance task : taskInstances){
completeTaskList.putIfAbsent(task.getName(), task.getState());
}
// 獲取所有依賴結果
List<DependResult> modelResultList = new ArrayList<>();
for(DependentTaskModel dependentTaskModel : dependentParameters.getDependTaskList()){
List<DependResult> itemDependResult = new ArrayList<>();
for(DependentItem item : dependentTaskModel.getDependItemList()){
itemDependResult.add(getDependResultForItem(item));
}
DependResult modelResult = DependentUtils.getDependResultForRelation(dependentTaskModel.getRelation(), itemDependResult);
modelResultList.add(modelResult);
}
// 根據邏輯運算子,合併依賴結果
conditionResult = DependentUtils.getDependResultForRelation(
dependentParameters.getRelation(), modelResultList
);
logger.info("the conditions task depend result : {}", conditionResult);
}
DependentTaskExecThread
submitWaitComplete()
public Boolean submitWaitComplete() {
try{
logger.info("dependent task start");
this.taskInstance = submit();
logger = LoggerFactory.getLogger(LoggerUtils.buildTaskId(LoggerUtils.TASK_LOGGER_INFO_PREFIX,
taskInstance.getProcessDefinitionId(),
taskInstance.getProcessInstanceId(),
taskInstance.getId()));
String threadLoggerInfoName = String.format(Constants.TASK_LOG_INFO_FORMAT, processService.formatTaskAppId(this.taskInstance));
Thread.currentThread().setName(threadLoggerInfoName);
initTaskParameters();
initDependParameters();
waitTaskQuit();
updateTaskState();
}catch (Exception e){
logger.error("dependent task run exception" , e);
}
return true;
}
waitTaskQuit()
private Boolean waitTaskQuit() {
logger.info("wait depend task : {} complete", this.taskInstance.getName());
if (taskInstance.getState().typeIsFinished()) {
logger.info("task {} already complete. task state:{}",
this.taskInstance.getName(),
this.taskInstance.getState());
return true;
}
while (Stopper.isRunning()) {
try{
if(this.processInstance == null){
logger.error("process instance not exists , master task exec thread exit");
return true;
}
// 省略部分程式碼
// allDependentTaskFinish()等待所有依賴任務執行結束
if ( allDependentTaskFinish() || taskInstance.getState().typeIsFinished()){
break;
}
// update process task
taskInstance = processService.findTaskInstanceById(taskInstance.getId());
processInstance = processService.findProcessInstanceById(processInstance.getId());
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (Exception e) {
logger.error("exception",e);
if (processInstance != null) {
logger.error("wait task quit failed, instance id:{}, task id:{}",
processInstance.getId(), taskInstance.getId());
}
}
}
return true;
}
TaskPriorityQueueConsumer
@Override
public void run() {
List<TaskPriority> failedDispatchTasks = new ArrayList<>();
while (Stopper.isRunning()){
try {
// 每一批次分發任務數量,master.dispatch.task.num = 3
int fetchTaskNum = masterConfig.getMasterDispatchTaskNumber();
failedDispatchTasks.clear();
for(int i = 0; i < fetchTaskNum; i++){
if(taskPriorityQueue.size() <= 0){
Thread.sleep(Constants.SLEEP_TIME_MILLIS);
continue;
}
// if not task , blocking here
// 從佇列裡面獲取task
TaskPriority taskPriority = taskPriorityQueue.take();
// 分發給worker執行
boolean dispatchResult = dispatch(taskPriority);
if(!dispatchResult){
failedDispatchTasks.add(taskPriority);
}
}
if (!failedDispatchTasks.isEmpty()) {
// 分發失敗的任務,需要重新加入佇列中,等待重新分發
for (TaskPriority dispatchFailedTask : failedDispatchTasks) {
taskPriorityQueue.put(dispatchFailedTask);
}
// If there are tasks in a cycle that cannot find the worker group,
// sleep for 1 second
if (taskPriorityQueue.size() <= failedDispatchTasks.size()) {
TimeUnit.MILLISECONDS.sleep(Constants.SLEEP_TIME_MILLIS);
}
}
}catch (Exception e){
logger.error("dispatcher task error",e);
}
}
}
/**
* dispatch task
*
* @param taskPriority taskPriority
* @return result
*/
protected boolean dispatch(TaskPriority taskPriority) {
boolean result = false;
try {
int taskInstanceId = taskPriority.getTaskId();
TaskExecutionContext context = getTaskExecutionContext(taskInstanceId);
ExecutionContext executionContext = new ExecutionContext(context.toCommand(), ExecutorType.WORKER, context.getWorkerGroup());
if (taskInstanceIsFinalState(taskInstanceId)){
// when task finish, ignore this task, there is no need to dispatch anymore
return true;
}else{
// 分發任務
// 分發演算法支援:低負載優先演算法,隨機演算法, 輪詢演算法。
result = dispatcher.dispatch(executionContext);
}
} catch (ExecuteException e) {
logger.error("dispatch error: {}",e.getMessage());
}
return result;
}
透過對 Apache DolphinScheduler 1.3.9 的原始碼分析,我們深入瞭解了其核心模組的設計和實現。DolphinScheduler 的 Master 架構充分保證了任務排程的高可用性和擴充套件性,而透過 Zookeeper 實現的叢集協調則為系統提供了強大的容錯機制。
如果你對 Apache DolphinScheduler 的原始碼有興趣,可以深入研究其任務排程策略的細節部分,或者根據自身業務場景進行二次開發,充分發揮 DolphinScheduler 的排程能力。
本文完!
本文由 白鯨開源 提供釋出支援!