序
本文主要研究一下PowerJob的InstanceStatusCheckService
InstanceStatus
tech/powerjob/common/enums/InstanceStatus.java
@Getter
@AllArgsConstructor
public enum InstanceStatus {
/**
*
*/
WAITING_DISPATCH(1, "等待派發"),
WAITING_WORKER_RECEIVE(2, "等待Worker接收"),
RUNNING(3, "執行中"),
FAILED(4, "失敗"),
SUCCEED(5, "成功"),
CANCELED(9, "取消"),
STOPPED(10, "手動停止");
private final int v;
private final String des;
/**
* 廣義的執行狀態
*/
public static final List<Integer> GENERALIZED_RUNNING_STATUS = Lists.newArrayList(WAITING_DISPATCH.v, WAITING_WORKER_RECEIVE.v, RUNNING.v);
/**
* 結束狀態
*/
public static final List<Integer> FINISHED_STATUS = Lists.newArrayList(FAILED.v, SUCCEED.v, CANCELED.v, STOPPED.v);
public static InstanceStatus of(int v) {
for (InstanceStatus is : values()) {
if (v == is.v) {
return is;
}
}
throw new IllegalArgumentException("InstanceStatus has no item for value " + v);
}
}
InstanceStatus定義了任務例項的狀態,廣義執行中的狀態為WAITING_DISPATCH、WAITING_WORKER_RECEIVE、RUNNING;終態為FAILED、SUCCEED、CANCELED、STOPPED
InstanceStatusCheckService
tech/powerjob/server/core/scheduler/InstanceStatusCheckService.java
@Slf4j
@Service
@RequiredArgsConstructor
public class InstanceStatusCheckService {
private static final int MAX_BATCH_NUM_APP = 10;
private static final int MAX_BATCH_NUM_INSTANCE = 3000;
private static final int MAX_BATCH_UPDATE_NUM = 500;
private static final long DISPATCH_TIMEOUT_MS = 30000;
private static final long RECEIVE_TIMEOUT_MS = 60000;
private static final long RUNNING_TIMEOUT_MS = 60000;
private static final long WORKFLOW_WAITING_TIMEOUT_MS = 60000;
public static final long CHECK_INTERVAL = 10000;
private final TransportService transportService;
private final DispatchService dispatchService;
private final InstanceManager instanceManager;
private final WorkflowInstanceManager workflowInstanceManager;
private final AppInfoRepository appInfoRepository;
private final JobInfoRepository jobInfoRepository;
private final InstanceInfoRepository instanceInfoRepository;
private final WorkflowInfoRepository workflowInfoRepository;
private final WorkflowInstanceInfoRepository workflowInstanceInfoRepository;
//......
}
InstanceStatusCheckService提供了checkRunningInstance、checkWaitingDispatchInstance、checkWaitingWorkerReceiveInstance、checkWorkflowInstance方法
checkRunningInstance
public void checkRunningInstance() {
Stopwatch stopwatch = Stopwatch.createStarted();
// 查詢 DB 獲取該 Server 需要負責的 AppGroup
List<Long> allAppIds = appInfoRepository.listAppIdByCurrentServer(transportService.defaultProtocol().getAddress());
if (CollectionUtils.isEmpty(allAppIds)) {
log.info("[InstanceStatusChecker] current server has no app's job to check");
return;
}
try {
// 檢查 RUNNING 狀態的任務(一定時間沒收到 TaskTracker 的狀態報告,視為失敗)
Lists.partition(allAppIds, MAX_BATCH_NUM_APP).forEach(this::handleRunningInstance);
} catch (Exception e) {
log.error("[InstanceStatusChecker] RunningInstance status check failed.", e);
}
log.info("[InstanceStatusChecker] RunningInstance status check using {}.", stopwatch.stop());
}
private void handleRunningInstance(List<Long> partAppIds) {
// 3. 檢查 RUNNING 狀態的任務(一定時間沒收到 TaskTracker 的狀態報告,視為失敗)
long threshold = System.currentTimeMillis() - RUNNING_TIMEOUT_MS;
List<BriefInstanceInfo> failedInstances = instanceInfoRepository.selectBriefInfoByAppIdInAndStatusAndGmtModifiedBefore(partAppIds, InstanceStatus.RUNNING.getV(), new Date(threshold), PageRequest.of(0, MAX_BATCH_NUM_INSTANCE));
while (!failedInstances.isEmpty()) {
// collect job id
Set<Long> jobIds = failedInstances.stream().map(BriefInstanceInfo::getJobId).collect(Collectors.toSet());
// query job info and map
Map<Long, JobInfoDO> jobInfoMap = jobInfoRepository.findByIdIn(jobIds).stream().collect(Collectors.toMap(JobInfoDO::getId, e -> e));
log.warn("[InstanceStatusCheckService] find some instances have not received status report for a long time : {}", failedInstances.stream().map(BriefInstanceInfo::getInstanceId).collect(Collectors.toList()));
failedInstances.forEach(instance -> {
Optional<JobInfoDO> jobInfoOpt = Optional.ofNullable(jobInfoMap.get(instance.getJobId()));
if (!jobInfoOpt.isPresent()) {
final Optional<InstanceInfoDO> opt = instanceInfoRepository.findById(instance.getId());
opt.ifPresent(e -> updateFailedInstance(e, SystemInstanceResult.REPORT_TIMEOUT));
return;
}
TimeExpressionType timeExpressionType = TimeExpressionType.of(jobInfoOpt.get().getTimeExpressionType());
SwitchableStatus switchableStatus = SwitchableStatus.of(jobInfoOpt.get().getStatus());
// 如果任務已關閉,則不進行重試,將任務置為失敗即可;秒級任務也直接置為失敗,由派發器重新排程
if (switchableStatus != SwitchableStatus.ENABLE || TimeExpressionType.FREQUENT_TYPES.contains(timeExpressionType.getV())) {
final Optional<InstanceInfoDO> opt = instanceInfoRepository.findById(instance.getId());
opt.ifPresent(e -> updateFailedInstance(e, SystemInstanceResult.REPORT_TIMEOUT));
return;
}
// CRON 和 API一樣,失敗次數 + 1,根據重試配置進行重試
if (instance.getRunningTimes() < jobInfoOpt.get().getInstanceRetryNum()) {
dispatchService.redispatchAsync(instance.getInstanceId(), InstanceStatus.RUNNING.getV());
} else {
final Optional<InstanceInfoDO> opt = instanceInfoRepository.findById(instance.getId());
opt.ifPresent(e -> updateFailedInstance(e, SystemInstanceResult.REPORT_TIMEOUT));
}
});
threshold = System.currentTimeMillis() - RUNNING_TIMEOUT_MS;
failedInstances = instanceInfoRepository.selectBriefInfoByAppIdInAndStatusAndGmtModifiedBefore(partAppIds, InstanceStatus.RUNNING.getV(), new Date(threshold), PageRequest.of(0, MAX_BATCH_NUM_INSTANCE));
}
}
checkRunningInstance查詢該server負責的appId,然後挨個遍歷執行handleRunningInstance;handleRunningInstance查詢一定時間沒收到TaskTracker狀態報告的任務例項,若任務已經關閉則不進行重試,若是秒級任務則更新為失敗,其他的則判斷執行次數市場超過重試次數,否則透過dispatchService.redispatchAsync重試
checkWaitingDispatchInstance
public void checkWaitingDispatchInstance() {
Stopwatch stopwatch = Stopwatch.createStarted();
// 查詢 DB 獲取該 Server 需要負責的 AppGroup
List<Long> allAppIds = appInfoRepository.listAppIdByCurrentServer(transportService.defaultProtocol().getAddress());
if (CollectionUtils.isEmpty(allAppIds)) {
log.info("[InstanceStatusChecker] current server has no app's job to check");
return;
}
try {
// 檢查等待 WAITING_DISPATCH 狀態的任務
Lists.partition(allAppIds, MAX_BATCH_NUM_APP).forEach(this::handleWaitingDispatchInstance);
} catch (Exception e) {
log.error("[InstanceStatusChecker] WaitingDispatchInstance status check failed.", e);
}
log.info("[InstanceStatusChecker] WaitingDispatchInstance status check using {}.", stopwatch.stop());
}
private void handleWaitingDispatchInstance(List<Long> partAppIds) {
// 1. 檢查等待 WAITING_DISPATCH 狀態的任務
long threshold = System.currentTimeMillis() - DISPATCH_TIMEOUT_MS;
List<InstanceInfoDO> waitingDispatchInstances = instanceInfoRepository.findAllByAppIdInAndStatusAndExpectedTriggerTimeLessThan(partAppIds, InstanceStatus.WAITING_DISPATCH.getV(), threshold, PageRequest.of(0, MAX_BATCH_NUM_INSTANCE));
while (!waitingDispatchInstances.isEmpty()) {
List<Long> overloadAppIdList = new ArrayList<>();
long startTime = System.currentTimeMillis();
// 按照 appId 分組處理,方便處理超載的邏輯
Map<Long, List<InstanceInfoDO>> waitingDispatchInstancesMap = waitingDispatchInstances.stream().collect(Collectors.groupingBy(InstanceInfoDO::getAppId));
for (Map.Entry<Long, List<InstanceInfoDO>> entry : waitingDispatchInstancesMap.entrySet()) {
final Long currentAppId = entry.getKey();
final List<InstanceInfoDO> currentAppWaitingDispatchInstances = entry.getValue();
// collect job id
Set<Long> jobIds = currentAppWaitingDispatchInstances.stream().map(InstanceInfoDO::getJobId).collect(Collectors.toSet());
// query job info and map
Map<Long, JobInfoDO> jobInfoMap = jobInfoRepository.findByIdIn(jobIds).stream().collect(Collectors.toMap(JobInfoDO::getId, e -> e));
log.warn("[InstanceStatusChecker] find some instance in app({}) which is not triggered as expected: {}", currentAppId, currentAppWaitingDispatchInstances.stream().map(InstanceInfoDO::getInstanceId).collect(Collectors.toList()));
final Holder<Boolean> overloadFlag = new Holder<>(false);
// 先這麼簡單處理沒問題,畢竟只有這一個地方用了 parallelStream
currentAppWaitingDispatchInstances.parallelStream().forEach(instance -> {
if (overloadFlag.get()) {
// 直接忽略
return;
}
Optional<JobInfoDO> jobInfoOpt = Optional.ofNullable(jobInfoMap.get(instance.getJobId()));
if (jobInfoOpt.isPresent()) {
// 處理等待派發的任務沒有必要再重置一次狀態,減少 io 次數
dispatchService.dispatch(jobInfoOpt.get(), instance.getInstanceId(), Optional.of(instance), Optional.of(overloadFlag));
} else {
log.warn("[InstanceStatusChecker] can't find job by jobId[{}], so redispatch failed, failed instance: {}", instance.getJobId(), instance);
final Optional<InstanceInfoDO> opt = instanceInfoRepository.findById(instance.getId());
opt.ifPresent(instanceInfoDO -> updateFailedInstance(instanceInfoDO, SystemInstanceResult.CAN_NOT_FIND_JOB_INFO));
}
});
threshold = System.currentTimeMillis() - DISPATCH_TIMEOUT_MS;
if (overloadFlag.get()) {
overloadAppIdList.add(currentAppId);
}
}
log.info("[InstanceStatusChecker] process {} task,use {} ms", waitingDispatchInstances.size(), System.currentTimeMillis() - startTime);
if (!overloadAppIdList.isEmpty()) {
log.warn("[InstanceStatusChecker] app[{}] is overload, so skip check waiting dispatch instance", overloadAppIdList);
partAppIds.removeAll(overloadAppIdList);
}
if (partAppIds.isEmpty()) {
break;
}
waitingDispatchInstances = instanceInfoRepository.findAllByAppIdInAndStatusAndExpectedTriggerTimeLessThan(partAppIds, InstanceStatus.WAITING_DISPATCH.getV(), threshold, PageRequest.of(0, MAX_BATCH_NUM_INSTANCE));
}
}
checkWaitingDispatchInstance查詢待派發的任務例項,透過dispatchService.dispatch進行派發
checkWaitingWorkerReceiveInstance
public void checkWaitingWorkerReceiveInstance() {
Stopwatch stopwatch = Stopwatch.createStarted();
// 查詢 DB 獲取該 Server 需要負責的 AppGroup
List<Long> allAppIds = appInfoRepository.listAppIdByCurrentServer(transportService.defaultProtocol().getAddress());
if (CollectionUtils.isEmpty(allAppIds)) {
log.info("[InstanceStatusChecker] current server has no app's job to check");
return;
}
try {
// 檢查 WAITING_WORKER_RECEIVE 狀態的任務
Lists.partition(allAppIds, MAX_BATCH_NUM_APP).forEach(this::handleWaitingWorkerReceiveInstance);
} catch (Exception e) {
log.error("[InstanceStatusChecker] WaitingWorkerReceiveInstance status check failed.", e);
}
log.info("[InstanceStatusChecker] WaitingWorkerReceiveInstance status check using {}.", stopwatch.stop());
}
private void handleWaitingWorkerReceiveInstance(List<Long> partAppIds) {
// 2. 檢查 WAITING_WORKER_RECEIVE 狀態的任務
long threshold = System.currentTimeMillis() - RECEIVE_TIMEOUT_MS;
List<BriefInstanceInfo> waitingWorkerReceiveInstances = instanceInfoRepository.selectBriefInfoByAppIdInAndStatusAndActualTriggerTimeLessThan(partAppIds, InstanceStatus.WAITING_WORKER_RECEIVE.getV(), threshold, PageRequest.of(0, MAX_BATCH_NUM_INSTANCE));
while (!waitingWorkerReceiveInstances.isEmpty()) {
log.warn("[InstanceStatusChecker] find some instance didn't receive any reply from worker, try to redispatch: {}", waitingWorkerReceiveInstances.stream().map(BriefInstanceInfo::getInstanceId).collect(Collectors.toList()));
final List<List<BriefInstanceInfo>> partitions = Lists.partition(waitingWorkerReceiveInstances, MAX_BATCH_UPDATE_NUM);
for (List<BriefInstanceInfo> partition : partitions) {
dispatchService.redispatchBatchAsyncLockFree(partition.stream().map(BriefInstanceInfo::getInstanceId).collect(Collectors.toList()), InstanceStatus.WAITING_WORKER_RECEIVE.getV());
}
// 重新查詢
threshold = System.currentTimeMillis() - RECEIVE_TIMEOUT_MS;
waitingWorkerReceiveInstances = instanceInfoRepository.selectBriefInfoByAppIdInAndStatusAndActualTriggerTimeLessThan(partAppIds, InstanceStatus.WAITING_WORKER_RECEIVE.getV(), threshold, PageRequest.of(0, MAX_BATCH_NUM_INSTANCE));
}
}
checkWaitingWorkerReceiveInstance檢查等待worker接收的例項,挨個執行dispatchService.redispatchBatchAsyncLockFree
checkWorkflowInstance
public void checkWorkflowInstance() {
Stopwatch stopwatch = Stopwatch.createStarted();
// 查詢 DB 獲取該 Server 需要負責的 AppGroup
List<Long> allAppIds = appInfoRepository.listAppIdByCurrentServer(transportService.defaultProtocol().getAddress());
if (CollectionUtils.isEmpty(allAppIds)) {
log.info("[InstanceStatusChecker] current server has no app's job to check");
return;
}
try {
checkWorkflowInstance(allAppIds);
} catch (Exception e) {
log.error("[InstanceStatusChecker] WorkflowInstance status check failed.", e);
}
log.info("[InstanceStatusChecker] WorkflowInstance status check using {}.", stopwatch.stop());
}
private void checkWorkflowInstance(List<Long> allAppIds) {
// 重試長時間處於 WAITING 狀態的工作流例項
long threshold = System.currentTimeMillis() - WORKFLOW_WAITING_TIMEOUT_MS;
Lists.partition(allAppIds, MAX_BATCH_NUM_APP).forEach(partAppIds -> {
List<WorkflowInstanceInfoDO> waitingWfInstanceList = workflowInstanceInfoRepository.findByAppIdInAndStatusAndExpectedTriggerTimeLessThan(partAppIds, WorkflowInstanceStatus.WAITING.getV(), threshold);
if (!CollectionUtils.isEmpty(waitingWfInstanceList)) {
List<Long> wfInstanceIds = waitingWfInstanceList.stream().map(WorkflowInstanceInfoDO::getWfInstanceId).collect(Collectors.toList());
log.warn("[WorkflowInstanceChecker] wfInstance({}) is not started as expected, oms try to restart these workflowInstance.", wfInstanceIds);
waitingWfInstanceList.forEach(wfInstance -> {
Optional<WorkflowInfoDO> workflowOpt = workflowInfoRepository.findById(wfInstance.getWorkflowId());
workflowOpt.ifPresent(workflowInfo -> {
workflowInstanceManager.start(workflowInfo, wfInstance.getWfInstanceId());
log.info("[Workflow-{}|{}] restart workflowInstance successfully~", workflowInfo.getId(), wfInstance.getWfInstanceId());
});
});
}
});
}
checkWorkflowInstance定期檢測工作流例項的狀態,針對WAITING的挨個執行workflowInstanceManager.start
小結
InstanceStatusCheckService提供了checkRunningInstance、checkWaitingDispatchInstance、checkWaitingWorkerReceiveInstance、checkWorkflowInstance方法,他們分別用於檢查狀態是執行中但是上報超時的任務例項、等待worker接收處理的任務例項、等待排程的工作流例項。