序
本文主要研究一下PowerJob Server的高可用
PowerJobSpringWorker
tech/powerjob/worker/PowerJobSpringWorker.java
public class PowerJobSpringWorker implements ApplicationContextAware, InitializingBean, DisposableBean {
/**
* 組合優於繼承,持有 PowerJobWorker,內部重新設定 ProcessorFactory 更優雅
*/
private PowerJobWorker powerJobWorker;
private final PowerJobWorkerConfig config;
public PowerJobSpringWorker(PowerJobWorkerConfig config) {
this.config = config;
}
@Override
public void afterPropertiesSet() throws Exception {
powerJobWorker = new PowerJobWorker(config);
powerJobWorker.init();
}
@Override
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
BuiltInSpringProcessorFactory springProcessorFactory = new BuiltInSpringProcessorFactory(applicationContext);
BuildInSpringMethodProcessorFactory springMethodProcessorFactory = new BuildInSpringMethodProcessorFactory(applicationContext);
// append BuiltInSpringProcessorFactory
List<ProcessorFactory> processorFactories = Lists.newArrayList(
Optional.ofNullable(config.getProcessorFactoryList())
.orElse(Collections.emptyList()));
processorFactories.add(springProcessorFactory);
processorFactories.add(springMethodProcessorFactory);
config.setProcessorFactoryList(processorFactories);
}
@Override
public void destroy() throws Exception {
powerJobWorker.destroy();
}
}
PowerJobSpringWorker實現了InitializingBean介面,其afterPropertiesSet會建立powerJobWorker,然後執行其init方法
PowerJobWorker.init
tech/powerjob/worker/PowerJobWorker.java
public void init() throws Exception {
if (!initialized.compareAndSet(false, true)) {
log.warn("[PowerJobWorker] please do not repeat the initialization");
return;
}
Stopwatch stopwatch = Stopwatch.createStarted();
log.info("[PowerJobWorker] start to initialize PowerJobWorker...");
PowerJobWorkerConfig config = workerRuntime.getWorkerConfig();
CommonUtils.requireNonNull(config, "can't find PowerJobWorkerConfig, please set PowerJobWorkerConfig first");
ServerDiscoveryService serverDiscoveryService = new PowerJobServerDiscoveryService(config);
workerRuntime.setServerDiscoveryService(serverDiscoveryService);
try {
PowerBannerPrinter.print();
// 校驗 appName
WorkerAppInfo appInfo = serverDiscoveryService.assertApp();
workerRuntime.setAppInfo(appInfo);
// 初始化網路資料,區別對待上報地址和本機繫結地址(對外統一使用上報地址)
String localBindIp = NetUtils.getLocalHost();
int localBindPort = config.getPort();
String externalIp = PropertyUtils.readProperty(PowerJobDKey.NT_EXTERNAL_ADDRESS, localBindIp);
String externalPort = PropertyUtils.readProperty(PowerJobDKey.NT_EXTERNAL_PORT, String.valueOf(localBindPort));
log.info("[PowerJobWorker] [ADDRESS_INFO] localBindIp: {}, localBindPort: {}; externalIp: {}, externalPort: {}", localBindIp, localBindPort, externalIp, externalPort);
workerRuntime.setWorkerAddress(Address.toFullAddress(externalIp, Integer.parseInt(externalPort)));
// 初始化 執行緒池
final ExecutorManager executorManager = new ExecutorManager(workerRuntime.getWorkerConfig());
workerRuntime.setExecutorManager(executorManager);
// 初始化 ProcessorLoader
ProcessorLoader processorLoader = buildProcessorLoader(workerRuntime);
workerRuntime.setProcessorLoader(processorLoader);
// 初始化 actor
TaskTrackerActor taskTrackerActor = new TaskTrackerActor(workerRuntime);
ProcessorTrackerActor processorTrackerActor = new ProcessorTrackerActor(workerRuntime);
WorkerActor workerActor = new WorkerActor(workerRuntime, taskTrackerActor);
// 初始化通訊引擎
EngineConfig engineConfig = new EngineConfig()
.setType(config.getProtocol().name())
.setServerType(ServerType.WORKER)
.setBindAddress(new Address().setHost(localBindIp).setPort(localBindPort))
.setActorList(Lists.newArrayList(taskTrackerActor, processorTrackerActor, workerActor));
EngineOutput engineOutput = remoteEngine.start(engineConfig);
workerRuntime.setTransporter(engineOutput.getTransporter());
// 連線 server
serverDiscoveryService.timingCheck(workerRuntime.getExecutorManager().getCoreExecutor());
log.info("[PowerJobWorker] PowerJobRemoteEngine initialized successfully.");
// 初始化日誌系統
OmsLogHandler omsLogHandler = new OmsLogHandler(workerRuntime.getWorkerAddress(), workerRuntime.getTransporter(), serverDiscoveryService);
workerRuntime.setOmsLogHandler(omsLogHandler);
// 初始化儲存
TaskPersistenceService taskPersistenceService = new TaskPersistenceService(workerRuntime.getWorkerConfig().getStoreStrategy());
taskPersistenceService.init();
workerRuntime.setTaskPersistenceService(taskPersistenceService);
log.info("[PowerJobWorker] local storage initialized successfully.");
// 初始化定時任務
workerRuntime.getExecutorManager().getCoreExecutor().scheduleAtFixedRate(new WorkerHealthReporter(workerRuntime), 0, config.getHealthReportInterval(), TimeUnit.SECONDS);
workerRuntime.getExecutorManager().getCoreExecutor().scheduleWithFixedDelay(omsLogHandler.logSubmitter, 0, 5, TimeUnit.SECONDS);
log.info("[PowerJobWorker] PowerJobWorker initialized successfully, using time: {}, congratulations!", stopwatch);
}catch (Exception e) {
log.error("[PowerJobWorker] initialize PowerJobWorker failed, using {}.", stopwatch, e);
throw e;
}
}
PowerJobWorker的init方法會執行serverDiscoveryService.timingCheck(workerRuntime.getExecutorManager().getCoreExecutor())排程timingCheck
timingCheck
tech/powerjob/worker/background/discovery/PowerJobServerDiscoveryService.java
public void timingCheck(ScheduledExecutorService timingPool) {
this.currentServerAddress = discovery();
if (StringUtils.isEmpty(this.currentServerAddress) && !config.isAllowLazyConnectServer()) {
throw new PowerJobException("can't find any available server, this worker has been quarantined.");
}
// 這裡必須保證成功
timingPool.scheduleAtFixedRate(() -> {
try {
this.currentServerAddress = discovery();
} catch (Exception e) {
log.error("[PowerDiscovery] fail to discovery server!", e);
}
}
, 10, 10, TimeUnit.SECONDS);
}
PowerJobServerDiscoveryService的timingCheck會使用timingPool定時每隔10s排程執行discovery()來更新當前worker的server地址
discovery
private String discovery() {
// 只有允許延遲載入模式下,appId 才可能為空。每次服務發現前,都重新嘗試獲取 appInfo。由於是懶載入鏈路,此處完全忽略異常
if (appInfo.getAppId() == null || appInfo.getAppId() < 0) {
try {
assertApp0();
} catch (Exception e) {
log.warn("[PowerDiscovery] assertAppName in discovery stage failed, msg: {}", e.getMessage());
return null;
}
}
if (ip2Address.isEmpty()) {
config.getServerAddress().forEach(x -> ip2Address.put(x.split(":")[0], x));
}
String result = null;
// 先對當前機器發起請求
String currentServer = currentServerAddress;
if (!StringUtils.isEmpty(currentServer)) {
String ip = currentServer.split(":")[0];
// 直接請求當前Server的HTTP服務,可以少一次網路開銷,減輕Server負擔
String firstServerAddress = ip2Address.get(ip);
if (firstServerAddress != null) {
result = acquire(firstServerAddress);
}
}
for (String httpServerAddress : config.getServerAddress()) {
if (StringUtils.isEmpty(result)) {
result = acquire(httpServerAddress);
}else {
break;
}
}
if (StringUtils.isEmpty(result)) {
log.warn("[PowerDiscovery] can't find any available server, this worker has been quarantined.");
// 在 Server 高可用的前提下,連續失敗多次,說明該節點與外界失聯,Server已經將秒級任務轉移到其他Worker,需要殺死本地的任務
if (FAILED_COUNT++ > MAX_FAILED_COUNT) {
log.warn("[PowerDiscovery] can't find any available server for 3 consecutive times, It's time to kill all frequent job in this worker.");
List<Long> frequentInstanceIds = HeavyTaskTrackerManager.getAllFrequentTaskTrackerKeys();
if (!CollectionUtils.isEmpty(frequentInstanceIds)) {
frequentInstanceIds.forEach(instanceId -> {
HeavyTaskTracker taskTracker = HeavyTaskTrackerManager.removeTaskTracker(instanceId);
taskTracker.destroy();
log.warn("[PowerDiscovery] kill frequent instance(instanceId={}) due to can't find any available server.", instanceId);
});
}
FAILED_COUNT = 0;
}
return null;
} else {
// 重置失敗次數
FAILED_COUNT = 0;
log.debug("[PowerDiscovery] current server is {}.", result);
return result;
}
}
discovery方法就是定時遍歷配置的serverAddress地址列表,呼叫server端的acquire方法來獲取可用的server
acquireServer
tech/powerjob/server/web/controller/ServerController.java
@GetMapping("/acquire")
public ResultDTO<String> acquireServer(ServerDiscoveryRequest request) {
return ResultDTO.success(serverElectionService.elect(request));
}
ServerController提供了acquire介面,它執行的是serverElectionService.elect(request)
elect
tech/powerjob/server/remote/server/election/ServerElectionService.java
public String elect(ServerDiscoveryRequest request) {
if (!accurate()) {
final String currentServer = request.getCurrentServer();
// 如果是本機,就不需要查資料庫那麼複雜的操作了,直接返回成功
Optional<ProtocolInfo> localProtocolInfoOpt = Optional.ofNullable(transportService.allProtocols().get(request.getProtocol()));
if (localProtocolInfoOpt.isPresent()) {
if (localProtocolInfoOpt.get().getExternalAddress().equals(currentServer) || localProtocolInfoOpt.get().getAddress().equals(currentServer)) {
log.info("[ServerElection] this server[{}] is worker[appId={}]'s current server, skip check", currentServer, request.getAppId());
return currentServer;
}
}
}
return getServer0(request);
}
ServerElectionService的elect方法主要是執行getServer0
getServer0
private String getServer0(ServerDiscoveryRequest discoveryRequest) {
final Long appId = discoveryRequest.getAppId();
final String protocol = discoveryRequest.getProtocol();
Set<String> downServerCache = Sets.newHashSet();
for (int i = 0; i < RETRY_TIMES; i++) {
// 無鎖獲取當前資料庫中的Server
Optional<AppInfoDO> appInfoOpt = appInfoRepository.findById(appId);
if (!appInfoOpt.isPresent()) {
throw new PowerJobException(appId + " is not registered!");
}
String appName = appInfoOpt.get().getAppName();
String originServer = appInfoOpt.get().getCurrentServer();
String activeAddress = activeAddress(originServer, downServerCache, protocol);
if (StringUtils.isNotEmpty(activeAddress)) {
return activeAddress;
}
// 無可用Server,重新進行Server選舉,需要加鎖
String lockName = String.format(SERVER_ELECT_LOCK, appId);
boolean lockStatus = lockService.tryLock(lockName, 30000);
if (!lockStatus) {
try {
Thread.sleep(500);
}catch (Exception ignore) {
}
continue;
}
try {
// 可能上一臺機器已經完成了Server選舉,需要再次判斷
AppInfoDO appInfo = appInfoRepository.findById(appId).orElseThrow(() -> new RuntimeException("impossible, unless we just lost our database."));
String address = activeAddress(appInfo.getCurrentServer(), downServerCache, protocol);
if (StringUtils.isNotEmpty(address)) {
return address;
}
// 篡位,如果本機存在協議,則作為Server排程該 worker
final ProtocolInfo targetProtocolInfo = transportService.allProtocols().get(protocol);
if (targetProtocolInfo != null) {
// 注意,寫入 AppInfoDO#currentServer 的永遠是 default 的繫結地址,僅在返回的時候特殊處理為協議地址
appInfo.setCurrentServer(transportService.defaultProtocol().getAddress());
appInfo.setGmtModified(new Date());
appInfoRepository.saveAndFlush(appInfo);
log.info("[ServerElection] this server({}) become the new server for app(appId={}).", appInfo.getCurrentServer(), appId);
return targetProtocolInfo.getExternalAddress();
}
}catch (Exception e) {
log.error("[ServerElection] write new server to db failed for app {}.", appName, e);
} finally {
lockService.unlock(lockName);
}
}
throw new PowerJobException("server elect failed for app " + appId);
}
getServer0方法會重試10次,它先針對discoveryRequest指定的currentServer進行activeAddress,成功則返回,沒有可用server則加鎖進行重新分配,這裡優先本機判斷
activeAddress
private String activeAddress(String serverAddress, Set<String> downServerCache, String protocol) {
if (downServerCache.contains(serverAddress)) {
return null;
}
if (StringUtils.isEmpty(serverAddress)) {
return null;
}
Ping ping = new Ping();
ping.setCurrentTime(System.currentTimeMillis());
URL targetUrl = ServerURLFactory.ping2Friend(serverAddress);
try {
AskResponse response = transportService.ask(Protocol.HTTP.name(), targetUrl, ping, AskResponse.class)
.toCompletableFuture()
.get(PING_TIMEOUT_MS, TimeUnit.MILLISECONDS);
if (response.isSuccess()) {
// 檢測透過的是遠端 server 的暴露地址,需要返回 worker 需要的協議地址
final JSONObject protocolInfo = JsonUtils.parseObject(response.getData(), JSONObject.class).getJSONObject(protocol);
if (protocolInfo != null) {
downServerCache.remove(serverAddress);
ProtocolInfo remoteProtocol = protocolInfo.toJavaObject(ProtocolInfo.class);
log.info("[ServerElection] server[{}] is active, it will be the master, final protocol={}", serverAddress, remoteProtocol);
// 4.3.3 升級 4.3.4 過程中,未升級的 server 還不存在 externalAddress,需要使用 address 相容
return Optional.ofNullable(remoteProtocol.getExternalAddress()).orElse(remoteProtocol.getAddress());
} else {
log.warn("[ServerElection] server[{}] is active but don't have target protocol", serverAddress);
}
}
} catch (TimeoutException te) {
log.warn("[ServerElection] server[{}] was down due to ping timeout!", serverAddress);
} catch (Exception e) {
log.warn("[ServerElection] server[{}] was down with unknown case!", serverAddress, e);
}
downServerCache.add(serverAddress);
return null;
}
activeAddress方法主要是對目標server發起ping請求,超時時間為1s,若目標server掛了,則丟擲TimeoutException,將目標server加入到downServerCache中;若目標server響應成功,則從downServerCache中移除
小結
PowerJob的worker在初始化的時候會啟動一個定時任務,每隔10s排程執行discovery()來更新當前worker的server地址;discovery方法就是定時遍歷配置的serverAddress地址列表,呼叫server端的acquire方法來獲取可用的server;ServerController提供了acquire介面,它執行的是serverElectionService.elect(request),ServerElectionService的elect方法主要是執行getServer0,getServer0方法會重試10次,它先針對discoveryRequest指定的currentServer進行activeAddress,成功則返回,沒有可用server則加鎖進行重新分配,這裡優先本機判斷。activeAddress方法主要是對目標server發起ping請求,超時時間為1s,若目標server掛了,則丟擲TimeoutException,將目標server加入到downServerCache中;若目標server響應成功,則從downServerCache中移除。
worker定時任務 --> 輪詢serverAddress請求acquire --> server端判斷目標server的ping是否成功,不成功則加鎖優先使用本機作為替代server。