fix: split brain #102

This commit is contained in:
KFCFans 2020-11-21 13:32:43 +08:00
parent 4be5b01ba1
commit 5a50d933a8
5 changed files with 39 additions and 23 deletions

View File

@ -12,6 +12,7 @@ import com.github.kfcfans.powerjob.server.service.lock.LockService;
import com.google.common.collect.Sets;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import javax.annotation.Resource;
@ -20,6 +21,7 @@ import java.util.Date;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletionStage;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
/**
@ -37,17 +39,25 @@ public class ServerSelectService {
@Resource
private AppInfoRepository appInfoRepository;
@Value("${oms.accurate.select.server.percentage}")
private int accurateSelectServerPercentage;
private static final int RETRY_TIMES = 10;
private static final long PING_TIMEOUT_MS = 1000;
private static final String SERVER_ELECT_LOCK = "server_elect_%d";
/**
* 获取某个应用对应的Server
*
* @param appId 应用ID
* @return 当前可用的Server
*/
public String getServer(Long appId) {
public String getServer(Long appId, String currentServer) {
if (!accurate()) {
// 如果是本机就不需要查数据库那么复杂的操作了直接返回成功
if (OhMyServer.getActorSystemAddress().equals(currentServer)) {
return currentServer;
}
}
return getServer0(appId);
}
private String getServer0(Long appId) {
Set<String> downServerCache = Sets.newHashSet();
@ -95,7 +105,7 @@ public class ServerSelectService {
lockService.unlock(lockName);
}
}
throw new RuntimeException("server elect failed for app " + appId);
throw new PowerJobException("server elect failed for app " + appId);
}
/**
@ -113,6 +123,10 @@ public class ServerSelectService {
return false;
}
if (OhMyServer.getActorSystemAddress().equals(serverAddress)) {
return true;
}
Ping ping = new Ping();
ping.setCurrentTime(System.currentTimeMillis());
@ -128,4 +142,8 @@ public class ServerSelectService {
downServerCache.add(serverAddress);
return false;
}
private boolean accurate() {
return ThreadLocalRandom.current().nextInt(100) < accurateSelectServerPercentage;
}
}

View File

@ -2,26 +2,21 @@ package com.github.kfcfans.powerjob.server.web.controller;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.github.kfcfans.powerjob.common.PowerJobException;
import com.github.kfcfans.powerjob.common.response.ResultDTO;
import com.github.kfcfans.powerjob.common.utils.CommonUtils;
import com.github.kfcfans.powerjob.common.utils.NetUtils;
import com.github.kfcfans.powerjob.server.akka.OhMyServer;
import com.github.kfcfans.powerjob.server.persistence.core.model.AppInfoDO;
import com.github.kfcfans.powerjob.server.persistence.core.model.JobInfoDO;
import com.github.kfcfans.powerjob.server.persistence.core.repository.AppInfoRepository;
import com.github.kfcfans.powerjob.server.persistence.core.repository.JobInfoRepository;
import com.github.kfcfans.powerjob.server.service.ha.ClusterStatusHolder;
import com.github.kfcfans.powerjob.server.service.ha.ServerSelectService;
import com.github.kfcfans.powerjob.server.service.ha.WorkerManagerService;
import com.taobao.api.internal.cluster.ClusterManager;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import javax.annotation.Resource;
import java.util.List;
import java.util.Optional;
import java.util.TimeZone;
@ -52,13 +47,7 @@ public class ServerController {
@GetMapping("/acquire")
public ResultDTO<String> acquireServer(Long appId, String currentServer) {
// 如果是本机就不需要查数据库那么复杂的操作了直接返回成功
if (OhMyServer.getActorSystemAddress().equals(currentServer)) {
return ResultDTO.success(currentServer);
}
String server = serverSelectService.getServer(appId);
return ResultDTO.success(server);
return ResultDTO.success(serverSelectService.getServer(appId, currentServer));
}
@GetMapping("/hello")

View File

@ -33,3 +33,6 @@ oms.container.retention.remote=-1
####### 缓存配置 #######
oms.instance.metadata.cache.size=1024
####### 精确获取 server 的百分比0100100代表每次 worker 获取 server 都会进行完整的探活流程,不存在脑裂问题,但有性能开销 #######
oms.accurate.select.server.percentage = 50

View File

@ -33,3 +33,6 @@ oms.container.retention.remote=-1
####### 缓存配置 #######
oms.instance.metadata.cache.size=1024
####### 精确获取 server 的百分比0100100代表每次 worker 获取 server 都会进行完整的探活流程,不存在脑裂问题,但有性能开销 #######
oms.accurate.select.server.percentage = 50

View File

@ -33,3 +33,6 @@ oms.container.retention.remote=-1
####### 缓存配置 #######
oms.instance.metadata.cache.size=2048
####### 精确获取 server 的百分比0100100代表每次 worker 获取 server 都会进行完整的探活流程,不存在脑裂问题,但有性能开销 #######
oms.accurate.select.server.percentage = 50