fix: concurrency problem when process workflow instance

This commit is contained in:
Echo009 2021-03-08 16:14:08 +08:00
parent e6127f1dba
commit afff77b540
11 changed files with 179 additions and 142 deletions

View File

@ -55,24 +55,36 @@ public class InstanceLogService {
private InstanceMetadataService instanceMetadataService;
@Resource
private GridFsManager gridFsManager;
// 本地数据库操作bean
/**
* 本地数据库操作bean
*/
@Resource(name = "localTransactionTemplate")
private TransactionTemplate localTransactionTemplate;
@Resource
private LocalInstanceLogRepository localInstanceLogRepository;
// 本地维护了在线日志的任务实例ID
/**
* 本地维护了在线日志的任务实例ID
*/
private final Map<Long, Long> instanceId2LastReportTime = Maps.newConcurrentMap();
private final ExecutorService workerPool;
// 分段锁
/**
* 分段锁
*/
private final SegmentLock segmentLock = new SegmentLock(8);
// 格式化时间戳
private static final FastDateFormat dateFormat = FastDateFormat.getInstance(OmsConstant.TIME_PATTERN_PLUS);
// 每一个展示的行数
/**
* 格式化时间戳
*/
private static final FastDateFormat DATE_FORMAT = FastDateFormat.getInstance(OmsConstant.TIME_PATTERN_PLUS);
/**
* 每一个展示的行数
*/
private static final int MAX_LINE_COUNT = 100;
// 过期时间
/**
* 过期时间
*/
private static final long EXPIRE_INTERVAL_MS = 60000;
public InstanceLogService() {
@ -110,7 +122,7 @@ public class InstanceLogService {
* @param index 页码从0开始
* @return 文本字符串
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public StringPage fetchInstanceLog(Long appId, Long instanceId, Long index) {
try {
Future<File> fileFuture = prepareLogFile(instanceId);
@ -154,7 +166,7 @@ public class InstanceLogService {
* @param instanceId 任务实例 ID
* @return 下载链接
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public String fetchDownloadUrl(Long appId, Long instanceId) {
String url = "http://" + NetUtils.getLocalHost() + ":" + port + "/instance/downloadLog?instanceId=" + instanceId;
log.info("[InstanceLog-{}] downloadURL for appId[{}]: {}", instanceId, appId, url);
@ -326,7 +338,7 @@ public class InstanceLogService {
*/
private static String convertLog(LocalInstanceLogDO instanceLog) {
return String.format("%s [%s] %s %s",
dateFormat.format(instanceLog.getLogTime()),
DATE_FORMAT.format(instanceLog.getLogTime()),
instanceLog.getWorkerAddress(),
LogLevel.genLogLevelString(instanceLog.getLogLevel()),
instanceLog.getLogContent());

View File

@ -104,7 +104,7 @@ public class InstanceService {
*
* @param instanceId 任务实例ID
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public void stopInstance(Long appId,Long instanceId) {
log.info("[Instance-{}] try to stop the instance instance in appId: {}", instanceId,appId);
@ -152,7 +152,7 @@ public class InstanceService {
*
* @param instanceId 任务实例ID
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public void retryInstance(Long appId, Long instanceId) {
log.info("[Instance-{}] retry instance in appId: {}", instanceId, appId);
@ -186,6 +186,7 @@ public class InstanceService {
*
* @param instanceId 任务实例
*/
@DesignateServer
public void cancelInstance(Long instanceId) {
log.info("[Instance-{}] try to cancel the instance.", instanceId);

View File

@ -1,6 +1,7 @@
package tech.powerjob.server.core.lock;
import com.github.kfcfans.powerjob.common.utils.SegmentLock;
import org.springframework.core.annotation.Order;
import tech.powerjob.server.common.utils.AOPUtils;
import com.google.common.collect.Maps;
import lombok.extern.slf4j.Slf4j;
@ -20,6 +21,7 @@ import java.util.Map;
@Slf4j
@Aspect
@Component
@Order(1)
public class UseSegmentLockAspect {
private final Map<String, SegmentLock> lockStore = Maps.newConcurrentMap();

View File

@ -145,7 +145,7 @@ public class JobService {
* @param delay 延迟时间单位 毫秒
* @return 任务实例ID
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public long runJob(Long appId, Long jobId, String instanceParams, Long delay) {
delay = delay == null ? 0 : delay;

View File

@ -5,7 +5,6 @@ import com.alibaba.fastjson.TypeReference;
import com.github.kfcfans.powerjob.common.*;
import com.github.kfcfans.powerjob.common.model.PEWorkflowDAG;
import com.github.kfcfans.powerjob.common.utils.JsonUtils;
import com.github.kfcfans.powerjob.common.utils.SegmentLock;
import tech.powerjob.server.common.constants.SwitchableStatus;
import tech.powerjob.server.core.workflow.algorithm.WorkflowDAGUtils;
import tech.powerjob.server.persistence.remote.model.*;
@ -63,8 +62,6 @@ public class WorkflowInstanceManager {
@Resource
private WorkflowNodeInfoRepository workflowNodeInfoRepository;
private final SegmentLock segmentLock = new SegmentLock(16);
/**
* 创建工作流任务实例
* ********************************************
@ -101,7 +98,7 @@ public class WorkflowInstanceManager {
newWfInstance.setGmtModified(now);
// 校验 DAG 信息
PEWorkflowDAG dag = null;
PEWorkflowDAG dag;
try {
dag = JSON.parseObject(wfInfo.getPeDAG(), PEWorkflowDAG.class);
// 校验
@ -151,6 +148,7 @@ public class WorkflowInstanceManager {
* @param wfInfo 工作流任务信息
* @param wfInstanceId 工作流任务实例ID
*/
@UseSegmentLock(type = "startWfInstance", key = "#wfInfo.getId().intValue()", concurrencyLevel = 1024)
public void start(WorkflowInfoDO wfInfo, Long wfInstanceId) {
Optional<WorkflowInstanceInfoDO> wfInstanceInfoOpt = workflowInstanceInfoRepository.findByWfInstanceId(wfInstanceId);
@ -230,12 +228,9 @@ public class WorkflowInstanceManager {
* @param result 完成任务的任务实例结果
*/
@SuppressWarnings({"squid:S3776", "squid:S2142", "squid:S1141"})
@UseSegmentLock(type = "processWfInstance", key = "#wfInstanceId.intValue()", concurrencyLevel = 1024)
public void move(Long wfInstanceId, Long instanceId, InstanceStatus status, String result) {
int lockId = wfInstanceId.hashCode();
try {
segmentLock.lockInterruptible(lockId);
Optional<WorkflowInstanceInfoDO> wfInstanceInfoOpt = workflowInstanceInfoRepository.findByWfInstanceId(wfInstanceId);
if (!wfInstanceInfoOpt.isPresent()) {
log.error("[WorkflowInstanceManager] can't find metadata by workflowInstanceId({}).", wfInstanceId);
@ -337,21 +332,17 @@ public class WorkflowInstanceManager {
log.error("[Workflow-{}|{}] update failed.", wfId, wfInstanceId, e);
}
} catch (InterruptedException ignore) {
// ignore
} finally {
segmentLock.unlock(lockId);
}
}
/**
* 更新工作流上下文
* fix : 得和其他操作工作流实例的方法用同一把锁才行不然有并发问题会导致节点状态被覆盖
*
* @param wfInstanceId 工作流实例
* @param appendedWfContextData 追加的上下文数据
* @since 2021/02/05
*/
@UseSegmentLock(type = "updateWfContext", key = "#wfInstanceId.intValue()", concurrencyLevel = 1024)
@UseSegmentLock(type = "processWfInstance", key = "#wfInstanceId.intValue()", concurrencyLevel = 1024)
public void updateWorkflowContext(Long wfInstanceId, Map<String, String> appendedWfContextData) {
try {

View File

@ -8,6 +8,7 @@ import com.github.kfcfans.powerjob.common.WorkflowInstanceStatus;
import com.github.kfcfans.powerjob.common.model.PEWorkflowDAG;
import com.github.kfcfans.powerjob.common.response.WorkflowInstanceInfoDTO;
import tech.powerjob.server.common.constants.SwitchableStatus;
import tech.powerjob.server.core.lock.UseSegmentLock;
import tech.powerjob.server.core.workflow.algorithm.WorkflowDAGUtils;
import tech.powerjob.server.persistence.remote.model.WorkflowInfoDO;
import tech.powerjob.server.persistence.remote.model.WorkflowInstanceInfoDO;
@ -53,7 +54,8 @@ public class WorkflowInstanceService {
* @param wfInstanceId 工作流实例ID
* @param appId 所属应用ID
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
@UseSegmentLock(type = "processWfInstance", key = "#wfInstanceId.intValue()", concurrencyLevel = 1024)
public void stopWorkflowInstance(Long wfInstanceId, Long appId) {
WorkflowInstanceInfoDO wfInstance = fetchWfInstance(wfInstanceId, appId);
if (!WorkflowInstanceStatus.GENERALIZED_RUNNING_STATUS.contains(wfInstance.getStatus())) {
@ -92,7 +94,8 @@ public class WorkflowInstanceService {
* @param wfInstanceId 工作流实例ID
* @param appId 应用ID
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
@UseSegmentLock(type = "processWfInstance", key = "#wfInstanceId.intValue()", concurrencyLevel = 1024)
public void retryWorkflowInstance(Long wfInstanceId, Long appId) {
WorkflowInstanceInfoDO wfInstance = fetchWfInstance(wfInstanceId, appId);
// 仅允许重试 失败的工作流
@ -107,7 +110,7 @@ public class WorkflowInstanceService {
throw new PowerJobException("you can't retry the workflow instance which is missing job info!");
}
// 校验 DAG 信息
PEWorkflowDAG dag = null;
PEWorkflowDAG dag;
try {
dag = JSON.parseObject(wfInstance.getDag(), PEWorkflowDAG.class);
if (!WorkflowDAGUtils.valid(dag)) {
@ -161,13 +164,17 @@ public class WorkflowInstanceService {
* 而且仅会操作工作流实例 DAG 中的节点信息状态result
* 并不会改变对应任务实例中的任何信息
*
* 还是加把锁保平安 ~
*
* @param wfInstanceId 工作流实例 ID
* @param nodeId 节点 ID
*/
@DesignateServer
@UseSegmentLock(type = "processWfInstance", key = "#wfInstanceId.intValue()", concurrencyLevel = 1024)
public void markNodeAsSuccess(Long appId, Long wfInstanceId, Long nodeId) {
WorkflowInstanceInfoDO wfInstance = fetchWfInstance(wfInstanceId, appId);
// 校验工作流实例状态运行中的不允许处理
// 校验工作流实例状态运行中的不允许处理
if (WorkflowInstanceStatus.GENERALIZED_RUNNING_STATUS.contains(wfInstance.getStatus())) {
throw new PowerJobException("you can't mark the node in a running workflow!");
}

View File

@ -265,7 +265,7 @@ public class WorkflowService {
* @param delay 延迟时间
* @return workflow 实例的 instanceIdwfInstanceId
*/
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public Long runWorkflow(Long wfId, Long appId, String initParams, Long delay) {
delay = delay == null ? 0 : delay;

View File

@ -18,17 +18,39 @@ import java.util.Optional;
*/
public interface WorkflowInstanceInfoRepository extends JpaRepository<WorkflowInstanceInfoDO, Long> {
/**
* 查找对应工作流实例
* @param wfInstanceId 实例 ID
* @return 工作流实例
*/
Optional<WorkflowInstanceInfoDO> findByWfInstanceId(Long wfInstanceId);
// 删除历史数据JPA自带的删除居然是根据ID循环删2000条数据删了几秒也太拉垮了吧...
// 结果只能用 int 接收
/**
* 删除历史数据JPA自带的删除居然是根据ID循环删2000条数据删了几秒也太拉垮了吧...
* 结果只能用 int 接收
* @param time 更新时间阈值
* @param status 状态列表
* @return 删除的记录条数
*/
@Modifying
@Transactional
@Transactional(rollbackOn = Exception.class)
@Query(value = "delete from WorkflowInstanceInfoDO where gmtModified < ?1 and status in ?2")
int deleteAllByGmtModifiedBeforeAndStatusIn(Date time, List<Integer> status);
/**
* 统计该工作流下处于对应状态的实例数量
* @param workflowId 工作流 ID
* @param status 状态列表
* @return 更新的记录条数
*/
int countByWorkflowIdAndStatusIn(Long workflowId, List<Integer> status);
// 状态检查
/**
* 加载期望调度时间小于给定阈值的
* @param appIds 应用 ID 列表
* @param status 状态
* @param time 期望调度时间阈值
* @return 工作流实例列表
*/
List<WorkflowInstanceInfoDO> findByAppIdInAndStatusAndExpectedTriggerTimeLessThan(List<Long> appIds, int status, long time);
}

View File

@ -17,8 +17,8 @@ import java.lang.annotation.Target;
public @interface DesignateServer {
/**
* 转发请求需要 AppInfo 下的 currentServer 信息因此必须要有 appId 作为入参该字段指定了 appId 字段的参数名称
* 转发请求需要 AppInfo 下的 currentServer 信息因此必须要有 appId 作为入参该字段指定了 appId 字段的参数名称默认为 appId
* @return appId 参数名称
*/
String appIdParameterName();
String appIdParameterName() default "appId";
}

View File

@ -7,6 +7,7 @@ import com.fasterxml.jackson.databind.type.TypeFactory;
import com.github.kfcfans.powerjob.common.PowerJobException;
import com.github.kfcfans.powerjob.common.RemoteConstant;
import com.github.kfcfans.powerjob.common.response.AskResponse;
import org.springframework.core.annotation.Order;
import tech.powerjob.server.persistence.remote.model.AppInfoDO;
import tech.powerjob.server.persistence.remote.repository.AppInfoRepository;
import tech.powerjob.server.remote.transport.starter.AkkaStarter;
@ -37,12 +38,13 @@ import java.util.concurrent.CompletionStage;
@Slf4j
@Aspect
@Component
@Order(0)
public class DesignateServerAspect {
@Resource
private AppInfoRepository appInfoRepository;
private static final ObjectMapper objectMapper = new ObjectMapper();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@Around(value = "@annotation(designateServer))")
public Object execute(ProceedingJoinPoint point, DesignateServer designateServer) throws Throwable {
@ -99,7 +101,7 @@ public class DesignateServerAspect {
Method method = methodSignature.getMethod();
JavaType returnType = getMethodReturnJavaType(method);
return objectMapper.readValue(askResponse.getData(), returnType);
return OBJECT_MAPPER.readValue(askResponse.getData(), returnType);
}

View File

@ -53,7 +53,7 @@ public class WorkerClusterQueryService {
return workers;
}
@DesignateServer(appIdParameterName = "appId")
@DesignateServer
public List<WorkerInfo> getAllWorkers(Long appId) {
List<WorkerInfo> workers = Lists.newLinkedList(getWorkerInfosByAppId(appId).values());
workers.sort((o1, o2) -> o2 .getSystemMetrics().calculateScore() - o1.getSystemMetrics().calculateScore());