Name
global.RetrySolutionUtil
Description
No description available
Script
var RetrySolutionUtil = Class.create();
RetrySolutionUtil.prototype = {
initialize: function() {},
STATE: 'state',
UPDATE_STATE: 'update_state',
SYS_UPDATED_ON: 'sys_updated_on',
SYS_CREATED_ON: 'sys_created_on',
TOKEN_HASH: 'token_hash',
PROGRESS: 'progress',
PROGRESS_TRACKER: 'progress_tracker',
WORKER_ID: 'worker_id',
ACTIVE: 'active',
VERSION: 'version',
WORKER: 'worker',
WAITING_FOR_TRAINING_STATE: 'waiting_for_training',
RETRY_STATE: 'retry',
RUNNING_CLASSIFICATION: 'running_classification',
RUNNING_CLUSTERING: 'running_clustering',
TRAINING_SOLUTION: 'training_solution',
SOLUTION_COMPLETE_STATE: 'solution_complete',
SOLUTION_CANCELLED_STATE: 'solution_cancelled',
SOLUTION_ERROR_STATE: 'solution_error',
TIMED_OUT: 'timed_out',
UNAUTHORIZED: 'unauthorized',
EMPTY: '',
NOT_IN: 'NOT IN',
ML_CAPABILITY_DEFINITION_BASE_TABLE: 'ml_capability_definition_base',
ML_CAPABILITY_DEFINITION_TABLE: 'ml_capability_definition',
ML_SOLUTION: 'ml_solution',
TRAINER_RESPONSE_TIMEOUT_PROPERTY: 'glide.platform_ml.trainer_response_timeout_threshold',
TRAINING_TIMEOUT_PROPERTY: 'glide.platform_ml.training_timeout',
TRAINING_RETRY_LIMIT_PROPERTY: 'glide.platform_ml.training.retry.limit',
JOB_STATE_WINDOW_HOURS: 'glide.platform_ml.status_api.time_window',
DATA_ANALYSIS_UPDATER: 'data_analysis_updater',
DATA_ANALYSIS_TRAINER: 'data_analysis_trainer',
CLUSTERING_TRAINER: 'clustering_trainer',
JOB_REQUEST_COUNT_LIMIT: 40,
retrySolutions: function() {
var timeoutDuration = gs.getProperty(this.TRAINER_RESPONSE_TIMEOUT_PROPERTY, -1);
var waitingTimeoutDuration = gs.getProperty(this.TRAINING_TIMEOUT_PROPERTY, -1);
var result = null;
var terminalStates = [this.SOLUTION_CANCELLED_STATE, this.SOLUTION_COMPLETE_STATE, this.SOLUTION_ERROR_STATE];
var solution = new GlideRecord(this.ML_SOLUTION);
var stateQuery = solution.addQuery(this.STATE, this.NOT_IN, terminalStates);
stateQuery.addOrCondition(this.UPDATE_STATE, this.NOT_IN, terminalStates);
solution.query();
result = this.fetchJobStatesFromScheduler();
while (solution.next()) {
if (new MLGroupbyUtils().isChildGroupbySol(solution))
continue;
// We do not want to pick the trainings that we marked as timedOut in previous run. Processing this can result in creation of duplicate solution records.
if (solution.getValue(this.STATE) == this.TIMED_OUT || solution.getValue(this.UPDATE_STATE) == this.TIMED_OUT)
continue;
var isUpdateJob = this.isUpdateTraining(solution);
var timeSinceUpdate = this.calculateTimeSinceUpdate(solution);
if (this.getState(solution, isUpdateJob) == this.WAITING_FOR_TRAINING_STATE) {
this.handleWaitingForTrainingState(waitingTimeoutDuration, timeSinceUpdate, solution, isUpdateJob, result, terminalStates);
} else { // Solution has state (or update_state) other than waiting_for_training
if ((timeoutDuration != -1 && timeSinceUpdate >= timeoutDuration) || this.getState(solution, isUpdateJob) == this.RETRY_STATE) {
var lastUpdated = this.getLastUpdatedTime(solution);
if (isUpdateJob)
gs.log("Training update for solution definition " + solution.solution_definition + " timed out - sys_id: " + solution.sys_id + ", update_state: " + solution.update_state + ", created time: " + solution.sys_created_on + ", last updated time: " + lastUpdated + ", calculated time since last update: " + new GlideDuration(timeSinceUpdate).getDurationValue());
else
gs.log("Training for solution definition " + solution.solution_definition + " timed out - sys_id: " + solution.sys_id + ", state: " + solution.state + ", created time: " + solution.sys_created_on + ", last updated time: " + lastUpdated + ", calculated time since last update: " + new GlideDuration(timeSinceUpdate).getDurationValue());
this.cancelAndRetry(solution, isUpdateJob);
}
}
}
},
handleWaitingForTrainingState: function(waitingTimeoutDuration, timeSinceUpdate, solution, isUpdateJob, result, terminalStates) {
var jobStateFromScheduler = this.getJobStatus(result, solution.getUniqueValue());
var currentSolutionState = this.EMPTY;
if (jobStateFromScheduler == null || jobStateFromScheduler == this.EMPTY || jobStateFromScheduler == this.WAITING_FOR_TRAINING_STATE) {
if (waitingTimeoutDuration != -1 && timeSinceUpdate > waitingTimeoutDuration) {
// For Last retry attempt alone - we should not consider waiting timeout duration. This is to honor exceptional cases where scheduler is flooded with trainings and not responding
if (!this.isLastRetry(solution, isUpdateJob)) {
currentSolutionState = this.getSolutionState(solution.getUniqueValue(), isUpdateJob);
if (currentSolutionState == this.WAITING_FOR_TRAINING_STATE) {
this.cancelAndRetry(solution, isUpdateJob);
}
}
}
} else if (terminalStates.includes(jobStateFromScheduler)) {
currentSolutionState = this.getSolutionState(solution.getUniqueValue(), isUpdateJob);
if (currentSolutionState == this.WAITING_FOR_TRAINING_STATE) {
// set the state of solution to Timed out and perform retry
this.setState(solution, isUpdateJob, this.TIMED_OUT);
this.performRetry(solution, isUpdateJob);
}
} else {
currentSolutionState = this.getSolutionState(solution.getUniqueValue(), isUpdateJob);
if (currentSolutionState == this.WAITING_FOR_TRAINING_STATE)
this.checkAndUpdateStateOnSolution(solution, jobStateFromScheduler, isUpdateJob);
}
},
isLastRetry: function(solution, isUpdateJob) {
var retryLimit = this.getRetryLimit();
if (isUpdateJob)
return this.isLastRetryUpdateTraining(solution, retryLimit);
else
return this.isLastRetryTraining(solution, retryLimit);
},
isLastRetryTraining: function(solution, retryLimit) {
var solutionDefinition = new GlideRecord(this.ML_CAPABILITY_DEFINITION_BASE_TABLE);
solutionDefinition.get(solution.ml_capability_definition);
var failedAttempts = this.getFailedAttempts(solutionDefinition);
// If the failed attempts is equal to retryLimit- it means the current execution is the last retry.
if (retryLimit == failedAttempts)
return true;
return false;
},
isLastRetryUpdateTraining: function(solution, retryLimit) {
var failedUpdateAttempts = this.getFailedUpdateAttempts(solution);
if (failedUpdateAttempts == retryLimit)
return true;
return false;
},
getSolutionState: function(solutionSysId, isUpdateJob) {
var solutionGR = new GlideRecord(this.ML_SOLUTION);
solutionGR.get(solutionSysId);
if (solutionGR.isValid()) {
if (isUpdateJob)
return solutionGR.getValue(this.UPDATE_STATE);
else
return solutionGR.getValue(this.STATE);
}
return this.EMPTY;
},
getState: function(solution, isUpdateJob) {
if (isUpdateJob)
return solution.getValue(this.UPDATE_STATE);
return solution.getValue(this.STATE);
},
setState: function(solution, isUpdateJob, state) {
if (isUpdateJob) {
gs.info("Update state of solution :" + solution.getUniqueValue() + "is updated to " + state);
solution.setValue(this.UPDATE_STATE, state);
}
else {
gs.info("State of solution :" + solution.getUniqueValue() + "is updated to " + state);
solution.setValue(this.STATE, state);
}
solution.update();
},
isUpdateTraining: function(solution) {
if (solution.update_state == this.EMPTY)
return false;
return true;
},
getLastUpdatedTime: function(solution) {
//Compare progress tracker and solution updated times and pick latest time
var trackerLastUpdated = solution.progress_tracker.sys_updated_on;
var solutionLastUpdated = solution.getValue(this.SYS_UPDATED_ON);
if (!GlideStringUtil.nil(trackerLastUpdated) && !GlideStringUtil.nil(solutionLastUpdated)) {
if (trackerLastUpdated > solutionLastUpdated)
return new GlideDateTime(trackerLastUpdated);
else
return new GlideDateTime(solutionLastUpdated);
}
if (GlideStringUtil.nil(trackerLastUpdated))
return new GlideDateTime(solutionLastUpdated);
if (GlideStringUtil.nil(solutionLastUpdated))
return new GlideDateTime(trackerLastUpdated);
},
calculateTimeSinceUpdate: function(solution) {
var currentTime = new GlideDateTime();
var lastUpdated = this.getLastUpdatedTime(solution);
return currentTime.getNumericValue() - lastUpdated.getNumericValue();
},
cancelAndRetry: function(solution, isUpdateJob) {
if (isUpdateJob) {
this.cancelUpdateTraining(solution);
this.retryUpdateTraining(solution);
} else {
this.cancelTraining(solution);
this.retryTraining(solution);
}
},
cancelTraining: function(solution) {
new sn_ml.TrainingRequest().cancelTraining(solution.getUniqueValue());
solution.setValue(this.STATE, this.TIMED_OUT);
solution.update();
},
retryTraining: function(solution) {
var solutionDefinition = new GlideRecord(this.ML_CAPABILITY_DEFINITION_BASE_TABLE);
solutionDefinition.get(solution.ml_capability_definition);
// Failed Attempts include the initial submission also. So reducing count by 1 for failedRetryAttempts
var failedRetryAttempts = this.getFailedAttempts(solutionDefinition) - 1;
var trainingRetryLimit = this.getRetryLimit();
if (failedRetryAttempts < trainingRetryLimit)
this.startNewServiceRequest(solution, failedRetryAttempts, solutionDefinition, trainingRetryLimit);
else
gs.log("Training for solution definition " + solutionDefinition.getUniqueValue() + " has hit the max number of retries and timed out");
},
performRetry: function(solution, isUpdateJob) {
if (isUpdateJob)
this.retryUpdateTraining(solution);
else
this.retryTraining(solution);
},
getRetryLimit: function() {
return gs.getProperty(this.TRAINING_RETRY_LIMIT_PROPERTY, 3);
},
cancelUpdateTraining: function(solution) {
new sn_ml.TrainingRequest().cancelUpdateJob(solution.sys_id);
solution.setValue(this.UPDATE_STATE, this.TIMED_OUT);
solution.update();
},
retryUpdateTraining: function(solution) {
//retry_count starts with 0 for initial submission. It increases with every retry
var failedRetryAttempts = this.getFailedUpdateAttempts(solution);
var trainingRetryLimit = this.getRetryLimit();
if (failedRetryAttempts < trainingRetryLimit)
this.submitUpdateJob(solution);
else
gs.log("Update training for solution " + solution.getUniqueValue() + " has hit the max number of retries and timed out");
},
submitUpdateJob: function(solution) {
var capability = solution.capability;
var solutionName = solution.solution_name;
var mlSolution = null;
var solutionVersion = null;
var updateConfig = null;
if (capability == this.DATA_ANALYSIS_UPDATER || capability == this.DATA_ANALYSIS_TRAINER)
mlSolution = sn_ml.DataAnalysisStore.get(solutionName);
else if (capability == this.CLUSTERING_TRAINER)
mlSolution = sn_ml.ClusteringSolutionStore.get(solutionName);
solutionVersion = mlSolution.getActiveVersion();
updateConfig = JSON.parse(solution.update_config);
solutionVersion.submitUpdateJob(updateConfig);
},
getJobStatus: function(result, solutionId) {
if (result == null || result.length == 0)
return this.EMPTY;
var i = 0;
for (i = 0; i < result.length; i++) {
if (result[i].solution_id == solutionId) {
//running_classification and running_clustering are not ml_solution states. returning training_solution insted
if (result[i].state == this.RUNNING_CLASSIFICATION || result[i].state == this.RUNNING_CLUSTERING)
return this.TRAINING_SOLUTION;
if (this.isValidState(result[i].state))
return result[i].state;
return null;
}
}
return this.EMPTY;
},
isValidState: function(state) {
var validStates = ["fetching_files_for_training", "preparing_data", "retry", "solution_cancelled", "solution_complete", "solution_error", "solution_error_missing_artifacts", "timed_out", "training_request_received", "training_solution", "unauthorized", "uploading_solution", "waiting_for_training"];
return validStates.includes(state);
},
checkAndUpdateStateOnSolution: function(solution, jobStateFromScheduler, isUpdateJob) {
if (jobStateFromScheduler != null) {
if (isUpdateJob) {
solution.setValue(this.UPDATE_STATE, jobStateFromScheduler);
gs.info("Setting update_state of solution " + solution.getUniqueValue + " to " + jobStateFromScheduler);
} else {
solution.setValue(this.STATE, jobStateFromScheduler);
gs.info("Setting state of solution " + solution.getUniqueValue + " to " + jobStateFromScheduler);
}
solution.update();
}
},
getSolutionIds: function() {
var solution = new GlideRecord(this.ML_SOLUTION);
var stateQuery = solution.addQuery(this.STATE, this.WAITING_FOR_TRAINING_STATE);
stateQuery.addOrCondition(this.UPDATE_STATE, this.WAITING_FOR_TRAINING_STATE);
solution.query();
var solutionIds = [];
var index = 0;
while (solution.next()) {
solutionIds[index++] = solution.getUniqueValue();
}
return solutionIds;
},
fetchJobStatesFromScheduler: function() {
var solutionIds = this.getSolutionIds();
if (solutionIds.length != 0)
return this.fetchJobStatesForSolutionIds(solutionIds);
},
fetchJobStatesForSolutionIds: function(solutionIds) {
var response = [];
try {
if (solutionIds.length <= this.JOB_REQUEST_COUNT_LIMIT) {
response = new sn_ml.TrainingRequest().getTrainingJobsState(this.WORKER, this.EMPTY, solutionIds.toString()).getBody();
return JSON.parse(response).result;
} else {
while (solutionIds.length != 0) {
var solutionIdForCurrentRequest = this.getSolutionIdsForCurrentRequest(solutionIds);
var apiResponse = new sn_ml.TrainingRequest().getTrainingJobsState(this.WORKER, this.EMPTY, solutionIdForCurrentRequest.toString()).getBody();
var partialResponse = JSON.parse(apiResponse).result;
response = response.concat(partialResponse);
}
return response;
}
} catch (e) {
gs.error("Unable to fetch Job states from getTrainingJobsState API for solution Ids." + solutionIds + "Exception details : " + e);
}
},
getSolutionIdsForCurrentRequest: function(solutionIds) {
if (solutionIds.length == 0)
return [];
else if (solutionIds.length <= this.JOB_REQUEST_COUNT_LIMIT)
return this.extractItems(solutionIds, solutionIds.length);
else
return this.extractItems(solutionIds, this.JOB_REQUEST_COUNT_LIMIT);
},
extractItems: function(solutionIds, count) {
var extractedIds = [];
while (count > 0) {
extractedIds.push(solutionIds.pop());
count--;
}
return extractedIds;
},
getFailedAttempts: function(solutionDefinition) {
var trainingRetryLimit = this.getRetryLimit();
var gr = new GlideRecord(this.ML_SOLUTION);
gr.addQuery(this.ML_CAPABILITY_DEFINITION_TABLE, solutionDefinition.getUniqueValue());
gr.orderByDesc(this.SYS_CREATED_ON);
gr.setLimit(trainingRetryLimit + 1);
gr.query();
var count = 0;
while (gr.next()) {
if (gr.getValue(this.STATE) == this.TIMED_OUT || gr.getValue(this.STATE) == this.RETRY_STATE)
count++;
}
return count;
},
getFailedUpdateAttempts: function(solution) {
var updateConfig = JSON.parse(solution.update_config);
if (updateConfig.retry_count == undefined)
return 0;
return updateConfig.retry_count;
},
startNewServiceRequest: function(solution, failedRetryAttempts, solutionDefinition, trainingRetryLimit) {
var token = GlideSecureRandomUtil.getSecureRandomString(32);
var newSolutionId = this.insertNewSolution(solution, token);
if (newSolutionId == this.EMPTY || newSolutionId == null) {
gs.print("Could not create new solution for solution definition " + solutionDefinition.getUniqueValue());
return;
}
var newSolution = new GlideRecord(this.ML_SOLUTION);
newSolution.get(newSolutionId);
gs.print("Attempt number " + (failedRetryAttempts + 1) + " to train for the solution definition: " + solutionDefinition.getUniqueValue());
var response = new sn_ml.TrainingRequest(solutionDefinition).postServiceRequest(newSolutionId, token);
if (failedRetryAttempts > trainingRetryLimit) {
newSolution.setValue(this.STATE, this.TIMED_OUT);
newSolution.setValue(this.TOKEN_HASH, this.EMPTY);
gs.log("Training for solution definition " + solutionDefinition.getUniqueValue() + " has hit the max number of retries and timed out");
} else if (!gs.nil(response) && response.getStatusCode() === 401) {
newSolution.setValue(this.STATE, this.UNAUTHORIZED);
newSolution.setValue(this.TOKEN_HASH, this.EMPTY);
} else if (!gs.nil(response) && response.getStatusCode() === 200) {
newSolution.setValue(this.STATE, this.WAITING_FOR_TRAINING_STATE);
} else {
newSolution.setValue(this.STATE, this.RETRY_STATE);
newSolution.setValue(this.TOKEN_HASH, this.EMPTY);
}
newSolution.update();
},
insertNewSolution: function(solution, token) {
var version = sn_ml.MLRecordUtil.getNextSolutionVersion(solution.getValue(this.ML_CAPABILITY_DEFINITION_TABLE));
if (version < 0)
return "";
var newSolution = new GlideRecord(this.ML_SOLUTION);
newSolution.initialize();
var elements = solution.getFields();
for (var i = 0; i < elements.size(); i++) {
var element = elements.get(i);
var elementName = element.getName();
if (elementName != this.PROGRESS && elementName != this.PROGRESS_TRACKER && elementName != this.WORKER_ID && elementName != this.ACTIVE) {
newSolution.setValue(element.getName(), element.getValue());
}
}
newSolution.setValue(this.TOKEN_HASH, new GlideDigest().md5_digest(token));
newSolution.setValue(this.VERSION, version);
return newSolution.insert();
},
type: 'RetrySolutionUtil'
};
Sys ID
5ac31e4b532121106e33ddeeff7b120f