Name

global.LongRunner

Description

Base class for long runner handlers, specified by glide.eccprobe.longrunner.class property. start() processes the initial SSHCommandLong input. error() handles any errors that come back.

Script

var LongRunner = Class.create();

//
// This is a more evolved version of DiscoveryLongRunner that avoids polling via repeating probes
//  and allows customers to customize without modifying OOB scripts and complicating future updates.
//
// Customers may extend this class to customize long running command handling, for example,
//  overriding error() to implement a different retry scheme that discriminates between
//  retryable and fatal errors. You might also override _nextPollTime() to come up with
//  a more sophisticated polling frequency rolloff scheme. ...or override _error(), _warn(), and _info()
//  and rework the logging.
//
// Alternately, the whole class may be replaced by creating a new class that implements
//  the start() and error() methods.
//
// When customizing, set glide.eccprobe.longrunner.class to the name of the new class.
//

LongRunner.prototype = {
  // class - LongRunner class (allows extending LongRunner)
  PROP_CLASS : "glide.eccprobe.longrunner.class",
  DEFAULT_CLASS : "LongRunner",
  // debug - "True" to report non-error events in the log
  PROP_DEBUG : "glide.eccprobe.longrunner.debug",
  // retry_minutes - Give up on retrying failures that have persisted longer than retry_minutes
  PROP_RETRY_M : "glide.eccprobe.longrunner.retry_minutes",
  DEFAULT_RETRY_M : "10",
  // interval.initial_seconds - Set the initial polling period to interval.initial_seconds
  PROP_INTERVAL_INITIAL_S : "glide.eccprobe.longrunner.interval.initial_seconds",
  DEFAULT_INTERVAL_INITIAL_S : "20",
  // interval.backoff_percent - Increase polling interval by this percentage per poll
  PROP_INTERVAL_BACKOFF_PCT : "glide.eccprobe.longrunner.interval.backoff_percent",
  DEFAULT_INTERVAL_BACKOFF_PCT : "15",
  // interval.max_seconds - Cap the polling interval at this value
  PROP_INTERVAL_MAX_S : "glide.eccprobe.longrunner.interval.max_seconds",
  DEFAULT_INTERVAL_MAX_S : "300",
  // max_poll_count - Limit the number of polling probes launched per LongRunner.poll()
  PROP_MAX_PROBES_PER_POLL : "glide.eccprobe.longrunner.max_poll_count",

  initialize: function() {
  	this.longRunnerClass = gs.getProperty(this.PROP_CLASS, this.DEFAULT_CLASS);
  	this.debug = gs.getProperty(this.PROP_DEBUG);
  	this.debug = !!(this.debug && this.debug.toLowerCase() == 'true'); // String (or nil) to boolean
  },

/*
* The methods in this section get called in the context of a sensor.
* The prior probe is accessed via the g_probe global.
*/

  // API entry point, called once on successful SSHCommandLong completion.
  //
  // Should initiate "polling," a series of SSHCommand operations with long_sensor pointing to
  //  our sensor() method.
  //
  // Return boolean false if processing is complete, bypassing main sensor.
  // Return boolean true or throw things if processing is incomplete,
  //  allowing main sensor to run and terminating the command.
  start: function() {
  	this._scheduleNextPoll();
  	return true;
  },

  // API entry point, called on probe error response (be it start, or poll.)
  //
  // Should determine if the error is retryable or fatal.
  //
  // Return boolean true if processing is complete, bypassing main sensor. (Retryable)
  // Return boolean false (or throw things) if processing is incomplete, allowing main
  // sensor to run. (Fatal)
  //
  // This implementation retries a fixed number of times before giving up.
  //
  // Note that error() is a public function to handle ecc_queue input with an error in the result and
  // _error() is a local function to log an error message.

  error: function() {
  	// If the command never started, just report the problem.
  	if (g_probe.topic == 'SSHCommandLong') {
  		this._error('Not retrying: failed before polling started.');
  		return false;
  	}

  	// Tolerate bursts of failures up to glide.eccprobe.longrunner.retry_minutes long.
  	// This is measured from the time of the first failing ecc_queue input, failing_since
  	// to the most recent, g_probe's sys_created_on.
  	if (typeof(failing_since) !== 'undefined') {
  		var gdtFailingSince = failing_since && GlideDateTime(failing_since);
  		var retryMinutes = parseInt(gs.getProperty(this.PROP_RETRY_M, this.DEFAULT_RETRY_M));
  		var gdtEarliestTime = new GlideDateTime(g_probe.getEccQueueRecord().sys_created_on);
  		gdtEarliestTime.addSeconds(-retryMinutes * 60);

  		// We've been trying for glide.eccprobe.longrunner.retry_minutes and no joy. Time to pack it in.
  		if (gdtFailingSince && gdtFailingSince.before(gdtEarliestTime)) {
  			this._error('Giving up on retries.\n' +
  					this.PROP_RETRY_M + ' = {0}\n' +
  					'Failing since {1}', retryMinutes, '' + gdtFailingSince.getDisplayValue());
  			return false;
  		}

  		this._warn('Retrying on repeated error.\n' +
  				this.PROP_RETRY_M + ' = {0}\n' +
  				'Failing since {1}', retryMinutes, '' + gdtFailingSince.getDisplayValue());
  	} else
  		this._warn('Retrying on initial error.');

  	// Ask again later.
  	this._scheduleNextPoll();
  	return true;
  },

  // Non-API entry point.
  //
  // We set long_sensor in our polling probes to cause this to be called as each
  //  polling probe result comes in, if successful.
  //
  // Returns true if polling should continue, false if the command is complete.
  sensor: function() {
  	if (!output) {
  		// Should *never* happen. Not going to bother to retry it with this.error().
  		throw this._getFailureHeader() + 'Missing output field in a non-error result.';
  	}

  	// Still running? Ask again later.
  	if (this._stillRunning(output)) {
  		this._scheduleNextPoll();
  		return true;
  	}

  	// Not still running? Create a new complete probe.
  	if (this._completeProbe() == null) {
  		this._error('Failed to create complete probe.');
  		return false;
  	} else {
  		this._info('Command complete. Collecting results.');
  		return true;
  	}
  },

  // Issue a new probe to complete the long running command
  // This is done as a separate probe from polling so that we can discriminate between
  //  failed polling and failed completion.
  _completeProbe: function() {
  	var subdir = '.run.' + g_probe.getParameter('ssh_long_id');
  	var path = '/tmp/' + subdir + '/';
  	var probeOut = new SncProbe();
  	probeOut.setSource(g_probe.getParameter('source')); // SncProbe.getSource() not scriptable. PRB1284487
  	var correlator = g_probe.getCorrelator();
  	probeOut.setCorrelator(g_probe.getCorrelator());
  	// "probe" parameter does not copy for free as it is in the Probe.NOT_TO_COPY list.
  	probeOut.addParameter("probe", g_probe.getParameter("probe"));
  	probeOut.copy(g_probe);
  	probeOut.setTopic('SSHCommand');
  	var completeMustSudo = probeOut.getBooleanParameter('complete_must_sudo', false);
  	probeOut.setName('sh ' + path + 'complete');
  	probeOut.addParameter('must_sudo', completeMustSudo);
  	probeOut.addParameter('run_directory', subdir);
  	probeOut.setParameter('failing_since', null);
  	probeOut.setParameter('long_error_handler', null);

  	return probeOut.create(g_probe.agent);
  },

  // Future API entry point, called on probe error response (be it start, or poll.)
  //
  // Tear down a potentially incomplete LRC
  cancel: function() {
  	throw this._getFailureHeader() + 'LongRunner.cancel() not yet implemented';
  },

  // Queue up the next poll for this LRC
  _scheduleNextPoll: function() {
  	var gr = new GlideRecord('long_runner_poll');
  	// Must use sudo on complete if we used sudo to create files. This
  	//  will appear in must_sudo in start or complete_must_sudo in sensor
  	gr.next_poll = this._nextPollTime();
  	gr.previous_probe_response = g_probe.getEccQueueId();
  	gr.insert();
  },

  // Find next poll time based on poll_interval from probe, or if not from initial poll time
  _nextPollTime: function() {
  	var gdt = new GlideDateTime();
  	var pollInterval = parseInt(g_probe.getParameter('poll_interval') ||
  		gs.getProperty(this.PROP_INTERVAL_INITIAL_S, this.DEFAULT_INTERVAL_INITIAL_S));
  	gdt.add(pollInterval * 1000);
  	this._info('poll interval {0}, next poll {1}', pollInterval, gdt.getDisplayValue());
  	return gdt;
  },

  // Find next polling interval based on current (in ms units)
  _nextPollInterval: function(previousInterval) {
  	if (!previousInterval)
  		previousInterval = parseInt(gs.getProperty(this.PROP_INTERVAL_INITIAL_S,
  								this.DEFAULT_INTERVAL_INITIAL_S));

  	var backoffPercent = parseInt(gs.getProperty(this.PROP_INTERVAL_BACKOFF_PCT,
  								this.DEFAULT_INTERVAL_BACKOFF_PCT)) / 100;
  	var maxInterval = parseInt(gs.getProperty(this.PROP_INTERVAL_MAX_S, this.DEFAULT_INTERVAL_MAX_S));

  	return Math.min(maxInterval, Math.round(100 * previousInterval * (1 + backoffPercent)) / 100);
  },

  // Return true if probe is still running
  _stillRunning: function(output) {
  	return output.startsWith("still running");
  },

  _info: function(msg, arg1, arg2) {
  	if (this.debug)
  		gs.info(this._getCommonHeader() + ' info:\n' + msg, arg1, arg2);
  },

  _warn: function(msg, arg1, arg2) {
  	gs.warn(this._getFailureHeader() + msg, arg1, arg2);
  },

  // Note that error() is a public function to handle ecc_queue input with an error in the result and
  // _error() is a local function to log an error message.
  _error: function(msg, arg1, arg2) {
  	gs.error(this._getFailureHeader() + msg, arg1, arg2);
  },

  _getFailureHeader: function() {
  	return this._getCommonHeader() + ' failed:\n';
  },

  _getCommonHeader: function() {
  	return 'LongRunner: ssh_long_id=' + ssh_long_id + ', sys_id=' + g_probe.getEccQueueId();
  },

/*
* The methods in this section get called in the context of polling.
* The prior probe is accessed via the previous_probe_response field
*  of a long_runner_poll record.
*/

  // API entry point, called to service the polling loop.
  //
  // Query for scheduled probes and execute them.
  poll: function() {
  	var pollGR = new GlideRecord('long_runner_poll');
  	pollGR.addActiveQuery();
  	pollGR.addQuery('next_pollRELATIVELE@minute@ago@0');
  	var maxPoll = gs.getProperty(this.PROP_MAX_PROBES_PER_POLL);
  	if (maxPoll) {
  		pollGR.orderBy('next_poll');
  		pollGR.setLimit(parseInt(maxPoll));
  	}
  	pollGR.query();
  	while (pollGR.next()) {
  		var probeResponseGR = new GlideRecord('ecc_queue');
  		if (pollGR.previous_probe_response) {
  			probeResponseGR.get(pollGR.previous_probe_response);
  			this._pollProbe(probeResponseGR);
  		}
  		pollGR.active = false;
  		pollGR.update();
  	}
  },

  // Issue a new polling probe for a long_runner_poll GlideRecord
  _pollProbe: function(probeResponseGR) {
  	// Get SncProbe object containing response to previous probe.
  	var probeIn = SncProbe.createProbeResponse(probeResponseGR);
  	var subdir = '.run.' + probeIn.getParameter('ssh_long_id');
  	var path = '/tmp/' + subdir + '/';
  	var probeOut = new SncProbe();
  	probeOut.setSource(probeIn.getParameter('source')); // SncProbe.getSource() not scriptable. PRB1284487
  	probeOut.setCorrelator(probeIn.getCorrelator());
  	// "probe" parameter does not copy for free as it is in the Probe.NOT_TO_COPY list.
  	probeOut.addParameter("probe", probeIn.getParameter("probe"));
  	probeOut.copy(probeIn);
  	var params = probeOut.getParametersMap();
  	if (JSUtil.toBoolean(probeIn.getParameter('use_snc_ssh')))
  		params.remove('must_sudo'); // Don't need root just to poll for running
  	probeOut.setTopic('SSHCommand');
  	var completeMustSudo = probeOut.getBooleanParameter('complete_must_sudo', false);
  	var sudoString = completeMustSudo ? 'sudo ' : '';
  	probeOut.setName('sh -c "cd ' + path + ';if [ -f running ];then echo still running;' +
  				'[ -s nohup.out ] && echo tail of stdout:;' + sudoString + 'tail -5 nohup.out;' +
  				'[ -s nohup.out2 ] && echo tail of stderr:;' + sudoString + 'tail -5 nohup.out2;' +
  				'else echo not running;fi"');
  	probeOut.addParameter('run_directory', subdir);
  	probeOut.addParameter('long_sensor', 'new ' + this.longRunnerClass + '().sensor()');
  	if (probeIn.getError()) {
  		if (!probeIn.hasParameter('failing_since'))
  			probeOut.addParameter('failing_since', probeResponseGR.sys_created_on);
  	} else
  		probeOut.setParameter('failing_since', null);
  	// Displace any old repeating commands. (Only necessary at first installation.)
  	if (probeIn.hasParameter('repeat_correlator'))
  		probeOut.addParameter('repeat_cancel', 'true');

  	var previousInterval = probeIn.getParameter('poll_interval') ||
  		parseInt(gs.getProperty(this.PROP_INTERVAL_INITIAL_S, this.DEFAULT_INTERVAL_INITIAL_S));
  	var nextInterval = this._nextPollInterval(previousInterval);
  	probeOut.addParameter('poll_interval', nextInterval);

  	return probeOut.create(probeIn.agent);
  },

  type: 'LongRunner'
};

Sys ID

a4a1b8ae9351130078b870b8b47ffbe0

Offical Documentation

Official Docs: