Name

global.MonitorMIDServer

Description

Monitors MID servers, using the heartbeat probe.

Script

/*
* Monitors MID servers, using the heartbeat probe.
*/

var MonitorMIDServer = Class.create();

MonitorMIDServer.STATUS_RECORD = 'mid.monitor.heartbeat_sent';

MonitorMIDServer.prototype = {
  
  /*
   * Do everything required for a scheduled MID server monitor cycle.
   */
  monitor: function() {
      this.killOldRequests();
      this.markDowners();
      this.sendHeartbeatRequests();
  },
  
  /*
   * Mark any non-responding MID servers as being down.
   *
   * There are two timestamps that are updated separately:
   *   1. last_heartbeat_sent is the last time an output heartbeat message was sent by this monitor.
   *   2. last_refreshed is the last time the mid server status was updated.
   *
   * The monitor looks for any mid server activity (last_refreshed is updated or any ecc input message)
   * between now and last_heartbeat_sent, and if there is nothing, it will mark the mid server as Down.
   *
   * Because of the time drift issue when moving between nodes, last_refreshed and last_heartbeat_sent
   * can be few seconds before or after the actual time.  To solve the issue, we extends the query window
   * to the past by substracting some extra time from last_heartbeat_sent.
   */
  markDowners: function() {
      // find the last time we sent a heartbeat...
      var gr = new GlideRecord('sys_status');
      gr.addQuery('name', MonitorMIDServer.STATUS_RECORD);
      gr.query();
      if (!gr.next())
          return;
      var last_heartbeat_sent = new GlideDateTime(gr.value);
      // add some extra time to query condition to address the time drift issue when moving between instance nodes
      last_heartbeat_sent.addSeconds(-5);

      // find all the non-responding MID servers...
      gr = new GlideRecord('ecc_agent');
      gr.addQuery('last_refreshed', '<', last_heartbeat_sent);
      gr.addQuery('status', '!=', 'Down');
      gr.query();
      while (gr.next()) {
          if (! this.hasRecentActivity('' + gr.name, last_heartbeat_sent)) {
              gr.status = 'Down';
              gr.update();
          }
      }
  },
  
  /*
   * Check if the given agent name has written to the ecc_queue more recently than the given time.
   * This protects against marking an overloaded (but functioning) mid server as 'down' inappropriately.
   */
  hasRecentActivity: function(agentName, time) {
      var gr = new GlideRecord('ecc_queue');
      gr.addQuery('sys_created_on', '>=', time);
      // time bound is an optimization to tell GR to focus on one or two ecc_queue shards and not join all shards
      gr.addQuery('sys_created_on', '<=', gs.minutesAgo(0));
      gr.addQuery('agent', 'mid.server.' + agentName);
      gr.addQuery('queue', 'input');
      gr.query();
      if (gr.hasNext())
          return true;
      
      return false;
  },

  /*
   * Send heartbeat requests to all MID servers, and note when we did it.
   */
  sendHeartbeatRequests: function() {
      // first send the requests...
      var now = gs.nowNoTZ();
      var gr = new GlideRecord('ecc_queue');
      gr.initialize();
      gr.agent = 'mid.server.*';  // a business rule turns this into individual requests...
      gr.queue = 'output';
  	gr.priority = 0;
      gr.topic = 'HeartbeatProbe';
      gr.state = 'ready';
      gr.insert();
      
      // then update or insert our status record to show when we sent them...
      gr = new GlideRecord('sys_status');
      gr.addQuery('name', MonitorMIDServer.STATUS_RECORD);
      gr.query();
      if (gr.next()) {
          gr.value = now;
          gr.update();
      } else {
          gr.initialize();
          gr.value = now;
          gr.name = MonitorMIDServer.STATUS_RECORD;
          gr.description = 'Last time MID server heartbeat probes were launched';
          gr.insert();
      }
  },
  
  /*
   * If there are any heartbeat probe requests that haven't been processed, cancel them so that they don't accumulate when
   * a MID server is down for a while...
   */
  killOldRequests: function() {
      var gr = new GlideRecord('ecc_queue');
      gr.addQuery('agent', 'STARTSWITH', 'mid.server.');
      gr.addQuery('queue', 'output');
      gr.addQuery('state', '!=', 'processed');
      gr.addQuery('topic', 'HeartbeatProbe');
      gr.addQuery("sys_created_on", ">", gs.minutesAgo(10));
      gr.addQuery("sys_created_on", "<=", gs.minutesAgo(0)); // Bound the ecc_queue because of potential table rotation schedules in the future
      gr.query();
      while (gr.next()) {
          gr.state = 'processed';
          gr.processed = gs.nowDateTime();
          gr.update();
      }
  },
  
  type: 'MonitorMIDServer'
};

Sys ID

979a5dde0ab3015252383b43d9a73436

Offical Documentation

Official Docs: