Name

sn_app_insights.Kmeans

Description

No description available

Script

/**
   * Kmeans
   * This is a clustering model following Kmean++ algorithm.
   * Usage:
   * 1. The data needs to be passed to processAndClusterData.
   *    The data (passed in parameter) needs to have a unique structure:
   *        It needs to contain two keys ["seriesConfig, "seriesData"] and each key
   *        should have values similarly to the example bellow.
   * read the data in the format given eg:
   data = {
          "seriesConfig": {
              "line0": {
                  "x": "timeStamp",
                  "y": "y0",
                  "type": "line",
                  "label": "ML-G1-AG25-1"
              },
              "line1": {
                  "x": "timeStamp",
                  "y": "y1",
                  "type": "line",
                  "label": "BU-MN4-AG27-4"
              },
          },
          "seriesData": [
              {
                  "timeStamp": 1603899000000,
                  "y0": 43.694744,
                  "y1": 49.635857,
              },
              {
                  "timeStamp": 1603899300000,
                  "y0": 47.725418,
                  "y1": 54.03098,
              },
              {
                  "timeStamp": 1603899600000,
                  "y0": null,
                  "y1": null,
              },
              {
                  "timeStamp": 1603899900000,
                  "y0": null,
                  "y1": null,
              }
          ]
   }
   * 2. Return value:
   *      After calling processAndClusterData function on the data it will return the clusters in the same
   *      format that was requested for the input. The return object will contain ["seriesConfig, "seriesData"]
   *      keys but the difference would be the number of lines in values since the new values are the values
   *      of the clusters.
   * 3. Disclaimer:
   *      1. If the passed in object contains any other keys than ["seriesConfig, "seriesData"] the return object
   *         will not contain those keys with their values; it will only contain these two keys with the calculated
   *         values
   *      2. If the passed in object has missing keys or values that are stated in the example above, the script would
   *         fail and result in errors.
   */
var Kmeans = Class.create();
Kmeans.prototype = {
  initialize: function(opts) {
      //If not parameter was provided set parameters to default empty dictionary
      if (opts === undefined)
          opts = {};

      // Maximum number of iterations for training
      if (opts.maxIteration)
          this.maxIteration = opts.maxIteration;
      else
          this.maxIteration = 300;

      // Number of cluster centroids.
      if (opts.numberOfClusters)
          this.numberOfClusters = opts.numberOfClusters;
      else
          this.numberOfClusters = 3;

      // Function used for distance metric
      if (opts.distanceFunction)
          this.distanceFunction = opts.distanceFunction;
      else
          this.distanceFunction = this.euclidean;

      // For speeding up the process on the bigger data sets where each input is very long we need to decrease the
      // dimensionality
      if (opts.maxNumOfDimensions)
          this.maxNumOfDimensions = opts.maxNumOfDimensions;
      else
          this.maxNumOfDimensions = 10000;

      // Keeps track of which cluster centroid index each data point belongs to.
      this.assignments = [];

      // Keep track of number of times centroids move.
      this.iterations = 0;
  },

  getClusters: function() {
      // Generate random cluster centroid points.
      this.means = this.seedsPlusPlus();

      // Perform work.
      this.run();

      return this.assignments;
  },

  /**
   * seedsPlusPlus
   *  Kmean Plus Plus algorithm for initializing the centroids:
   *  1. choose one center uniformly at random among the data points.
   *  2. For each data point x, compute D(x), the distance between x and the nearest center that has already been
   *     chosen.
   *  3. choose one new data point at random as a new center, using a weighted probability distribution where a point
   *     x is chosen with probability proportional to D(x)2.
   *  4. Repeat Steps 2 and 3 until numberOfClusters centers have been chosen.
   *  5. Now that the initial centers have been chosen, proceed using standard numberOfClusters-means clustering.
   * **/
  seedsPlusPlus: function() {
      var means = [];
      var randomIndex = Math.floor(Math.random() * this.data.length);
      var newMean = this.data[randomIndex];

      means.push(newMean);
      //Assign the first cluster centroid
      // means.push(mean)
      //Loop numberOfClusters-1 to assign the rest of the cluster centroids
      for (var seedCount = 0; seedCount < this.numberOfClusters - 1; seedCount++) {
          // var assignments = [];
          var closestDistancesSquare = [];
          var distance;
          for (var i = 0; i < this.data.length; i++) {
              var point = this.data[i];
              var distances = [];

              for (var j = 0; j < means.length; j++) {
                  var mean = means[j];
                  distances[j] = this.distanceFunction(point, mean);
              }

              // After calculating all the distances from the data point to each cluster centroid,
              // we pick the closest (smallest) distances.
              distance = Math.min.apply(null, distances);
              closestDistancesSquare[i] = Math.pow(distance, 2);
          }
          var sum = closestDistancesSquare.reduce(function(a, b) {
              return a + b;
          });
          var probabilities = [];
          for (i = 0; i < this.data.length; i++) {
              if (sum == 0)
                  probabilities[i] = .01;
              else
                  probabilities[i] = (closestDistancesSquare[i]) / (sum);
          }
          newMean = this.probabilityBasedChoice(this.data, probabilities);
          means.push(newMean);
      }
      return means;
  },

  /**
   * probabilityBasedChoice
   * Select and return an element in the given list of options randomly based on probability distributions received
   * **/
  probabilityBasedChoice: function(options, probabilities) {
      var choices = [];
      for (var i = 0; i < probabilities.length; i++) {
          for (var j = 0; j < Math.round(probabilities[i] * 100); j++) {
              choices.push(options[i]);
          }
      }
      var idx = Math.floor(Math.random() * choices.length);
      return choices[idx];
  },

  /**
   * assignClusterToDataPoints
   * Calculate distance (based on the distance metric given) between each point and the cluster center.
   * Assigns each point to closest mean point.
   */
  assignClusterToDataPoints: function() {
      var assignments = [];
      var point;
      var distances;
      var mean;

      for (var i = 0; i < this.data.length; i++) {
          point = this.data[i];
          distances = [];

          for (var j = 0; j < this.means.length; j++) {
              mean = this.means[j];
              distances[j] = this.distanceFunction(point, mean);
          }
          // After calculating all the distances from the data point to each cluster centroid,
          // we pick the closest (smallest) distances.
          assignments[i] = distances.indexOf(Math.min.apply(null, distances));
      }
      return assignments;
  },

  /**
   * updateMeans
   * Update the positions of the the cluster centroids (means) to the average positions
   * of all data points that belong to that mean.
   */
  updateMeans: function() {
      var newMeans = this.calculateMeans(this.data, this.assignments);

      if (this.means.toString() != newMeans.toString()) {
          this.means = newMeans;
          return true;
      }
      return false;
  },

  /**
   * calculateMeans
   * Get the average of the points assigned under the same clusters.
   */
  calculateMeans: function(points, assignments) {
      var sums = [];
      var counts = this.fillArray(this.means.length, 0);
      var dim;
      var numOfDims = points[0].length;
      var numOfClusters = this.numberOfClusters;
      var point;

      // Clear location sums for each dimension.
      for (var c = 0; c < numOfClusters; c++)
          sums[c] = this.fillArray(numOfDims, 0);

      // For each cluster, get sum of point coordinates in every dimension.
      for (var index = 0; index < assignments.length; index++) {
          c = this.assignments[index];
          point = points[index];
          counts[c]++;
          for (dim = 0; dim < numOfDims; dim++)
              sums[c][dim] += point[dim];

      }
      for (c = 0; c < numOfClusters; c++) {
          if (counts[c] === 0)
              counts[c] = 1;
          for (dim = 0; dim < numOfDims; dim++) {
              sums[c][dim] /= counts[c];
              sums[c][dim] = Math.round(100 * sums[c][dim]) / 100;
          }
      }
      return sums;
  },

  /**
   * run
   * Executing model training over multiple cycles until the model converges
   * or the number of iteration reaches the maximum number of iterations
   */
  run: function() {
      ++this.iterations;
      // Reassign points to nearest cluster centroids.
      this.assignments = this.assignClusterToDataPoints();
      // Returns true if the cluster centroids have moved location since the last iteration.
      var meansUpdated = this.updateMeans();

      if (meansUpdated && (this.iterations < this.maxIteration))
          this.run();
  },

  /**
   * fillArray
   * return an array of size length initialized all to the value of the val
   */
  fillArray: function(length, val) {
      return Array.apply(null, Array(length)).map(function() {
          return val;
      });
  },

  /**
   * euclidean
   * Custom and the default metric distance used to calculate the distance between each data points
   */
  euclidean: function(point, mean) {
      var sum = 0;
      var difference;

      for (var dim = 0; dim < point.length; dim++) {
          difference = point[dim] - mean[dim];
          difference = Math.pow(difference, 2);
          sum += difference;
      }
      return Math.sqrt(sum);
  },

  /**
   * processAndClusterData
   * read the data in the format given eg:
   data = {
          "seriesConfig": {
              "line0": {
                  "x": "timeStamp",
                  "y": "y0",
                  "type": "line",
                  "label": "ML-G1-AG25-1"
              },
              "line1": {
                  "x": "timeStamp",
                  "y": "y1",
                  "type": "line",
                  "label": "BU-MN4-AG27-4"
              },
          },
          "seriesData": [
              {
                  "timeStamp": 1603899000000,
                  "y0": 43.694744,
                  "y1": 49.635857,
              },
              {
                  "timeStamp": 1603899300000,
                  "y0": 47.725418,
                  "y1": 54.03098,

              },
              {
                  "timeStamp": 1603899600000,
                  "y0": null,
                  "y1": null,

              },
              {
                  "timeStamp": 1603899900000,
                  "y0": null,
                  "y1": null,
              }
          ]
   }
   * and create a 2d array that the rows present each line (data point) and the columns
   * represent the values through time then the newly generated 2d array (values) has reduced
   * to less dimensions based on the maximum dimensions assigned and then this is passed to the model
   * for clustering purposes. Then the centroids are selected from the model and formatted in the original
   * format of the data received and then it is returned
   */
  processAndClusterData: function(data, exclusionLineList) {
      if (!exclusionLineList)
          exclusionLineList = [];
      if ("seriesConfig" in data && "seriesData" in data) {
          if (Object.keys(data["seriesConfig"]).length > this.numberOfClusters) {
              if (data["seriesData"].length === 0)
                  throw new Error('seriesData has no elements!!!');
          } else
              throw new Error('seriesConfig has less data points to be clustered than the number of clusters!!!');
      } else
          throw new Error('Argument passed in does not contain either seriesConfig or seriesData');

      var seriesConfig = JSON.parse(JSON.stringify(data["seriesConfig"]));
      var excluded = {};
      for (var i = 0; i < exclusionLineList.length; i++) {
          var key = exclusionLineList[i];
          excluded[key] = seriesConfig[key];
          delete seriesConfig[key];
      }
      var excludedKeys = Object.keys(excluded);
      var keys = Object.keys(seriesConfig);
      var seriesData = data["seriesData"];
      var lines = Object.keys(seriesConfig);
      var numberOfLinesToBeClustered = lines.length;
      var xTitle = seriesConfig[lines[0]]["x"];
      var labels = new Array(numberOfLinesToBeClustered);
      var types = new Array(numberOfLinesToBeClustered);
      var yTitles = new Array(numberOfLinesToBeClustered);
      var times = new Array(seriesData.length);
      var values = new Array(numberOfLinesToBeClustered);
      for (i = 0; i < numberOfLinesToBeClustered; i++) {
          yTitles[i] = seriesConfig[lines[i]]["y"];
          labels[i] = seriesConfig[lines[i]]["label"];
          types[i] = seriesConfig[lines[i]]["type"];
      }
      for (i = 0; i < numberOfLinesToBeClustered; i++) {
          values[i] = new Array(times.length);
          for (j = 0; j < times.length; j++)
              values[i][j] = -1;

      }
      for (i = 0; i < times.length; i++) {
          times[i] = seriesData[i][xTitle];
          for (var j = 0; j < numberOfLinesToBeClustered; j++) {
              var y = yTitles[j];
              if (seriesData[i][y] != null)
                  values[j][i] = seriesData[i][y];
          }
      }
      this.originalData = values;
      this.data = this.dimensionReduction(values, this.maxNumOfDimensions);
      this.originalMeans = [];
      this.getClusters();
      this.originalMeans = this.calculateMeans(this.originalData, this.assignments);
      this.means = this.originalMeans;

      // put the mean results back in the original format of the data provided.
      returnData = JSON.parse(JSON.stringify(data));
      returnData["allConfigs"] = {};
      var groupLines = {};
      var label;
      for (i = 0; i < this.means.length; i++) {
  		var groupNumberTemp1 = parseInt(i + 1).toFixed(0);     //Using temp variable because of L10N concatenation warnings
          label = gs.getMessage("Group {0}", groupNumberTemp1);  //Not using a constant for group string as it causes L10N warning
          groupLines[label] = JSON.parse(JSON.stringify(exclusionLineList));
      }
      for (i = 0; i < this.assignments.length; i++) {
  		var groupNumberTemp2 = parseInt(this.assignments[i] + 1).toFixed(0);  //Using temp variable because of L10N concatenation warnings
          label = gs.getMessage("Group {0}", groupNumberTemp2);                 //Not using a constant for group string as it causes L10N warning
          groupLines[label].push(keys[i]);
      }
      returnData["seriesConfig"] = {};
      returnData["allConfigs"]["ungrouped"] = data["seriesConfig"];
      returnData["allConfigs"]["grouped"] = {};
      returnData["seriesData"] = seriesData;
      for (i = 0; i < this.means.length; i++) {
          mean = this.means[i];
  		var groupNumberTemp3 = parseInt(i + 1).toFixed(0);    //Using temp variable because of L10N concatenation warnings
          label = gs.getMessage("Group {0}", groupNumberTemp3); //Not using a constant for group string as it causes L10N warning
          returnData["allConfigs"]["grouped"][label] = {
              "x": xTitle,
              "y": "g" + i,
              "type": types[i],
              "label": label,
              "lines": groupLines[label]
          };
          returnData["seriesConfig"][label] = {
              "x": xTitle,
              "y": "g" + i,
              "type": types[i],
              "label": label
          };
          for (j = 0; j < times.length; j++)
              returnData["seriesData"][j]["g" + i] = mean[j];

      }
      for (i = 0; i < excludedKeys.length; i++) {
          var lineName = excludedKeys[i];
          returnData["allConfigs"]["grouped"][lineName] = excluded[lineName];
          returnData["seriesConfig"][key] = excluded[key];
      }
      return returnData;
  },

  /**
   * dimensionReduction
   * reduce the dimensions of the columns of the rows in the 2d array passed in to the
   * new dimension (newD) by taking the average of every two consecutive points and replacing them
   * with the average
   */
  dimensionReduction: function(originals, newD) {
      var original;
      var newArrays = [];
      var newList;

      function innerFunc(list, num2) {
          list[0].push(num2);
          if (list[0].length == 2)
              list[1].push((list[0].pop() + list[0].pop()) / 2);
          return list;
      }
      for (var index = 0; index < originals.length; index++) {
          original = [
              [
                  [],
                  []
              ]
          ].concat(originals[index]);
          while (original.length - 1 > newD) {
              newList = original.reduce(innerFunc);
              // If there is an extra value in the end (the number of elements was not divisible by two) then
              // take its average with the new previous value and add the average back.
              if (newList[0].length === 1)
                  newList[1].push((newList[0][0] + newList[1][newList[1].length - 1]) / 2);
              original = [
                  [
                      [],
                      []
                  ]
              ].concat(newList[1]);
          }
          original.splice(0, 1);
          newArrays.push(original);
      }
      return newArrays;
  },

  type: 'Kmeans'
};

Sys ID

746ff9af1b982010593876a61a4bcb0c

Offical Documentation

Official Docs: