283 lines
12 KiB
JavaScript
283 lines
12 KiB
JavaScript
'use strict';
|
|
|
|
var assert = require('assert-plus');
|
|
var pidusage = require('pidusage');
|
|
var errors = require('restify-errors');
|
|
var EWMA = require('ewma');
|
|
|
|
/**
|
|
* cpuUsageThrottle is a middleware that rejects a variable number of requests
|
|
* (between 0% and 100%) based on a historical view of CPU utilization of a
|
|
* Node.js process. Essentially, this plugin allows you to define what
|
|
* constitutes a saturated Node.js process via CPU utilization and it will
|
|
* handle dropping a % of requests based on that definiton. This is useful when
|
|
* you would like to keep CPU bound tasks from piling up causing an increased
|
|
* per-request latency.
|
|
*
|
|
* The algorithm asks you for a maximum CPU utilization rate, which it uses to
|
|
* determine at what point it should be rejecting 100% of traffic. For a normal
|
|
* Node.js service, this is 1 since Node is single threaded. It uses this,
|
|
* paired with a limit that you provide to determine the total % of traffic it
|
|
* should be rejecting. For example, if you specify a limit of .5 and a max of
|
|
* 1, and the current EWMA (next paragraph) value reads .75, this plugin will
|
|
* reject approximately 50% of all requests.
|
|
*
|
|
* When looking at the process' CPU usage, this algorithm will take a load
|
|
* average over a user specified interval. example, if given an interval of
|
|
* 250ms, this plugin will attempt to record the average CPU utilization over
|
|
* 250ms intervals. Due to contention for resources, the duration of each
|
|
* average may be wider or narrower than 250ms. To compensate for this, we use
|
|
* an exponentially weighted moving average. The EWMA algorithm is provided by
|
|
* the ewma module. The parameter for configuring the EWMA is halfLife. This
|
|
* value controls how quickly each load average measurment decays to half it's
|
|
* value when being represented in the current average. For example, if you
|
|
* have an interval of 250, and a halfLife of 250, you will take the previous
|
|
* ewma value multiplied by 0.5 and add it to the new CPU utilization average
|
|
* measurement multiplied by 0.5. The previous value and the new measurement
|
|
* would each represent 50% of the new value. A good way of thinking about the
|
|
* halfLife is in terms of how responsive this plugin will be to spikes in CPU
|
|
* utilization. The higher the halfLife, the longer CPU utilization will have
|
|
* to remain above your defined limit before this plugin begins rejecting
|
|
* requests and, converserly, the longer it will have to drop below your limit
|
|
* before the plugin begins accepting requests again. This is a knob you will
|
|
* want to with play when trying to determine the ideal value for your use
|
|
* case.
|
|
*
|
|
* For a better understanding of the EWMA algorithn, refer to the documentation
|
|
* for the ewma module.
|
|
*
|
|
* @public
|
|
* @function cpuUsageThrottle
|
|
* @param {Object} opts - Configure this plugin.
|
|
* @param {Number} [opts.limit] - The point at which restify will begin
|
|
* rejecting a % of all requests at the front door.
|
|
* This value is a percentage.
|
|
* For example 0.8 === 80% average CPU utilization. Defaults to 0.75.
|
|
* @param {Number} [opts.max] - The point at which restify will reject 100% of
|
|
* all requests at the front door. This is used in conjunction with limit to
|
|
* determine what % of traffic restify needs to reject when attempting to
|
|
* bring the average load back to the user requested values. Since Node.js is
|
|
* single threaded, the default for this is 1. In some rare cases, a Node.js
|
|
* process can exceed 100% CPU usage and you will want to update this value.
|
|
* @param {Number} [opts.interval] - How frequently we calculate the average CPU
|
|
* utilization. When we calculate an average CPU utilization, we calculate it
|
|
* over this interval, and this drives whether or not we should be shedding
|
|
* load. This can be thought of as a "resolution" where the lower this value,
|
|
* the higher the resolution our load average will be and the more frequently
|
|
* we will recalculate the % of traffic we should be shedding. This check
|
|
* is rather lightweight, while the default is 250ms, you should be able to
|
|
* decrease this value without seeing a significant impact to performance.
|
|
* @param {Number} [opts.halfLife] - When we sample the CPU usage on an
|
|
* interval, we create a series of data points.
|
|
* We take these points and calculate a
|
|
* moving average. The halfLife indicates how quickly a point "decays" to
|
|
* half it's value in the moving average. The lower the halfLife, the more
|
|
* impact newer data points have on the average. If you want to be extremely
|
|
* responsive to spikes in CPU usage, set this to a lower value. If you want
|
|
* your process to put more emphasis on recent historical CPU usage when
|
|
* determininng whether it should shed load, set this to a higher value. The
|
|
* unit is in ms. Defaults to 250.
|
|
* @returns {Function} middleware to be registered on server.pre
|
|
* @example
|
|
* var restify = require('restify');
|
|
*
|
|
* var server = restify.createServer();
|
|
* const options = {
|
|
* limit: .75,
|
|
* max: 1,
|
|
* interval: 250,
|
|
* halfLife: 500,
|
|
* }
|
|
*
|
|
* server.pre(restify.plugins.cpuUsageThrottle(options));
|
|
* @example
|
|
* <caption>
|
|
* You can also update the plugin during runtime using the `.update()` function.
|
|
* This function accepts the same `opts` object as a constructor.
|
|
* </caption>
|
|
* var plugin = restify.plugins.cpuUsageThrottle(options);
|
|
* server.pre(plugin);
|
|
*
|
|
* plugin.update({ limit: .4, halfLife: 5000 });
|
|
*/
|
|
function cpuUsageThrottlePlugin(opts) {
|
|
// Scrub input and populate our configuration
|
|
assert.object(opts, 'opts');
|
|
assert.optionalNumber(opts.limit, 'opts.limit');
|
|
assert.optionalNumber(opts.max, 'opts.max');
|
|
assert.optionalNumber(opts.interval, 'opts.interval');
|
|
assert.optionalNumber(opts.halfLife, 'opts.halfLife');
|
|
|
|
var plugin = {};
|
|
plugin._limit = typeof opts.limit === 'number' ? opts.limit : 0.75;
|
|
plugin._max = opts.max || 1;
|
|
plugin._interval = opts.interval || 250;
|
|
plugin._halfLife = typeof opts.halfLife === 'number' ? opts.halfLife : 250;
|
|
assert.ok(plugin._max > plugin._limit, 'limit must be less than max');
|
|
|
|
plugin._ewma = new EWMA(plugin._halfLife);
|
|
|
|
// plugin._reject represents the % of traffic that we should reject at the
|
|
// current point in time based on how much over our limit we are. This is
|
|
// updated on an interval by updateReject().
|
|
plugin._reject = 0;
|
|
|
|
// plugin._timeout keeps track of the current handle for the setTimeout we
|
|
// use to gather CPU load averages, this allows us to cancel the timeout
|
|
// when shutting down restify.
|
|
plugin._timeout = null;
|
|
|
|
// plugin._timeoutDelta represents the amount of time between when we
|
|
// _should_ have run updateReject and the actual time it was invoked.
|
|
// This allows us to monitor lag caused by both the event loop
|
|
// and pidusage
|
|
plugin._timeoutDelta = 0;
|
|
plugin._timeoutStart = Date.now();
|
|
|
|
// updateReject should be called on an interval, it checks the average CPU
|
|
// usage between two invocations of updateReject.
|
|
function updateReject() {
|
|
pidusage(process.pid, function pidusageStat(e, stat) {
|
|
// Requeue an updateReject irrespective of whether or not pidusage
|
|
// encountered an error
|
|
plugin._timeout = setTimeout(updateReject, plugin._interval);
|
|
|
|
// If we were unable to get cpu usage, don't make any new decisions.
|
|
if (
|
|
!stat ||
|
|
typeof stat.cpu !== 'number' ||
|
|
Number.isNaN(stat.cpu)
|
|
) {
|
|
return;
|
|
}
|
|
|
|
// Divide by 100 to match Linux's `top` format
|
|
plugin._ewma.insert(stat.cpu / 100);
|
|
plugin._cpu = plugin._ewma.value();
|
|
|
|
// Update reject with the % of traffic we should be rejecting. This
|
|
// is safe since max > limit so the denominator can never be 0. If
|
|
// the current cpu usage is less that the limit, _reject will be
|
|
// negative and we will never shed load
|
|
plugin._reject =
|
|
(plugin._cpu - plugin._limit) / (plugin._max - plugin._limit);
|
|
|
|
// Calculate how long it took between when our interval should have
|
|
// updated the _reject value and how long it actually took. This
|
|
// metric accounts for the misbehaviour of pidusage
|
|
var now = Date.now();
|
|
plugin._timeoutDelta =
|
|
now - plugin._timeoutStart - plugin._interval;
|
|
plugin._timeoutStart = now;
|
|
});
|
|
}
|
|
|
|
// Kick off updating our _reject value
|
|
updateReject();
|
|
|
|
function cpuUsageThrottle(req, res, next) {
|
|
// Check to see if this request gets rejected. Since, in updateReject,
|
|
// we calculate a percentage of traffic we are planning to reject, we
|
|
// can use Math.random() (which picks from a uniform distribution in
|
|
// [0,1)) to give us a `plugin._reject`% chance of dropping any given
|
|
// request. This is a stateless was to drop approximatly
|
|
// `plugin._reject`% of traffic.
|
|
var probabilityDraw = Math.random();
|
|
|
|
if (probabilityDraw >= plugin._reject) {
|
|
return next(); // Don't reject this request
|
|
}
|
|
|
|
var err = new errors.ServiceUnavailableError({
|
|
context: {
|
|
plugin: 'cpuUsageThrottle',
|
|
cpuUsage: plugin._cpu,
|
|
limit: plugin._limit,
|
|
max: plugin._max,
|
|
reject: plugin._reject,
|
|
halfLife: plugin._halfLife,
|
|
interval: plugin._interval,
|
|
probabilityDraw: probabilityDraw,
|
|
lag: plugin._timeoutDelta
|
|
}
|
|
});
|
|
|
|
return next(err);
|
|
}
|
|
|
|
// Allow the app to clear the timeout for this plugin if necessary, without
|
|
// this we would never be able to clear the event loop when letting Node
|
|
// shut down gracefully
|
|
function close() {
|
|
clearTimeout(plugin._timeout);
|
|
}
|
|
cpuUsageThrottle.close = close;
|
|
|
|
// Expose internal plugin state for introspection
|
|
Object.defineProperty(cpuUsageThrottle, 'state', {
|
|
get: function get() {
|
|
// We intentionally do not expose ewma since we don't want the user
|
|
// to be able to update it's configuration, the current state of
|
|
// ewma is represented in plugin._cpu
|
|
return {
|
|
limit: plugin._limit,
|
|
max: plugin._max,
|
|
interval: plugin._interval,
|
|
halfLife: plugin._halfLife,
|
|
cpuUsage: plugin._cpu,
|
|
reject: plugin._reject,
|
|
lag: plugin._timeoutDelta
|
|
};
|
|
}
|
|
});
|
|
|
|
/**
|
|
* cpuUsageThrottle.update
|
|
*
|
|
* Allow the plugin's configuration to be updated during runtime.
|
|
*
|
|
* @private
|
|
* @param {Object} newOpts - The opts object for reconfiguring this plugin,
|
|
* it follows the same format as the constructor for this plugin.
|
|
* @returns {undefined} no return value
|
|
*/
|
|
cpuUsageThrottle.update = function update(newOpts) {
|
|
assert.object(newOpts, 'newOpts');
|
|
assert.optionalNumber(newOpts.limit, 'newOpts.limit');
|
|
assert.optionalNumber(newOpts.max, 'newOpts.max');
|
|
assert.optionalNumber(newOpts.interval, 'newOpts.interval');
|
|
assert.optionalNumber(newOpts.halfLife, 'newOpts.halfLife');
|
|
|
|
if (newOpts.limit !== undefined) {
|
|
plugin._limit = newOpts.limit;
|
|
}
|
|
|
|
if (newOpts.max !== undefined) {
|
|
plugin._max = newOpts.max;
|
|
}
|
|
|
|
if (newOpts.interval !== undefined) {
|
|
plugin._interval = newOpts.interval;
|
|
}
|
|
|
|
if (newOpts.halfLife !== undefined) {
|
|
plugin._halfLife = newOpts.halfLife;
|
|
// update our ewma with the new halfLife, we use the previous known
|
|
// state as the initial state for our new halfLife in lieu of
|
|
// having access to true historical data.
|
|
plugin._ewma = new EWMA(plugin._halfLife, plugin._cpu);
|
|
}
|
|
|
|
// Ensure new values are still valid
|
|
assert.ok(plugin._max > plugin._limit, 'limit must be less than max');
|
|
|
|
// Update _reject with the new settings
|
|
plugin._reject =
|
|
(plugin._cpu - plugin._limit) / (plugin._max - plugin._limit);
|
|
};
|
|
|
|
return cpuUsageThrottle;
|
|
}
|
|
|
|
module.exports = cpuUsageThrottlePlugin;
|