mirror of
https://github.com/TryGhost/Ghost.git
synced 2025-01-20 22:42:53 -05:00
Configured prometheus client to reuse TCP connections to the pushgateway (#21695)
ref https://linear.app/ghost/issue/ENG-1796/reuse-tcp-connections-when-sending-metrics-to-the-pushgateway - When we rolled out the prometheus metrics collection, it overwhelmed the pushgateway. Our hypothesis is that Ghost was creating too many new TCP connections to the pushgateway. - The prometheus client was creating a new connection with the pushgateway each time it pushed metrics every 15 seconds. - This commit changes the prometheus client to keep the connection alive, and re-use it instead of creating a new one. - It also limits the number of retries if pushing the metrics fails — after 3 consecutive failures, Ghost will stop retrying and log an error.
This commit is contained in:
parent
829a0fc7e9
commit
2ff82c7ac0
2 changed files with 50 additions and 8 deletions
|
@ -1,4 +1,5 @@
|
|||
import {Request, Response} from 'express';
|
||||
import http from 'http';
|
||||
import client from 'prom-client';
|
||||
import type {Metric, MetricObjectWithValues, MetricValue} from 'prom-client';
|
||||
import type {Knex} from 'knex';
|
||||
|
@ -37,6 +38,7 @@ export class PrometheusClient {
|
|||
private config: PrometheusClientConfig;
|
||||
private prefix;
|
||||
private pushInterval: ReturnType<typeof setInterval> | undefined;
|
||||
private pushRetries: number = 0;
|
||||
private logger: any;
|
||||
|
||||
/**
|
||||
|
@ -46,8 +48,14 @@ export class PrometheusClient {
|
|||
this.collectDefaultMetrics();
|
||||
if (this.config.pushgateway?.enabled) {
|
||||
const gatewayUrl = this.config.pushgateway.url || 'http://localhost:9091';
|
||||
const interval = this.config.pushgateway.interval || 5000;
|
||||
this.gateway = new client.Pushgateway(gatewayUrl);
|
||||
const interval = this.config.pushgateway.interval || 15000;
|
||||
this.gateway = new client.Pushgateway(gatewayUrl, {
|
||||
timeout: 5000,
|
||||
agent: new http.Agent({
|
||||
keepAlive: true,
|
||||
maxSockets: 1
|
||||
})
|
||||
});
|
||||
this.pushMetrics();
|
||||
this.pushInterval = setInterval(() => {
|
||||
this.pushMetrics();
|
||||
|
@ -59,11 +67,17 @@ export class PrometheusClient {
|
|||
* Pushes metrics to the pushgateway, if enabled
|
||||
*/
|
||||
async pushMetrics() {
|
||||
if (this.pushRetries >= 3) {
|
||||
this.logger.error('Failed to push metrics to pushgateway 3 times in a row, giving up');
|
||||
this.stop();
|
||||
return;
|
||||
}
|
||||
if (this.config.pushgateway?.enabled && this.gateway) {
|
||||
const jobName = this.config.pushgateway?.jobName || 'ghost';
|
||||
try {
|
||||
await this.gateway.pushAdd({jobName});
|
||||
this.logger.debug('Metrics pushed to pushgateway - jobName: ', jobName);
|
||||
this.pushRetries = 0;
|
||||
} catch (err) {
|
||||
let error;
|
||||
if (typeof err === 'object' && err !== null && 'code' in err) {
|
||||
|
@ -72,6 +86,7 @@ export class PrometheusClient {
|
|||
error = 'Error pushing metrics to pushgateway: Unknown error';
|
||||
}
|
||||
this.logger.error(error);
|
||||
this.pushRetries = this.pushRetries + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,10 +47,10 @@ describe('Prometheus Client', function () {
|
|||
const clock = sinon.useFakeTimers();
|
||||
nock('http://localhost:9091')
|
||||
.persist()
|
||||
.post('/metrics/job/ghost')
|
||||
.post('/metrics/job/ghost-test')
|
||||
.reply(200);
|
||||
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20}});
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20, jobName: 'ghost-test'}});
|
||||
const pushMetricsStub = sinon.stub(instance, 'pushMetrics').resolves();
|
||||
instance.init();
|
||||
assert.ok(instance.gateway);
|
||||
|
@ -59,6 +59,12 @@ describe('Prometheus Client', function () {
|
|||
assert.ok(pushMetricsStub.calledTwice, 'pushMetrics should be called again after the interval');
|
||||
clock.restore();
|
||||
});
|
||||
|
||||
it('should not create the pushgateway client if the pushgateway is disabled', function () {
|
||||
instance = new PrometheusClient({pushgateway: {enabled: false}});
|
||||
instance.init();
|
||||
assert.equal(instance.gateway, undefined);
|
||||
});
|
||||
});
|
||||
|
||||
describe('collectDefaultMetrics', function () {
|
||||
|
@ -74,16 +80,16 @@ describe('Prometheus Client', function () {
|
|||
it('should push metrics to the pushgateway', async function () {
|
||||
const scope = nock('http://localhost:9091')
|
||||
.persist()
|
||||
.post('/metrics/job/ghost')
|
||||
.post('/metrics/job/ghost-test')
|
||||
.reply(200);
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true}});
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}});
|
||||
instance.init();
|
||||
await instance.pushMetrics();
|
||||
scope.done();
|
||||
});
|
||||
|
||||
it('should log an error with error code if pushing metrics to the gateway fails', async function () {
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true}}, logger);
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger);
|
||||
instance.init();
|
||||
instance.gateway = {
|
||||
pushAdd: sinon.stub().rejects({code: 'ECONNRESET'})
|
||||
|
@ -95,7 +101,7 @@ describe('Prometheus Client', function () {
|
|||
});
|
||||
|
||||
it('should log a generic error if the error is unknown', async function () {
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true}}, logger);
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger);
|
||||
instance.init();
|
||||
instance.gateway = {
|
||||
pushAdd: sinon.stub().rejects()
|
||||
|
@ -105,6 +111,27 @@ describe('Prometheus Client', function () {
|
|||
const [[error]] = logger.error.args;
|
||||
assert.match(error, /Unknown error/);
|
||||
});
|
||||
|
||||
it('should give up after 3 retries in a row', async function () {
|
||||
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger);
|
||||
instance.init();
|
||||
const pushAddStub = sinon.stub().rejects();
|
||||
instance.gateway = {
|
||||
pushAdd: pushAddStub
|
||||
} as unknown as Pushgateway<RegistryContentType>;
|
||||
|
||||
// Simulate failing to push metrics multiple times in a row
|
||||
// It should give up after the 3rd attempt in a row
|
||||
await instance.pushMetrics();
|
||||
await instance.pushMetrics();
|
||||
await instance.pushMetrics();
|
||||
await instance.pushMetrics();
|
||||
await instance.pushMetrics();
|
||||
await instance.pushMetrics();
|
||||
await instance.pushMetrics();
|
||||
assert.ok(pushAddStub.calledThrice);
|
||||
assert.ok(logger.error.calledWith('Failed to push metrics to pushgateway 3 times in a row, giving up'));
|
||||
});
|
||||
});
|
||||
|
||||
describe('handleMetricsRequest', function () {
|
||||
|
|
Loading…
Add table
Reference in a new issue