0
Fork 0
mirror of https://github.com/TryGhost/Ghost.git synced 2025-01-20 22:42:53 -05:00

Configured prometheus client to reuse TCP connections to the pushgateway (#21695)

ref
https://linear.app/ghost/issue/ENG-1796/reuse-tcp-connections-when-sending-metrics-to-the-pushgateway

- When we rolled out the prometheus metrics collection, it overwhelmed
the pushgateway. Our hypothesis is that Ghost was creating too many new
TCP connections to the pushgateway.
- The prometheus client was creating a new connection with the
pushgateway each time it pushed metrics every 15 seconds.
- This commit changes the prometheus client to keep the connection
alive, and re-use it instead of creating a new one.
- It also limits the number of retries if pushing the metrics fails —
after 3 consecutive failures, Ghost will stop retrying and log an error.
This commit is contained in:
Chris Raible 2024-11-21 17:43:33 -08:00 committed by GitHub
parent 829a0fc7e9
commit 2ff82c7ac0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 50 additions and 8 deletions

View file

@ -1,4 +1,5 @@
import {Request, Response} from 'express';
import http from 'http';
import client from 'prom-client';
import type {Metric, MetricObjectWithValues, MetricValue} from 'prom-client';
import type {Knex} from 'knex';
@ -37,6 +38,7 @@ export class PrometheusClient {
private config: PrometheusClientConfig;
private prefix;
private pushInterval: ReturnType<typeof setInterval> | undefined;
private pushRetries: number = 0;
private logger: any;
/**
@ -46,8 +48,14 @@ export class PrometheusClient {
this.collectDefaultMetrics();
if (this.config.pushgateway?.enabled) {
const gatewayUrl = this.config.pushgateway.url || 'http://localhost:9091';
const interval = this.config.pushgateway.interval || 5000;
this.gateway = new client.Pushgateway(gatewayUrl);
const interval = this.config.pushgateway.interval || 15000;
this.gateway = new client.Pushgateway(gatewayUrl, {
timeout: 5000,
agent: new http.Agent({
keepAlive: true,
maxSockets: 1
})
});
this.pushMetrics();
this.pushInterval = setInterval(() => {
this.pushMetrics();
@ -59,11 +67,17 @@ export class PrometheusClient {
* Pushes metrics to the pushgateway, if enabled
*/
async pushMetrics() {
if (this.pushRetries >= 3) {
this.logger.error('Failed to push metrics to pushgateway 3 times in a row, giving up');
this.stop();
return;
}
if (this.config.pushgateway?.enabled && this.gateway) {
const jobName = this.config.pushgateway?.jobName || 'ghost';
try {
await this.gateway.pushAdd({jobName});
this.logger.debug('Metrics pushed to pushgateway - jobName: ', jobName);
this.pushRetries = 0;
} catch (err) {
let error;
if (typeof err === 'object' && err !== null && 'code' in err) {
@ -72,6 +86,7 @@ export class PrometheusClient {
error = 'Error pushing metrics to pushgateway: Unknown error';
}
this.logger.error(error);
this.pushRetries = this.pushRetries + 1;
}
}
}

View file

@ -47,10 +47,10 @@ describe('Prometheus Client', function () {
const clock = sinon.useFakeTimers();
nock('http://localhost:9091')
.persist()
.post('/metrics/job/ghost')
.post('/metrics/job/ghost-test')
.reply(200);
instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20}});
instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20, jobName: 'ghost-test'}});
const pushMetricsStub = sinon.stub(instance, 'pushMetrics').resolves();
instance.init();
assert.ok(instance.gateway);
@ -59,6 +59,12 @@ describe('Prometheus Client', function () {
assert.ok(pushMetricsStub.calledTwice, 'pushMetrics should be called again after the interval');
clock.restore();
});
it('should not create the pushgateway client if the pushgateway is disabled', function () {
instance = new PrometheusClient({pushgateway: {enabled: false}});
instance.init();
assert.equal(instance.gateway, undefined);
});
});
describe('collectDefaultMetrics', function () {
@ -74,16 +80,16 @@ describe('Prometheus Client', function () {
it('should push metrics to the pushgateway', async function () {
const scope = nock('http://localhost:9091')
.persist()
.post('/metrics/job/ghost')
.post('/metrics/job/ghost-test')
.reply(200);
instance = new PrometheusClient({pushgateway: {enabled: true}});
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}});
instance.init();
await instance.pushMetrics();
scope.done();
});
it('should log an error with error code if pushing metrics to the gateway fails', async function () {
instance = new PrometheusClient({pushgateway: {enabled: true}}, logger);
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger);
instance.init();
instance.gateway = {
pushAdd: sinon.stub().rejects({code: 'ECONNRESET'})
@ -95,7 +101,7 @@ describe('Prometheus Client', function () {
});
it('should log a generic error if the error is unknown', async function () {
instance = new PrometheusClient({pushgateway: {enabled: true}}, logger);
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger);
instance.init();
instance.gateway = {
pushAdd: sinon.stub().rejects()
@ -105,6 +111,27 @@ describe('Prometheus Client', function () {
const [[error]] = logger.error.args;
assert.match(error, /Unknown error/);
});
it('should give up after 3 retries in a row', async function () {
instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger);
instance.init();
const pushAddStub = sinon.stub().rejects();
instance.gateway = {
pushAdd: pushAddStub
} as unknown as Pushgateway<RegistryContentType>;
// Simulate failing to push metrics multiple times in a row
// It should give up after the 3rd attempt in a row
await instance.pushMetrics();
await instance.pushMetrics();
await instance.pushMetrics();
await instance.pushMetrics();
await instance.pushMetrics();
await instance.pushMetrics();
await instance.pushMetrics();
assert.ok(pushAddStub.calledThrice);
assert.ok(logger.error.calledWith('Failed to push metrics to pushgateway 3 times in a row, giving up'));
});
});
describe('handleMetricsRequest', function () {