From 2ff82c7ac0fa421cef2f5634dc912ddb48bcf3b9 Mon Sep 17 00:00:00 2001 From: Chris Raible Date: Thu, 21 Nov 2024 17:43:33 -0800 Subject: [PATCH] Configured prometheus client to reuse TCP connections to the pushgateway (#21695) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ref https://linear.app/ghost/issue/ENG-1796/reuse-tcp-connections-when-sending-metrics-to-the-pushgateway - When we rolled out the prometheus metrics collection, it overwhelmed the pushgateway. Our hypothesis is that Ghost was creating too many new TCP connections to the pushgateway. - The prometheus client was creating a new connection with the pushgateway each time it pushed metrics every 15 seconds. - This commit changes the prometheus client to keep the connection alive, and re-use it instead of creating a new one. - It also limits the number of retries if pushing the metrics fails — after 3 consecutive failures, Ghost will stop retrying and log an error. --- .../src/PrometheusClient.ts | 19 ++++++++- .../test/prometheus-client.test.ts | 39 ++++++++++++++++--- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/ghost/prometheus-metrics/src/PrometheusClient.ts b/ghost/prometheus-metrics/src/PrometheusClient.ts index 948d1d84ee..f658296f9c 100644 --- a/ghost/prometheus-metrics/src/PrometheusClient.ts +++ b/ghost/prometheus-metrics/src/PrometheusClient.ts @@ -1,4 +1,5 @@ import {Request, Response} from 'express'; +import http from 'http'; import client from 'prom-client'; import type {Metric, MetricObjectWithValues, MetricValue} from 'prom-client'; import type {Knex} from 'knex'; @@ -37,6 +38,7 @@ export class PrometheusClient { private config: PrometheusClientConfig; private prefix; private pushInterval: ReturnType | undefined; + private pushRetries: number = 0; private logger: any; /** @@ -46,8 +48,14 @@ export class PrometheusClient { this.collectDefaultMetrics(); if (this.config.pushgateway?.enabled) { const gatewayUrl = this.config.pushgateway.url || 'http://localhost:9091'; - const interval = this.config.pushgateway.interval || 5000; - this.gateway = new client.Pushgateway(gatewayUrl); + const interval = this.config.pushgateway.interval || 15000; + this.gateway = new client.Pushgateway(gatewayUrl, { + timeout: 5000, + agent: new http.Agent({ + keepAlive: true, + maxSockets: 1 + }) + }); this.pushMetrics(); this.pushInterval = setInterval(() => { this.pushMetrics(); @@ -59,11 +67,17 @@ export class PrometheusClient { * Pushes metrics to the pushgateway, if enabled */ async pushMetrics() { + if (this.pushRetries >= 3) { + this.logger.error('Failed to push metrics to pushgateway 3 times in a row, giving up'); + this.stop(); + return; + } if (this.config.pushgateway?.enabled && this.gateway) { const jobName = this.config.pushgateway?.jobName || 'ghost'; try { await this.gateway.pushAdd({jobName}); this.logger.debug('Metrics pushed to pushgateway - jobName: ', jobName); + this.pushRetries = 0; } catch (err) { let error; if (typeof err === 'object' && err !== null && 'code' in err) { @@ -72,6 +86,7 @@ export class PrometheusClient { error = 'Error pushing metrics to pushgateway: Unknown error'; } this.logger.error(error); + this.pushRetries = this.pushRetries + 1; } } } diff --git a/ghost/prometheus-metrics/test/prometheus-client.test.ts b/ghost/prometheus-metrics/test/prometheus-client.test.ts index 2a69907966..57b244edbc 100644 --- a/ghost/prometheus-metrics/test/prometheus-client.test.ts +++ b/ghost/prometheus-metrics/test/prometheus-client.test.ts @@ -47,10 +47,10 @@ describe('Prometheus Client', function () { const clock = sinon.useFakeTimers(); nock('http://localhost:9091') .persist() - .post('/metrics/job/ghost') + .post('/metrics/job/ghost-test') .reply(200); - instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20}}); + instance = new PrometheusClient({pushgateway: {enabled: true, interval: 20, jobName: 'ghost-test'}}); const pushMetricsStub = sinon.stub(instance, 'pushMetrics').resolves(); instance.init(); assert.ok(instance.gateway); @@ -59,6 +59,12 @@ describe('Prometheus Client', function () { assert.ok(pushMetricsStub.calledTwice, 'pushMetrics should be called again after the interval'); clock.restore(); }); + + it('should not create the pushgateway client if the pushgateway is disabled', function () { + instance = new PrometheusClient({pushgateway: {enabled: false}}); + instance.init(); + assert.equal(instance.gateway, undefined); + }); }); describe('collectDefaultMetrics', function () { @@ -74,16 +80,16 @@ describe('Prometheus Client', function () { it('should push metrics to the pushgateway', async function () { const scope = nock('http://localhost:9091') .persist() - .post('/metrics/job/ghost') + .post('/metrics/job/ghost-test') .reply(200); - instance = new PrometheusClient({pushgateway: {enabled: true}}); + instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}); instance.init(); await instance.pushMetrics(); scope.done(); }); it('should log an error with error code if pushing metrics to the gateway fails', async function () { - instance = new PrometheusClient({pushgateway: {enabled: true}}, logger); + instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger); instance.init(); instance.gateway = { pushAdd: sinon.stub().rejects({code: 'ECONNRESET'}) @@ -95,7 +101,7 @@ describe('Prometheus Client', function () { }); it('should log a generic error if the error is unknown', async function () { - instance = new PrometheusClient({pushgateway: {enabled: true}}, logger); + instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger); instance.init(); instance.gateway = { pushAdd: sinon.stub().rejects() @@ -105,6 +111,27 @@ describe('Prometheus Client', function () { const [[error]] = logger.error.args; assert.match(error, /Unknown error/); }); + + it('should give up after 3 retries in a row', async function () { + instance = new PrometheusClient({pushgateway: {enabled: true, jobName: 'ghost-test'}}, logger); + instance.init(); + const pushAddStub = sinon.stub().rejects(); + instance.gateway = { + pushAdd: pushAddStub + } as unknown as Pushgateway; + + // Simulate failing to push metrics multiple times in a row + // It should give up after the 3rd attempt in a row + await instance.pushMetrics(); + await instance.pushMetrics(); + await instance.pushMetrics(); + await instance.pushMetrics(); + await instance.pushMetrics(); + await instance.pushMetrics(); + await instance.pushMetrics(); + assert.ok(pushAddStub.calledThrice); + assert.ok(logger.error.calledWith('Failed to push metrics to pushgateway 3 times in a row, giving up')); + }); }); describe('handleMetricsRequest', function () {