From 826b5be00229e72728c31cf766d1700abbcb7f40 Mon Sep 17 00:00:00 2001 From: naz Date: Tue, 19 Sep 2023 16:02:14 +0800 Subject: [PATCH 1/3] Optimized collection repository queries (#18194) refs https://github.com/TryGhost/Arch/issues/86 - Creating bookshelf models for each collection_post relation created a massive overhead. On a dataset with 500k collections_posts records the timing was roughly 7s comparing to 810ms after the optimization. - Optimized memory and performance of collections fetching by querying post ids only by default --- .../core/server/api/endpoints/collections.js | 6 +- .../BookshelfCollectionsRepository.js | 60 ++++++++++++++++--- .../test/e2e-api/admin/collections.test.js | 6 +- 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/ghost/core/core/server/api/endpoints/collections.js b/ghost/core/core/server/api/endpoints/collections.js index 92e6ffb06b..2bf31facbe 100644 --- a/ghost/core/core/server/api/endpoints/collections.js +++ b/ghost/core/core/server/api/endpoints/collections.js @@ -29,7 +29,11 @@ module.exports = { }, permissions: true, query(frame) { - return collectionsService.api.getAll(frame.options); + return collectionsService.api.getAll({ + filter: frame.options.filter, + limit: frame.options.limit, + page: frame.options.page + }); } }, diff --git a/ghost/core/core/server/services/collections/BookshelfCollectionsRepository.js b/ghost/core/core/server/services/collections/BookshelfCollectionsRepository.js index 3a05bfccf1..855226d756 100644 --- a/ghost/core/core/server/services/collections/BookshelfCollectionsRepository.js +++ b/ghost/core/core/server/services/collections/BookshelfCollectionsRepository.js @@ -28,12 +28,14 @@ module.exports = class BookshelfCollectionsRepository { async getById(id, options = {}) { const model = await this.#model.findOne({id}, { require: false, - withRelated: ['collectionPosts'], transacting: options.transaction }); if (!model) { return null; } + + model.collectionPostIds = await this.#fetchCollectionPostIds(model.id, options); + return this.#modelToCollection(model); } @@ -44,28 +46,63 @@ module.exports = class BookshelfCollectionsRepository { async getBySlug(slug, options = {}) { const model = await this.#model.findOne({slug}, { require: false, - withRelated: ['collectionPosts'], transacting: options.transaction }); + if (!model) { return null; } + + model.collectionPostIds = await this.#fetchCollectionPostIds(model.id, options); + return this.#modelToCollection(model); } + /** + * NOTE: we are only fetching post_id column here to save memory on + * instances with a large amount of posts + * + * The method could be further optimized to fetch posts for + * multiple collections at a time. + * + * @param {string} collectionId collection to fetch post ids for + * @param {Object} options bookshelf options + * + * @returns {Promise>} + */ + async #fetchCollectionPostIds(collectionId, options = {}) { + const toSelect = options.columns || ['post_id']; + + const query = this.#relationModel.query(); + if (options.transaction) { + query.transacting(options.transaction); + } + + return await query + .select(toSelect) + .where('collection_id', collectionId); + } + /** * @param {object} [options] * @param {string} [options.filter] * @param {string} [options.order] + * @param {string[]} [options.withRelated] * @param {import('knex').Transaction} [options.transaction] */ async getAll(options = {}) { const models = await this.#model.findAll({ ...options, - transacting: options.transaction, - withRelated: ['collectionPosts'] + transacting: options.transaction }); + for (const model of models) { + // NOTE: Ideally collection posts would be fetching as a part of findAll query. + // Because bookshelf introduced a massive processing and memory overhead + // we are fetching collection post ids separately using raw knex query + model.collectionPostIds = await this.#fetchCollectionPostIds(model.id, options); + } + return (await Promise.all(models.map(model => this.#modelToCollection(model)))).filter(entity => !!entity); } @@ -78,6 +115,10 @@ module.exports = class BookshelfCollectionsRepository { } try { + // NOTE: collectionPosts are not a part of serialized model + // and are fetched separately to save memory + const posts = model.collectionPostIds; + return await Collection.create({ id: json.id, slug: json.slug, @@ -86,7 +127,7 @@ module.exports = class BookshelfCollectionsRepository { filter: filter, type: json.type, featureImage: json.feature_image, - posts: json.collectionPosts.map(collectionPost => collectionPost.post_id), + posts: posts.map(collectionPost => collectionPost.post_id), createdAt: json.created_at, updatedAt: json.updated_at }); @@ -135,8 +176,7 @@ module.exports = class BookshelfCollectionsRepository { {id: data.id}, { require: false, - transacting: options.transaction, - withRelated: ['collectionPosts'] + transacting: options.transaction } ); @@ -175,7 +215,11 @@ module.exports = class BookshelfCollectionsRepository { if (collection.type === 'manual') { await this.#relationModel.query().delete().where('collection_id', collection.id).transacting(options.transaction); } else { - const existingRelations = existing.toJSON().collectionPosts; + const collectionPostsOptions = { + transaction: options.transaction, + columns: ['id', 'post_id'] + }; + const existingRelations = await this.#fetchCollectionPostIds(collection.id, collectionPostsOptions); for (const existingRelation of existingRelations) { const found = collectionPostsRelations.find((thing) => { diff --git a/ghost/core/test/e2e-api/admin/collections.test.js b/ghost/core/test/e2e-api/admin/collections.test.js index 917832e154..96d362a171 100644 --- a/ghost/core/test/e2e-api/admin/collections.test.js +++ b/ghost/core/test/e2e-api/admin/collections.test.js @@ -117,7 +117,7 @@ describe('Collections API', function () { }); }, this.skip.bind(this)); const collectionRelatedQueries = queries.filter(query => query.sql.includes('collection')); - assert(collectionRelatedQueries.length === 2); + assert(collectionRelatedQueries.length === 3); }); it('Can browse Collections and include the posts count', async function () { @@ -572,7 +572,7 @@ describe('Collections API', function () { }, this.skip.bind(this)); const collectionRelatedQueries = queries.filter(query => query.sql.includes('collection')); - assert.equal(collectionRelatedQueries.length, 8); + assert.equal(collectionRelatedQueries.length, 13); } await agent @@ -604,7 +604,7 @@ describe('Collections API', function () { }, this.skip.bind(this)); const collectionRelatedQueries = queries.filter(query => query.sql.includes('collection')); - assert.equal(collectionRelatedQueries.length, 14); + assert.equal(collectionRelatedQueries.length, 18); } await agent From 00e5f84d88488a5b915d191093151e7dff208465 Mon Sep 17 00:00:00 2001 From: Naz Date: Tue, 15 Aug 2023 12:52:53 +0800 Subject: [PATCH 2/3] Repopulated collections_posts built-in collections data closes https://github.com/TryGhost/Arch/issues/74 refs https://github.com/TryGhost/Ghost/commit/b5d1245be1bfbbcb8bde9d16eedb039b511fc892 - We have turned off the collections feature flag after a unsuccessful attempt to make collections GA. With the flag turned off, collections_posts data has gone stale and needs repopulation to function properly again. - This migration is meant to clear the data on collections_posts table and repopulated it again the same way initial migration did in 5.5/2023-07-10-05-16-55-add-built-in-collection-posts.js script. --- ...uncate-stale-built-in-collections-posts.js | 12 ++++ ...10-repopulate-built-in-collection-posts.js | 69 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-25-40-truncate-stale-built-in-collections-posts.js create mode 100644 ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-34-10-repopulate-built-in-collection-posts.js diff --git a/ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-25-40-truncate-stale-built-in-collections-posts.js b/ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-25-40-truncate-stale-built-in-collections-posts.js new file mode 100644 index 0000000000..f59076b22e --- /dev/null +++ b/ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-25-40-truncate-stale-built-in-collections-posts.js @@ -0,0 +1,12 @@ +const logging = require('@tryghost/logging'); +const {createNonTransactionalMigration} = require('../../utils'); + +module.exports = createNonTransactionalMigration( + async function up(knex) { + logging.info('Clearing collections_posts table'); + await knex('collections_posts').truncate(); + }, + async function down() { + logging.info('Not doing anything - collections_posts table has been truncated'); + } +); diff --git a/ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-34-10-repopulate-built-in-collection-posts.js b/ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-34-10-repopulate-built-in-collection-posts.js new file mode 100644 index 0000000000..6f699083e9 --- /dev/null +++ b/ghost/core/core/server/data/migrations/versions/5.64/2023-09-19-04-34-10-repopulate-built-in-collection-posts.js @@ -0,0 +1,69 @@ +const logging = require('@tryghost/logging'); +const {default: ObjectID} = require('bson-objectid'); +const {createTransactionalMigration} = require('../../utils'); + +const insertPostCollections = async (knex, collectionId, postIds) => { + logging.warn(`Batch inserting ${postIds.length} collection posts for collection ${collectionId}`); + + const collectionPosts = postIds.map((postId) => { + return { + id: (new ObjectID()).toHexString(), + collection_id: collectionId, + post_id: postId, + sort_order: 0 + }; + }); + + await knex.batchInsert('collections_posts', collectionPosts, 1000); +}; + +module.exports = createTransactionalMigration( + async function up(knex) { + logging.info('Populating built-in collections'); + + const existingLatestCollection = await knex('collections') + .where({ + slug: 'latest' + }) + .first(); + + if (!existingLatestCollection) { + logging.warn('Latest collection does not exists, skipping'); + } else { + const latestPostsRows = await knex('posts') + .select('id') + .where({ + type: 'post' + }); + + const latestPostsIds = latestPostsRows.map(row => row.id); + + await insertPostCollections(knex, existingLatestCollection.id, latestPostsIds); + } + + const existingFeaturedCollection = await knex('collections') + .where({ + slug: 'featured' + }) + .first(); + + if (!existingFeaturedCollection) { + logging.warn('Featured collection does not exist, skipping'); + } else { + const featuredPostsRows = await knex('posts') + .select('id') + .where({ + featured: true, + type: 'post' + }); + + const featuredPostsIds = featuredPostsRows.map(row => row.id); + + await insertPostCollections(knex, existingFeaturedCollection.id, featuredPostsIds); + } + }, + async function down(knex) { + logging.info('Clearing collections_posts table'); + await knex('collections_posts').truncate(); + } +); From 24bebc1ace6bcbdf2c1b329e75e420b9de8dcea4 Mon Sep 17 00:00:00 2001 From: Ghost CI <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 03:59:17 +0000 Subject: [PATCH 3/3] v5.64.0 --- ghost/admin/package.json | 2 +- ghost/core/package.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ghost/admin/package.json b/ghost/admin/package.json index d90c1bb5db..ccf8e66a46 100644 --- a/ghost/admin/package.json +++ b/ghost/admin/package.json @@ -1,6 +1,6 @@ { "name": "ghost-admin", - "version": "5.63.0", + "version": "5.64.0", "description": "Ember.js admin client for Ghost", "author": "Ghost Foundation", "homepage": "http://ghost.org", diff --git a/ghost/core/package.json b/ghost/core/package.json index fc2597aee8..3881832d9c 100644 --- a/ghost/core/package.json +++ b/ghost/core/package.json @@ -1,6 +1,6 @@ { "name": "ghost", - "version": "5.63.0", + "version": "5.64.0", "description": "The professional publishing platform", "author": "Ghost Foundation", "homepage": "https://ghost.org",