From 01e833376b42c0bd22e9f80fb4da0167cae0ea58 Mon Sep 17 00:00:00 2001 From: Thibaut Patel Date: Wed, 5 Jan 2022 11:54:35 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20Added=20pagination=20to=20sitema?= =?UTF-8?q?p.xml=20to=20avoid=20max=2050,000=20entries=20limit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refs https://github.com/TryGhost/Team/issues/1044 refs https://github.com/TryGhost/Ghost/pull/13298 - This splits the sitemaps according to the limit set by Google https://developers.google.com/search/docs/advanced/sitemaps/large-sitemaps Co-authored-by: - Kevin Ansfield (@kevinansfield) --- .../services/sitemap/base-generator.js | 39 ++++++------ core/frontend/services/sitemap/handler.js | 17 +++-- .../services/sitemap/index-generator.js | 28 ++++++--- core/frontend/services/sitemap/manager.js | 13 ++-- .../services/sitemap/generator.test.js | 62 ++++++++++++++++--- 5 files changed, 113 insertions(+), 46 deletions(-) diff --git a/core/frontend/services/sitemap/base-generator.js b/core/frontend/services/sitemap/base-generator.js index c880b6bf19..261e02e3c0 100644 --- a/core/frontend/services/sitemap/base-generator.js +++ b/core/frontend/services/sitemap/base-generator.js @@ -17,12 +17,12 @@ class BaseSiteMapGenerator { constructor() { this.nodeLookup = {}; this.nodeTimeLookup = {}; - this.siteMapContent = null; + this.siteMapContent = new Map(); this.lastModified = 0; - this.maxNodes = 50000; + this.maxPerPage = 50000; } - generateXmlFromNodes() { + generateXmlFromNodes(page) { // Get a mapping of node to timestamp let nodesToProcess = _.map(this.nodeLookup, (node, id) => { return { @@ -33,20 +33,23 @@ class BaseSiteMapGenerator { }; }); - // Limit to 50k nodes - this is a quick fix to prevent errors in google console - if (this.maxNodes) { - nodesToProcess = nodesToProcess.slice(0, this.maxNodes); - } - // Sort nodes by timestamp nodesToProcess = _.sortBy(nodesToProcess, 'ts'); + // Get the page of nodes that was requested + nodesToProcess = nodesToProcess.slice((page - 1) * this.maxPerPage, page * this.maxPerPage); + + // Do not generate empty sitemaps + if (nodesToProcess.length === 0) { + return null; + } + // Grab just the nodes - nodesToProcess = _.map(nodesToProcess, 'node'); + const nodes = _.map(nodesToProcess, 'node'); const data = { // Concat the elements to the _attr declaration - urlset: [XMLNS_DECLS].concat(nodesToProcess) + urlset: [XMLNS_DECLS].concat(nodes) }; // Generate full xml @@ -67,7 +70,7 @@ class BaseSiteMapGenerator { this.updateLastModified(datum); this.updateLookups(datum, node); // force regeneration of xml - this.siteMapContent = null; + this.siteMapContent.clear(); } } @@ -75,7 +78,7 @@ class BaseSiteMapGenerator { this.removeFromLookups(datum); // force regeneration of xml - this.siteMapContent = null; + this.siteMapContent.clear(); this.lastModified = Date.now(); } @@ -152,13 +155,13 @@ class BaseSiteMapGenerator { return !!imageUrl; } - getXml() { - if (this.siteMapContent) { - return this.siteMapContent; + getXml(page = 1) { + if (this.siteMapContent.has(page)) { + return this.siteMapContent.get(page); } - const content = this.generateXmlFromNodes(); - this.siteMapContent = content; + const content = this.generateXmlFromNodes(page); + this.siteMapContent.set(page, content); return content; } @@ -181,7 +184,7 @@ class BaseSiteMapGenerator { reset() { this.nodeLookup = {}; this.nodeTimeLookup = {}; - this.siteMapContent = null; + this.siteMapContent.clear(); } } diff --git a/core/frontend/services/sitemap/handler.js b/core/frontend/services/sitemap/handler.js index c4a01ff497..817eee1e46 100644 --- a/core/frontend/services/sitemap/handler.js +++ b/core/frontend/services/sitemap/handler.js @@ -5,7 +5,8 @@ const manager = new Manager(); // Responsible for handling requests for sitemap files module.exports = function handler(siteApp) { const verifyResourceType = function verifyResourceType(req, res, next) { - if (!Object.prototype.hasOwnProperty.call(manager, req.params.resource)) { + const resourceWithoutPage = req.params.resource.replace(/-\d+$/, ''); + if (!Object.prototype.hasOwnProperty.call(manager, resourceWithoutPage)) { return res.sendStatus(404); } @@ -22,14 +23,22 @@ module.exports = function handler(siteApp) { }); siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) { - const type = req.params.resource; - const page = 1; + const type = req.params.resource.replace(/-\d+$/, ''); + const pageParam = (req.params.resource.match(/-(\d+)$/) || [null, null])[1]; + const page = pageParam ? parseInt(pageParam, 10) : 1; + + const content = manager.getSiteMapXml(type, page); + // Prevent x-1.xml as it is a duplicate of x.xml and empty sitemaps + // (except for the first page so that at least one sitemap exists per type) + if (pageParam === '1' || (!content && page !== 1)) { + return res.sendStatus(404); + } res.set({ 'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'), 'Content-Type': 'text/xml' }); - res.send(manager.getSiteMapXml(type, page)); + res.send(content); }); }; diff --git a/core/frontend/services/sitemap/index-generator.js b/core/frontend/services/sitemap/index-generator.js index 9005c81697..d0c1d107d4 100644 --- a/core/frontend/services/sitemap/index-generator.js +++ b/core/frontend/services/sitemap/index-generator.js @@ -14,6 +14,7 @@ class SiteMapIndexGenerator { constructor(options) { options = options || {}; this.types = options.types; + this.maxPerPage = options.maxPerPage; } getXml() { @@ -30,16 +31,25 @@ class SiteMapIndexGenerator { generateSiteMapUrlElements() { return _.map(this.types, (resourceType) => { - const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + '.xml'}, true); - const lastModified = resourceType.lastModified; + // `|| 1` = even if there are no items we still have an empty sitemap file + const noOfPages = Math.ceil(Object.keys(resourceType.nodeLookup).length / this.maxPerPage) || 1; + const pages = []; - return { - sitemap: [ - {loc: url}, - {lastmod: moment(lastModified).toISOString()} - ] - }; - }); + for (let i = 0; i < noOfPages; i++) { + const page = i === 0 ? '' : `-${i + 1}`; + const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + page + '.xml'}, true); + const lastModified = resourceType.lastModified; + + pages.push({ + sitemap: [ + {loc: url}, + {lastmod: moment(lastModified).toISOString()} + ] + }); + } + + return pages; + }).flat(); } } diff --git a/core/frontend/services/sitemap/manager.js b/core/frontend/services/sitemap/manager.js index ecf81b7bd6..78a719d38c 100644 --- a/core/frontend/services/sitemap/manager.js +++ b/core/frontend/services/sitemap/manager.js @@ -11,11 +11,13 @@ class SiteMapManager { constructor(options) { options = options || {}; + options.maxPerPage = options.maxPerPage || 50000; + this.pages = options.pages || this.createPagesGenerator(options); this.posts = options.posts || this.createPostsGenerator(options); this.users = this.authors = options.authors || this.createUsersGenerator(options); this.tags = options.tags || this.createTagsGenerator(options); - this.index = options.index || this.createIndexGenerator(); + this.index = options.index || this.createIndexGenerator(options); events.on('router.created', (router) => { if (router.name === 'StaticRoutesRouter') { @@ -43,14 +45,15 @@ class SiteMapManager { }); } - createIndexGenerator() { + createIndexGenerator(options) { return new IndexMapGenerator({ types: { pages: this.pages, posts: this.posts, authors: this.authors, tags: this.tags - } + }, + maxPerPage: options.maxPerPage }); } @@ -74,8 +77,8 @@ class SiteMapManager { return this.index.getXml(); } - getSiteMapXml(type) { - return this[type].getXml(); + getSiteMapXml(type, page) { + return this[type].getXml(page); } } diff --git a/test/unit/frontend/services/sitemap/generator.test.js b/test/unit/frontend/services/sitemap/generator.test.js index 177fb3c513..6231ee5fcd 100644 --- a/test/unit/frontend/services/sitemap/generator.test.js +++ b/test/unit/frontend/services/sitemap/generator.test.js @@ -53,7 +53,7 @@ describe('Generators', function () { }); it('max node setting results in the right number of nodes', function () { - generator = new PostGenerator({maxNodes: 5}); + generator = new PostGenerator({maxPerPage: 5}); for (let i = 0; i < 10; i++) { generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({ @@ -70,12 +70,12 @@ describe('Generators', function () { Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10); // But only 5 are output in the xml - generator.siteMapContent.match(//g).should.be.Array().with.lengthOf(5); + generator.siteMapContent.get(1).match(//g).should.be.Array().with.lengthOf(5); }); it('default is 50k', function () { generator = new PostGenerator(); - generator.maxNodes.should.eql(50000); + generator.maxPerPage.should.eql(50000); }); describe('IndexGenerator', function () { @@ -86,7 +86,8 @@ describe('Generators', function () { pages: new PageGenerator(), tags: new TagGenerator(), authors: new UserGenerator() - } + }, + maxPerPage: 5 }); }); @@ -99,6 +100,21 @@ describe('Generators', function () { xml.should.match(/sitemap-pages.xml/); xml.should.match(/sitemap-authors.xml/); }); + + it('creates multiple pages when there are too many posts', function () { + for (let i = 0; i < 10; i++) { + generator.types.posts.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({ + created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200, + updated_at: null, + published_at: null, + slug: `episode-${i}` + })); + } + const xml = generator.getXml(); + + xml.should.match(/sitemap-posts.xml/); + xml.should.match(/sitemap-posts-2.xml/); + }); }); }); @@ -126,9 +142,9 @@ describe('Generators', function () { it('get cached xml', function () { sinon.spy(generator, 'generateXmlFromNodes'); - generator.siteMapContent = 'something'; + generator.siteMapContent.set(1, 'something'); generator.getXml().should.eql('something'); - generator.siteMapContent = null; + generator.siteMapContent.clear(); generator.generateXmlFromNodes.called.should.eql(false); }); @@ -184,6 +200,32 @@ describe('Generators', function () { idxFirst.should.be.below(idxSecond); idxSecond.should.be.below(idxThird); }); + + it('creates multiple pages when there are too many posts', function () { + generator.maxPerPage = 5; + urlUtils.urlFor.withArgs('sitemap_xsl', true).returns('http://my-ghost-blog.com/sitemap.xsl'); + for (let i = 0; i < 10; i++) { + generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({ + created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200, + updated_at: null, + published_at: null, + slug: `episode-${i}` + })); + } + + const pages = [generator.getXml(), generator.getXml(2)]; + + for (let i = 0; i < 10; i++) { + const pageIndex = Math.floor(i / 5); + pages[pageIndex].should.containEql(`http://my-ghost-blog.com/episode-${i}/`); + } + }); + + it('shouldn\'t break with out of bounds pages', function () { + should.not.exist(generator.getXml(-1)); + should.not.exist(generator.getXml(99999)); + should.not.exist(generator.getXml(0)); + }); }); describe('fn: removeUrl', function () { @@ -224,12 +266,12 @@ describe('Generators', function () { generator.getXml(); - generator.siteMapContent.should.containEql('http://my-ghost-blog.com/home/'); - generator.siteMapContent.should.containEql('http://my-ghost-blog.com/magic/'); - generator.siteMapContent.should.containEql('http://my-ghost-blog.com/subscribe/'); + generator.siteMapContent.get(1).should.containEql('http://my-ghost-blog.com/home/'); + generator.siteMapContent.get(1).should.containEql('http://my-ghost-blog.com/magic/'); + generator.siteMapContent.get(1).should.containEql('http://my-ghost-blog.com/subscribe/'); // should exist exactly one time - generator.siteMapContent.match(//g).length.should.eql(3); + generator.siteMapContent.get(1).match(//g).length.should.eql(3); }); }); });