0
Fork 0
mirror of https://github.com/TryGhost/Ghost.git synced 2025-01-20 22:42:53 -05:00

🐛 Added pagination to sitemap.xml to avoid max 50,000 entries limit

refs https://github.com/TryGhost/Team/issues/1044
refs https://github.com/TryGhost/Ghost/pull/13298

- This splits the sitemaps according to the limit set by Google https://developers.google.com/search/docs/advanced/sitemaps/large-sitemaps

Co-authored-by:
- Kevin Ansfield (@kevinansfield)
This commit is contained in:
Thibaut Patel 2022-01-05 11:54:35 +01:00 committed by GitHub
parent 9427886610
commit 01e833376b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 113 additions and 46 deletions

View file

@ -17,12 +17,12 @@ class BaseSiteMapGenerator {
constructor() { constructor() {
this.nodeLookup = {}; this.nodeLookup = {};
this.nodeTimeLookup = {}; this.nodeTimeLookup = {};
this.siteMapContent = null; this.siteMapContent = new Map();
this.lastModified = 0; this.lastModified = 0;
this.maxNodes = 50000; this.maxPerPage = 50000;
} }
generateXmlFromNodes() { generateXmlFromNodes(page) {
// Get a mapping of node to timestamp // Get a mapping of node to timestamp
let nodesToProcess = _.map(this.nodeLookup, (node, id) => { let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
return { return {
@ -33,20 +33,23 @@ class BaseSiteMapGenerator {
}; };
}); });
// Limit to 50k nodes - this is a quick fix to prevent errors in google console
if (this.maxNodes) {
nodesToProcess = nodesToProcess.slice(0, this.maxNodes);
}
// Sort nodes by timestamp // Sort nodes by timestamp
nodesToProcess = _.sortBy(nodesToProcess, 'ts'); nodesToProcess = _.sortBy(nodesToProcess, 'ts');
// Get the page of nodes that was requested
nodesToProcess = nodesToProcess.slice((page - 1) * this.maxPerPage, page * this.maxPerPage);
// Do not generate empty sitemaps
if (nodesToProcess.length === 0) {
return null;
}
// Grab just the nodes // Grab just the nodes
nodesToProcess = _.map(nodesToProcess, 'node'); const nodes = _.map(nodesToProcess, 'node');
const data = { const data = {
// Concat the elements to the _attr declaration // Concat the elements to the _attr declaration
urlset: [XMLNS_DECLS].concat(nodesToProcess) urlset: [XMLNS_DECLS].concat(nodes)
}; };
// Generate full xml // Generate full xml
@ -67,7 +70,7 @@ class BaseSiteMapGenerator {
this.updateLastModified(datum); this.updateLastModified(datum);
this.updateLookups(datum, node); this.updateLookups(datum, node);
// force regeneration of xml // force regeneration of xml
this.siteMapContent = null; this.siteMapContent.clear();
} }
} }
@ -75,7 +78,7 @@ class BaseSiteMapGenerator {
this.removeFromLookups(datum); this.removeFromLookups(datum);
// force regeneration of xml // force regeneration of xml
this.siteMapContent = null; this.siteMapContent.clear();
this.lastModified = Date.now(); this.lastModified = Date.now();
} }
@ -152,13 +155,13 @@ class BaseSiteMapGenerator {
return !!imageUrl; return !!imageUrl;
} }
getXml() { getXml(page = 1) {
if (this.siteMapContent) { if (this.siteMapContent.has(page)) {
return this.siteMapContent; return this.siteMapContent.get(page);
} }
const content = this.generateXmlFromNodes(); const content = this.generateXmlFromNodes(page);
this.siteMapContent = content; this.siteMapContent.set(page, content);
return content; return content;
} }
@ -181,7 +184,7 @@ class BaseSiteMapGenerator {
reset() { reset() {
this.nodeLookup = {}; this.nodeLookup = {};
this.nodeTimeLookup = {}; this.nodeTimeLookup = {};
this.siteMapContent = null; this.siteMapContent.clear();
} }
} }

View file

@ -5,7 +5,8 @@ const manager = new Manager();
// Responsible for handling requests for sitemap files // Responsible for handling requests for sitemap files
module.exports = function handler(siteApp) { module.exports = function handler(siteApp) {
const verifyResourceType = function verifyResourceType(req, res, next) { const verifyResourceType = function verifyResourceType(req, res, next) {
if (!Object.prototype.hasOwnProperty.call(manager, req.params.resource)) { const resourceWithoutPage = req.params.resource.replace(/-\d+$/, '');
if (!Object.prototype.hasOwnProperty.call(manager, resourceWithoutPage)) {
return res.sendStatus(404); return res.sendStatus(404);
} }
@ -22,14 +23,22 @@ module.exports = function handler(siteApp) {
}); });
siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) { siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) {
const type = req.params.resource; const type = req.params.resource.replace(/-\d+$/, '');
const page = 1; const pageParam = (req.params.resource.match(/-(\d+)$/) || [null, null])[1];
const page = pageParam ? parseInt(pageParam, 10) : 1;
const content = manager.getSiteMapXml(type, page);
// Prevent x-1.xml as it is a duplicate of x.xml and empty sitemaps
// (except for the first page so that at least one sitemap exists per type)
if (pageParam === '1' || (!content && page !== 1)) {
return res.sendStatus(404);
}
res.set({ res.set({
'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'), 'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'),
'Content-Type': 'text/xml' 'Content-Type': 'text/xml'
}); });
res.send(manager.getSiteMapXml(type, page)); res.send(content);
}); });
}; };

View file

@ -14,6 +14,7 @@ class SiteMapIndexGenerator {
constructor(options) { constructor(options) {
options = options || {}; options = options || {};
this.types = options.types; this.types = options.types;
this.maxPerPage = options.maxPerPage;
} }
getXml() { getXml() {
@ -30,16 +31,25 @@ class SiteMapIndexGenerator {
generateSiteMapUrlElements() { generateSiteMapUrlElements() {
return _.map(this.types, (resourceType) => { return _.map(this.types, (resourceType) => {
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + '.xml'}, true); // `|| 1` = even if there are no items we still have an empty sitemap file
const lastModified = resourceType.lastModified; const noOfPages = Math.ceil(Object.keys(resourceType.nodeLookup).length / this.maxPerPage) || 1;
const pages = [];
return { for (let i = 0; i < noOfPages; i++) {
sitemap: [ const page = i === 0 ? '' : `-${i + 1}`;
{loc: url}, const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + page + '.xml'}, true);
{lastmod: moment(lastModified).toISOString()} const lastModified = resourceType.lastModified;
]
}; pages.push({
}); sitemap: [
{loc: url},
{lastmod: moment(lastModified).toISOString()}
]
});
}
return pages;
}).flat();
} }
} }

View file

@ -11,11 +11,13 @@ class SiteMapManager {
constructor(options) { constructor(options) {
options = options || {}; options = options || {};
options.maxPerPage = options.maxPerPage || 50000;
this.pages = options.pages || this.createPagesGenerator(options); this.pages = options.pages || this.createPagesGenerator(options);
this.posts = options.posts || this.createPostsGenerator(options); this.posts = options.posts || this.createPostsGenerator(options);
this.users = this.authors = options.authors || this.createUsersGenerator(options); this.users = this.authors = options.authors || this.createUsersGenerator(options);
this.tags = options.tags || this.createTagsGenerator(options); this.tags = options.tags || this.createTagsGenerator(options);
this.index = options.index || this.createIndexGenerator(); this.index = options.index || this.createIndexGenerator(options);
events.on('router.created', (router) => { events.on('router.created', (router) => {
if (router.name === 'StaticRoutesRouter') { if (router.name === 'StaticRoutesRouter') {
@ -43,14 +45,15 @@ class SiteMapManager {
}); });
} }
createIndexGenerator() { createIndexGenerator(options) {
return new IndexMapGenerator({ return new IndexMapGenerator({
types: { types: {
pages: this.pages, pages: this.pages,
posts: this.posts, posts: this.posts,
authors: this.authors, authors: this.authors,
tags: this.tags tags: this.tags
} },
maxPerPage: options.maxPerPage
}); });
} }
@ -74,8 +77,8 @@ class SiteMapManager {
return this.index.getXml(); return this.index.getXml();
} }
getSiteMapXml(type) { getSiteMapXml(type, page) {
return this[type].getXml(); return this[type].getXml(page);
} }
} }

View file

@ -53,7 +53,7 @@ describe('Generators', function () {
}); });
it('max node setting results in the right number of nodes', function () { it('max node setting results in the right number of nodes', function () {
generator = new PostGenerator({maxNodes: 5}); generator = new PostGenerator({maxPerPage: 5});
for (let i = 0; i < 10; i++) { for (let i = 0; i < 10; i++) {
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({ generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
@ -70,12 +70,12 @@ describe('Generators', function () {
Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10); Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10);
// But only 5 are output in the xml // But only 5 are output in the xml
generator.siteMapContent.match(/<loc>/g).should.be.Array().with.lengthOf(5); generator.siteMapContent.get(1).match(/<loc>/g).should.be.Array().with.lengthOf(5);
}); });
it('default is 50k', function () { it('default is 50k', function () {
generator = new PostGenerator(); generator = new PostGenerator();
generator.maxNodes.should.eql(50000); generator.maxPerPage.should.eql(50000);
}); });
describe('IndexGenerator', function () { describe('IndexGenerator', function () {
@ -86,7 +86,8 @@ describe('Generators', function () {
pages: new PageGenerator(), pages: new PageGenerator(),
tags: new TagGenerator(), tags: new TagGenerator(),
authors: new UserGenerator() authors: new UserGenerator()
} },
maxPerPage: 5
}); });
}); });
@ -99,6 +100,21 @@ describe('Generators', function () {
xml.should.match(/sitemap-pages.xml/); xml.should.match(/sitemap-pages.xml/);
xml.should.match(/sitemap-authors.xml/); xml.should.match(/sitemap-authors.xml/);
}); });
it('creates multiple pages when there are too many posts', function () {
for (let i = 0; i < 10; i++) {
generator.types.posts.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
updated_at: null,
published_at: null,
slug: `episode-${i}`
}));
}
const xml = generator.getXml();
xml.should.match(/sitemap-posts.xml/);
xml.should.match(/sitemap-posts-2.xml/);
});
}); });
}); });
@ -126,9 +142,9 @@ describe('Generators', function () {
it('get cached xml', function () { it('get cached xml', function () {
sinon.spy(generator, 'generateXmlFromNodes'); sinon.spy(generator, 'generateXmlFromNodes');
generator.siteMapContent = 'something'; generator.siteMapContent.set(1, 'something');
generator.getXml().should.eql('something'); generator.getXml().should.eql('something');
generator.siteMapContent = null; generator.siteMapContent.clear();
generator.generateXmlFromNodes.called.should.eql(false); generator.generateXmlFromNodes.called.should.eql(false);
}); });
@ -184,6 +200,32 @@ describe('Generators', function () {
idxFirst.should.be.below(idxSecond); idxFirst.should.be.below(idxSecond);
idxSecond.should.be.below(idxThird); idxSecond.should.be.below(idxThird);
}); });
it('creates multiple pages when there are too many posts', function () {
generator.maxPerPage = 5;
urlUtils.urlFor.withArgs('sitemap_xsl', true).returns('http://my-ghost-blog.com/sitemap.xsl');
for (let i = 0; i < 10; i++) {
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
updated_at: null,
published_at: null,
slug: `episode-${i}`
}));
}
const pages = [generator.getXml(), generator.getXml(2)];
for (let i = 0; i < 10; i++) {
const pageIndex = Math.floor(i / 5);
pages[pageIndex].should.containEql(`<loc>http://my-ghost-blog.com/episode-${i}/</loc>`);
}
});
it('shouldn\'t break with out of bounds pages', function () {
should.not.exist(generator.getXml(-1));
should.not.exist(generator.getXml(99999));
should.not.exist(generator.getXml(0));
});
}); });
describe('fn: removeUrl', function () { describe('fn: removeUrl', function () {
@ -224,12 +266,12 @@ describe('Generators', function () {
generator.getXml(); generator.getXml();
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/home/</loc>'); generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>'); generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>'); generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
// <loc> should exist exactly one time // <loc> should exist exactly one time
generator.siteMapContent.match(/<loc>/g).length.should.eql(3); generator.siteMapContent.get(1).match(/<loc>/g).length.should.eql(3);
}); });
}); });
}); });