mirror of
https://github.com/TryGhost/Ghost.git
synced 2025-01-06 22:40:14 -05:00
🐛 Added pagination to sitemap.xml to avoid max 50,000 entries limit
refs https://github.com/TryGhost/Team/issues/1044 refs https://github.com/TryGhost/Ghost/pull/13298 - This splits the sitemaps according to the limit set by Google https://developers.google.com/search/docs/advanced/sitemaps/large-sitemaps Co-authored-by: - Kevin Ansfield (@kevinansfield)
This commit is contained in:
parent
9427886610
commit
01e833376b
5 changed files with 113 additions and 46 deletions
|
@ -17,12 +17,12 @@ class BaseSiteMapGenerator {
|
|||
constructor() {
|
||||
this.nodeLookup = {};
|
||||
this.nodeTimeLookup = {};
|
||||
this.siteMapContent = null;
|
||||
this.siteMapContent = new Map();
|
||||
this.lastModified = 0;
|
||||
this.maxNodes = 50000;
|
||||
this.maxPerPage = 50000;
|
||||
}
|
||||
|
||||
generateXmlFromNodes() {
|
||||
generateXmlFromNodes(page) {
|
||||
// Get a mapping of node to timestamp
|
||||
let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
|
||||
return {
|
||||
|
@ -33,20 +33,23 @@ class BaseSiteMapGenerator {
|
|||
};
|
||||
});
|
||||
|
||||
// Limit to 50k nodes - this is a quick fix to prevent errors in google console
|
||||
if (this.maxNodes) {
|
||||
nodesToProcess = nodesToProcess.slice(0, this.maxNodes);
|
||||
}
|
||||
|
||||
// Sort nodes by timestamp
|
||||
nodesToProcess = _.sortBy(nodesToProcess, 'ts');
|
||||
|
||||
// Get the page of nodes that was requested
|
||||
nodesToProcess = nodesToProcess.slice((page - 1) * this.maxPerPage, page * this.maxPerPage);
|
||||
|
||||
// Do not generate empty sitemaps
|
||||
if (nodesToProcess.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Grab just the nodes
|
||||
nodesToProcess = _.map(nodesToProcess, 'node');
|
||||
const nodes = _.map(nodesToProcess, 'node');
|
||||
|
||||
const data = {
|
||||
// Concat the elements to the _attr declaration
|
||||
urlset: [XMLNS_DECLS].concat(nodesToProcess)
|
||||
urlset: [XMLNS_DECLS].concat(nodes)
|
||||
};
|
||||
|
||||
// Generate full xml
|
||||
|
@ -67,7 +70,7 @@ class BaseSiteMapGenerator {
|
|||
this.updateLastModified(datum);
|
||||
this.updateLookups(datum, node);
|
||||
// force regeneration of xml
|
||||
this.siteMapContent = null;
|
||||
this.siteMapContent.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -75,7 +78,7 @@ class BaseSiteMapGenerator {
|
|||
this.removeFromLookups(datum);
|
||||
|
||||
// force regeneration of xml
|
||||
this.siteMapContent = null;
|
||||
this.siteMapContent.clear();
|
||||
this.lastModified = Date.now();
|
||||
}
|
||||
|
||||
|
@ -152,13 +155,13 @@ class BaseSiteMapGenerator {
|
|||
return !!imageUrl;
|
||||
}
|
||||
|
||||
getXml() {
|
||||
if (this.siteMapContent) {
|
||||
return this.siteMapContent;
|
||||
getXml(page = 1) {
|
||||
if (this.siteMapContent.has(page)) {
|
||||
return this.siteMapContent.get(page);
|
||||
}
|
||||
|
||||
const content = this.generateXmlFromNodes();
|
||||
this.siteMapContent = content;
|
||||
const content = this.generateXmlFromNodes(page);
|
||||
this.siteMapContent.set(page, content);
|
||||
return content;
|
||||
}
|
||||
|
||||
|
@ -181,7 +184,7 @@ class BaseSiteMapGenerator {
|
|||
reset() {
|
||||
this.nodeLookup = {};
|
||||
this.nodeTimeLookup = {};
|
||||
this.siteMapContent = null;
|
||||
this.siteMapContent.clear();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,8 @@ const manager = new Manager();
|
|||
// Responsible for handling requests for sitemap files
|
||||
module.exports = function handler(siteApp) {
|
||||
const verifyResourceType = function verifyResourceType(req, res, next) {
|
||||
if (!Object.prototype.hasOwnProperty.call(manager, req.params.resource)) {
|
||||
const resourceWithoutPage = req.params.resource.replace(/-\d+$/, '');
|
||||
if (!Object.prototype.hasOwnProperty.call(manager, resourceWithoutPage)) {
|
||||
return res.sendStatus(404);
|
||||
}
|
||||
|
||||
|
@ -22,14 +23,22 @@ module.exports = function handler(siteApp) {
|
|||
});
|
||||
|
||||
siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) {
|
||||
const type = req.params.resource;
|
||||
const page = 1;
|
||||
const type = req.params.resource.replace(/-\d+$/, '');
|
||||
const pageParam = (req.params.resource.match(/-(\d+)$/) || [null, null])[1];
|
||||
const page = pageParam ? parseInt(pageParam, 10) : 1;
|
||||
|
||||
const content = manager.getSiteMapXml(type, page);
|
||||
// Prevent x-1.xml as it is a duplicate of x.xml and empty sitemaps
|
||||
// (except for the first page so that at least one sitemap exists per type)
|
||||
if (pageParam === '1' || (!content && page !== 1)) {
|
||||
return res.sendStatus(404);
|
||||
}
|
||||
|
||||
res.set({
|
||||
'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'),
|
||||
'Content-Type': 'text/xml'
|
||||
});
|
||||
|
||||
res.send(manager.getSiteMapXml(type, page));
|
||||
res.send(content);
|
||||
});
|
||||
};
|
||||
|
|
|
@ -14,6 +14,7 @@ class SiteMapIndexGenerator {
|
|||
constructor(options) {
|
||||
options = options || {};
|
||||
this.types = options.types;
|
||||
this.maxPerPage = options.maxPerPage;
|
||||
}
|
||||
|
||||
getXml() {
|
||||
|
@ -30,16 +31,25 @@ class SiteMapIndexGenerator {
|
|||
|
||||
generateSiteMapUrlElements() {
|
||||
return _.map(this.types, (resourceType) => {
|
||||
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + '.xml'}, true);
|
||||
const lastModified = resourceType.lastModified;
|
||||
// `|| 1` = even if there are no items we still have an empty sitemap file
|
||||
const noOfPages = Math.ceil(Object.keys(resourceType.nodeLookup).length / this.maxPerPage) || 1;
|
||||
const pages = [];
|
||||
|
||||
return {
|
||||
sitemap: [
|
||||
{loc: url},
|
||||
{lastmod: moment(lastModified).toISOString()}
|
||||
]
|
||||
};
|
||||
});
|
||||
for (let i = 0; i < noOfPages; i++) {
|
||||
const page = i === 0 ? '' : `-${i + 1}`;
|
||||
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + page + '.xml'}, true);
|
||||
const lastModified = resourceType.lastModified;
|
||||
|
||||
pages.push({
|
||||
sitemap: [
|
||||
{loc: url},
|
||||
{lastmod: moment(lastModified).toISOString()}
|
||||
]
|
||||
});
|
||||
}
|
||||
|
||||
return pages;
|
||||
}).flat();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -11,11 +11,13 @@ class SiteMapManager {
|
|||
constructor(options) {
|
||||
options = options || {};
|
||||
|
||||
options.maxPerPage = options.maxPerPage || 50000;
|
||||
|
||||
this.pages = options.pages || this.createPagesGenerator(options);
|
||||
this.posts = options.posts || this.createPostsGenerator(options);
|
||||
this.users = this.authors = options.authors || this.createUsersGenerator(options);
|
||||
this.tags = options.tags || this.createTagsGenerator(options);
|
||||
this.index = options.index || this.createIndexGenerator();
|
||||
this.index = options.index || this.createIndexGenerator(options);
|
||||
|
||||
events.on('router.created', (router) => {
|
||||
if (router.name === 'StaticRoutesRouter') {
|
||||
|
@ -43,14 +45,15 @@ class SiteMapManager {
|
|||
});
|
||||
}
|
||||
|
||||
createIndexGenerator() {
|
||||
createIndexGenerator(options) {
|
||||
return new IndexMapGenerator({
|
||||
types: {
|
||||
pages: this.pages,
|
||||
posts: this.posts,
|
||||
authors: this.authors,
|
||||
tags: this.tags
|
||||
}
|
||||
},
|
||||
maxPerPage: options.maxPerPage
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -74,8 +77,8 @@ class SiteMapManager {
|
|||
return this.index.getXml();
|
||||
}
|
||||
|
||||
getSiteMapXml(type) {
|
||||
return this[type].getXml();
|
||||
getSiteMapXml(type, page) {
|
||||
return this[type].getXml(page);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ describe('Generators', function () {
|
|||
});
|
||||
|
||||
it('max node setting results in the right number of nodes', function () {
|
||||
generator = new PostGenerator({maxNodes: 5});
|
||||
generator = new PostGenerator({maxPerPage: 5});
|
||||
|
||||
for (let i = 0; i < 10; i++) {
|
||||
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
||||
|
@ -70,12 +70,12 @@ describe('Generators', function () {
|
|||
Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10);
|
||||
|
||||
// But only 5 are output in the xml
|
||||
generator.siteMapContent.match(/<loc>/g).should.be.Array().with.lengthOf(5);
|
||||
generator.siteMapContent.get(1).match(/<loc>/g).should.be.Array().with.lengthOf(5);
|
||||
});
|
||||
|
||||
it('default is 50k', function () {
|
||||
generator = new PostGenerator();
|
||||
generator.maxNodes.should.eql(50000);
|
||||
generator.maxPerPage.should.eql(50000);
|
||||
});
|
||||
|
||||
describe('IndexGenerator', function () {
|
||||
|
@ -86,7 +86,8 @@ describe('Generators', function () {
|
|||
pages: new PageGenerator(),
|
||||
tags: new TagGenerator(),
|
||||
authors: new UserGenerator()
|
||||
}
|
||||
},
|
||||
maxPerPage: 5
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -99,6 +100,21 @@ describe('Generators', function () {
|
|||
xml.should.match(/sitemap-pages.xml/);
|
||||
xml.should.match(/sitemap-authors.xml/);
|
||||
});
|
||||
|
||||
it('creates multiple pages when there are too many posts', function () {
|
||||
for (let i = 0; i < 10; i++) {
|
||||
generator.types.posts.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
||||
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
|
||||
updated_at: null,
|
||||
published_at: null,
|
||||
slug: `episode-${i}`
|
||||
}));
|
||||
}
|
||||
const xml = generator.getXml();
|
||||
|
||||
xml.should.match(/sitemap-posts.xml/);
|
||||
xml.should.match(/sitemap-posts-2.xml/);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
@ -126,9 +142,9 @@ describe('Generators', function () {
|
|||
|
||||
it('get cached xml', function () {
|
||||
sinon.spy(generator, 'generateXmlFromNodes');
|
||||
generator.siteMapContent = 'something';
|
||||
generator.siteMapContent.set(1, 'something');
|
||||
generator.getXml().should.eql('something');
|
||||
generator.siteMapContent = null;
|
||||
generator.siteMapContent.clear();
|
||||
generator.generateXmlFromNodes.called.should.eql(false);
|
||||
});
|
||||
|
||||
|
@ -184,6 +200,32 @@ describe('Generators', function () {
|
|||
idxFirst.should.be.below(idxSecond);
|
||||
idxSecond.should.be.below(idxThird);
|
||||
});
|
||||
|
||||
it('creates multiple pages when there are too many posts', function () {
|
||||
generator.maxPerPage = 5;
|
||||
urlUtils.urlFor.withArgs('sitemap_xsl', true).returns('http://my-ghost-blog.com/sitemap.xsl');
|
||||
for (let i = 0; i < 10; i++) {
|
||||
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
||||
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
|
||||
updated_at: null,
|
||||
published_at: null,
|
||||
slug: `episode-${i}`
|
||||
}));
|
||||
}
|
||||
|
||||
const pages = [generator.getXml(), generator.getXml(2)];
|
||||
|
||||
for (let i = 0; i < 10; i++) {
|
||||
const pageIndex = Math.floor(i / 5);
|
||||
pages[pageIndex].should.containEql(`<loc>http://my-ghost-blog.com/episode-${i}/</loc>`);
|
||||
}
|
||||
});
|
||||
|
||||
it('shouldn\'t break with out of bounds pages', function () {
|
||||
should.not.exist(generator.getXml(-1));
|
||||
should.not.exist(generator.getXml(99999));
|
||||
should.not.exist(generator.getXml(0));
|
||||
});
|
||||
});
|
||||
|
||||
describe('fn: removeUrl', function () {
|
||||
|
@ -224,12 +266,12 @@ describe('Generators', function () {
|
|||
|
||||
generator.getXml();
|
||||
|
||||
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
|
||||
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
|
||||
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
|
||||
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
|
||||
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
|
||||
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
|
||||
|
||||
// <loc> should exist exactly one time
|
||||
generator.siteMapContent.match(/<loc>/g).length.should.eql(3);
|
||||
generator.siteMapContent.get(1).match(/<loc>/g).length.should.eql(3);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
Loading…
Reference in a new issue