mirror of
https://github.com/TryGhost/Ghost.git
synced 2025-01-20 22:42:53 -05:00
🐛 Added pagination to sitemap.xml to avoid max 50,000 entries limit
refs https://github.com/TryGhost/Team/issues/1044 refs https://github.com/TryGhost/Ghost/pull/13298 - This splits the sitemaps according to the limit set by Google https://developers.google.com/search/docs/advanced/sitemaps/large-sitemaps Co-authored-by: - Kevin Ansfield (@kevinansfield)
This commit is contained in:
parent
9427886610
commit
01e833376b
5 changed files with 113 additions and 46 deletions
|
@ -17,12 +17,12 @@ class BaseSiteMapGenerator {
|
||||||
constructor() {
|
constructor() {
|
||||||
this.nodeLookup = {};
|
this.nodeLookup = {};
|
||||||
this.nodeTimeLookup = {};
|
this.nodeTimeLookup = {};
|
||||||
this.siteMapContent = null;
|
this.siteMapContent = new Map();
|
||||||
this.lastModified = 0;
|
this.lastModified = 0;
|
||||||
this.maxNodes = 50000;
|
this.maxPerPage = 50000;
|
||||||
}
|
}
|
||||||
|
|
||||||
generateXmlFromNodes() {
|
generateXmlFromNodes(page) {
|
||||||
// Get a mapping of node to timestamp
|
// Get a mapping of node to timestamp
|
||||||
let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
|
let nodesToProcess = _.map(this.nodeLookup, (node, id) => {
|
||||||
return {
|
return {
|
||||||
|
@ -33,20 +33,23 @@ class BaseSiteMapGenerator {
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
// Limit to 50k nodes - this is a quick fix to prevent errors in google console
|
|
||||||
if (this.maxNodes) {
|
|
||||||
nodesToProcess = nodesToProcess.slice(0, this.maxNodes);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort nodes by timestamp
|
// Sort nodes by timestamp
|
||||||
nodesToProcess = _.sortBy(nodesToProcess, 'ts');
|
nodesToProcess = _.sortBy(nodesToProcess, 'ts');
|
||||||
|
|
||||||
|
// Get the page of nodes that was requested
|
||||||
|
nodesToProcess = nodesToProcess.slice((page - 1) * this.maxPerPage, page * this.maxPerPage);
|
||||||
|
|
||||||
|
// Do not generate empty sitemaps
|
||||||
|
if (nodesToProcess.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
// Grab just the nodes
|
// Grab just the nodes
|
||||||
nodesToProcess = _.map(nodesToProcess, 'node');
|
const nodes = _.map(nodesToProcess, 'node');
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
// Concat the elements to the _attr declaration
|
// Concat the elements to the _attr declaration
|
||||||
urlset: [XMLNS_DECLS].concat(nodesToProcess)
|
urlset: [XMLNS_DECLS].concat(nodes)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Generate full xml
|
// Generate full xml
|
||||||
|
@ -67,7 +70,7 @@ class BaseSiteMapGenerator {
|
||||||
this.updateLastModified(datum);
|
this.updateLastModified(datum);
|
||||||
this.updateLookups(datum, node);
|
this.updateLookups(datum, node);
|
||||||
// force regeneration of xml
|
// force regeneration of xml
|
||||||
this.siteMapContent = null;
|
this.siteMapContent.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,7 +78,7 @@ class BaseSiteMapGenerator {
|
||||||
this.removeFromLookups(datum);
|
this.removeFromLookups(datum);
|
||||||
|
|
||||||
// force regeneration of xml
|
// force regeneration of xml
|
||||||
this.siteMapContent = null;
|
this.siteMapContent.clear();
|
||||||
this.lastModified = Date.now();
|
this.lastModified = Date.now();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,13 +155,13 @@ class BaseSiteMapGenerator {
|
||||||
return !!imageUrl;
|
return !!imageUrl;
|
||||||
}
|
}
|
||||||
|
|
||||||
getXml() {
|
getXml(page = 1) {
|
||||||
if (this.siteMapContent) {
|
if (this.siteMapContent.has(page)) {
|
||||||
return this.siteMapContent;
|
return this.siteMapContent.get(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
const content = this.generateXmlFromNodes();
|
const content = this.generateXmlFromNodes(page);
|
||||||
this.siteMapContent = content;
|
this.siteMapContent.set(page, content);
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -181,7 +184,7 @@ class BaseSiteMapGenerator {
|
||||||
reset() {
|
reset() {
|
||||||
this.nodeLookup = {};
|
this.nodeLookup = {};
|
||||||
this.nodeTimeLookup = {};
|
this.nodeTimeLookup = {};
|
||||||
this.siteMapContent = null;
|
this.siteMapContent.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,8 @@ const manager = new Manager();
|
||||||
// Responsible for handling requests for sitemap files
|
// Responsible for handling requests for sitemap files
|
||||||
module.exports = function handler(siteApp) {
|
module.exports = function handler(siteApp) {
|
||||||
const verifyResourceType = function verifyResourceType(req, res, next) {
|
const verifyResourceType = function verifyResourceType(req, res, next) {
|
||||||
if (!Object.prototype.hasOwnProperty.call(manager, req.params.resource)) {
|
const resourceWithoutPage = req.params.resource.replace(/-\d+$/, '');
|
||||||
|
if (!Object.prototype.hasOwnProperty.call(manager, resourceWithoutPage)) {
|
||||||
return res.sendStatus(404);
|
return res.sendStatus(404);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,14 +23,22 @@ module.exports = function handler(siteApp) {
|
||||||
});
|
});
|
||||||
|
|
||||||
siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) {
|
siteApp.get('/sitemap-:resource.xml', verifyResourceType, function sitemapResourceXML(req, res) {
|
||||||
const type = req.params.resource;
|
const type = req.params.resource.replace(/-\d+$/, '');
|
||||||
const page = 1;
|
const pageParam = (req.params.resource.match(/-(\d+)$/) || [null, null])[1];
|
||||||
|
const page = pageParam ? parseInt(pageParam, 10) : 1;
|
||||||
|
|
||||||
|
const content = manager.getSiteMapXml(type, page);
|
||||||
|
// Prevent x-1.xml as it is a duplicate of x.xml and empty sitemaps
|
||||||
|
// (except for the first page so that at least one sitemap exists per type)
|
||||||
|
if (pageParam === '1' || (!content && page !== 1)) {
|
||||||
|
return res.sendStatus(404);
|
||||||
|
}
|
||||||
|
|
||||||
res.set({
|
res.set({
|
||||||
'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'),
|
'Cache-Control': 'public, max-age=' + config.get('caching:sitemap:maxAge'),
|
||||||
'Content-Type': 'text/xml'
|
'Content-Type': 'text/xml'
|
||||||
});
|
});
|
||||||
|
|
||||||
res.send(manager.getSiteMapXml(type, page));
|
res.send(content);
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
|
@ -14,6 +14,7 @@ class SiteMapIndexGenerator {
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
options = options || {};
|
options = options || {};
|
||||||
this.types = options.types;
|
this.types = options.types;
|
||||||
|
this.maxPerPage = options.maxPerPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
getXml() {
|
getXml() {
|
||||||
|
@ -30,16 +31,25 @@ class SiteMapIndexGenerator {
|
||||||
|
|
||||||
generateSiteMapUrlElements() {
|
generateSiteMapUrlElements() {
|
||||||
return _.map(this.types, (resourceType) => {
|
return _.map(this.types, (resourceType) => {
|
||||||
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + '.xml'}, true);
|
// `|| 1` = even if there are no items we still have an empty sitemap file
|
||||||
const lastModified = resourceType.lastModified;
|
const noOfPages = Math.ceil(Object.keys(resourceType.nodeLookup).length / this.maxPerPage) || 1;
|
||||||
|
const pages = [];
|
||||||
|
|
||||||
return {
|
for (let i = 0; i < noOfPages; i++) {
|
||||||
sitemap: [
|
const page = i === 0 ? '' : `-${i + 1}`;
|
||||||
{loc: url},
|
const url = urlUtils.urlFor({relativeUrl: '/sitemap-' + resourceType.name + page + '.xml'}, true);
|
||||||
{lastmod: moment(lastModified).toISOString()}
|
const lastModified = resourceType.lastModified;
|
||||||
]
|
|
||||||
};
|
pages.push({
|
||||||
});
|
sitemap: [
|
||||||
|
{loc: url},
|
||||||
|
{lastmod: moment(lastModified).toISOString()}
|
||||||
|
]
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return pages;
|
||||||
|
}).flat();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,11 +11,13 @@ class SiteMapManager {
|
||||||
constructor(options) {
|
constructor(options) {
|
||||||
options = options || {};
|
options = options || {};
|
||||||
|
|
||||||
|
options.maxPerPage = options.maxPerPage || 50000;
|
||||||
|
|
||||||
this.pages = options.pages || this.createPagesGenerator(options);
|
this.pages = options.pages || this.createPagesGenerator(options);
|
||||||
this.posts = options.posts || this.createPostsGenerator(options);
|
this.posts = options.posts || this.createPostsGenerator(options);
|
||||||
this.users = this.authors = options.authors || this.createUsersGenerator(options);
|
this.users = this.authors = options.authors || this.createUsersGenerator(options);
|
||||||
this.tags = options.tags || this.createTagsGenerator(options);
|
this.tags = options.tags || this.createTagsGenerator(options);
|
||||||
this.index = options.index || this.createIndexGenerator();
|
this.index = options.index || this.createIndexGenerator(options);
|
||||||
|
|
||||||
events.on('router.created', (router) => {
|
events.on('router.created', (router) => {
|
||||||
if (router.name === 'StaticRoutesRouter') {
|
if (router.name === 'StaticRoutesRouter') {
|
||||||
|
@ -43,14 +45,15 @@ class SiteMapManager {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
createIndexGenerator() {
|
createIndexGenerator(options) {
|
||||||
return new IndexMapGenerator({
|
return new IndexMapGenerator({
|
||||||
types: {
|
types: {
|
||||||
pages: this.pages,
|
pages: this.pages,
|
||||||
posts: this.posts,
|
posts: this.posts,
|
||||||
authors: this.authors,
|
authors: this.authors,
|
||||||
tags: this.tags
|
tags: this.tags
|
||||||
}
|
},
|
||||||
|
maxPerPage: options.maxPerPage
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,8 +77,8 @@ class SiteMapManager {
|
||||||
return this.index.getXml();
|
return this.index.getXml();
|
||||||
}
|
}
|
||||||
|
|
||||||
getSiteMapXml(type) {
|
getSiteMapXml(type, page) {
|
||||||
return this[type].getXml();
|
return this[type].getXml(page);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,7 @@ describe('Generators', function () {
|
||||||
});
|
});
|
||||||
|
|
||||||
it('max node setting results in the right number of nodes', function () {
|
it('max node setting results in the right number of nodes', function () {
|
||||||
generator = new PostGenerator({maxNodes: 5});
|
generator = new PostGenerator({maxPerPage: 5});
|
||||||
|
|
||||||
for (let i = 0; i < 10; i++) {
|
for (let i = 0; i < 10; i++) {
|
||||||
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
||||||
|
@ -70,12 +70,12 @@ describe('Generators', function () {
|
||||||
Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10);
|
Object.keys(generator.nodeLookup).should.be.Array().with.lengthOf(10);
|
||||||
|
|
||||||
// But only 5 are output in the xml
|
// But only 5 are output in the xml
|
||||||
generator.siteMapContent.match(/<loc>/g).should.be.Array().with.lengthOf(5);
|
generator.siteMapContent.get(1).match(/<loc>/g).should.be.Array().with.lengthOf(5);
|
||||||
});
|
});
|
||||||
|
|
||||||
it('default is 50k', function () {
|
it('default is 50k', function () {
|
||||||
generator = new PostGenerator();
|
generator = new PostGenerator();
|
||||||
generator.maxNodes.should.eql(50000);
|
generator.maxPerPage.should.eql(50000);
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('IndexGenerator', function () {
|
describe('IndexGenerator', function () {
|
||||||
|
@ -86,7 +86,8 @@ describe('Generators', function () {
|
||||||
pages: new PageGenerator(),
|
pages: new PageGenerator(),
|
||||||
tags: new TagGenerator(),
|
tags: new TagGenerator(),
|
||||||
authors: new UserGenerator()
|
authors: new UserGenerator()
|
||||||
}
|
},
|
||||||
|
maxPerPage: 5
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -99,6 +100,21 @@ describe('Generators', function () {
|
||||||
xml.should.match(/sitemap-pages.xml/);
|
xml.should.match(/sitemap-pages.xml/);
|
||||||
xml.should.match(/sitemap-authors.xml/);
|
xml.should.match(/sitemap-authors.xml/);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('creates multiple pages when there are too many posts', function () {
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
generator.types.posts.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
||||||
|
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
|
||||||
|
updated_at: null,
|
||||||
|
published_at: null,
|
||||||
|
slug: `episode-${i}`
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
const xml = generator.getXml();
|
||||||
|
|
||||||
|
xml.should.match(/sitemap-posts.xml/);
|
||||||
|
xml.should.match(/sitemap-posts-2.xml/);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -126,9 +142,9 @@ describe('Generators', function () {
|
||||||
|
|
||||||
it('get cached xml', function () {
|
it('get cached xml', function () {
|
||||||
sinon.spy(generator, 'generateXmlFromNodes');
|
sinon.spy(generator, 'generateXmlFromNodes');
|
||||||
generator.siteMapContent = 'something';
|
generator.siteMapContent.set(1, 'something');
|
||||||
generator.getXml().should.eql('something');
|
generator.getXml().should.eql('something');
|
||||||
generator.siteMapContent = null;
|
generator.siteMapContent.clear();
|
||||||
generator.generateXmlFromNodes.called.should.eql(false);
|
generator.generateXmlFromNodes.called.should.eql(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -184,6 +200,32 @@ describe('Generators', function () {
|
||||||
idxFirst.should.be.below(idxSecond);
|
idxFirst.should.be.below(idxSecond);
|
||||||
idxSecond.should.be.below(idxThird);
|
idxSecond.should.be.below(idxThird);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('creates multiple pages when there are too many posts', function () {
|
||||||
|
generator.maxPerPage = 5;
|
||||||
|
urlUtils.urlFor.withArgs('sitemap_xsl', true).returns('http://my-ghost-blog.com/sitemap.xsl');
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
generator.addUrl(`http://my-ghost-blog.com/episode-${i}/`, testUtils.DataGenerator.forKnex.createPost({
|
||||||
|
created_at: (Date.UTC(2014, 11, 22, 12) - 360000) + 200,
|
||||||
|
updated_at: null,
|
||||||
|
published_at: null,
|
||||||
|
slug: `episode-${i}`
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
const pages = [generator.getXml(), generator.getXml(2)];
|
||||||
|
|
||||||
|
for (let i = 0; i < 10; i++) {
|
||||||
|
const pageIndex = Math.floor(i / 5);
|
||||||
|
pages[pageIndex].should.containEql(`<loc>http://my-ghost-blog.com/episode-${i}/</loc>`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
it('shouldn\'t break with out of bounds pages', function () {
|
||||||
|
should.not.exist(generator.getXml(-1));
|
||||||
|
should.not.exist(generator.getXml(99999));
|
||||||
|
should.not.exist(generator.getXml(0));
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('fn: removeUrl', function () {
|
describe('fn: removeUrl', function () {
|
||||||
|
@ -224,12 +266,12 @@ describe('Generators', function () {
|
||||||
|
|
||||||
generator.getXml();
|
generator.getXml();
|
||||||
|
|
||||||
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
|
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/home/</loc>');
|
||||||
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
|
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/magic/</loc>');
|
||||||
generator.siteMapContent.should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
|
generator.siteMapContent.get(1).should.containEql('<loc>http://my-ghost-blog.com/subscribe/</loc>');
|
||||||
|
|
||||||
// <loc> should exist exactly one time
|
// <loc> should exist exactly one time
|
||||||
generator.siteMapContent.match(/<loc>/g).length.should.eql(3);
|
generator.siteMapContent.get(1).match(/<loc>/g).length.should.eql(3);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
Loading…
Add table
Reference in a new issue