0
Fork 0
mirror of https://github.com/TryGhost/Ghost.git synced 2025-04-08 02:52:39 -05:00

Increased speed of importer

no issue

- change behaviour from updating user references after the actual import to update the user reference before the actual import
  - updating user references after the import is way less case intense
  - that was the initial decision for updating the references afterwards
  - but that does not play well with adding nested relations by identifier
- the refactoring is required for multiple authors
  - if we e.g. store invalid author id's, we won't be able to add a belongs-to-many relation for multiple authors
  - bookshelf-relations is generic and always tries to find a matching target before attching a model
  - invalid user references won't work anymore
- this change has a very good side affect
  - 17mb takes on master ~1,5seconds
    - on this branch it takes ~45seconds
  - also the memory usage is way lower and stabler
  - 40mb takes 1,6s (times out on master)
This commit is contained in:
kirrg001 2018-01-28 13:13:11 +01:00 committed by Katharina Irrgang
parent 12724df8e4
commit 5a4dd6b792
9 changed files with 267 additions and 113 deletions

View file

@ -1,11 +1,12 @@
'use strict';
const debug = require('ghost-ignition').debug('importer:base'),
_ = require('lodash'),
Promise = require('bluebird'),
ObjectId = require('bson-objectid'),
common = require('../../../../lib/common'),
sequence = require('../../../../lib/promise/sequence'),
models = require('../../../../models'),
_ = require('lodash'),
Promise = require('bluebird');
models = require('../../../../models');
class Base {
constructor(allDataFromFile, options) {
@ -29,9 +30,16 @@ class Base {
this.dataKeyToImport = options.dataKeyToImport;
this.dataToImport = _.cloneDeep(allDataFromFile[this.dataKeyToImport] || []);
this.importedDataToReturn = [];
this.importedData = [];
this.options.requiredImportedData = ['users'];
this.options.requiredExistingData = ['users'];
this.requiredFromFile = {};
this.requiredImportedData = {};
this.requiredExistingData = {};
if (!this.options.requiredFromFile) {
this.options.requiredFromFile = ['users'];
@ -76,9 +84,20 @@ class Base {
});
}
generateIdentifier() {
_.each(this.dataToImport, (obj) => {
obj.id = ObjectId.generate();
});
}
fetchExisting() {
return Promise.resolve();
}
beforeImport() {
this.stripProperties(['id']);
this.sanitizeValues();
this.generateIdentifier();
return Promise.resolve();
}
@ -146,6 +165,130 @@ class Base {
return Promise.reject(errorsToReject);
}
/**
* Data is now prepared. Last step is to replace identifiers.
*
* `dataToImport`: the objects to import (contain the new ID already)
* `requiredExistingData`: the importer allows you to ask for existing database objects
* `requiredFromFile`: the importer allows you to ask for data from the file
* `requiredImportedData`: the importer allows you to ask for already imported data
*/
replaceIdentifiers() {
const ownerUserId = _.find(this.requiredExistingData.users, (user) => {
if (user.roles[0].name === 'Owner') {
return true;
}
}).id;
let userReferenceProblems = {};
const handleObject = (obj, key) => {
if (!obj.hasOwnProperty(key)) {
return;
}
// CASE: you import null, fallback to owner
if (!obj[key]) {
if (!userReferenceProblems[obj.id]) {
userReferenceProblems[obj.id] = {obj: obj, keys: []};
}
userReferenceProblems[obj.id].keys.push(key);
obj[key] = ownerUserId;
return;
}
// CASE: first match the user reference with in the imported file
let userFromFile = _.find(this.requiredFromFile.users, {id: obj[key]});
if (!userFromFile) {
// CASE: if user does not exist in file, try to lookup the existing db users
let existingUser = _.find(this.requiredExistingData.users, {id: obj[key].toString()});
// CASE: fallback to owner
if (!existingUser) {
if (!userReferenceProblems[obj.id]) {
userReferenceProblems[obj.id] = {obj: obj, keys: []};
}
userReferenceProblems[obj.id].keys.push(key);
obj[key] = ownerUserId;
return;
} else {
// CASE: user exists in the database, ID is correct, skip
return;
}
}
// CASE: users table is the first data we insert. we have no access to the imported data yet
// Result: `this.requiredImportedData.users` will be empty.
// We already generate identifiers for each object in the importer layer. Accessible via `dataToImport`.
if (this.modelName === 'User' && !this.requiredImportedData.users.length) {
let userToImport = _.find(this.dataToImport, {slug: userFromFile.slug});
if (userToImport) {
obj[key] = userToImport.id;
return;
} else {
// CASE: unknown
return;
}
}
// CASE: user exists in the file, let's find his db id
// NOTE: lookup by email, because slug can change on insert
let importedUser = _.find(this.requiredImportedData.users, {email: userFromFile.email});
// CASE: found. let's assign the new ID
if (importedUser) {
obj[key] = importedUser.id;
return;
}
// CASE: user was not imported, let's figure out if the user exists in the database
let existingUser = _.find(this.requiredExistingData.users, {slug: userFromFile.slug});
if (!existingUser) {
// CASE: let's try by ID
existingUser = _.find(this.requiredExistingData.users, {id: userFromFile.id.toString()});
if (!existingUser) {
if (!userReferenceProblems[obj.id]) {
userReferenceProblems[obj.id] = {obj: obj, keys: []};
}
userReferenceProblems[obj.id].keys.push(key);
obj[key] = ownerUserId;
}
} else {
obj[key] = existingUser.id;
}
};
// Iterate over all possible user relations
_.each(this.dataToImport, (obj) => {
_.each([
'author_id',
'published_by',
'created_by',
'updated_by'
], (key) => {
return handleObject(obj, key);
});
});
_.each(userReferenceProblems, (entry) => {
this.problems.push({
message: 'Entry was imported, but we were not able to resolve the following user references: ' +
entry.keys.join(', ') + '. The user does not exist, fallback to owner user.',
help: this.modelName,
context: JSON.stringify(entry.obj)
});
});
}
doImport(options, importOptions) {
debug('doImport', this.modelName, this.dataToImport.length);
@ -163,6 +306,13 @@ class Base {
this.importedDataToReturn.push(importedModel.toJSON());
}
// for identifier lookup
this.importedData.push({
id: importedModel.id,
slug: importedModel.get('slug'),
email: importedModel.get('email')
});
return importedModel;
})
.catch((err) => {
@ -181,75 +331,6 @@ class Base {
*/
return sequence(ops);
}
/**
* Update all user reference fields e.g. published_by
*
* Background:
* - we never import the id field
* - almost each imported model has a reference to a user reference
* - we update all fields after the import (!)
*/
afterImport(options) {
debug('afterImport', this.modelName);
return models.User.getOwnerUser(options)
.then((ownerUser) => {
return Promise.each(this.dataToImport, (obj) => {
if (!obj.model) {
return;
}
return Promise.each(['author_id', 'published_by', 'created_by', 'updated_by'], (key) => {
// CASE: not all fields exist on each model, skip them if so
if (!obj[key]) {
return Promise.resolve();
}
let oldUser = _.find(this.requiredFromFile.users, {id: obj[key]});
if (!oldUser) {
this.problems.push({
message: 'Entry was imported, but we were not able to update user reference field: ' +
key + '. The user does not exist, fallback to owner user.',
help: this.modelName,
context: JSON.stringify(obj)
});
oldUser = {
email: ownerUser.get('email')
};
}
return models.User.findOne({
email: oldUser.email,
status: 'all'
}, options).then((userModel) => {
// CASE: user could not be imported e.g. multiple roles attached
if (!userModel) {
userModel = {
id: ownerUser.id
};
}
let dataToEdit = {};
dataToEdit[key] = userModel.id;
let context;
// CASE: updated_by is taken from the context object
if (key === 'updated_by') {
context = {context: {user: userModel.id}};
} else {
context = {};
}
return models[this.modelName].edit(dataToEdit, _.merge({}, options, {id: obj.model.id}, context));
});
});
});
});
}
}
module.exports = Base;

View file

@ -20,9 +20,9 @@ DataImporter = {
},
init: function init(importData) {
importers.users = new UsersImporter(importData.data);
importers.roles = new RolesImporter(importData.data);
importers.tags = new TagsImporter(importData.data);
importers.users = new UsersImporter(importData.data);
importers.subscribers = new SubscribersImporter(importData.data);
importers.posts = new PostsImporter(importData.data);
importers.settings = new SettingsImporter(importData.data);
@ -48,6 +48,7 @@ DataImporter = {
if (importOptions.importPersistUser) {
modelOptions.importPersistUser = importOptions.importPersistUser;
}
this.init(importData);
return models.Base.transaction(function (transacting) {
@ -55,7 +56,25 @@ DataImporter = {
_.each(importers, function (importer) {
ops.push(function doModelImport() {
return importer.beforeImport(modelOptions, importOptions)
return importer.fetchExisting(modelOptions, importOptions)
.then(function () {
return importer.beforeImport(modelOptions, importOptions);
})
.then(function () {
if (importer.options.requiredImportedData.length) {
_.each(importer.options.requiredImportedData, (key) => {
importer.requiredImportedData[key] = importers[key].importedData;
});
}
if (importer.options.requiredExistingData.length) {
_.each(importer.options.requiredExistingData, (key) => {
importer.requiredExistingData[key] = importers[key].existingData;
});
}
return importer.replaceIdentifiers(modelOptions, importOptions);
})
.then(function () {
return importer.doImport(modelOptions, importOptions)
.then(function (_results) {
@ -65,12 +84,6 @@ DataImporter = {
});
});
_.each(importers, function (importer) {
ops.push(function afterImport() {
return importer.afterImport(modelOptions);
});
});
sequence(ops)
.then(function () {
results.forEach(function (promise) {

View file

@ -66,6 +66,10 @@ class SettingsImporter extends BaseImporter {
return super.beforeImport();
}
generateIdentifier() {
return Promise.resolve();
}
doImport(options) {
debug('doImport', this.dataToImport.length);

View file

@ -29,7 +29,7 @@ class TagsImporter extends BaseImporter {
* Background:
* - the tag model is smart enough to regenerate unique fields
* - so if you import a tag name "test" and the same tag name exists, it would add "test-2"
* - that's we add a protection here to first find the tag
* - that's why we add a protection here to first find the tag
*
* @TODO: Add a flag to the base implementation e.g. `fetchBeforeAdd`
*/

View file

@ -2,7 +2,8 @@
const debug = require('ghost-ignition').debug('importer:users'),
_ = require('lodash'),
BaseImporter = require('./base');
BaseImporter = require('./base'),
models = require('../../../../models');
class UsersImporter extends BaseImporter {
constructor(allDataFromFile) {
@ -20,6 +21,13 @@ class UsersImporter extends BaseImporter {
};
}
fetchExisting(modelOptions) {
return models.User.findAll(_.merge({columns: ['id', 'slug', 'email'], withRelated: ['roles']}, modelOptions))
.then((existingData) => {
this.existingData = existingData.toJSON();
});
}
/**
* - by default all imported users are locked and get a random password
* - they have to follow the password forgotten flow

View file

@ -44,7 +44,7 @@ ghostBookshelf.plugin(plugins.collision);
// Manages nested updates (relationships)
ghostBookshelf.plugin('bookshelf-relations', {
allowedOptions: ['context'],
allowedOptions: ['context', 'importing'],
unsetRelations: true,
hooks: {
belongsToMany: {
@ -179,7 +179,9 @@ ghostBookshelf.Model = ghostBookshelf.Model.extend({
}
}
this.set('updated_by', this.contextUser(options));
if (!options.importing) {
this.set('updated_by', this.contextUser(options));
}
if (!newObj.get('created_at')) {
newObj.set('created_at', new Date());
@ -210,7 +212,9 @@ ghostBookshelf.Model = ghostBookshelf.Model.extend({
* - if no context
*/
onUpdating: function onUpdating(newObj, attr, options) {
this.set('updated_by', this.contextUser(options));
if (!options.importing) {
this.set('updated_by', this.contextUser(options));
}
if (options && options.context && !options.internal && !options.importing) {
if (newObj.hasDateChanged('created_at', {beforeWrite: true})) {

View file

@ -54,7 +54,7 @@ describe('Import', function () {
}).catch(done);
});
it('removes duplicate posts', function (done) {
it('removes duplicate posts, cares about invalid dates', function (done) {
var exportData;
testUtils.fixtures.loadExportFixture('export-003', {lts: true}).then(function (exported) {
@ -63,7 +63,26 @@ describe('Import', function () {
}).then(function (importResult) {
should.exist(importResult.data.posts);
importResult.data.posts.length.should.equal(1);
importResult.problems.length.should.eql(3);
importResult.problems.length.should.eql(4);
importResult.problems[0].message.should.eql('Entry was imported, but we were not able to ' +
'resolve the following user references: updated_by. The user does not exist, fallback to owner user.');
importResult.problems[0].help.should.eql('User');
importResult.problems[1].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importResult.problems[1].help.should.eql('User');
importResult.problems[2].message.should.eql('Date is in a wrong format and invalid. ' +
'It was replaced with the current timestamp.');
importResult.problems[2].help.should.eql('Post');
importResult.problems[3].message.should.eql('Theme not imported, please upload in Settings - Design');
importResult.problems[3].help.should.eql('Settings');
moment(importResult.data.posts[0].created_at).isValid().should.eql(true);
moment(importResult.data.posts[0].updated_at).format().should.eql('2013-10-18T23:58:44Z');
moment(importResult.data.posts[0].published_at).format().should.eql('2013-12-29T11:58:30Z');
moment(importResult.data.tags[0].updated_at).format().should.eql('2016-07-17T12:02:54Z');
done();
}).catch(done);
@ -91,6 +110,7 @@ describe('Import', function () {
testUtils.fixtures.loadExportFixture('export-003-duplicate-tags', {lts: true}).then(function (exported) {
exportData = exported;
return dataImporter.doImport(exportData, importOptions);
}).then(function (importResult) {
should.exist(importResult.data.tags);
@ -106,30 +126,40 @@ describe('Import', function () {
return postTag.tag_id !== 2;
});
importResult.problems.length.should.equal(3);
importResult.problems.length.should.equal(4);
importResult.problems[2].message.should.equal('Theme not imported, please upload in Settings - Design');
importResult.problems[0].message.should.eql('Entry was imported, but we were not able to ' +
'resolve the following user references: updated_by. The user does not exist, fallback to owner user.');
importResult.problems[0].help.should.eql('User');
importResult.problems[1].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importResult.problems[1].help.should.eql('User');
importResult.problems[2].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importResult.problems[2].help.should.eql('Tag');
importResult.problems[3].message.should.eql('Theme not imported, please upload in Settings - Design');
importResult.problems[3].help.should.eql('Settings');
done();
}).catch(done);
});
it('cares about invalid dates', function (done) {
it('removes broken tags from post (not in db, not in file)', function (done) {
var exportData;
testUtils.fixtures.loadExportFixture('export-003', {lts: true}).then(function (exported) {
testUtils.fixtures.loadExportFixture('export-003-broken-tags', {lts: true}).then(function (exported) {
exportData = exported;
return dataImporter.doImport(exportData, importOptions);
}).then(function (importResult) {
should.exist(importResult.data.posts);
importResult.data.posts.length.should.equal(1);
importResult.problems.length.should.eql(3);
moment(importResult.data.posts[0].created_at).isValid().should.eql(true);
moment(importResult.data.posts[0].updated_at).format().should.eql('2013-10-18T23:58:44Z');
moment(importResult.data.posts[0].published_at).format().should.eql('2013-12-29T11:58:30Z');
moment(importResult.data.tags[0].updated_at).format().should.eql('2016-07-17T12:02:54Z');
}).then(function () {
return Promise.all([
models.Post.findPage({withRelated: ['tags']})
]);
}).then(function (data) {
data[0].posts.length.should.eql(1);
data[0].posts[0].slug.should.eql('welcome-to-ghost-2');
data[0].posts[0].tags.length.should.eql(0);
done();
}).catch(done);
});
@ -392,7 +422,7 @@ describe('Import', function () {
}).then(function () {
done(new Error('Allowed import of duplicate data'));
}).catch(function (response) {
response.length.should.equal(4);
response.length.should.equal(3);
// NOTE: a duplicated tag.slug is a warning
response[0].errorType.should.equal('ValidationError');
@ -400,9 +430,8 @@ describe('Import', function () {
response[1].errorType.should.equal('ValidationError');
response[1].message.should.eql('Value in [posts.title] cannot be blank.');
response[2].errorType.should.equal('ValidationError');
response[2].message.should.eql('Value in [tags.name] cannot be blank.');
response[3].errorType.should.equal('ValidationError');
response[3].message.should.eql('Value in [settings.key] cannot be blank.');
response[2].message.should.eql('Value in [settings.key] cannot be blank.');
done();
}).catch(done);
});
@ -416,13 +445,17 @@ describe('Import', function () {
}).then(function (importedData) {
importedData.data.posts.length.should.eql(1);
importedData.problems.length.should.eql(3);
importedData.problems[0].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importedData.problems[0].help.should.eql('Tag');
importedData.problems.length.should.eql(4);
importedData.problems[0].message.should.eql('Entry was imported, but we were not able to ' +
'resolve the following user references: updated_by. The user does not exist, fallback to owner user.');
importedData.problems[0].help.should.eql('User');
importedData.problems[1].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importedData.problems[1].help.should.eql('Tag');
importedData.problems[2].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importedData.problems[2].help.should.eql('Post');
importedData.problems[2].help.should.eql('Tag');
importedData.problems[3].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importedData.problems[3].help.should.eql('Post');
done();
}).catch(done);
});
@ -437,9 +470,16 @@ describe('Import', function () {
// NOTE: we detect invalid author references as warnings, because ember can handle this
// The owner can simply update the author reference in the UI
importedData.problems.length.should.eql(3);
importedData.problems[0].message.should.eql('Entry was imported, but we were not able to resolve the ' +
'following user references: updated_by. The user does not exist, fallback to owner user.');
importedData.problems[0].help.should.eql('User');
importedData.problems[1].message.should.eql('Entry was not imported and ignored. Detected duplicated entry.');
importedData.problems[1].help.should.eql('User');
importedData.problems[2].message.should.eql('Entry was imported, but we were not able to ' +
'update user reference field: published_by. The user does not exist, fallback to owner user.');
importedData.problems[2].help.should.eql('Post');
'resolve the following user references: author_id, published_by. The user does not exist, fallback to owner user.');
// Grab the data from tables
return Promise.all([
@ -1530,12 +1570,16 @@ describe('Import (new test structure)', function () {
// Check feature image is correctly mapped for a post
firstPost.feature_image.should.eql('/content/images/2017/05/post-image.jpg');
// Check logo and cover images are correctly mapped for a user
users[1].cover_image.should.eql(exportData.data.users[0].cover);
users[1].profile_image.should.eql(exportData.data.users[0].image);
// Check feature image is correctly mapped for a tag
tags[0].feature_image.should.eql(exportData.data.tags[0].image);
// updated_by: 2 could not be resolved, fallback to owner
settings[5].updated_by.should.eql(users[0].id);
// Check logo image is correctly mapped for a blog
settings[5].key.should.eql('logo');
settings[5].value.should.eql('/content/images/2017/05/bloglogo.jpeg');

View file

@ -131,7 +131,7 @@
"created_at": 1388319501897,
"created_by": 1,
"updated_at": null,
"updated_by": null
"updated_by": 10
},
{
"id": 3,

View file

@ -1366,7 +1366,7 @@
"created_at": "2016-10-28T13:43:36.000Z",
"created_by": 1,
"updated_at": "2017-05-30T12:03:55.000Z",
"updated_by": 1
"updated_by": 2
},
{
"id": 10,