0
Fork 0
mirror of https://github.com/TryGhost/Ghost.git synced 2025-03-04 02:01:58 -05:00

🌐 Improved sodo-search for CJK (#21148)

no ref

According to the flexsearch documentation, https://github.com/nextapps-de/flexsearch?tab=readme-ov-file#cjk-word-break-chinese-japanese-korean for searching CJK text, need to pass in a custom encode function for better search results.

This enhancement for CJK will only take effect when the ghost site locale is set to one of `zh`, `zh-Hans`, `zh-Hant`, `ja`, `ko`.

Co-authored-by: Cathy Sarisky <42299862+cathysarisky@users.noreply.github.com>
This commit is contained in:
ココロ 2024-10-02 02:01:54 +08:00 committed by GitHub
parent 3bbe8c8c7a
commit 300eba49ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -15,7 +15,8 @@ export default class SearchIndex {
id: 'id',
index: ['title', 'excerpt'],
store: true
}
},
...this.#getEncodeOptions()
});
this.authorsIndex = new Flexsearch.Document({
tokenize: 'forward',
@ -23,7 +24,8 @@ export default class SearchIndex {
id: 'id',
index: ['name'],
store: true
}
},
...this.#getEncodeOptions()
});
this.tagsIndex = new Flexsearch.Document({
tokenize: 'forward',
@ -31,7 +33,8 @@ export default class SearchIndex {
id: 'id',
index: ['name'],
store: true
}
},
...this.#getEncodeOptions()
});
this.init = this.init.bind(this);
@ -133,4 +136,17 @@ export default class SearchIndex {
tags: this.#normalizeSearchResult(tags)
};
}
#getEncodeOptions() {
const regex = new RegExp(
`[\u{4E00}-\u{9FFF}\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{AC00}-\u{D7A3}\u{3400}-\u{4DBF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{30000}-\u{3134F}\u{31350}-\u{323AF}\u{2EBF0}-\u{2EE5F}\u{F900}-\u{FAFF}\u{2F800}-\u{2FA1F}]|[0-9A-Za-zа\u00C0-\u017F\u0400-\u04FF\u0600-\u06FF\u0980-\u09FF\u1E00-\u1EFF]+`,
'mug'
);
return {
encode: (str) => {
return ('' + str).toLowerCase().match(regex) ?? [];
}
};
}
}