From 300eba49ca73d301b1bdc53c4160fbeb81c19ff0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=82=B3=E3=82=B3=E3=83=AD?= <4946624+shincurry@users.noreply.github.com> Date: Wed, 2 Oct 2024 02:01:54 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=8C=90=20Improved=20sodo-search=20for=20C?= =?UTF-8?q?JK=20(#21148)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit no ref According to the flexsearch documentation, https://github.com/nextapps-de/flexsearch?tab=readme-ov-file#cjk-word-break-chinese-japanese-korean for searching CJK text, need to pass in a custom encode function for better search results. This enhancement for CJK will only take effect when the ghost site locale is set to one of `zh`, `zh-Hans`, `zh-Hant`, `ja`, `ko`. Co-authored-by: Cathy Sarisky <42299862+cathysarisky@users.noreply.github.com> --- apps/sodo-search/src/search-index.js | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/apps/sodo-search/src/search-index.js b/apps/sodo-search/src/search-index.js index a70595e42d..89f9dcf363 100644 --- a/apps/sodo-search/src/search-index.js +++ b/apps/sodo-search/src/search-index.js @@ -15,7 +15,8 @@ export default class SearchIndex { id: 'id', index: ['title', 'excerpt'], store: true - } + }, + ...this.#getEncodeOptions() }); this.authorsIndex = new Flexsearch.Document({ tokenize: 'forward', @@ -23,7 +24,8 @@ export default class SearchIndex { id: 'id', index: ['name'], store: true - } + }, + ...this.#getEncodeOptions() }); this.tagsIndex = new Flexsearch.Document({ tokenize: 'forward', @@ -31,7 +33,8 @@ export default class SearchIndex { id: 'id', index: ['name'], store: true - } + }, + ...this.#getEncodeOptions() }); this.init = this.init.bind(this); @@ -133,4 +136,17 @@ export default class SearchIndex { tags: this.#normalizeSearchResult(tags) }; } + + #getEncodeOptions() { + const regex = new RegExp( + `[\u{4E00}-\u{9FFF}\u{3040}-\u{309F}\u{30A0}-\u{30FF}\u{AC00}-\u{D7A3}\u{3400}-\u{4DBF}\u{20000}-\u{2A6DF}\u{2A700}-\u{2B73F}\u{2B740}-\u{2B81F}\u{2B820}-\u{2CEAF}\u{2CEB0}-\u{2EBEF}\u{30000}-\u{3134F}\u{31350}-\u{323AF}\u{2EBF0}-\u{2EE5F}\u{F900}-\u{FAFF}\u{2F800}-\u{2FA1F}]|[0-9A-Za-zа-я\u00C0-\u017F\u0400-\u04FF\u0600-\u06FF\u0980-\u09FF\u1E00-\u1EFF]+`, + 'mug' + ); + + return { + encode: (str) => { + return ('' + str).toLowerCase().match(regex) ?? []; + } + }; + } }