1
0
mirror of https://github.com/nextapps-de/flexsearch.git synced 2025-10-01 15:46:40 +02:00

document resolver, reduce code duplication, refactoring

This commit is contained in:
Thomas Wilkerling
2025-03-19 14:33:22 +01:00
parent 0455fa4b74
commit d9f235ce3b
107 changed files with 11949 additions and 10740 deletions

View File

@@ -1,5 +1,5 @@
import { parse_option } from "./common.js";
import { merge_option } from "./common.js";
import normalize_polyfill from "./charset/normalize.js";
import { EncoderOptions } from "./type.js";
@@ -23,21 +23,26 @@ function fixedEncoder(string){
return [string]
}
Built-in Encoder (Workflow)
Built-in Encoder
----------------------------
The main workflow follows an increasing strategy,
starting from a simple .toLowerCase() to full RegExp
Pipeline:
1. apply this.normalize: charset normalization:
applied on the whole input string e.g. lowercase,
will also apply on: filter, matcher, stemmer, mapper
2. apply this.split: split input into terms (includes/excludes)
3. apply this.filter (pre-filter)
4. apply this.matcher (replace terms)
5. apply this.stemmer (replace term endings)
6. apply this.filter (post-filter)
7. apply this.mapper (replace chars)
8. apply this.replacer (custom regex)
everything you put later into (filter, matcher, stemmer, mapper, etc.)
has to be normalized by definition, because it won't apply to them automatically
2. apply this.prepare (custom preparation, string in - string out)
3 split numerics into triplets when not surrounded by a letter
4. apply this.split: split input into terms (includes/excludes)
5. apply this.filter (pre-filter)
6. apply this.stemmer (replace term endings)
7. apply this.filter (post-filter)
8. apply this.mapper (replace chars)
9. apply this.dedupe (letter deduplication)
10. apply this.finalize
10. apply this.matcher (replace terms)
11. apply this.replacer (custom regex)
12. apply this.finalize
*/
const whitespace = /[^\p{L}\p{N}]+/u,
@@ -62,6 +67,9 @@ const whitespace = /[^\p{L}\p{N}]+/u,
export default function Encoder() {
if (!this || this.constructor !== Encoder) {
// let args = Array.prototype.slice.call(arguments);
// args.unshift(Encoder);
// return new (Encoder.bind.apply(Encoder, args));
return new Encoder(...arguments);
}
@@ -79,8 +87,9 @@ Encoder.prototype.assign = function (options) {
* pre-processing string input
* @type {Function|boolean}
*/
this.normalize = /** @type {Function|boolean} */parse_option(options.normalize, /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */ /* skip_update: */ /* skip deletion */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/
/*await rows.hasNext()*/, this.normalize);
this.normalize = /** @type {Function|boolean} */merge_option(options.normalize, /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */
/* skip_update: */
/* skip deletion */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/ /*await rows.hasNext()*/, this.normalize);
// {
// letter: true,
@@ -93,68 +102,78 @@ Encoder.prototype.assign = function (options) {
// }
let include = options.include,
tmp = include || options.exclude || options.split;
tmp = include || options.exclude || options.split,
numeric;
if ("object" == typeof tmp) {
let numeric = !include,
regex = "";
if (tmp || "" === tmp) {
if ("object" == typeof tmp && tmp.constructor !== RegExp) {
let regex = "";
numeric = !include;
// split on whitespace by default
include || (regex += "\\p{Z}");
if (tmp.letter) {
regex += "\\p{L}";
}
if (tmp.number) {
regex += "\\p{N}";
numeric = !!include;
}
if (tmp.symbol) {
regex += "\\p{S}";
}
if (tmp.punctuation) {
regex += "\\p{P}";
}
if (tmp.control) {
regex += "\\p{C}";
}
if (tmp = tmp.char) {
regex += "object" == typeof tmp ? tmp.join("") : tmp;
}
// split on whitespace by default
options.include || (regex += "\\p{Z}");
if (tmp.letter) {
regex += "\\p{L}";
}
if (tmp.number) {
regex += "\\p{N}";
numeric = !!include;
}
if (tmp.symbol) {
regex += "\\p{S}";
}
if (tmp.punctuation) {
regex += "\\p{P}";
}
if (tmp.control) {
regex += "\\p{C}";
}
if (tmp = tmp.char) {
regex += "object" == typeof tmp ? tmp.join("") : tmp;
try {
// https://github.com/nextapps-de/flexsearch/issues/410
/**
* split string input into terms
* @type {string|RegExp|boolean|null}
*/
this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
} catch (e) {
console.error("Your split configuration:", tmp, "is not supported on this platform. It falls back to using simple whitespace splitter instead: /\s+/.");
// fallback to a simple whitespace splitter
this.split = /\s+/;
}
} else {
this.split = /** @type {string|RegExp|boolean} */tmp;
// determine numeric encoding
numeric = /* suggest */ /* append: */ /* enrich */!1 === tmp || 2 > "a1a".split(tmp).length;
}
try {
// https://github.com/nextapps-de/flexsearch/issues/410
/**
* split string input into terms
* @type {string|RegExp|boolean|null}
*/
this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
} catch (e) {
// fallback to a simple whitespace splitter
this.split = /\s+/;
}
this.numeric = numeric;
this.numeric = merge_option(options.numeric, numeric);
} else {
try {
// https://github.com/nextapps-de/flexsearch/issues/410
this.split = /** @type {string|RegExp|boolean} */parse_option(tmp, whitespace, this.split);
this.split = /** @type {string|RegExp|boolean} */merge_option(this.split, whitespace);
} catch (e) {
console.warn("This platform does not support unicode regex. It falls back to using simple whitespace splitter instead: /\s+/.");
// fallback to a simple whitespace splitter
this.split = /\s+/;
}
this.numeric = parse_option(this.numeric, !0);
this.numeric = merge_option(options.numeric, merge_option(this.numeric, !0));
}
/**
* post-processing terms
* @type {Function|null}
*/
this.prepare = /** @type {Function|null} */parse_option(options.prepare, null, this.prepare);
this.prepare = /** @type {Function|null} */merge_option(options.prepare, null, this.prepare);
/**
* final processing
* @type {Function|null}
*/
this.finalize = /** @type {Function|null} */parse_option(options.finalize, null, this.finalize);
this.finalize = /** @type {Function|null} */merge_option(options.finalize, null, this.finalize);
// assign the normalization fallback to the mapper
if (!normalize) {
@@ -164,24 +183,23 @@ Encoder.prototype.assign = function (options) {
// options
this.rtl = options.rtl || /* suggest */ /* append: */ /* enrich */!1;
this.dedupe = parse_option(options.dedupe, !1, this.dedupe);
this.filter = parse_option((tmp = options.filter) && new Set(tmp), null, this.filter);
this.matcher = parse_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
this.mapper = parse_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
this.stemmer = parse_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
this.replacer = parse_option(options.replacer, null, this.replacer);
this.minlength = parse_option(options.minlength, 1, this.minlength);
this.maxlength = parse_option(options.maxlength, 0, this.maxlength);
this.rtl = merge_option(options.rtl, !1, this.rtl);
this.dedupe = merge_option(options.dedupe, !1, this.dedupe);
this.filter = merge_option((tmp = options.filter) && new Set(tmp), null, this.filter);
this.matcher = merge_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
this.mapper = merge_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
this.stemmer = merge_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
this.replacer = merge_option(options.replacer, null, this.replacer);
this.minlength = merge_option(options.minlength, 1, this.minlength);
this.maxlength = merge_option(options.maxlength, 0, this.maxlength);
// minimum required tokenizer by this encoder
//this.tokenize = options["tokenize"] || "";
// auto-balanced cache
this.cache = tmp = parse_option(options.cache, !0, this.cache);
this.cache = tmp = merge_option(options.cache, !0, this.cache);
if (tmp) {
this.timer = null;
this.cache_size = "number" == typeof tmp ? tmp : 2e5;
this.timer = null;this.cache_size = "number" == typeof tmp ? tmp : 2e5;
this.cache_enc = new Map();
this.cache_term = new Map();
this.cache_enc_length = 128;
@@ -216,7 +234,7 @@ Encoder.prototype.assign = function (options) {
}
// if(SUPPORT_COMPRESSION){
// this.compression = parse_option(options.compress || options.compression, 0, this.compression);
// this.compression = merge_option(options.compress || options.compression, 0, this.compression);
// if(this.compression && !table){
// table = new Array(radix);
// for(let i = 0; i < radix; i++) table[i] = i + 33;
@@ -227,43 +245,33 @@ Encoder.prototype.assign = function (options) {
return this;
};
Encoder.prototype.addMatcher = function (match, replace) {
// regex:
if ("object" == typeof match) {
return this.addReplacer(match, replace);
}
// a single char:
if (2 > match.length) {
return this.addMapper(match, replace);
}
this.matcher || (this.matcher = new Map());
this.matcher.set(match, replace);
this.matcher_str += (this.matcher_str ? "|" : "") + match;
this.matcher_test = null; //new RegExp("(" + this.matcher_str + ")");
this.cache && this.invalidate();
return this;
};
Encoder.prototype.addStemmer = function (match, replace) {
this.stemmer || (this.stemmer = new Map());
this.stemmer.set(match, replace);
this.stemmer_str += (this.stemmer_str ? "|" : "") + match;
this.stemmer_test = null; //new RegExp("(" + this.stemmer_str + ")");
this.cache && this.invalidate();
this.stemmer_test = null;
this.cache && clear(this);
return this;
};
Encoder.prototype.addFilter = function (str) {
Encoder.prototype.addFilter = function (term) {
this.filter || (this.filter = new Set());
this.filter.add(str);
this.cache && this.invalidate();
this.filter.add(term);
this.cache && clear(this);
return this;
};
/**
* Replace a single char
* @param {string} char_match
* @param {string} char_replace
* @return {Encoder}
* @suppress {invalidCasts}
*/
Encoder.prototype.addMapper = function (char_match, char_replace) {
// regex:
if ("object" == typeof char_match) {
return this.addReplacer(char_match, char_replace);
return this.addReplacer( /** @type {RegExp} */char_match, char_replace);
}
// not a char:
if (1 < char_match.length) {
@@ -271,24 +279,55 @@ Encoder.prototype.addMapper = function (char_match, char_replace) {
}
this.mapper || (this.mapper = new Map());
this.mapper.set(char_match, char_replace);
this.cache && this.invalidate();
this.cache && clear(this);
return this;
};
Encoder.prototype.addReplacer = function (match, replace) {
if ("string" == typeof match) match = new RegExp(match, "g");
/**
* Replace a string
* @param {string} match
* @param {string} replace
* @return {Encoder}
* @suppress {invalidCasts}
*/
Encoder.prototype.addMatcher = function (match, replace) {
// regex:
if ("object" == typeof match) {
return this.addReplacer( /** @type {RegExp} */match, replace);
}
// a single char:
// only downgrade when dedupe is on or mapper already was filled
if (2 > match.length && (this.dedupe || this.mapper)) {
return this.addMapper(match, replace);
}
this.matcher || (this.matcher = new Map());
this.matcher.set(match, replace);
this.matcher_str += (this.matcher_str ? "|" : "") + match;
this.matcher_test = null;
this.cache && clear(this);
return this;
};
/**
* @param {RegExp} regex
* @param {string} replace
* @return {Encoder}
* @suppress {invalidCasts}
*/
Encoder.prototype.addReplacer = function (regex, replace) {
if ("string" == typeof regex) {
return this.addMatcher( /** @type {string} */regex, replace);
}
this.replacer || (this.replacer = []);
this.replacer.push(match, replace || "");
this.cache && this.invalidate();
this.replacer.push(regex, replace);
this.cache && clear(this);
return this;
};
Encoder.prototype.invalidate = function () {
this.cache_enc.clear();
this.cache_term.clear();
};
/**
* @param {!string} str
* @return {!Array<string>}
*/
Encoder.prototype.encode = function (str) {
//if(!str) return str;
@@ -304,7 +343,7 @@ Encoder.prototype.encode = function (str) {
}
}
// 1. apply charset normalization
// apply charset normalization
if (this.normalize) {
if ("function" == typeof this.normalize) {
str = this.normalize(str);
@@ -315,12 +354,12 @@ Encoder.prototype.encode = function (str) {
}
}
// 2. apply custom encoder (can replace split)
// apply custom encoder (can replace split)
if (this.prepare) {
str = this.prepare(str);
}
// 3. split numbers into triplets
// split numbers into triplets
if (this.numeric && 3 < str.length) {
str = str.replace(numeric_split_prev_char, "$1 $2").replace(numeric_split_next_char, "$1 $2").replace(numeric_split_length, "$1 ");
}
@@ -356,7 +395,7 @@ Encoder.prototype.encode = function (str) {
continue;
}
// 1. pre-filter before cache
// pre-filter before cache
if (this.filter && this.filter.has(word)) {
continue;
}
@@ -364,11 +403,8 @@ Encoder.prototype.encode = function (str) {
if (this.cache && word.length <= this.cache_term_length) {
if (this.timer) {
const tmp = this.cache_term.get(word);
//if(this.cache_term.has(word)){
if (tmp || "" === tmp) {
//word = this.cache_term.get(word);
tmp && final.push(tmp);
//word ? words[i] = word : words.splice(i--, 1);
continue;
}
} else {
@@ -376,9 +412,7 @@ Encoder.prototype.encode = function (str) {
}
}
let postfilter;
// 2. apply stemmer after matcher
// apply stemmer after matcher
if (this.stemmer && 2 < word.length) {
// for(const item of this.stemmer){
// const key = item[0];
@@ -397,18 +431,15 @@ Encoder.prototype.encode = function (str) {
// }
this.stemmer_test || (this.stemmer_test = new RegExp("(?!^)(" + this.stemmer_str + ")$"));
word = word.replace(this.stemmer_test, match => this.stemmer.get(match));
postfilter = 1;
// 4. post-filter after matcher and stemmer was applied
if (word.length < this.minlength || this.filter && this.filter.has(word)) {
word = "";
}
}
// 4. post-filter after matcher and stemmer was applied
if (word && postfilter && (word.length < this.minlength || this.filter && this.filter.has(word))) {
word = "";
}
// 5. apply mapper and collapsing
// apply mapper and collapsing
if (word && (this.mapper || this.dedupe && 1 < word.length)) {
//word = this.replace_dedupe(word);
//word = replace_deduped(word, this.mapper, true);
let final = "";
for (let i = 0, prev = "", char, tmp; i < word.length; i++) {
char = word.charAt(i);
@@ -420,11 +451,13 @@ Encoder.prototype.encode = function (str) {
word = final;
}
// 3. apply matcher
// from here the input string can shrink,
// minlength should not apply
// apply matcher
if (this.matcher && 1 < word.length) {
this.matcher_test || (this.matcher_test = new RegExp("(" + this.matcher_str + ")", "g"));
word = word.replace(this.matcher_test, match => this.matcher.get(match));
//postfilter = 1;
}
// apply custom regex
@@ -439,10 +472,6 @@ Encoder.prototype.encode = function (str) {
//word = word.replace(/(.)\1+/g, "$1");
//word = word.replace(/(?<=(.))\1+/g, "");
// if(word){
// words[i] = word;
// }
if (this.cache && base.length <= this.cache_term_length) {
this.cache_term.set(base, word);
if (this.cache_term.size > this.cache_size) {
@@ -451,20 +480,9 @@ Encoder.prototype.encode = function (str) {
}
}
//word || words.splice(i--, 1);
word && final.push(word);
}
//words = final;
// else if(this.filter){
// for(let i = 0, word; i < words.length; i++){
// if((word = words[i]) && !this.filter.has(word)){
// //filtered.push(word);
// words.splice(i--, 1);
// }
// }
// }
if (this.finalize) {
final = this.finalize(final) || final;
}
@@ -513,6 +531,9 @@ Encoder.prototype.encode = function (str) {
// return str;
// }
/**
* @param {Encoder} self
*/
function clear(self) {
self.timer = null;
self.cache_enc.clear();