document resolver, reduce code duplication, refactoring

2025-10-01 15:46:40 +02:00 · 2025-03-19 14:33:22 +01:00
parent 0455fa4b74
commit d9f235ce3b
107 changed files with 11949 additions and 10740 deletions
--- a/dist/module-debug/encoder.js
+++ b/dist/module-debug/encoder.js
@@ -1,5 +1,5 @@

-import { parse_option } from "./common.js";
+import { merge_option } from "./common.js";
 import normalize_polyfill from "./charset/normalize.js";
 import { EncoderOptions } from "./type.js";

@@ -23,21 +23,26 @@ function fixedEncoder(string){
    return [string]
 }

-Built-in Encoder (Workflow)
+Built-in Encoder
 ----------------------------
+The main workflow follows an increasing strategy,
+starting from a simple .toLowerCase() to full RegExp
 Pipeline:
    1. apply this.normalize: charset normalization:
       applied on the whole input string e.g. lowercase,
-       will also apply on: filter, matcher, stemmer, mapper
-    2. apply this.split: split input into terms (includes/excludes)
-    3. apply this.filter (pre-filter)
-    4. apply this.matcher (replace terms)
-    5. apply this.stemmer (replace term endings)
-    6. apply this.filter (post-filter)
-    7. apply this.mapper (replace chars)
-    8. apply this.replacer (custom regex)
+       everything you put later into (filter, matcher, stemmer, mapper, etc.)
+       has to be normalized by definition, because it won't apply to them automatically
+    2. apply this.prepare (custom preparation, string in - string out)
+    3  split numerics into triplets when not surrounded by a letter
+    4. apply this.split: split input into terms (includes/excludes)
+    5. apply this.filter (pre-filter)
+    6. apply this.stemmer (replace term endings)
+    7. apply this.filter (post-filter)
+    8. apply this.mapper (replace chars)
    9. apply this.dedupe (letter deduplication)
-   10. apply this.finalize
+   10. apply this.matcher (replace terms)
+   11. apply this.replacer (custom regex)
+   12. apply this.finalize
 */

 const whitespace = /[^\p{L}\p{N}]+/u,
@@ -62,6 +67,9 @@ const whitespace = /[^\p{L}\p{N}]+/u,
 export default function Encoder() {

    if (!this || this.constructor !== Encoder) {
+        // let args = Array.prototype.slice.call(arguments);
+        // args.unshift(Encoder);
+        // return new (Encoder.bind.apply(Encoder, args));
        return new Encoder(...arguments);
    }

@@ -79,8 +87,9 @@ Encoder.prototype.assign = function (options) {
     * pre-processing string input
     * @type {Function|boolean}
     */
-    this.normalize = /** @type {Function|boolean} */parse_option(options.normalize, /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */ /* skip_update: */ /* skip deletion */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/
-    /*await rows.hasNext()*/, this.normalize);
+    this.normalize = /** @type {Function|boolean} */merge_option(options.normalize, /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */
+    /* skip_update: */
+    /* skip deletion */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/ /*await rows.hasNext()*/, this.normalize);

    // {
    //     letter: true,
@@ -93,68 +102,78 @@ Encoder.prototype.assign = function (options) {
    // }

    let include = options.include,
-        tmp = include || options.exclude || options.split;
+        tmp = include || options.exclude || options.split,
+        numeric;


-    if ("object" == typeof tmp) {
-        let numeric = !include,
-            regex = "";
+    if (tmp || "" === tmp) {
+        if ("object" == typeof tmp && tmp.constructor !== RegExp) {
+            let regex = "";
+            numeric = !include;
+            // split on whitespace by default
+            include || (regex += "\\p{Z}");
+            if (tmp.letter) {
+                regex += "\\p{L}";
+            }
+            if (tmp.number) {
+                regex += "\\p{N}";
+                numeric = !!include;
+            }
+            if (tmp.symbol) {
+                regex += "\\p{S}";
+            }
+            if (tmp.punctuation) {
+                regex += "\\p{P}";
+            }
+            if (tmp.control) {
+                regex += "\\p{C}";
+            }
+            if (tmp = tmp.char) {
+                regex += "object" == typeof tmp ? tmp.join("") : tmp;
+            }

-        // split on whitespace by default
-        options.include || (regex += "\\p{Z}");
-        if (tmp.letter) {
-            regex += "\\p{L}";
-        }
-        if (tmp.number) {
-            regex += "\\p{N}";
-            numeric = !!include;
-        }
-        if (tmp.symbol) {
-            regex += "\\p{S}";
-        }
-        if (tmp.punctuation) {
-            regex += "\\p{P}";
-        }
-        if (tmp.control) {
-            regex += "\\p{C}";
-        }
-        if (tmp = tmp.char) {
-            regex += "object" == typeof tmp ? tmp.join("") : tmp;
+            try {
+                // https://github.com/nextapps-de/flexsearch/issues/410
+                /**
+                 * split string input into terms
+                 * @type {string|RegExp|boolean|null}
+                 */
+                this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
+            } catch (e) {
+                console.error("Your split configuration:", tmp, "is not supported on this platform. It falls back to using simple whitespace splitter instead: /\s+/.");
+                // fallback to a simple whitespace splitter
+                this.split = /\s+/;
+            }
+        } else {
+            this.split = /** @type {string|RegExp|boolean} */tmp;
+            // determine numeric encoding
+            numeric = /* suggest */ /* append: */ /* enrich */!1 === tmp || 2 > "a1a".split(tmp).length;
        }

-        try {
-            // https://github.com/nextapps-de/flexsearch/issues/410
-            /**
-             * split string input into terms
-             * @type {string|RegExp|boolean|null}
-             */
-            this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
-        } catch (e) {
-            // fallback to a simple whitespace splitter
-            this.split = /\s+/;
-        }
-        this.numeric = numeric;
+        this.numeric = merge_option(options.numeric, numeric);
    } else {
        try {
            // https://github.com/nextapps-de/flexsearch/issues/410
-            this.split = /** @type {string|RegExp|boolean} */parse_option(tmp, whitespace, this.split);
+            this.split = /** @type {string|RegExp|boolean} */merge_option(this.split, whitespace);
        } catch (e) {
+            console.warn("This platform does not support unicode regex. It falls back to using simple whitespace splitter instead: /\s+/.");
            // fallback to a simple whitespace splitter
            this.split = /\s+/;
        }
-        this.numeric = parse_option(this.numeric, !0);
+
+        this.numeric = merge_option(options.numeric, merge_option(this.numeric, !0));
    }

    /**
     * post-processing terms
     * @type {Function|null}
     */
-    this.prepare = /** @type {Function|null} */parse_option(options.prepare, null, this.prepare);
+    this.prepare = /** @type {Function|null} */merge_option(options.prepare, null, this.prepare);
    /**
     * final processing
     * @type {Function|null}
     */
-    this.finalize = /** @type {Function|null} */parse_option(options.finalize, null, this.finalize);
+    this.finalize = /** @type {Function|null} */merge_option(options.finalize, null, this.finalize);

    // assign the normalization fallback to the mapper
    if (!normalize) {
@@ -164,24 +183,23 @@ Encoder.prototype.assign = function (options) {

    // options

-    this.rtl = options.rtl || /* suggest */ /* append: */ /* enrich */!1;
-    this.dedupe = parse_option(options.dedupe, !1, this.dedupe);
-    this.filter = parse_option((tmp = options.filter) && new Set(tmp), null, this.filter);
-    this.matcher = parse_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
-    this.mapper = parse_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
-    this.stemmer = parse_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
-    this.replacer = parse_option(options.replacer, null, this.replacer);
-    this.minlength = parse_option(options.minlength, 1, this.minlength);
-    this.maxlength = parse_option(options.maxlength, 0, this.maxlength);
+    this.rtl = merge_option(options.rtl, !1, this.rtl);
+    this.dedupe = merge_option(options.dedupe, !1, this.dedupe);
+    this.filter = merge_option((tmp = options.filter) && new Set(tmp), null, this.filter);
+    this.matcher = merge_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
+    this.mapper = merge_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
+    this.stemmer = merge_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
+    this.replacer = merge_option(options.replacer, null, this.replacer);
+    this.minlength = merge_option(options.minlength, 1, this.minlength);
+    this.maxlength = merge_option(options.maxlength, 0, this.maxlength);

    // minimum required tokenizer by this encoder
    //this.tokenize = options["tokenize"] || "";

    // auto-balanced cache
-    this.cache = tmp = parse_option(options.cache, !0, this.cache);
+    this.cache = tmp = merge_option(options.cache, !0, this.cache);
    if (tmp) {
-        this.timer = null;
-        this.cache_size = "number" == typeof tmp ? tmp : 2e5;
+        this.timer = null;this.cache_size = "number" == typeof tmp ? tmp : 2e5;
        this.cache_enc = new Map();
        this.cache_term = new Map();
        this.cache_enc_length = 128;
@@ -216,7 +234,7 @@ Encoder.prototype.assign = function (options) {
    }

    // if(SUPPORT_COMPRESSION){
-    //     this.compression = parse_option(options.compress || options.compression, 0, this.compression);
+    //     this.compression = merge_option(options.compress || options.compression, 0, this.compression);
    //     if(this.compression && !table){
    //         table = new Array(radix);
    //         for(let i = 0; i < radix; i++) table[i] = i + 33;
@@ -227,43 +245,33 @@ Encoder.prototype.assign = function (options) {
    return this;
 };

-Encoder.prototype.addMatcher = function (match, replace) {
-    // regex:
-    if ("object" == typeof match) {
-        return this.addReplacer(match, replace);
-    }
-    // a single char:
-    if (2 > match.length) {
-        return this.addMapper(match, replace);
-    }
-    this.matcher || (this.matcher = new Map());
-    this.matcher.set(match, replace);
-    this.matcher_str += (this.matcher_str ? "|" : "") + match;
-    this.matcher_test = null; //new RegExp("(" + this.matcher_str + ")");
-    this.cache && this.invalidate();
-    return this;
-};
-
 Encoder.prototype.addStemmer = function (match, replace) {
    this.stemmer || (this.stemmer = new Map());
    this.stemmer.set(match, replace);
    this.stemmer_str += (this.stemmer_str ? "|" : "") + match;
-    this.stemmer_test = null; //new RegExp("(" + this.stemmer_str + ")");
-    this.cache && this.invalidate();
+    this.stemmer_test = null;
+    this.cache && clear(this);
    return this;
 };

-Encoder.prototype.addFilter = function (str) {
+Encoder.prototype.addFilter = function (term) {
    this.filter || (this.filter = new Set());
-    this.filter.add(str);
-    this.cache && this.invalidate();
+    this.filter.add(term);
+    this.cache && clear(this);
    return this;
 };

+/**
+ * Replace a single char
+ * @param {string} char_match
+ * @param {string} char_replace
+ * @return {Encoder}
+ * @suppress {invalidCasts}
+ */
 Encoder.prototype.addMapper = function (char_match, char_replace) {
    // regex:
    if ("object" == typeof char_match) {
-        return this.addReplacer(char_match, char_replace);
+        return this.addReplacer( /**  @type {RegExp} */char_match, char_replace);
    }
    // not a char:
    if (1 < char_match.length) {
@@ -271,24 +279,55 @@ Encoder.prototype.addMapper = function (char_match, char_replace) {
    }
    this.mapper || (this.mapper = new Map());
    this.mapper.set(char_match, char_replace);
-    this.cache && this.invalidate();
+    this.cache && clear(this);
    return this;
 };

-Encoder.prototype.addReplacer = function (match, replace) {
-    if ("string" == typeof match) match = new RegExp(match, "g");
+/**
+ * Replace a string
+ * @param {string} match
+ * @param {string} replace
+ * @return {Encoder}
+ * @suppress {invalidCasts}
+ */
+Encoder.prototype.addMatcher = function (match, replace) {
+    // regex:
+    if ("object" == typeof match) {
+        return this.addReplacer( /**  @type {RegExp} */match, replace);
+    }
+    // a single char:
+    // only downgrade when dedupe is on or mapper already was filled
+    if (2 > match.length && (this.dedupe || this.mapper)) {
+        return this.addMapper(match, replace);
+    }
+    this.matcher || (this.matcher = new Map());
+    this.matcher.set(match, replace);
+    this.matcher_str += (this.matcher_str ? "|" : "") + match;
+    this.matcher_test = null;
+    this.cache && clear(this);
+    return this;
+};
+
+/**
+ * @param {RegExp} regex
+ * @param {string} replace
+ * @return {Encoder}
+ * @suppress {invalidCasts}
+ */
+Encoder.prototype.addReplacer = function (regex, replace) {
+    if ("string" == typeof regex) {
+        return this.addMatcher( /**  @type {string} */regex, replace);
+    }
    this.replacer || (this.replacer = []);
-    this.replacer.push(match, replace || "");
-    this.cache && this.invalidate();
+    this.replacer.push(regex, replace);
+    this.cache && clear(this);
    return this;
 };

-Encoder.prototype.invalidate = function () {
-    this.cache_enc.clear();
-    this.cache_term.clear();
-};
-
-
+/**
+ * @param {!string} str
+ * @return {!Array<string>}
+ */
 Encoder.prototype.encode = function (str) {

    //if(!str) return str;
@@ -304,7 +343,7 @@ Encoder.prototype.encode = function (str) {
        }
    }

-    // 1. apply charset normalization
+    // apply charset normalization
    if (this.normalize) {
        if ("function" == typeof this.normalize) {
            str = this.normalize(str);
@@ -315,12 +354,12 @@ Encoder.prototype.encode = function (str) {
        }
    }

-    // 2. apply custom encoder (can replace split)
+    // apply custom encoder (can replace split)
    if (this.prepare) {
        str = this.prepare(str);
    }

-    // 3. split numbers into triplets
+    // split numbers into triplets
    if (this.numeric && 3 < str.length) {
        str = str.replace(numeric_split_prev_char, "$1 $2").replace(numeric_split_next_char, "$1 $2").replace(numeric_split_length, "$1 ");
    }
@@ -356,7 +395,7 @@ Encoder.prototype.encode = function (str) {
            continue;
        }

-        // 1. pre-filter before cache
+        // pre-filter before cache
        if (this.filter && this.filter.has(word)) {
            continue;
        }
@@ -364,11 +403,8 @@ Encoder.prototype.encode = function (str) {
        if (this.cache && word.length <= this.cache_term_length) {
            if (this.timer) {
                const tmp = this.cache_term.get(word);
-                //if(this.cache_term.has(word)){
                if (tmp || "" === tmp) {
-                    //word = this.cache_term.get(word);
                    tmp && final.push(tmp);
-                    //word ? words[i] = word : words.splice(i--, 1);
                    continue;
                }
            } else {
@@ -376,9 +412,7 @@ Encoder.prototype.encode = function (str) {
            }
        }

-        let postfilter;
-
-        // 2. apply stemmer after matcher
+        // apply stemmer after matcher
        if (this.stemmer && 2 < word.length) {
            // for(const item of this.stemmer){
            //     const key = item[0];
@@ -397,18 +431,15 @@ Encoder.prototype.encode = function (str) {
            // }
            this.stemmer_test || (this.stemmer_test = new RegExp("(?!^)(" + this.stemmer_str + ")$"));
            word = word.replace(this.stemmer_test, match => this.stemmer.get(match));
-            postfilter = 1;
+
+            // 4. post-filter after matcher and stemmer was applied
+            if (word.length < this.minlength || this.filter && this.filter.has(word)) {
+                word = "";
+            }
        }

-        // 4. post-filter after matcher and stemmer was applied
-        if (word && postfilter && (word.length < this.minlength || this.filter && this.filter.has(word))) {
-            word = "";
-        }
-
-        // 5. apply mapper and collapsing
+        // apply mapper and collapsing
        if (word && (this.mapper || this.dedupe && 1 < word.length)) {
-            //word = this.replace_dedupe(word);
-            //word = replace_deduped(word, this.mapper, true);
            let final = "";
            for (let i = 0, prev = "", char, tmp; i < word.length; i++) {
                char = word.charAt(i);
@@ -420,11 +451,13 @@ Encoder.prototype.encode = function (str) {
            word = final;
        }

-        // 3. apply matcher
+        // from here the input string can shrink,
+        // minlength should not apply
+
+        // apply matcher
        if (this.matcher && 1 < word.length) {
            this.matcher_test || (this.matcher_test = new RegExp("(" + this.matcher_str + ")", "g"));
            word = word.replace(this.matcher_test, match => this.matcher.get(match));
-            //postfilter = 1;
        }

        // apply custom regex
@@ -439,10 +472,6 @@ Encoder.prototype.encode = function (str) {
        //word = word.replace(/(.)\1+/g, "$1");
        //word = word.replace(/(?<=(.))\1+/g, "");

-        // if(word){
-        //     words[i] = word;
-        // }
-
        if (this.cache && base.length <= this.cache_term_length) {
            this.cache_term.set(base, word);
            if (this.cache_term.size > this.cache_size) {
@@ -451,20 +480,9 @@ Encoder.prototype.encode = function (str) {
            }
        }

-        //word || words.splice(i--, 1);
        word && final.push(word);
    }

-    //words = final;
-    // else if(this.filter){
-    //     for(let i = 0, word; i < words.length; i++){
-    //         if((word = words[i]) && !this.filter.has(word)){
-    //             //filtered.push(word);
-    //             words.splice(i--, 1);
-    //         }
-    //     }
-    // }
-
    if (this.finalize) {
        final = this.finalize(final) || final;
    }
@@ -513,6 +531,9 @@ Encoder.prototype.encode = function (str) {
 //     return str;
 // }

+/**
+ * @param {Encoder} self
+ */
 function clear(self) {
    self.timer = null;
    self.cache_enc.clear();