mirror of
https://github.com/nextapps-de/flexsearch.git
synced 2025-10-01 15:46:40 +02:00
document resolver, reduce code duplication, refactoring
This commit is contained in:
293
dist/module-debug/encoder.js
vendored
293
dist/module-debug/encoder.js
vendored
@@ -1,5 +1,5 @@
|
||||
|
||||
import { parse_option } from "./common.js";
|
||||
import { merge_option } from "./common.js";
|
||||
import normalize_polyfill from "./charset/normalize.js";
|
||||
import { EncoderOptions } from "./type.js";
|
||||
|
||||
@@ -23,21 +23,26 @@ function fixedEncoder(string){
|
||||
return [string]
|
||||
}
|
||||
|
||||
Built-in Encoder (Workflow)
|
||||
Built-in Encoder
|
||||
----------------------------
|
||||
The main workflow follows an increasing strategy,
|
||||
starting from a simple .toLowerCase() to full RegExp
|
||||
Pipeline:
|
||||
1. apply this.normalize: charset normalization:
|
||||
applied on the whole input string e.g. lowercase,
|
||||
will also apply on: filter, matcher, stemmer, mapper
|
||||
2. apply this.split: split input into terms (includes/excludes)
|
||||
3. apply this.filter (pre-filter)
|
||||
4. apply this.matcher (replace terms)
|
||||
5. apply this.stemmer (replace term endings)
|
||||
6. apply this.filter (post-filter)
|
||||
7. apply this.mapper (replace chars)
|
||||
8. apply this.replacer (custom regex)
|
||||
everything you put later into (filter, matcher, stemmer, mapper, etc.)
|
||||
has to be normalized by definition, because it won't apply to them automatically
|
||||
2. apply this.prepare (custom preparation, string in - string out)
|
||||
3 split numerics into triplets when not surrounded by a letter
|
||||
4. apply this.split: split input into terms (includes/excludes)
|
||||
5. apply this.filter (pre-filter)
|
||||
6. apply this.stemmer (replace term endings)
|
||||
7. apply this.filter (post-filter)
|
||||
8. apply this.mapper (replace chars)
|
||||
9. apply this.dedupe (letter deduplication)
|
||||
10. apply this.finalize
|
||||
10. apply this.matcher (replace terms)
|
||||
11. apply this.replacer (custom regex)
|
||||
12. apply this.finalize
|
||||
*/
|
||||
|
||||
const whitespace = /[^\p{L}\p{N}]+/u,
|
||||
@@ -62,6 +67,9 @@ const whitespace = /[^\p{L}\p{N}]+/u,
|
||||
export default function Encoder() {
|
||||
|
||||
if (!this || this.constructor !== Encoder) {
|
||||
// let args = Array.prototype.slice.call(arguments);
|
||||
// args.unshift(Encoder);
|
||||
// return new (Encoder.bind.apply(Encoder, args));
|
||||
return new Encoder(...arguments);
|
||||
}
|
||||
|
||||
@@ -79,8 +87,9 @@ Encoder.prototype.assign = function (options) {
|
||||
* pre-processing string input
|
||||
* @type {Function|boolean}
|
||||
*/
|
||||
this.normalize = /** @type {Function|boolean} */parse_option(options.normalize, /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */ /* skip_update: */ /* skip deletion */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/
|
||||
/*await rows.hasNext()*/, this.normalize);
|
||||
this.normalize = /** @type {Function|boolean} */merge_option(options.normalize, /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */
|
||||
/* skip_update: */
|
||||
/* skip deletion */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/ /*await rows.hasNext()*/, this.normalize);
|
||||
|
||||
// {
|
||||
// letter: true,
|
||||
@@ -93,68 +102,78 @@ Encoder.prototype.assign = function (options) {
|
||||
// }
|
||||
|
||||
let include = options.include,
|
||||
tmp = include || options.exclude || options.split;
|
||||
tmp = include || options.exclude || options.split,
|
||||
numeric;
|
||||
|
||||
|
||||
if ("object" == typeof tmp) {
|
||||
let numeric = !include,
|
||||
regex = "";
|
||||
if (tmp || "" === tmp) {
|
||||
if ("object" == typeof tmp && tmp.constructor !== RegExp) {
|
||||
let regex = "";
|
||||
numeric = !include;
|
||||
// split on whitespace by default
|
||||
include || (regex += "\\p{Z}");
|
||||
if (tmp.letter) {
|
||||
regex += "\\p{L}";
|
||||
}
|
||||
if (tmp.number) {
|
||||
regex += "\\p{N}";
|
||||
numeric = !!include;
|
||||
}
|
||||
if (tmp.symbol) {
|
||||
regex += "\\p{S}";
|
||||
}
|
||||
if (tmp.punctuation) {
|
||||
regex += "\\p{P}";
|
||||
}
|
||||
if (tmp.control) {
|
||||
regex += "\\p{C}";
|
||||
}
|
||||
if (tmp = tmp.char) {
|
||||
regex += "object" == typeof tmp ? tmp.join("") : tmp;
|
||||
}
|
||||
|
||||
// split on whitespace by default
|
||||
options.include || (regex += "\\p{Z}");
|
||||
if (tmp.letter) {
|
||||
regex += "\\p{L}";
|
||||
}
|
||||
if (tmp.number) {
|
||||
regex += "\\p{N}";
|
||||
numeric = !!include;
|
||||
}
|
||||
if (tmp.symbol) {
|
||||
regex += "\\p{S}";
|
||||
}
|
||||
if (tmp.punctuation) {
|
||||
regex += "\\p{P}";
|
||||
}
|
||||
if (tmp.control) {
|
||||
regex += "\\p{C}";
|
||||
}
|
||||
if (tmp = tmp.char) {
|
||||
regex += "object" == typeof tmp ? tmp.join("") : tmp;
|
||||
try {
|
||||
// https://github.com/nextapps-de/flexsearch/issues/410
|
||||
/**
|
||||
* split string input into terms
|
||||
* @type {string|RegExp|boolean|null}
|
||||
*/
|
||||
this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
|
||||
} catch (e) {
|
||||
console.error("Your split configuration:", tmp, "is not supported on this platform. It falls back to using simple whitespace splitter instead: /\s+/.");
|
||||
// fallback to a simple whitespace splitter
|
||||
this.split = /\s+/;
|
||||
}
|
||||
} else {
|
||||
this.split = /** @type {string|RegExp|boolean} */tmp;
|
||||
// determine numeric encoding
|
||||
numeric = /* suggest */ /* append: */ /* enrich */!1 === tmp || 2 > "a1a".split(tmp).length;
|
||||
}
|
||||
|
||||
try {
|
||||
// https://github.com/nextapps-de/flexsearch/issues/410
|
||||
/**
|
||||
* split string input into terms
|
||||
* @type {string|RegExp|boolean|null}
|
||||
*/
|
||||
this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
|
||||
} catch (e) {
|
||||
// fallback to a simple whitespace splitter
|
||||
this.split = /\s+/;
|
||||
}
|
||||
this.numeric = numeric;
|
||||
this.numeric = merge_option(options.numeric, numeric);
|
||||
} else {
|
||||
try {
|
||||
// https://github.com/nextapps-de/flexsearch/issues/410
|
||||
this.split = /** @type {string|RegExp|boolean} */parse_option(tmp, whitespace, this.split);
|
||||
this.split = /** @type {string|RegExp|boolean} */merge_option(this.split, whitespace);
|
||||
} catch (e) {
|
||||
console.warn("This platform does not support unicode regex. It falls back to using simple whitespace splitter instead: /\s+/.");
|
||||
// fallback to a simple whitespace splitter
|
||||
this.split = /\s+/;
|
||||
}
|
||||
this.numeric = parse_option(this.numeric, !0);
|
||||
|
||||
this.numeric = merge_option(options.numeric, merge_option(this.numeric, !0));
|
||||
}
|
||||
|
||||
/**
|
||||
* post-processing terms
|
||||
* @type {Function|null}
|
||||
*/
|
||||
this.prepare = /** @type {Function|null} */parse_option(options.prepare, null, this.prepare);
|
||||
this.prepare = /** @type {Function|null} */merge_option(options.prepare, null, this.prepare);
|
||||
/**
|
||||
* final processing
|
||||
* @type {Function|null}
|
||||
*/
|
||||
this.finalize = /** @type {Function|null} */parse_option(options.finalize, null, this.finalize);
|
||||
this.finalize = /** @type {Function|null} */merge_option(options.finalize, null, this.finalize);
|
||||
|
||||
// assign the normalization fallback to the mapper
|
||||
if (!normalize) {
|
||||
@@ -164,24 +183,23 @@ Encoder.prototype.assign = function (options) {
|
||||
|
||||
// options
|
||||
|
||||
this.rtl = options.rtl || /* suggest */ /* append: */ /* enrich */!1;
|
||||
this.dedupe = parse_option(options.dedupe, !1, this.dedupe);
|
||||
this.filter = parse_option((tmp = options.filter) && new Set(tmp), null, this.filter);
|
||||
this.matcher = parse_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
|
||||
this.mapper = parse_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
|
||||
this.stemmer = parse_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
|
||||
this.replacer = parse_option(options.replacer, null, this.replacer);
|
||||
this.minlength = parse_option(options.minlength, 1, this.minlength);
|
||||
this.maxlength = parse_option(options.maxlength, 0, this.maxlength);
|
||||
this.rtl = merge_option(options.rtl, !1, this.rtl);
|
||||
this.dedupe = merge_option(options.dedupe, !1, this.dedupe);
|
||||
this.filter = merge_option((tmp = options.filter) && new Set(tmp), null, this.filter);
|
||||
this.matcher = merge_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
|
||||
this.mapper = merge_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
|
||||
this.stemmer = merge_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
|
||||
this.replacer = merge_option(options.replacer, null, this.replacer);
|
||||
this.minlength = merge_option(options.minlength, 1, this.minlength);
|
||||
this.maxlength = merge_option(options.maxlength, 0, this.maxlength);
|
||||
|
||||
// minimum required tokenizer by this encoder
|
||||
//this.tokenize = options["tokenize"] || "";
|
||||
|
||||
// auto-balanced cache
|
||||
this.cache = tmp = parse_option(options.cache, !0, this.cache);
|
||||
this.cache = tmp = merge_option(options.cache, !0, this.cache);
|
||||
if (tmp) {
|
||||
this.timer = null;
|
||||
this.cache_size = "number" == typeof tmp ? tmp : 2e5;
|
||||
this.timer = null;this.cache_size = "number" == typeof tmp ? tmp : 2e5;
|
||||
this.cache_enc = new Map();
|
||||
this.cache_term = new Map();
|
||||
this.cache_enc_length = 128;
|
||||
@@ -216,7 +234,7 @@ Encoder.prototype.assign = function (options) {
|
||||
}
|
||||
|
||||
// if(SUPPORT_COMPRESSION){
|
||||
// this.compression = parse_option(options.compress || options.compression, 0, this.compression);
|
||||
// this.compression = merge_option(options.compress || options.compression, 0, this.compression);
|
||||
// if(this.compression && !table){
|
||||
// table = new Array(radix);
|
||||
// for(let i = 0; i < radix; i++) table[i] = i + 33;
|
||||
@@ -227,43 +245,33 @@ Encoder.prototype.assign = function (options) {
|
||||
return this;
|
||||
};
|
||||
|
||||
Encoder.prototype.addMatcher = function (match, replace) {
|
||||
// regex:
|
||||
if ("object" == typeof match) {
|
||||
return this.addReplacer(match, replace);
|
||||
}
|
||||
// a single char:
|
||||
if (2 > match.length) {
|
||||
return this.addMapper(match, replace);
|
||||
}
|
||||
this.matcher || (this.matcher = new Map());
|
||||
this.matcher.set(match, replace);
|
||||
this.matcher_str += (this.matcher_str ? "|" : "") + match;
|
||||
this.matcher_test = null; //new RegExp("(" + this.matcher_str + ")");
|
||||
this.cache && this.invalidate();
|
||||
return this;
|
||||
};
|
||||
|
||||
Encoder.prototype.addStemmer = function (match, replace) {
|
||||
this.stemmer || (this.stemmer = new Map());
|
||||
this.stemmer.set(match, replace);
|
||||
this.stemmer_str += (this.stemmer_str ? "|" : "") + match;
|
||||
this.stemmer_test = null; //new RegExp("(" + this.stemmer_str + ")");
|
||||
this.cache && this.invalidate();
|
||||
this.stemmer_test = null;
|
||||
this.cache && clear(this);
|
||||
return this;
|
||||
};
|
||||
|
||||
Encoder.prototype.addFilter = function (str) {
|
||||
Encoder.prototype.addFilter = function (term) {
|
||||
this.filter || (this.filter = new Set());
|
||||
this.filter.add(str);
|
||||
this.cache && this.invalidate();
|
||||
this.filter.add(term);
|
||||
this.cache && clear(this);
|
||||
return this;
|
||||
};
|
||||
|
||||
/**
|
||||
* Replace a single char
|
||||
* @param {string} char_match
|
||||
* @param {string} char_replace
|
||||
* @return {Encoder}
|
||||
* @suppress {invalidCasts}
|
||||
*/
|
||||
Encoder.prototype.addMapper = function (char_match, char_replace) {
|
||||
// regex:
|
||||
if ("object" == typeof char_match) {
|
||||
return this.addReplacer(char_match, char_replace);
|
||||
return this.addReplacer( /** @type {RegExp} */char_match, char_replace);
|
||||
}
|
||||
// not a char:
|
||||
if (1 < char_match.length) {
|
||||
@@ -271,24 +279,55 @@ Encoder.prototype.addMapper = function (char_match, char_replace) {
|
||||
}
|
||||
this.mapper || (this.mapper = new Map());
|
||||
this.mapper.set(char_match, char_replace);
|
||||
this.cache && this.invalidate();
|
||||
this.cache && clear(this);
|
||||
return this;
|
||||
};
|
||||
|
||||
Encoder.prototype.addReplacer = function (match, replace) {
|
||||
if ("string" == typeof match) match = new RegExp(match, "g");
|
||||
/**
|
||||
* Replace a string
|
||||
* @param {string} match
|
||||
* @param {string} replace
|
||||
* @return {Encoder}
|
||||
* @suppress {invalidCasts}
|
||||
*/
|
||||
Encoder.prototype.addMatcher = function (match, replace) {
|
||||
// regex:
|
||||
if ("object" == typeof match) {
|
||||
return this.addReplacer( /** @type {RegExp} */match, replace);
|
||||
}
|
||||
// a single char:
|
||||
// only downgrade when dedupe is on or mapper already was filled
|
||||
if (2 > match.length && (this.dedupe || this.mapper)) {
|
||||
return this.addMapper(match, replace);
|
||||
}
|
||||
this.matcher || (this.matcher = new Map());
|
||||
this.matcher.set(match, replace);
|
||||
this.matcher_str += (this.matcher_str ? "|" : "") + match;
|
||||
this.matcher_test = null;
|
||||
this.cache && clear(this);
|
||||
return this;
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {RegExp} regex
|
||||
* @param {string} replace
|
||||
* @return {Encoder}
|
||||
* @suppress {invalidCasts}
|
||||
*/
|
||||
Encoder.prototype.addReplacer = function (regex, replace) {
|
||||
if ("string" == typeof regex) {
|
||||
return this.addMatcher( /** @type {string} */regex, replace);
|
||||
}
|
||||
this.replacer || (this.replacer = []);
|
||||
this.replacer.push(match, replace || "");
|
||||
this.cache && this.invalidate();
|
||||
this.replacer.push(regex, replace);
|
||||
this.cache && clear(this);
|
||||
return this;
|
||||
};
|
||||
|
||||
Encoder.prototype.invalidate = function () {
|
||||
this.cache_enc.clear();
|
||||
this.cache_term.clear();
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* @param {!string} str
|
||||
* @return {!Array<string>}
|
||||
*/
|
||||
Encoder.prototype.encode = function (str) {
|
||||
|
||||
//if(!str) return str;
|
||||
@@ -304,7 +343,7 @@ Encoder.prototype.encode = function (str) {
|
||||
}
|
||||
}
|
||||
|
||||
// 1. apply charset normalization
|
||||
// apply charset normalization
|
||||
if (this.normalize) {
|
||||
if ("function" == typeof this.normalize) {
|
||||
str = this.normalize(str);
|
||||
@@ -315,12 +354,12 @@ Encoder.prototype.encode = function (str) {
|
||||
}
|
||||
}
|
||||
|
||||
// 2. apply custom encoder (can replace split)
|
||||
// apply custom encoder (can replace split)
|
||||
if (this.prepare) {
|
||||
str = this.prepare(str);
|
||||
}
|
||||
|
||||
// 3. split numbers into triplets
|
||||
// split numbers into triplets
|
||||
if (this.numeric && 3 < str.length) {
|
||||
str = str.replace(numeric_split_prev_char, "$1 $2").replace(numeric_split_next_char, "$1 $2").replace(numeric_split_length, "$1 ");
|
||||
}
|
||||
@@ -356,7 +395,7 @@ Encoder.prototype.encode = function (str) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// 1. pre-filter before cache
|
||||
// pre-filter before cache
|
||||
if (this.filter && this.filter.has(word)) {
|
||||
continue;
|
||||
}
|
||||
@@ -364,11 +403,8 @@ Encoder.prototype.encode = function (str) {
|
||||
if (this.cache && word.length <= this.cache_term_length) {
|
||||
if (this.timer) {
|
||||
const tmp = this.cache_term.get(word);
|
||||
//if(this.cache_term.has(word)){
|
||||
if (tmp || "" === tmp) {
|
||||
//word = this.cache_term.get(word);
|
||||
tmp && final.push(tmp);
|
||||
//word ? words[i] = word : words.splice(i--, 1);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
@@ -376,9 +412,7 @@ Encoder.prototype.encode = function (str) {
|
||||
}
|
||||
}
|
||||
|
||||
let postfilter;
|
||||
|
||||
// 2. apply stemmer after matcher
|
||||
// apply stemmer after matcher
|
||||
if (this.stemmer && 2 < word.length) {
|
||||
// for(const item of this.stemmer){
|
||||
// const key = item[0];
|
||||
@@ -397,18 +431,15 @@ Encoder.prototype.encode = function (str) {
|
||||
// }
|
||||
this.stemmer_test || (this.stemmer_test = new RegExp("(?!^)(" + this.stemmer_str + ")$"));
|
||||
word = word.replace(this.stemmer_test, match => this.stemmer.get(match));
|
||||
postfilter = 1;
|
||||
|
||||
// 4. post-filter after matcher and stemmer was applied
|
||||
if (word.length < this.minlength || this.filter && this.filter.has(word)) {
|
||||
word = "";
|
||||
}
|
||||
}
|
||||
|
||||
// 4. post-filter after matcher and stemmer was applied
|
||||
if (word && postfilter && (word.length < this.minlength || this.filter && this.filter.has(word))) {
|
||||
word = "";
|
||||
}
|
||||
|
||||
// 5. apply mapper and collapsing
|
||||
// apply mapper and collapsing
|
||||
if (word && (this.mapper || this.dedupe && 1 < word.length)) {
|
||||
//word = this.replace_dedupe(word);
|
||||
//word = replace_deduped(word, this.mapper, true);
|
||||
let final = "";
|
||||
for (let i = 0, prev = "", char, tmp; i < word.length; i++) {
|
||||
char = word.charAt(i);
|
||||
@@ -420,11 +451,13 @@ Encoder.prototype.encode = function (str) {
|
||||
word = final;
|
||||
}
|
||||
|
||||
// 3. apply matcher
|
||||
// from here the input string can shrink,
|
||||
// minlength should not apply
|
||||
|
||||
// apply matcher
|
||||
if (this.matcher && 1 < word.length) {
|
||||
this.matcher_test || (this.matcher_test = new RegExp("(" + this.matcher_str + ")", "g"));
|
||||
word = word.replace(this.matcher_test, match => this.matcher.get(match));
|
||||
//postfilter = 1;
|
||||
}
|
||||
|
||||
// apply custom regex
|
||||
@@ -439,10 +472,6 @@ Encoder.prototype.encode = function (str) {
|
||||
//word = word.replace(/(.)\1+/g, "$1");
|
||||
//word = word.replace(/(?<=(.))\1+/g, "");
|
||||
|
||||
// if(word){
|
||||
// words[i] = word;
|
||||
// }
|
||||
|
||||
if (this.cache && base.length <= this.cache_term_length) {
|
||||
this.cache_term.set(base, word);
|
||||
if (this.cache_term.size > this.cache_size) {
|
||||
@@ -451,20 +480,9 @@ Encoder.prototype.encode = function (str) {
|
||||
}
|
||||
}
|
||||
|
||||
//word || words.splice(i--, 1);
|
||||
word && final.push(word);
|
||||
}
|
||||
|
||||
//words = final;
|
||||
// else if(this.filter){
|
||||
// for(let i = 0, word; i < words.length; i++){
|
||||
// if((word = words[i]) && !this.filter.has(word)){
|
||||
// //filtered.push(word);
|
||||
// words.splice(i--, 1);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
if (this.finalize) {
|
||||
final = this.finalize(final) || final;
|
||||
}
|
||||
@@ -513,6 +531,9 @@ Encoder.prototype.encode = function (str) {
|
||||
// return str;
|
||||
// }
|
||||
|
||||
/**
|
||||
* @param {Encoder} self
|
||||
*/
|
||||
function clear(self) {
|
||||
self.timer = null;
|
||||
self.cache_enc.clear();
|
||||
|
Reference in New Issue
Block a user