mirror of
https://github.com/nextapps-de/flexsearch.git
synced 2025-09-25 21:08:59 +02:00
457 lines
14 KiB
JavaScript
457 lines
14 KiB
JavaScript
|
|
import { create_object, merge_option } from "./common.js";
|
|
import normalize_polyfill from "./charset/polyfill.js";
|
|
import { EncoderOptions } from "./type.js";
|
|
|
|
/*
|
|
|
|
Custom Encoder
|
|
----------------
|
|
|
|
|
|
function englishEncoder(string){
|
|
return string.toLowerCase().split(/[^a-z]+/)
|
|
}
|
|
|
|
|
|
function chineseEncoder(string){
|
|
return string.replace(/\s+/, "").split("")
|
|
}
|
|
|
|
|
|
function fixedEncoder(string){
|
|
return [string]
|
|
}
|
|
|
|
Built-in Encoder
|
|
----------------------------
|
|
The main workflow follows an increasing strategy,
|
|
starting from a simple .toLowerCase() to full RegExp
|
|
Pipeline:
|
|
1. apply this.normalize (charset normalization)
|
|
applied on the whole input string e.g. lowercase,
|
|
everything you put later into (filter, matcher, stemmer, mapper, etc.)
|
|
has to be normalized by definition, because it won't apply to them automatically
|
|
2. apply this.prepare (custom function, string in - string out)
|
|
3 split numerics into triplets
|
|
4. split input into terms (by one of them: split/include/exclude)
|
|
5. pre-encoded term deduplication
|
|
6. apply this.filter (stop-words)
|
|
7. apply this.stemmer (replace term endings)
|
|
8. apply this.mapper (replace chars)
|
|
9. apply this.dedupe (letter deduplication)
|
|
10. apply this.matcher (replace terms)
|
|
11. apply this.replacer (custom regex)
|
|
12. post-encoded term deduplication
|
|
13. apply this.finalize (custom function, array in - array out)
|
|
*/
|
|
|
|
const whitespace = /[^\p{L}\p{N}]+/u,
|
|
numeric_split_length = /(\d{3})/g,
|
|
numeric_split_prev_char = /(\D)(\d{3})/g,
|
|
numeric_split_next_char = /(\d{3})(\D)/g,
|
|
normalize = /[\u0300-\u036f]/g;
|
|
|
|
|
|
/**
|
|
* @param {EncoderOptions=} options
|
|
* @constructor
|
|
*/
|
|
|
|
export default function Encoder(options = {}) {
|
|
|
|
if (!this || this.constructor !== Encoder) {
|
|
|
|
return new Encoder(...arguments);
|
|
}
|
|
|
|
if (arguments.length) {
|
|
for (let i = 0; i < arguments.length; i++) {
|
|
this.assign( /** @type {!EncoderOptions} */arguments[i]);
|
|
}
|
|
} else {
|
|
this.assign( /** @type {!EncoderOptions} */options);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param {!EncoderOptions} options
|
|
*/
|
|
Encoder.prototype.assign = function (options) {
|
|
|
|
/**
|
|
* pre-processing string input
|
|
* @type {Function|boolean}
|
|
*/
|
|
this.normalize = /** @type {Function|boolean} */merge_option(options.normalize, !0, this.normalize);
|
|
|
|
let include = options.include,
|
|
tmp = include || options.exclude || options.split,
|
|
numeric;
|
|
|
|
|
|
if (tmp || "" === tmp) {
|
|
if ("object" == typeof tmp && tmp.constructor !== RegExp) {
|
|
let regex = "";
|
|
numeric = !include;
|
|
|
|
include || (regex += "\\p{Z}");
|
|
if (tmp.letter) {
|
|
regex += "\\p{L}";
|
|
}
|
|
if (tmp.number) {
|
|
regex += "\\p{N}";
|
|
numeric = !!include;
|
|
}
|
|
if (tmp.symbol) {
|
|
regex += "\\p{S}";
|
|
}
|
|
if (tmp.punctuation) {
|
|
regex += "\\p{P}";
|
|
}
|
|
if (tmp.control) {
|
|
regex += "\\p{C}";
|
|
}
|
|
if (tmp = tmp.char) {
|
|
regex += "object" == typeof tmp ? tmp.join("") : tmp;
|
|
}
|
|
|
|
try {
|
|
|
|
/**
|
|
* split string input into terms
|
|
* @type {string|RegExp|boolean|null}
|
|
*/
|
|
this.split = new RegExp("[" + (include ? "^" : "") + regex + "]+", "u");
|
|
} catch (e) {
|
|
console.error("Your split configuration:", tmp, "is not supported on this platform. It falls back to using simple whitespace splitter instead: /\s+/.");
|
|
|
|
|
|
this.split = /\s+/;
|
|
}
|
|
} else {
|
|
this.split = /** @type {string|RegExp|boolean} */tmp;
|
|
|
|
numeric = !1 === tmp || 2 > "a1a".split(tmp).length;
|
|
}
|
|
|
|
this.numeric = merge_option(options.numeric, numeric);
|
|
} else {
|
|
try {
|
|
|
|
this.split = /** @type {string|RegExp|boolean} */merge_option(this.split, whitespace);
|
|
} catch (e) {
|
|
console.warn("This platform does not support unicode regex. It falls back to using simple whitespace splitter instead: /\s+/.");
|
|
|
|
|
|
this.split = /\s+/;
|
|
}
|
|
|
|
this.numeric = merge_option(options.numeric, merge_option(this.numeric, !0));
|
|
}
|
|
|
|
/**
|
|
* post-processing terms
|
|
* @type {Function|null}
|
|
*/
|
|
this.prepare = /** @type {Function|null} */merge_option(options.prepare, null, this.prepare);
|
|
/**
|
|
* final processing
|
|
* @type {Function|null}
|
|
*/
|
|
this.finalize = /** @type {Function|null} */merge_option(options.finalize, null, this.finalize);
|
|
|
|
tmp = options.filter;
|
|
this.filter = "function" == typeof tmp ? tmp : merge_option(tmp && new Set(tmp), null, this.filter);
|
|
this.dedupe = merge_option(options.dedupe, !0, this.dedupe);
|
|
this.matcher = merge_option((tmp = options.matcher) && new Map(tmp), null, this.matcher);
|
|
this.mapper = merge_option((tmp = options.mapper) && new Map(tmp), null, this.mapper);
|
|
this.stemmer = merge_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
|
|
this.replacer = merge_option(options.replacer, null, this.replacer);
|
|
this.minlength = merge_option(options.minlength, 1, this.minlength);
|
|
this.maxlength = merge_option(options.maxlength, 1024, this.maxlength);
|
|
this.rtl = merge_option(options.rtl, !1, this.rtl);
|
|
|
|
this.cache = tmp = merge_option(options.cache, !0, this.cache);
|
|
if (tmp) {
|
|
this.timer = null;
|
|
this.cache_size = "number" == typeof tmp ? tmp : 2e5;
|
|
this.cache_enc = new Map();
|
|
this.cache_term = new Map();
|
|
this.cache_enc_length = 128;
|
|
this.cache_term_length = 128;
|
|
}
|
|
|
|
this.matcher_str = "";
|
|
this.matcher_test = null;
|
|
this.stemmer_str = "";
|
|
this.stemmer_test = null;
|
|
|
|
if (this.matcher) {
|
|
for (const key of this.matcher.keys()) {
|
|
this.matcher_str += (this.matcher_str ? "|" : "") + key;
|
|
}
|
|
}
|
|
if (this.stemmer) {
|
|
for (const key of this.stemmer.keys()) {
|
|
this.stemmer_str += (this.stemmer_str ? "|" : "") + key;
|
|
}
|
|
}
|
|
|
|
return this;
|
|
};
|
|
|
|
Encoder.prototype.addStemmer = function (match, replace) {
|
|
this.stemmer || (this.stemmer = new Map());
|
|
this.stemmer.set(match, replace);
|
|
this.stemmer_str += (this.stemmer_str ? "|" : "") + match;
|
|
this.stemmer_test = null;
|
|
this.cache && clear(this);
|
|
return this;
|
|
};
|
|
|
|
Encoder.prototype.addFilter = function (term) {
|
|
if ("function" == typeof term) {
|
|
|
|
this.filter = term;
|
|
} else {
|
|
this.filter || (this.filter = new Set());
|
|
this.filter.add(term);
|
|
}
|
|
this.cache && clear(this);
|
|
return this;
|
|
};
|
|
|
|
/**
|
|
* Replace a single char
|
|
* @param {string} char_match
|
|
* @param {string} char_replace
|
|
* @return {Encoder}
|
|
* @suppress {invalidCasts}
|
|
*/
|
|
Encoder.prototype.addMapper = function (char_match, char_replace) {
|
|
|
|
if ("object" == typeof char_match) {
|
|
return this.addReplacer( /** @type {RegExp} */char_match, char_replace);
|
|
}
|
|
|
|
if (1 < char_match.length) {
|
|
return this.addMatcher(char_match, char_replace);
|
|
}
|
|
this.mapper || (this.mapper = new Map());
|
|
this.mapper.set(char_match, char_replace);
|
|
this.cache && clear(this);
|
|
return this;
|
|
};
|
|
|
|
/**
|
|
* Replace a string
|
|
* @param {string} match
|
|
* @param {string} replace
|
|
* @return {Encoder}
|
|
* @suppress {invalidCasts}
|
|
*/
|
|
Encoder.prototype.addMatcher = function (match, replace) {
|
|
|
|
if ("object" == typeof match) {
|
|
return this.addReplacer( /** @type {RegExp} */match, replace);
|
|
}
|
|
|
|
if (2 > match.length && (this.dedupe || this.mapper)) {
|
|
return this.addMapper(match, replace);
|
|
}
|
|
this.matcher || (this.matcher = new Map());
|
|
this.matcher.set(match, replace);
|
|
this.matcher_str += (this.matcher_str ? "|" : "") + match;
|
|
this.matcher_test = null;
|
|
this.cache && clear(this);
|
|
return this;
|
|
};
|
|
|
|
/**
|
|
* @param {RegExp} regex
|
|
* @param {string} replace
|
|
* @return {Encoder}
|
|
* @suppress {invalidCasts}
|
|
*/
|
|
Encoder.prototype.addReplacer = function (regex, replace) {
|
|
if ("string" == typeof regex) {
|
|
return this.addMatcher( /** @type {string} */regex, replace);
|
|
}
|
|
this.replacer || (this.replacer = []);
|
|
this.replacer.push(regex, replace);
|
|
this.cache && clear(this);
|
|
return this;
|
|
};
|
|
|
|
/**
|
|
* @param {!string} str
|
|
* @param {boolean=} dedupe_terms Note: term deduplication will break the context chain
|
|
* @return {!Array<string>}
|
|
*/
|
|
Encoder.prototype.encode = function (str, dedupe_terms) {
|
|
|
|
if (this.cache && str.length <= this.cache_enc_length) {
|
|
if (this.timer) {
|
|
if (this.cache_enc.has(str)) {
|
|
return this.cache_enc.get(str);
|
|
}
|
|
} else {
|
|
this.timer = setTimeout(clear, 50, this);
|
|
}
|
|
}
|
|
|
|
if (this.normalize) {
|
|
if ("function" == typeof this.normalize) {
|
|
str = this.normalize(str);
|
|
} else if (normalize) {
|
|
str = str.normalize("NFKD").replace(normalize, "").toLowerCase();
|
|
} else {
|
|
str = str.toLowerCase();
|
|
}
|
|
}
|
|
|
|
if (this.prepare) {
|
|
str = this.prepare(str);
|
|
}
|
|
|
|
if (this.numeric && 3 < str.length) {
|
|
str = str.replace(numeric_split_prev_char, "$1 $2").replace(numeric_split_next_char, "$1 $2").replace(numeric_split_length, "$1 ");
|
|
}
|
|
|
|
const skip = !(this.dedupe || this.mapper || this.filter || this.matcher || this.stemmer || this.replacer);
|
|
let final = [],
|
|
dupes = create_object(),
|
|
last_term,
|
|
last_term_enc,
|
|
words = this.split || "" === this.split ? str.split( /** @type {string|RegExp} */this.split) : [str];
|
|
|
|
|
|
for (let i = 0, word, base; i < words.length; i++) {
|
|
|
|
if (!(word = base = words[i])) {
|
|
continue;
|
|
}
|
|
|
|
if (word.length < this.minlength || word.length > this.maxlength) {
|
|
continue;
|
|
}
|
|
|
|
if (dedupe_terms) {
|
|
if (dupes[word]) {
|
|
continue;
|
|
}
|
|
dupes[word] = 1;
|
|
} else {
|
|
if (last_term === word) {
|
|
continue;
|
|
}
|
|
last_term = word;
|
|
}
|
|
|
|
if (skip) {
|
|
final.push(word);
|
|
continue;
|
|
}
|
|
|
|
if (this.filter && ("function" == typeof this.filter ? !this.filter(word) : this.filter.has(word))) {
|
|
continue;
|
|
}
|
|
|
|
if (this.cache && word.length <= this.cache_term_length) {
|
|
if (this.timer) {
|
|
const tmp = this.cache_term.get(word);
|
|
if (tmp || "" === tmp) {
|
|
tmp && final.push(tmp);
|
|
continue;
|
|
}
|
|
} else {
|
|
this.timer = setTimeout(clear, 50, this);
|
|
}
|
|
}
|
|
|
|
if (this.stemmer) {
|
|
|
|
this.stemmer_test || (this.stemmer_test = new RegExp("(?!^)(" + this.stemmer_str + ")$"));
|
|
|
|
let old;
|
|
|
|
while (old !== word && 2 < word.length) {
|
|
old = word;
|
|
word = word.replace(this.stemmer_test, match => this.stemmer.get(match));
|
|
}
|
|
}
|
|
|
|
if (word && (this.mapper || this.dedupe && 1 < word.length)) {
|
|
let final = "";
|
|
for (let i = 0, prev = "", char, tmp; i < word.length; i++) {
|
|
char = word.charAt(i);
|
|
if (char !== prev || !this.dedupe) {
|
|
tmp = this.mapper && this.mapper.get(char);
|
|
if (!tmp && "" !== tmp) final += prev = char;else if ((tmp !== prev || !this.dedupe) && (prev = tmp)) final += tmp;
|
|
}
|
|
}
|
|
word = final;
|
|
}
|
|
|
|
if (this.matcher && 1 < word.length) {
|
|
this.matcher_test || (this.matcher_test = new RegExp("(" + this.matcher_str + ")", "g"));
|
|
word = word.replace(this.matcher_test, match => this.matcher.get(match));
|
|
}
|
|
|
|
if (word && this.replacer) {
|
|
for (let i = 0; word && i < this.replacer.length; i += 2) {
|
|
word = word.replace(this.replacer[i], this.replacer[i + 1]);
|
|
}
|
|
}
|
|
|
|
if (this.cache && base.length <= this.cache_term_length) {
|
|
this.cache_term.set(base, word);
|
|
if (this.cache_term.size > this.cache_size) {
|
|
this.cache_term.clear();
|
|
this.cache_term_length = 0 | this.cache_term_length / 1.1;
|
|
}
|
|
}
|
|
|
|
if (word) {
|
|
if (word !== base) {
|
|
if (dedupe_terms) {
|
|
if (dupes[word]) {
|
|
continue;
|
|
}
|
|
dupes[word] = 1;
|
|
} else {
|
|
if (last_term_enc === word) {
|
|
continue;
|
|
}
|
|
last_term_enc = word;
|
|
}
|
|
}
|
|
final.push(word);
|
|
}
|
|
}
|
|
|
|
if (this.finalize) {
|
|
final = this.finalize(final) || final;
|
|
}
|
|
|
|
if (this.cache && str.length <= this.cache_enc_length) {
|
|
this.cache_enc.set(str, final);
|
|
if (this.cache_enc.size > this.cache_size) {
|
|
this.cache_enc.clear();
|
|
this.cache_enc_length = 0 | this.cache_enc_length / 1.1;
|
|
}
|
|
}
|
|
|
|
return final;
|
|
};
|
|
|
|
export function fallback_encoder(str) {
|
|
return str.normalize("NFKD").replace(normalize, "").toLowerCase().trim().split(/\s+/);
|
|
}
|
|
|
|
/**
|
|
* @param {Encoder} self
|
|
*/
|
|
function clear(self) {
|
|
self.timer = null;
|
|
self.cache_enc.clear();
|
|
self.cache_term.clear();
|
|
} |