From 91aff88510478e6d32fa76defe20457889da5f48 Mon Sep 17 00:00:00 2001 From: Thomas Wilkerling Date: Sun, 6 Apr 2025 14:16:20 +0200 Subject: [PATCH] readme encoder options --- README.md | 6 +- doc/encoder.md | 197 +++++++++++++++++++++++++++++++++++++++++++++---- doc/options.md | 71 ------------------ doc/worker.md | 4 +- src/encoder.js | 5 +- 5 files changed, 191 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index 5db3164..56f025d 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ Extern Projects & Plugins: - [Document Options](doc/options.md) - [Worker Index Options](doc/worker.md#worker-index-options) - [Persistent Options](doc/options.md) - - [Encoder Options](doc/options.md) + - [Encoder Options](doc/encoder.md#encoder-options) - [Resolver Options](doc/options.md) - [Presets](#presets) - [Context Search](#context-search) @@ -1133,9 +1133,9 @@ index.remove(0).update(1, 'foo').add(2, 'foobar'); tokenize - "strict" or "exact"
+ "strict" / "exact"
"forward"
- "reverse" or "bidirectional
+ "reverse" / "bidirectional
"full" diff --git a/doc/encoder.md b/doc/encoder.md index 7b53ab1..37ed804 100644 --- a/doc/encoder.md +++ b/doc/encoder.md @@ -252,91 +252,260 @@ const index = new Index({ }); ``` -### Property Overview +## Encoder Options - - - + + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + + - + + - + + - + + - + + - + + - + + + + + + + + +
PropertyDescriptionValuesOptionValuesDescriptionDefault
You can just choose one of those 3 options:
normalizeThe normalization stage will simplify the input content e.g. by replacing "é" to "e"include - true enable normalization (default) + Encoder Split Options + Deduplicate following letters, e.g. "missing" to "mising"{ letter: true, number: true }
exclude + Encoder Split Options + Deduplicate following letters, e.g. "missing" to "mising"false
split + false
+ RegExp
+ String
+ Encoder Split Options +
+ The expression used to split the content into terms + → include { letter: true, number: true }
Other options:
dedupe + Boolean + Deduplicate consecutive letters, e.g. "missing" to "mising"true
numeric + Boolean + By default, the extended numeric support (Triplets) inherits from chosen Encoder Split Options. You probably might want to disable Triplets to get a more exact result (fewer entries) in some cases.true
minlength + Number + Set the minimum term length which should be added to the index. This limit does not apply to the `forward` tokenizer. You still get results when just typing "f" on a term "flexsearch" when e.g. `minlength: 4` was used.1
maxlength + Number + Set the maximum term length which should be added to the index. Larger content will drop.1
rtl + Boolean + Force Right-To-Left encoding (you should just apply this when the string content was not already encoded as RTL)false
normalize + true enable normalization (default)
false disable normalization
function(str) => str custom function
The normalization stage will apply basic charset normalization e.g. by replacing "é" to "e"true
prepareThe preparation stage is a custom function direct followed when normalization was done function(str) => str custom function The preparation stage is a custom function direct followed when normalization was donefalse
finalizeThe finalization stage is a custom function executed at the last task in the encoding pipeline (here it gets an array of tokens and need to return an array of tokens) function([str]) => [str] custom function The finalization stage is a custom function executed at the last task in the encoding pipeline (here it gets an array of tokens and need to return an array of tokens)false
filterStop-word filter is like a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be"). This is also very useful when using Context Search Set(["and", "to", "be"])
function(str) => bool custom function

encoder.addFilter("and")
Stop-word filter is like a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be"). This is also very useful when using Context Searchfalse
stemmerStemmer will normalize several linguistic mutations of the same word (e.g. "run" and "running", or "property" and "properties"). This is also very useful when using Context Search Map([["ing", ""], ["ies", "y"]])

encoder.addStemmer("ing", "")
Stemmer will normalize several linguistic mutations of the same word (e.g. "run" and "running", or "property" and "properties"). This is also very useful when using Context Searchfalse
mapperMapper will replace a single char (e.g. "é" into "e") Map([["é", "e"], ["ß", "ss"]])

encoder.addMapper("é", "e")
Mapper will replace a single char (e.g. "é" into "e")false
matcherMatcher will do same as Mapper but instead of single chars it will replace char sequences Map([["and", "&"], ["usd", "$"]])

encoder.addMatcher("and", "&")
Matcher will do same as Mapper but instead of single chars it will replace char sequencesfalse
replacerReplacer takes custom regular expressions and couldn't get optimized in the same way as Mapper or Matcher. You should take this as the last option when no other replacement can do the same. [/[^a-z0-9]/g, "", /([^aeo])h(.)/g, "$1$2"])

encoder.addReplacer(/[^a-z0-9]/g, "")
Replacer takes custom regular expressions and couldn't get optimized in the same way as Mapper or Matcher. You should take this as the last option when no other replacement can do the same.false
cache + Boolean + In some very rare situations (large consecutive content with high cardinality) it might be useful to disable the internal event-loop-cachetrue
> [!TIP] > The methods `.addMapper()`, `.addMatcher()` and `.addReplacer()` might be confusing. For this reason they will automatically resolve to the right one when just using the same method for every rule. You can simplify this e.g. by just use `.addReplacer()` for each of this 3 rules. +### Encoder Split Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
letter + Boolean + Toggle inclusion of letters on/offtrue
number + Boolean + Toggle inclusion of numerics on/offtrue
symbol + Boolean + Toggle inclusion of symbols on/offfalse
punctuation + Boolean + + Toggle inclusion of punctuation on/off + false
control + Boolean + Toggle inclusion of control chars on/offfalse
char + String
+ Array[String] +
Toggle inclusion of specific chars on/offfalse
+ ## Custom Encoder Since it is very simple to create a custom Encoder, you are welcome to create your own. diff --git a/doc/options.md b/doc/options.md index 87f7242..c3224b7 100644 --- a/doc/options.md +++ b/doc/options.md @@ -38,77 +38,6 @@ -## Encoder Options - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionValuesDescriptionDefault
split

- false
- RegExp
- String -
- The rule to split words when using non-custom tokenizer (built-ins e.g. "forward"). Use a string/char or use a regular expression (default: /\W+/).
-
/[\W_]+/
rtl
- Boolean - Enables Right-To-Left encoding.false
encode
- function(str) => [words] - The custom encoding function./lang/latin/default.js
stemmer


- false
- String
- Function -
Disable or pass in language shorthand flag (ISO-3166) or a custom object. -
filter


- false
- String
- Function -
Disable or pass in language shorthand flag (ISO-3166) or a custom array.
matcher


- false
- String
- Function -
Disable or pass in language shorthand flag (ISO-3166) or a custom array.
- ## Search Options diff --git a/doc/worker.md b/doc/worker.md index 100d8da..006e3ca 100644 --- a/doc/worker.md +++ b/doc/worker.md @@ -424,6 +424,6 @@ await Promise.all(files.map(async file => { ## CSP-friendly Worker (Browser) -When just using worker by passing the option `worker: true`, the worker will be created by code generation under the hood. This might have issues when using strict CSP settings. +When using worker via one of the bundled versions (e.g. `flexearch.bundle.min.js`), the worker will be created by code generation under the hood. This might have issues when using strict CSP settings. -You can overcome this issue by passing the filepath to the worker file like `worker: "./worker.js"`. The original worker file is located at `src/worker/worker.js`. \ No newline at end of file +You can overcome this issue by using the non-bundled versions e.g. `dist/module/` or by passing the filepath to the worker file instead of `true` like `worker: "dist/module/worker/worker.js"`. \ No newline at end of file diff --git a/src/encoder.js b/src/encoder.js index 6338d31..1242470 100644 --- a/src/encoder.js +++ b/src/encoder.js @@ -217,7 +217,7 @@ Encoder.prototype.assign = function(options){ this.stemmer = merge_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer); this.replacer = merge_option(options.replacer, null, this.replacer); this.minlength = merge_option(options.minlength, 1, this.minlength); - this.maxlength = merge_option(options.maxlength, 0, this.maxlength); + this.maxlength = merge_option(options.maxlength, 1024, this.maxlength); this.rtl = merge_option(options.rtl, false, this.rtl); // auto-balanced cache @@ -427,7 +427,8 @@ Encoder.prototype.encode = function(str){ if(!(word = base = words[i])){ continue; } - if(word.length < this.minlength){ + if(word.length < this.minlength || + word.length > this.maxlength){ continue; } if(skip) {