From 91aff88510478e6d32fa76defe20457889da5f48 Mon Sep 17 00:00:00 2001
From: Thomas Wilkerling <thomas@nextapps.de>
Date: Sun, 6 Apr 2025 14:16:20 +0200
Subject: [PATCH] readme encoder options

---
 README.md      |   6 +-
 doc/encoder.md | 197 +++++++++++++++++++++++++++++++++++++++++++++----
 doc/options.md |  71 ------------------
 doc/worker.md  |   4 +-
 src/encoder.js |   5 +-
 5 files changed, 191 insertions(+), 92 deletions(-)
diff --git a/README.md b/README.md
index 5db3164..56f025d 100644
--- a/README.md
+++ b/README.md
@@ -240,7 +240,7 @@ Extern Projects & Plugins:
   - [Document Options](doc/options.md)
   - [Worker Index Options](doc/worker.md#worker-index-options)
   - [Persistent Options](doc/options.md)
-  - [Encoder Options](doc/options.md)
+  - [Encoder Options](doc/encoder.md#encoder-options)
   - [Resolver Options](doc/options.md)
 - [Presets](#presets)
 - [Context Search](#context-search)
@@ -1133,9 +1133,9 @@ index.remove(0).update(1, 'foo').add(2, 'foobar');
     <tr>
         <td>tokenize</td>
         <td>
-            "strict" or "exact"<br>
+            "strict" / "exact"<br>
             "forward"<br>
-            "reverse" or "bidirectional<br>
+            "reverse" / "bidirectional<br>
             "full"
         </td>
         <td>
diff --git a/doc/encoder.md b/doc/encoder.md
index 7b53ab1..37ed804 100644
--- a/doc/encoder.md
+++ b/doc/encoder.md
@@ -252,91 +252,260 @@ const index = new Index({
 });
 ```
 
-### Property Overview
+## Encoder Options
 
 <table>
     <tr></tr>
     <tr>
-        <th align="left">Property</th>
-        <th width="50%" align="left">Description</th>
-        <th align="left">Values</th>
+        <td>Option</td>
+        <td>Values</td>
+        <td>Description</td>
+        <td>Default</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td colspan="4">You can just choose one of those 3 options:</td>
     </tr>
     <tr>
-        <td><code>normalize</code></td>
-        <td>The normalization stage will simplify the input content e.g. by replacing "é" to "e"</td>
+        <td><code>include</code></td>
         <td>
-            <code>true</code> enable normalization (default)
+            <a href="#encoder-split-options">Encoder Split Options</a>
+        </td>
+        <td>Deduplicate following letters, e.g. "missing" to "mising"</td>
+        <td>{ letter: true, number: true }</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>exclude</code></td>
+        <td>
+            <a href="#encoder-split-options">Encoder Split Options</a>
+        </td>
+        <td>Deduplicate following letters, e.g. "missing" to "mising"</td>
+        <td>false</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>split</code></td>
+        <td>
+            false<br>
+            RegExp<br>
+            String<br>
+            <a href="#encoder-split-options">Encoder Split Options</a>
+        </td>
+        <td>
+            The expression used to split the content into terms
+        </td>
+        <td>→ include { letter: true, number: true }</td>
+    </tr>
+    <tr>
+        <td colspan="4">Other options:</td>
+    </tr>
+    <tr>
+        <td><code>dedupe</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>Deduplicate consecutive letters, e.g. "missing" to "mising"</td>
+        <td>true</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>numeric</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>By default, the extended numeric support (Triplets) inherits from chosen <a href="#encoder-split-options">Encoder Split Options</a>. You probably might want to disable Triplets to get a more exact result (fewer entries) in some cases.</td>
+        <td>true</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>minlength</code></td>
+        <td>
+            Number
+        </td>
+        <td>Set the minimum term length which should be added to the index. This limit does not apply to the `forward` tokenizer. You still get results when just typing "f" on a term "flexsearch" when e.g. `minlength: 4` was used.</td>
+        <td>1</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>maxlength</code></td>
+        <td>
+            Number
+        </td>
+        <td>Set the maximum term length which should be added to the index. Larger content will drop.</td>
+        <td>1</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>rtl</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>Force Right-To-Left encoding (you should just apply this when the string content was not already encoded as RTL)</td>
+        <td>false</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>normalize</code></td>
+        <td>
+            <code>true</code> enable normalization (default)<br>
             <code>false</code> disable normalization<br>
             <code>function(str) => str</code> custom function
         </td>
+        <td>The normalization stage will apply basic charset normalization e.g. by replacing "é" to "e"</td>
+        <td>true</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>prepare</code></td>
-        <td>The preparation stage is a custom function direct followed when normalization was done</td>
         <td>
             <code>function(str) => str</code> custom function
         </td>
+        <td>The preparation stage is a custom function direct followed when normalization was done</td>
+        <td>false</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>finalize</code></td>
-        <td>The finalization stage is a custom function executed at the last task in the encoding pipeline (here it gets an array of tokens and need to return an array of tokens)</td>
         <td>
             <code>function([str]) => [str]</code> custom function
         </td>
+        <td>The finalization stage is a custom function executed at the last task in the encoding pipeline (here it gets an array of tokens and need to return an array of tokens)</td>
+        <td>false</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>filter</code></td>
-        <td>Stop-word filter is like a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be"). This is also very useful when using <a href="../README.md#context-search">Context Search</a></td>
         <td>
             <code>Set(["and", "to", "be"])</code><br>
             <code>function(str) => bool</code> custom function<h2></h2>
             <code>encoder.addFilter("and")</code>
         </td>
+        <td>Stop-word filter is like a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be"). This is also very useful when using <a href="../README.md#context-search">Context Search</a></td>
+        <td>false</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>stemmer</code></td>
-        <td>Stemmer will normalize several linguistic mutations of the same word (e.g. "run" and "running", or "property" and "properties"). This is also very useful when using <a href="../README.md#context-search">Context Search</a></td>
         <td>
             <code>Map([["ing", ""], ["ies", "y"]])</code><h2></h2>
             <code>encoder.addStemmer("ing", "")</code>
         </td>
+        <td>Stemmer will normalize several linguistic mutations of the same word (e.g. "run" and "running", or "property" and "properties"). This is also very useful when using <a href="../README.md#context-search">Context Search</a></td>
+        <td>false</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>mapper</code></td>
-        <td>Mapper will replace a single char (e.g. "é" into "e")</td>
         <td>
             <code>Map([["é", "e"], ["ß", "ss"]])</code><h2></h2>
             <code>encoder.addMapper("é", "e")</code>
         </td>
+        <td>Mapper will replace a single char (e.g. "é" into "e")</td>
+        <td>false</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>matcher</code></td>
-        <td>Matcher will do same as Mapper but instead of single chars it will replace char sequences</td>
         <td>
             <code>Map([["and", "&"], ["usd", "$"]])</code><h2></h2>
             <code>encoder.addMatcher("and", "&")</code>
         </td>
+        <td>Matcher will do same as Mapper but instead of single chars it will replace char sequences</td>
+        <td>false</td>
     </tr>
     <tr></tr>
     <tr>
         <td><code>replacer</code></td>
-        <td>Replacer takes custom regular expressions and couldn't get optimized in the same way as Mapper or Matcher. You should take this as the last option when no other replacement can do the same.</td>
         <td>
             <code>[/[^a-z0-9]/g, "", /([^aeo])h(.)/g, "$1$2"])</code><h2></h2>
             <code>encoder.addReplacer(/[^a-z0-9]/g, "")</code>
         </td>
+        <td>Replacer takes custom regular expressions and couldn't get optimized in the same way as Mapper or Matcher. You should take this as the last option when no other replacement can do the same.</td>
+        <td>false</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>cache</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>In some very rare situations (large consecutive content with high cardinality) it might be useful to disable the internal event-loop-cache</td>
+        <td>true</td>
     </tr>
 </table>
 
 > [!TIP]
 > The methods `.addMapper()`, `.addMatcher()` and `.addReplacer()` might be confusing. For this reason they will automatically resolve to the right one when just using the same method for every rule. You can simplify this e.g. by just use `.addReplacer()` for each of this 3 rules.
 
+### Encoder Split Options
+
+<table>
+    <tr></tr>
+    <tr>
+        <td>Option</td>
+        <td>Values</td>
+        <td>Description</td>
+        <td>Default</td>
+    </tr>
+    <tr>
+        <td><code>letter</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>Toggle inclusion of letters on/off</td>
+        <td>true</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>number</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>Toggle inclusion of numerics on/off</td>
+        <td>true</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>symbol</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>Toggle inclusion of symbols on/off</td>
+        <td>false</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>punctuation</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>
+            Toggle inclusion of punctuation on/off
+        </td>
+        <td>false</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>control</code></td>
+        <td>
+            Boolean
+        </td>
+        <td>Toggle inclusion of control chars on/off</td>
+        <td>false</td>
+    </tr>
+    <tr></tr>
+    <tr>
+        <td><code>char</code></td>
+        <td>
+            String<br>
+            Array[String]
+        </td>
+        <td>Toggle inclusion of specific chars on/off</td>
+        <td>false</td>
+    </tr>
+</table>
+
 ## Custom Encoder
 
 Since it is very simple to create a custom Encoder, you are welcome to create your own.
diff --git a/doc/options.md b/doc/options.md
index 87f7242..c3224b7 100644
--- a/doc/options.md
+++ b/doc/options.md
@@ -38,77 +38,6 @@
     </tr>
 </table>
 
-## Encoder Options
-
-<table>
-    <tr><td colspan="4"></td></tr>
-    <tr>
-        <td>Option</td>
-        <td>Values</td>
-        <td>Description</td>
-        <td>Default</td>
-    </tr>
-    <tr>
-        <td>split<br><br></td>
-        <td>
-            false<br>
-            RegExp<br>
-            String
-        </td>
-        <td vertical-align="top">
-            The rule to split words when using non-custom tokenizer (<a href="#tokenizer">built-ins</a> e.g. "forward"). Use a string/char or use a regular expression (default: <code>/\W+/</code>).<br>
-        </td>
-        <td><code>/[\W_]+/</code></td>
-    </tr>
-    <tr></tr>
-    <tr>
-        <td>rtl<br></td>
-        <td>
-            Boolean
-        </td>
-        <td>Enables Right-To-Left encoding.</td>
-        <td>false</td>
-    </tr>
-    <tr></tr>
-    <tr>
-        <td>encode<br></td>
-        <td>
-            function(str) => [words]
-        </td>
-        <td>The custom encoding function.</td>
-        <td>/lang/latin/default.js</td>
-    </tr>
-    <tr>
-        <td>stemmer<br><br><br></td>
-        <td>
-            false<br>
-            String<br>
-            Function
-        </td>
-        <td>Disable or pass in language shorthand flag (ISO-3166) or a custom object.
-    </tr>
-    <tr></tr>
-    <tr>
-        <td>filter<br><br><br></td>
-        <td>
-            false<br>
-            String<br>
-            Function
-        </td>
-        <td>Disable or pass in language shorthand flag (ISO-3166) or a custom array.</td>
-    </tr>
-    <tr></tr>
-    <tr>
-        <td>matcher<br><br><br></td>
-        <td>
-            false<br>
-            String<br>
-            Function
-        </td>
-        <td>Disable or pass in language shorthand flag (ISO-3166) or a custom array.</td>
-    </tr>
-</table>
-
 ## Search Options
 
 <table>
diff --git a/doc/worker.md b/doc/worker.md
index 100d8da..006e3ca 100644
--- a/doc/worker.md
+++ b/doc/worker.md
@@ -424,6 +424,6 @@ await Promise.all(files.map(async file => {
 
 ## CSP-friendly Worker (Browser)
 
-When just using worker by passing the option `worker: true`, the worker will be created by code generation under the hood. This might have issues when using strict CSP settings.
+When using worker via one of the bundled versions (e.g. `flexearch.bundle.min.js`), the worker will be created by code generation under the hood. This might have issues when using strict CSP settings.
 
-You can overcome this issue by passing the filepath to the worker file like `worker: "./worker.js"`. The original worker file is located at `src/worker/worker.js`.
\ No newline at end of file
+You can overcome this issue by using the non-bundled versions e.g. `dist/module/` or by passing the filepath to the worker file instead of `true` like `worker: "dist/module/worker/worker.js"`.
\ No newline at end of file
diff --git a/src/encoder.js b/src/encoder.js
index 6338d31..1242470 100644
--- a/src/encoder.js
+++ b/src/encoder.js
@@ -217,7 +217,7 @@ Encoder.prototype.assign = function(options){
     this.stemmer = merge_option((tmp = options.stemmer) && new Map(tmp), null, this.stemmer);
     this.replacer = merge_option(options.replacer, null, this.replacer);
     this.minlength = merge_option(options.minlength, 1, this.minlength);
-    this.maxlength = merge_option(options.maxlength, 0, this.maxlength);
+    this.maxlength = merge_option(options.maxlength, 1024, this.maxlength);
     this.rtl = merge_option(options.rtl, false, this.rtl);
 
     // auto-balanced cache
@@ -427,7 +427,8 @@ Encoder.prototype.encode = function(str){
         if(!(word = base = words[i])){
             continue;
         }
-        if(word.length < this.minlength){
+        if(word.length < this.minlength ||
+           word.length > this.maxlength){
             continue;
         }
         if(skip) {

Property	Description	Values	Option	Values	Description	Default
You can just choose one of those 3 options:
`normalize`	The normalization stage will simplify the input content e.g. by replacing "é" to "e"	`include`	- `true` enable normalization (default) + Encoder Split Options +	Deduplicate following letters, e.g. "missing" to "mising"	{ letter: true, number: true }
`exclude`	+ Encoder Split Options +	Deduplicate following letters, e.g. "missing" to "mising"	false
`split`	+ false + RegExp + String + Encoder Split Options +	+ The expression used to split the content into terms +	→ include { letter: true, number: true }
Other options:
`dedupe`	+ Boolean +	Deduplicate consecutive letters, e.g. "missing" to "mising"	true
`numeric`	+ Boolean +	By default, the extended numeric support (Triplets) inherits from chosen Encoder Split Options. You probably might want to disable Triplets to get a more exact result (fewer entries) in some cases.	true
`minlength`	+ Number +	Set the minimum term length which should be added to the index. This limit does not apply to the `forward` tokenizer. You still get results when just typing "f" on a term "flexsearch" when e.g. `minlength: 4` was used.	1
`maxlength`	+ Number +	Set the maximum term length which should be added to the index. Larger content will drop.	1
`rtl`	+ Boolean +	Force Right-To-Left encoding (you should just apply this when the string content was not already encoded as RTL)	false
`normalize`	+ `true` enable normalization (default) `false` disable normalization `function(str) => str` custom function	The normalization stage will apply basic charset normalization e.g. by replacing "é" to "e"	true
`prepare`	The preparation stage is a custom function direct followed when normalization was done	`function(str) => str` custom function	The preparation stage is a custom function direct followed when normalization was done	false
`finalize`	The finalization stage is a custom function executed at the last task in the encoding pipeline (here it gets an array of tokens and need to return an array of tokens)	`function([str]) => [str]` custom function	The finalization stage is a custom function executed at the last task in the encoding pipeline (here it gets an array of tokens and need to return an array of tokens)	false
`filter`	Stop-word filter is like a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be"). This is also very useful when using Context Search	`Set(["and", "to", "be"])` `function(str) => bool` custom function `encoder.addFilter("and")`	Stop-word filter is like a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be"). This is also very useful when using Context Search	false
`stemmer`	Stemmer will normalize several linguistic mutations of the same word (e.g. "run" and "running", or "property" and "properties"). This is also very useful when using Context Search	`Map([["ing", ""], ["ies", "y"]])` `encoder.addStemmer("ing", "")`	Stemmer will normalize several linguistic mutations of the same word (e.g. "run" and "running", or "property" and "properties"). This is also very useful when using Context Search	false
`mapper`	Mapper will replace a single char (e.g. "é" into "e")	`Map([["é", "e"], ["ß", "ss"]])` `encoder.addMapper("é", "e")`	Mapper will replace a single char (e.g. "é" into "e")	false
`matcher`	Matcher will do same as Mapper but instead of single chars it will replace char sequences	`Map([["and", "&"], ["usd", "$"]])` `encoder.addMatcher("and", "&")`	Matcher will do same as Mapper but instead of single chars it will replace char sequences	false
`replacer`	Replacer takes custom regular expressions and couldn't get optimized in the same way as Mapper or Matcher. You should take this as the last option when no other replacement can do the same.	`[/[^a-z0-9]/g, "", /([^aeo])h(.)/g, "$1$2"])` `encoder.addReplacer(/[^a-z0-9]/g, "")`	Replacer takes custom regular expressions and couldn't get optimized in the same way as Mapper or Matcher. You should take this as the last option when no other replacement can do the same.	false
`cache`	+ Boolean +	In some very rare situations (large consecutive content with high cardinality) it might be useful to disable the internal event-loop-cache	true