diff --git a/CHANGELOG.md b/CHANGELOG.md index 681c21a..789ab40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,34 @@ # Changelog +### v0.8.0 + +- Persistent indexes support for: `IndexedDB` (Browser), `Redis`, `SQLite`, `Postgres`, `MongoDB`, `Clickhouse` +- Enhanced language customization via the new `Encoder` class +- Result Highlighting +- Query performance achieve results up to 4.5 times faster compared to the previous generation v0.7.x by also improving the quality of results +- Enhanced support for larger indexes or larger result sets +- Improved offset and limit processing achieve up to 100 times faster traversal performance through large datasets +- Support for larger In-Memory index with extended key size (the defaults maximum keystore limit is: 2^24) +- Greatly enhanced performance of the whole text encoding pipeline +- Improved indexing of numeric content (Triplets) +- Intermediate result sets and `Resolver` +- Basic Resolver: `and`, `or`, `xor`, `not`, `limit`, `offset`, `boost`, `resolve` +- Improved charset collection +- New charset preset `soundex` which further reduces memory consumption by also increasing "fuzziness" +- Performance gain when polling tasks to the index by using "Event-Loop-Caches" +- Up to 100 times faster deletion/replacement when not using the additional "fastupdate" register +- Regex Pre-Compilation (transforms hundreds of regex rules into just a few) +- Extended support for multiple tags (DocumentIndex) +- Custom Fields ("Virtual Fields") +- Custom Filter +- Custom Score Function +- Added French language preset (stop-word filter, stemmer) +- Enhanced Worker Support +- Export / Import index in chunks +- Improved Build System + Bundler (Supported: CommonJS, ESM, Global Namespace), also the import of language packs are now supported for Node.js +- Full covering index.d.ts type definitions +- Fast-Boot Serialization optimized for Server-Side-Rendering (PHP, Python, Ruby, Rust, Java, Go, Node.js, ...) + ### v0.7.0 - Bidirectional Context (the order of words can now vary, does not increase memory when using bidirectional context) diff --git a/README.md b/README.md index 51d09e7..37d16c7 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +<<<<<<< HEAD # FlexSearch v0.8 (Preview) ```bash @@ -1046,108 +1047,88 @@ const result = new Resolver({ .not({ query: "some query" }) .resolve(100); ``` +======= +FlexSearch v0.8: [Overview and Migration Guide](doc/0.8.0.md) +
+Feature | +flexsearch.bundle.js | +flexsearch.compact.js | +flexsearch.light.js | +
+ Presets + | +✓ | +✓ | +✓ | +
+ Async Search + | +✓ | +✓ | +- | +
+ Workers (Web + Node.js) + | +✓ | +- | +- | +
+ Contextual Indexes + | +✓ | +✓ | +✓ | +
+ Document Search + | +✓ | +✓ | +- | +
+ Document Store + | +✓ | +✓ | +- | +
+ Partial Matching + | +✓ | +✓ | +✓ | +
+ Relevance Scoring + | +✓ | +✓ | +✓ | +
+ Auto-Balanced Cache by Popularity/Last Queries + | +✓ | +- | +- | +
+ Tag Search + | +✓ | +- | +- | +
+ Suggestions + | +✓ | +✓ | +✓ | +
+ Phonetic Match (Fuzzy Search) + | +✓ | +✓ | +- | +
Encoder | +✓ | +✓ | +✓ | +
Export / Import Indexes | +✓ | +✓ | +- | +
Resolver | +✓ | +- | +- | +
Persistent Index (IndexedDB) | +✓ | +- | +- | +
File Size (gzip) | +14.0 kb | +9.0 kb | +4.4 kb | +
Rank | +Library | +Memory | +Query: Single | +Query: Multi | +Query: Large | +Query: Not Found | +|
1 | +FlexSearch | +4 | +60129740 | +26512159 | +17737877 | +66410956 | +|
2 | +JSii | +27 | +6564 | +158149 | +61290 | +534109 | +|
3 | +Wade | +424 | +20471 | +78780 | +16693 | +213754 | +|
4 | +JS Search | +193 | +8221 | +64034 | +10377 | +167605 | +|
5 | +Elasticlunr.js | +646 | +5412 | +7573 | +2865 | +13982 | +|
7 | +MiniSearch | +24348 | +4406 | +10945 | +72 | +17624 | +|
8 | +bm25 | +15719 | +1429 | +789 | +366 | +1823 | +|
9 | +Lunr.js | +2219 | +255 | +271 | +272 | +267 | +|
10 | +FuzzySearch | +157373 | +53 | +38 | +15 | +43 | +|
11 | +Fuse | +7641904 | +6 | +2 | +1 | +3 | +
Flag | +Option | Values | -Info | +Description | +Default | |
Feature Flags |
+ preset | +
+ "memory" + "performance" + "match" + "score" + "default" + |
+
+ The configuration profile as a shortcut or as a base for your custom settings. + |
+ "default" | +||
tokenize | +
+ "strict" + "forward" + "reverse" + "full" + |
+
+ The indexing mode (tokenizer). Choose one of the built-ins or pass a custom tokenizer function. + |
+ "strict" | +|||
cache | +
+ Boolean + Number + |
+ Enable/Disable and/or set capacity of cached entries. When passing a number as a limit the cache automatically balance stored entries related to their popularity. Note: When just using "true" the cache has no limits and growth unbounded. |
+ false | +|||
resolution | ++ Number + | +Sets the scoring resolution (default: 9). | +9 | +|||
context | +
+ Boolean + Context Options + |
+ Enable/Disable contextual indexing. When passing "true" as value it will take the default values for the context. | +false | +|||
optimize | ++ Boolean + | +When enabled it uses a memory-optimized stack flow for the index. | +true | +|||
boost | ++ function(arr, str, int) => float + | +A custom boost function used when indexing contents to the index. The function has this signature: Function(words[], term, index) => Float . It has 3 parameters where you get an array of all words, the current term and the current index where the term is placed in the word array. You can apply your own calculation e.g. the occurrences of a term and return this factor (<1 means relevance is lowered, >1 means relevance is increased).Note: this feature is currently limited by using the tokenizer "strict" only. |
+ null | |||
SUPPORT_WORKER | -true, false | ++ Language-specific Options and Encoding: + | +||||
charset |
+
+ Charset Payload + String (key) + |
+ + Provide a custom charset payload or pass one of the keys of built-in charsets. + | +"latin" | +|||
language |
+
+ Language Payload + String (key) + |
+ + Provide a custom language payload or pass in language shorthand flag (ISO-3166) of built-in languages. + | +null | +|||
encode |
+
+ false + "default" + "simple" + "balance" + "advanced" + "extra" + function(str) => [words] + |
+ The encoding type. Choose one of the built-ins or pass a custom encoding function. |
+ "default" | +|||
stemmer |
+
+ false + String + Function + |
+ | false | |||
SUPPORT_ENCODER | -true, false | +filter |
+
+ false + String + Function + |
+ | false | |
SUPPORT_CHARSET | -true, false | +matcher |
+
+ false + String + Function + |
+ | false | +|
+ Additional Options for Document Indexes: + | +||||||
worker |
+ + Boolean + | +Enable/Disable and set count of running worker threads. | +false | |||
SUPPORT_CACHE | -true, false | +document |
+ Document Descriptor | ++ Includes definitions for the document index and storage. + | ||
SUPPORT_ASYNC | -true, false | -Asynchronous Rendering (support Promises) | -||||
SUPPORT_STORE | -true, false | -- | ||||
SUPPORT_SUGGESTION | -true, false | -- | ||||
SUPPORT_SERIALIZE | -true, false | -- | ||||
SUPPORT_DOCUMENT | -true, false | -- | ||||
SUPPORT_TAGS | -true, false | -- | ||||
SUPPORT_PERSISTENT | -true, false | -- | ||||
SUPPORT_KEYSTORE | -true, false | -- | ||||
SUPPORT_COMPRESSION | -true, false | -- | ||||
SUPPORT_RESOLVER | -true, false | -- | ||||
Compiler Flags |
- ||||||
DEBUG | -true, false | -Output debug information to the console (default: false) | -||||
RELEASE |
- custom custom.module bundle bundle.module es5 light compact |
- - | ||||
POLYFILL | -true, false | -Include Polyfills (based on LANGUAGE_OUT) | -||||
PROFILER | -true, false | -Just used for automatic performance tests | -||||
LANGUAGE_OUT |
- ECMASCRIPT3 ECMASCRIPT5 ECMASCRIPT_2015 ECMASCRIPT_2016 ECMASCRIPT_2017 ECMASCRIPT_2018 ECMASCRIPT_2019 ECMASCRIPT_2020 ECMASCRIPT_2021 ECMASCRIPT_2022 ECMASCRIPT_NEXT STABLE |
- Target language | -
Option | +Values | +Description | +Default | +
resolution | ++ Number + | +Sets the scoring resolution for the context (default: 1). | +1 | +
depth |
+
+ false + Number + |
+ Enable/Disable contextual indexing and also sets contextual distance of relevance. Depth is the maximum number of words/tokens away a term to be considered as relevant. | +1 | +
bidirectional | ++ Boolean + | +Sets bidirectional search result. If enabled and the source text contains "red hat", it will be found for queries "red hat" and "hat red". | +true | +
Option | +Values | +Description | +Default | +
id |
+ String | ++ | "id"" | +
tag |
+ false String |
+ + | "tag" | +
index |
+ String Array<String> Array<Object> |
+ + | + |
store |
+ Boolean String Array<String> |
+ + | false | +
Option | +Values | +Description | +Default | +
split |
+
+ false + RegExp + String + |
+
+ The rule to split words when using non-custom tokenizer (built-ins e.g. "forward"). Use a string/char or use a regular expression (default: /\W+/ ).+ |
+ /[\W_]+/ |
+
rtl |
+ + Boolean + | +Enables Right-To-Left encoding. | +false | +
encode |
+ + function(str) => [words] + | +The custom encoding function. | +/lang/latin/default.js | +
Option | +Values | +Description | +
stemmer |
+
+ false + String + Function + |
+ Disable or pass in language shorthand flag (ISO-3166) or a custom object. + |
filter |
+
+ false + String + Function + |
+ Disable or pass in language shorthand flag (ISO-3166) or a custom array. | +
matcher |
+
+ false + String + Function + |
+ Disable or pass in language shorthand flag (ISO-3166) or a custom array. | +
Option | +Values | +Description | +Default | +
limit | +number | +Sets the limit of results. | +100 | +
offset | +number | +Apply offset (skip items). | +0 | +
suggest | +Boolean | +Enables suggestions in results. | +false | +
Option | +Values | +Description | +Default | +
index | +String Array<String> Array<Object> |
+ Sets the document fields which should be searched. When no field is set, all fields will be searched. Custom options per field are also supported. | ++ |
tag | +String Array<String> |
+ Sets the document fields which should be searched. When no field is set, all fields will be searched. Custom options per field are also supported. | +false | +
enrich | +Boolean | +Enrich IDs from the results with the corresponding documents. | +false | +
bool | +"and" "or" |
+ Sets the used logical operator when searching through multiple fields or tags. | +"or" | +
Option | +Description | +Example | +Memory Factor (n = length of word) | +
"strict" | +index whole words | +foobar |
+ * 1 | +
"forward" | +incrementally index words in forward direction | +fo obarfoob ar |
+ * n | +
"reverse" | +incrementally index words in both directions | +foobar fo obar |
+ * 2n - 1 | +
"full" | +index every possible combination | +fooba rf oob ar |
+ * n * (n - 1) | +
Option | +Description | +False-Positives | +Compression | +
false | +Turn off encoding | +no | +0% | +
"default" | +Case in-sensitive encoding | +no | +0% | +
"simple" | +Case in-sensitive encoding Charset normalizations |
+ no | +~ 3% | +
"balance" | +Case in-sensitive encoding Charset normalizations Literal transformations |
+ no | +~ 30% | +
"advanced" | +Case in-sensitive encoding Charset normalizations Literal transformations Phonetic normalizations |
+ no | +~ 40% | +
"extra" | +Case in-sensitive encoding Charset normalizations Literal transformations Phonetic normalizations Soundex transformations |
+ yes | +~ 65% | +
function() | +Pass custom encoding via function(string):[words] | ++ | + |
+
+
Field | +Category | +Description | +
encode | +charset | +The encoder function. Has to return an array of separated words (or an empty string). | +
rtl | +charset | +A boolean property which indicates right-to-left encoding. | +
filter | +language | +Filter are also known as "stopwords", they completely filter out words from being indexed. | +
stemmer | +language | +Stemmer removes word endings and is a kind of "partial normalization". A word ending just matched when the word length is bigger than the matched partial. | +
matcher | +language | +Matcher replaces all occurrences of a given string regardless of its position and is also a kind of "partial normalization". | +
+
+
Encoder: | +LatinExact |
+ LatinDefault |
+ LatinSimple |
+ LatinBalance |
+ LatinAdvanced |
+ LatinExtra |
+ LatinSoundex |
+
---|---|---|---|---|---|---|---|
Index Size | +3.1 Mb | +1.9 Mb | +1.8 Mb | +1.7 Mb | +1.6 Mb | +1.1 Mb | +0.7 Mb | +
Struldbrugs | +✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +
struldbrugs | ++ | ✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +
strũldbrųĝgs | ++ | + | ✓ | +✓ | +✓ | +✓ | +✓ | +
strultbrooks | ++ | + | + | ✓ | +✓ | +✓ | +✓ | +
shtruhldbrohkz | ++ | + | + | + | ✓ | +✓ | +✓ | +
zdroltbrykz | ++ | + | + | + | + | ✓ | +✓ | +
struhlbrogger | ++ | + | + | + | + | + | ✓ | +
Store | +Add | +Search 1 | +Search N | +Replace | +Remove | +Not Found | +Scaling | +
---|---|---|---|---|---|---|---|
+ | terms per sec | +terms per sec | +terms per sec | +terms per sec | +terms per sec | +terms per sec | ++ |
IndexedDB | +123,298 | +83,823 | +62,370 | +57,410 | +171,053 | +425,744 | +No | +
Redis | +1,566,091 | +201,534 | +859,463 | +117,013 | +129,595 | +875,526 | +Yes | +
Sqlite | +269,812 | +29,627 | +129,735 | +174,445 | +1,406,553 | +122,566 | +No | +
Postgres | +354,894 | +24,329 | +76,189 | +324,546 | +3,702,647 | +50,305 | +Yes | +
MongoDB | +515,938 | +19,684 | +81,558 | +243,353 | +485,192 | +67,751 | +Yes | +
Clickhouse | +1,436,992 | +11,507 | +22,196 | +931,026 | +3,276,847 | +16,644 | +Yes | +
Encoder: | +LatinExact |
+ LatinDefault |
+ LatinSimple |
+ LatinBalance |
+ LatinAdvanced |
+ LatinExtra |
+ LatinSoundex |
+
---|---|---|---|---|---|---|---|
Index Size | +3.1 Mb | +1.9 Mb | +1.8 Mb | +1.7 Mb | +1.6 Mb | +1.1 Mb | +0.7 Mb | +
Struldbrugs | +✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +
struldbrugs | ++ | ✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +
strũldbrųĝgs | ++ | + | ✓ | +✓ | +✓ | +✓ | +✓ | +
strultbrooks | ++ | + | + | ✓ | +✓ | +✓ | +✓ | +
shtruhldbrohkz | ++ | + | + | + | ✓ | +✓ | +✓ | +
zdroltbrykz | ++ | + | + | + | + | ✓ | +✓ | +
struhlbrogger | ++ | + | + | + | + | + | ✓ | +
Flag | +Values | +Info | +
Feature Flags |
+ ||
SUPPORT_WORKER | +true, false | ++ |
SUPPORT_ENCODER | +true, false | ++ |
SUPPORT_CHARSET | +true, false | ++ |
SUPPORT_CACHE | +true, false | ++ |
SUPPORT_ASYNC | +true, false | +Asynchronous Rendering (support Promises) | +
SUPPORT_STORE | +true, false | ++ |
SUPPORT_SUGGESTION | +true, false | ++ |
SUPPORT_SERIALIZE | +true, false | ++ |
SUPPORT_DOCUMENT | +true, false | ++ |
SUPPORT_TAGS | +true, false | ++ |
SUPPORT_PERSISTENT | +true, false | ++ |
SUPPORT_KEYSTORE | +true, false | ++ |
SUPPORT_COMPRESSION | +true, false | ++ |
SUPPORT_RESOLVER | +true, false | ++ |
Compiler Flags |
+ ||
DEBUG | +true, false | +Output debug information to the console (default: false) | +
RELEASE |
+ custom custom.module bundle bundle.module es5 light compact |
+ + |
POLYFILL | +true, false | +Include Polyfills (based on LANGUAGE_OUT) | +
PROFILER | +true, false | +Just used for automatic performance tests | +
LANGUAGE_OUT |
+ ECMASCRIPT3 ECMASCRIPT5 ECMASCRIPT_2015 ECMASCRIPT_2016 ECMASCRIPT_2017 ECMASCRIPT_2018 ECMASCRIPT_2019 ECMASCRIPT_2020 ECMASCRIPT_2021 ECMASCRIPT_2022 ECMASCRIPT_NEXT STABLE |
+ Target language | +
Flag | +Values | +Info | +
Feature Flags |
+ ||
SUPPORT_WORKER | +true, false | ++ |
SUPPORT_ENCODER | +true, false | ++ |
SUPPORT_CHARSET | +true, false | ++ |
SUPPORT_CACHE | +true, false | ++ |
SUPPORT_ASYNC | +true, false | +Asynchronous Rendering (support Promises) | +
SUPPORT_STORE | +true, false | ++ |
SUPPORT_SUGGESTION | +true, false | ++ |
SUPPORT_SERIALIZE | +true, false | ++ |
SUPPORT_DOCUMENT | +true, false | ++ |
SUPPORT_TAGS | +true, false | ++ |
SUPPORT_PERSISTENT | +true, false | ++ |
SUPPORT_KEYSTORE | +true, false | ++ |
SUPPORT_COMPRESSION | +true, false | ++ |
SUPPORT_RESOLVER | +true, false | ++ |
Compiler Flags |
+ ||
DEBUG | +true, false | +Output debug information to the console (default: false) | +
RELEASE |
+ custom custom.module bundle bundle.module es5 light compact |
+ + |
POLYFILL | +true, false | +Include Polyfills (based on LANGUAGE_OUT) | +
PROFILER | +true, false | +Just used for automatic performance tests | +
LANGUAGE_OUT |
+ ECMASCRIPT3 ECMASCRIPT5 ECMASCRIPT_2015 ECMASCRIPT_2016 ECMASCRIPT_2017 ECMASCRIPT_2018 ECMASCRIPT_2019 ECMASCRIPT_2020 ECMASCRIPT_2021 ECMASCRIPT_2022 ECMASCRIPT_NEXT STABLE |
+ Target language | +
Encoder: | +LatinExact |
+ LatinDefault |
+ LatinSimple |
+ LatinBalance |
+ LatinAdvanced |
+ LatinExtra |
+ LatinSoundex |
+
---|---|---|---|---|---|---|---|
Index Size | +3.1 Mb | +1.9 Mb | +1.8 Mb | +1.7 Mb | +1.6 Mb | +1.1 Mb | +0.7 Mb | +
Struldbrugs | +✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +
struldbrugs | ++ | ✓ | +✓ | +✓ | +✓ | +✓ | +✓ | +
strũldbrųĝgs | ++ | + | ✓ | +✓ | +✓ | +✓ | +✓ | +
strultbrooks | ++ | + | + | ✓ | +✓ | +✓ | +✓ | +
shtruhldbrohkz | ++ | + | + | + | ✓ | +✓ | +✓ | +
zdroltbrykz | ++ | + | + | + | + | ✓ | +✓ | +
struhlbrogger | ++ | + | + | + | + | + | ✓ | +
Store | +Add | +Search 1 | +Search N | +Replace | +Remove | +Not Found | +Scaling | +
---|---|---|---|---|---|---|---|
+ | terms per sec | +terms per sec | +terms per sec | +terms per sec | +terms per sec | +terms per sec | ++ |
IndexedDB | +123,298 | +83,823 | +62,370 | +57,410 | +171,053 | +425,744 | +No | +
Redis | +1,566,091 | +201,534 | +859,463 | +117,013 | +129,595 | +875,526 | +Yes | +
Sqlite | +269,812 | +29,627 | +129,735 | +174,445 | +1,406,553 | +122,566 | +No | +
Postgres | +354,894 | +24,329 | +76,189 | +324,546 | +3,702,647 | +50,305 | +Yes | +
MongoDB | +515,938 | +19,684 | +81,558 | +243,353 | +485,192 | +67,751 | +Yes | +
Clickhouse | +1,436,992 | +11,507 | +22,196 | +931,026 | +3,276,847 | +16,644 | +Yes | +