From 2b1771fd6d2dc64b51fc2f2812557b24d70c3d16 Mon Sep 17 00:00:00 2001 From: Thomas Wilkerling Date: Thu, 27 Mar 2025 21:04:06 +0100 Subject: [PATCH] update readme part 1 of 2 --- README.md | 3251 +++++++--------------------------------- doc/async.md | 49 +- doc/cache.md | 17 + doc/document-search.md | 733 +++++++++ doc/encoder.md | 688 ++++++++- doc/export-import.md | 91 ++ doc/options.md | 404 +++++ doc/persistent.md | 41 + doc/worker.md | 146 ++ index.d.ts | 9 +- package-lock.json | 4 +- src/document.js | 11 +- 12 files changed, 2745 insertions(+), 2699 deletions(-) create mode 100644 doc/cache.md create mode 100644 doc/options.md diff --git a/README.md b/README.md index 55d64d9..b89a6c3 100644 --- a/README.md +++ b/README.md @@ -13,20 +13,36 @@ FlexSearch v0.8: [Overview and Migration Guide](doc/0.8.0.md) -Basic Start  •  -API Reference  •  +[Basic Start](#load-library)  •  +[API Reference](#api-overview)  •  Encoder  •  Document Search  •  Persistent Indexes  •  Using Worker  •  Tag Search  •  Resolver  •  -Customization  •  Changelog -## Support this Project + + +## Please Support this Project + +FlexSearch has been helping developers around the world build powerful, efficient search functionalities for years. Maintaining and improving the library requires significant time and resources. If you’ve found this project valuable and you're interested in supporting the project, please consider donating. Thanks a lot for your continued support! Donate using Open Collective Donate using Github Sponsors @@ -45,15 +61,11 @@ You can help me by making a personal donation to keep this project alive and als

- - FlexSearch performs queries up to 1,000,000 times faster compared to other libraries by also providing powerful search capabilities like multi-field search (document search), phonetic transformations, partial matching, tag-search or suggestions. Bigger workloads are scalable through workers to perform any updates or queries on the index in parallel through dedicated balanced threads. -The latest generation v0.8 introduce Persistent Indexes, well optimized for scaling of large datasets and running in parallel. All available features was natively ported right into the database engine of your choice. +The latest generation v0.8 introduce [Persistent Indexes](doc/persistent.md), well optimized for scaling of large datasets and running in parallel. All available features was natively ported right into the database engine of your choice. FlexSearch was nominated by the GitNation for the "Best Technology of the Year". @@ -70,18 +82,211 @@ Supported Database: - [MongoDB](doc/persistent-mongodb.md) - [Clickhouse](doc/persistent-clickhouse.md) +Supported Charsets: +- Latin +- Chinese, Korean, Japanese (CJK) +- Hindi +- Arabic +- Cyrillic +- Greek and Coptic +- Hebrew + +Common Code Examples: + +- Node.js: [Module (ESM)](example/nodejs-esm) +- Node.js: [CommonJS](example/nodejs-commonjs) +- Browser: [Module (ESM)](example/browser-module) +- Browser: [Legacy Script](example/browser-legacy) + Demos: - Auto-Complete -Library Comparison: + +Benchmarks: - Performance Benchmark -- Scoring Benchmark +- Matching Benchmark + +
+Latest Benchmark Results + +The benchmark was measured in terms per seconds, higher values are better (except the test "Memory"). +The memory value refers to the amount of memory which was additionally allocated during search. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LibraryMemoryQuery: SingleQuery: MultiQuery: LargeQuery: Not Found
flexsearch1650955718119127301398111051706499
jsii21881384794955916359593730307
wade980604734432144191521239372
js-search23722982383775426609994803
minisearch4777305891916575849304233
orama5355294451702314454225491
elasticlunr3073143264855810120695840
lunr2443115275147688858103386
ufuzzy1375427997788585449557
bm2533963390347771265712471
fuzzysearch300147148229455276
fuse247107422321337329
+ +Run Comparison: Performance Benchmark "Gulliver's Travels" + +
Extern Projects & Plugins: - React: https://github.com/angeloashmore/react-use-flexsearch - Vue: https://github.com/Noction/vue-use-flexsearch - Gatsby: https://www.gatsbyjs.org/packages/gatsby-plugin-flexsearch/ +## Table of contents + +> [!TIP] +> You will just need to spend 5 minutes to improve your results significantly by understanding these 3 elementary things about FlexSearch : [Tokenizer](#tokenize), [Encoder](#encoder) and [Suggestions](#suggestion) + +- [Load Library (Node.js, ESM, Legacy Browser)](#load-library) + - [Non-Module Bundles (ES5 Legacy)](#non-module-bundles-es5-legacy) + - [Module (ESM)](#module-esm) + - [Node.js](#nodejs) +- [Basic Usage and Variants](#basic-usage-and-variants) +- [API Overview](#api-overview) +- [Options](doc/options.md) + - [Index Options](doc/options.md) + - [Document Options](doc/options.md) + - [Worker Options](doc/options.md) + - [Persistent Options](doc/options.md) + - [Encoder Options](doc/options.md) + - [Resolver Options](doc/options.md) +- [Context Search](#context-search) +- [Document Search (Multi-Field Search)](doc/document-search.md) +- [Multi-Tag Search](doc/document-search.md) +- [Phonetic Search (Fuzzy Search)](doc/fuzzy-search.md) +- [Tokenizer (Partial Search)](#tokenizer-partial-match) +- [Encoder](doc/encoder.md) + - Universal Charset Collection + - Latin Charset Encoder Presets + - Language Specific Preset + - [Custom Encoder](doc/encoder.md#custom-encoder) +- [Non-Blocking Runtime Balancer (Async)](doc/async.md) +- [Worker Indexes](doc/worker.md) +- [Resolver (Complex Queries)](doc/resolver.md) + - Boolean Operations (and, or, xor, not) + - Boost + - Limit / Offset + - Resolve +- [Export / Import Indexes](doc/export-import.md) + - [Fast-Boot Serialization](doc/export-import.md#fast-boot-serialization-for-server-side-rendering-php-python-ruby-rust-java-go-nodejs-) +- [Persistent Indexes](doc/persistent.md) + - [IndexedDB (Browser)](doc/persistent-indexeddb.md) + - [Postgres](doc/persistent-postgres.md) + - [Redis](doc/persistent-redis.md) + - [MongoDB](doc/persistent-mongodb.md) + - [SQLite](doc/persistent-sqlite.md) + - [Clickhouse](doc/persistent-clickhouse.md) +- [Result Highlighting](doc/result-highlighting.md) +- [Common Code Examples (Browser, Node.js)](#common-code-examples) + ## Load Library (Node.js, ESM, Legacy Browser) #### Update item from an index @@ -1490,891 +1238,10 @@ index.update(0, "Max Miller"); index.remove(0); ``` - -#### Add custom tokenizer - -> A tokenizer split words/terms into components or partials. - -Define a private custom tokenizer during creation/initialization: -```js -var index = new FlexSearch({ - - tokenize: function(str){ - - return str.split(/\s-\//g); - } -}); -``` - -> The tokenizer function gets a string as a parameter and has to return an array of strings representing a word or term. In some languages every char is a term and also not separated via whitespaces. - - -#### Add language-specific stemmer and/or filter - -> __Stemmer:__ several linguistic mutations of the same word (e.g. "run" and "running") - -> __Filter:__ a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be") - -Assign a private custom stemmer or filter during creation/initialization: -```js -var index = new FlexSearch({ - - stemmer: { - - // object {key: replacement} - "ational": "ate", - "tional": "tion", - "enci": "ence", - "ing": "" - }, - filter: [ - - // array blacklist - "in", - "into", - "is", - "isn't", - "it", - "it's" - ] -}); -``` - -Using a custom filter, e.g.: -```js -var index = new FlexSearch({ - - filter: function(value){ - - // just add values with length > 1 to the index - - return value.length > 1; - } -}); -``` - -Or assign stemmer/filters globally to a language: - -> Stemmer are passed as a object (key-value-pair), filter as an array. - -```js -FlexSearch.registerLanguage("us", { - - stemmer: { /* ... */ }, - filter: [ /* ... */ ] -}); -``` - -Or use some pre-defined stemmer or filter of your preferred languages: -```html - - - - - - -... -``` - -Now you can assign built-in stemmer during creation/initialization: -```js -var index_en = new FlexSearch.Index({ - language: "en" -}); - -var index_de = new FlexSearch.Index({ - language: "de" -}); -``` - -In Node.js all built-in language packs files are available: - -```js -const { Index } = require("flexsearch"); - -var index_en = new Index({ - language: "en" -}); -``` - - -### Right-To-Left Support - -> Set the tokenizer at least to "reverse" or "full" when using RTL. - -Just set the field "rtl" to _true_ and use a compatible tokenizer: - -```js -var index = new Index({ - encode: str => str.toLowerCase().split(/[^a-z]+/), - tokenize: "reverse", - rtl: true -}); -``` - - -### CJK Word Break (Chinese, Japanese, Korean) - -Set a custom tokenizer which fits your needs, e.g.: - -```js -var index = FlexSearch.create({ - encode: str => str.replace(/[\x00-\x7F]/g, "").split("") -}); -``` - -You can also pass a custom encoder function to apply some linguistic transformations. - -```js -index.add(0, "一个单词"); -``` - -```js -var results = index.search("单词"); -``` - -## Index Documents (Field-Search) +## Document Search (Field-Search) -### The Document Descriptor - -Assuming our document has a data structure like this: - -```json -{ - "id": 0, - "content": "some text" -} -``` - -Old syntax FlexSearch v0.6.3 (___not supported anymore!___): - -```js -const index = new Document({ - doc: { - id: "id", - field: ["content"] - } -}); -``` - -> The document descriptor has slightly changed, there is no `field` branch anymore, instead just apply one level higher, so `key` becomes a main member of options. - -For the new syntax the field "doc" was renamed to `document` and the field "field" was renamed to `index`: - -```js -const index = new Document({ - document: { - id: "id", - index: ["content"] - } -}); - -index.add({ - id: 0, - content: "some text" -}); -``` - -The field `id` describes where the ID or unique key lives inside your documents. The default key gets the value `id` by default when not passed, so you can shorten the example from above to: - -```js -const index = new Document({ - document: { - index: ["content"] - } -}); -``` - -The member `index` has a list of fields which you want to be indexed from your documents. When just selecting one field, then you can pass a string. When also using default key `id` then this shortens to just: - -```js -const index = new Document({ document: "content" }); -index.add({ id: 0, content: "some text" }); -``` - -Assuming you have several fields, you can add multiple fields to the index: - -```js -var docs = [{ - id: 0, - title: "Title A", - content: "Body A" -},{ - id: 1, - title: "Title B", - content: "Body B" -}]; -``` - -```js -const index = new Document({ - id: "id", - index: ["title", "content"] -}); -``` - -You can pass custom options for each field: - -```js -const index = new Document({ - id: "id", - index: [{ - field: "title", - tokenize: "forward", - optimize: true, - resolution: 9 - },{ - field: "content", - tokenize: "strict", - optimize: true, - resolution: 5, - minlength: 3, - context: { - depth: 1, - resolution: 3 - } - }] -}); -``` - -Field options gets inherited when also global options was passed, e.g.: - -```js -const index = new Document({ - tokenize: "strict", - optimize: true, - resolution: 9, - document: { - id: "id", - index:[{ - field: "title", - tokenize: "forward" - },{ - field: "content", - minlength: 3, - context: { - depth: 1, - resolution: 3 - } - }] - } -}); -``` - -Note: The context options from the field "content" also gets inherited by the corresponding field options, whereas this field options was inherited by the global option. - -### Nested Data Fields (Complex Objects) - -Assume the document array looks more complex (has nested branches etc.), e.g.: - -```json -{ - "record": { - "id": 0, - "title": "some title", - "content": { - "header": "some text", - "footer": "some text" - } - } -} -``` - -Then use the colon separated notation `root:child:child` to define hierarchy within the document descriptor: - -```js -const index = new Document({ - document: { - id: "record:id", - index: [ - "record:title", - "record:content:header", - "record:content:footer" - ] - } -}); -``` -> Just add fields you want to query against. Do not add fields to the index, you just need in the result (but did not query against). For this purpose you can store documents independently of its index (read below). - -When you want to query through a field you have to pass the exact key of the field you have defined in the `doc` as a field name (with colon syntax): - -```js -index.search(query, { - index: [ - "record:title", - "record:content:header", - "record:content:footer" - ] -}); -``` - -Same as: - -```js -index.search(query, [ - "record:title", - "record:content:header", - "record:content:footer" -]); -``` - -Using field-specific options: - -```js -index.search([{ - field: "record:title", - query: "some query", - limit: 100, - suggest: true -},{ - field: "record:title", - query: "some other query", - limit: 100, - suggest: true -}]); -``` - -You can perform a search through the same field with different queries. - -> When passing field-specific options you need to provide the full configuration for each field. They get not inherited like the document descriptor. - -### Complex Documents - -You need to follow 2 rules for your documents: - -1. The document cannot start with an Array at the root index. This will introduce sequential data and isn't supported yet. See below for a workaround for such data. - -```js -[ // <-- not allowed as document start! - { - "id": 0, - "title": "title" - } -] -``` - -2. The id can't be nested inside an array (also none of the parent fields can't be an array). This will introduce sequential data and isn't supported yet. See below for a workaround for such data. - -```js -{ - "records": [ // <-- not allowed when ID or tag lives inside! - { - "id": 0, - "title": "title" - } - ] -} -``` - -Here an example for a supported complex document: - -```json -{ - "meta": { - "tag": "cat", - "id": 0 - }, - "contents": [ - { - "body": { - "title": "some title", - "footer": "some text" - }, - "keywords": ["some", "key", "words"] - }, - { - "body": { - "title": "some title", - "footer": "some text" - }, - "keywords": ["some", "key", "words"] - } - ] -} -``` - -The corresponding document descriptor (when all fields should be indexed) looks like: - -```js -const index = new Document({ - document: { - id: "meta:id", - tag: "meta:tag", - index: [ - "contents[]:body:title", - "contents[]:body:footer", - "contents[]:keywords" - ] - } -}); -``` - -Again, when searching you have to use the same colon-separated-string from your field definition. - -```js -index.search(query, { - index: "contents[]:body:title" -}); -``` - -### Not Supported Documents (Sequential Data) - -This example breaks both rules from above: - -```js -[ // <-- not allowed as document start! - { - "tag": "cat", - "records": [ // <-- not allowed when ID or tag lives inside! - { - "id": 0, - "body": { - "title": "some title", - "footer": "some text" - }, - "keywords": ["some", "key", "words"] - }, - { - "id": 1, - "body": { - "title": "some title", - "footer": "some text" - }, - "keywords": ["some", "key", "words"] - } - ] - } -] -``` - -You need to apply some kind of structure normalization. - -A workaround to such a data structure looks like this: - -```js -const index = new Document({ - document: { - id: "record:id", - tag: "tag", - index: [ - "record:body:title", - "record:body:footer", - "record:body:keywords" - ] - } -}); - -function add(sequential_data){ - - for(let x = 0, data; x < sequential_data.length; x++){ - - data = sequential_data[x]; - - for(let y = 0, record; y < data.records.length; y++){ - - record = data.records[y]; - - index.add({ - id: record.id, - tag: data.tag, - record: record - }); - } - } -} - -// now just use add() helper method as usual: - -add([{ - // sequential structured data - // take the data example above -}]); -``` - -You can skip the first loop when your document data has just one index as the outer array. - -### Add/Update/Remove Documents to/from the Index - -Add a document to the index: - -```js -index.add({ - id: 0, - title: "Foo", - content: "Bar" - }); -``` - -Update index with a single object or an array of objects: - -```js -index.update({ - data:{ - id: 0, - title: "Foo", - body: { - content: "Bar" - } - } -}); -``` - -Remove a single object or an array of objects from the index: - -```js -index.remove(docs); -``` - -When the id is known, you can also simply remove by (faster): - -```js -index.remove(id); -``` - -### Join / Append Arrays - -On the complex example above, the field `keywords` is an array but here the markup did not have brackets like `keywords[]`. That will also detect the array but instead of appending each entry to a new context, the array will be joined into on large string and added to the index. - -The difference of both kinds of adding array contents is the relevance when searching. When adding each item of an array via `append()` to its own context by using the syntax `field[]`, then the relevance of the last entry concurrent with the first entry. When you left the brackets in the notation, it will join the array to one whitespace-separated string. Here the first entry has the highest relevance, whereas the last entry has the lowest relevance. - -So assuming the keyword from the example above are pre-sorted by relevance to its popularity, then you want to keep this order (information of relevance). For this purpose do not add brackets to the notation. Otherwise, it would take the entries in a new scoring context (the old order is getting lost). - -Also you can left bracket notation for better performance and smaller memory footprint. Use it when you did not need the granularity of relevance by the entries. - -### Field-Search - -Search through all fields: - -```js -index.search(query); -``` - -Search through a specific field: - -```js -index.search(query, { index: "title" }); -``` - -Search through a given set of fields: - -```js -index.search(query, { index: ["title", "content"] }); -``` - -Same as: - -```js -index.search(query, ["title", "content"]); -``` - -Pass custom modifiers and queries to each field: - -```js -index.search([{ - field: "content", - query: "some query", - limit: 100, - suggest: true -},{ - field: "content", - query: "some other query", - limit: 100, - suggest: true -}]); -``` - -You can perform a search through the same field with different queries. - -See all available field-search options. - -### The Result Set - -Schema of the result-set: - -> `fields[] => { field, result[] => { document }}` - -The first index is an array of fields the query was applied to. Each of this field has a record (object) with 2 properties "field" and "result". The "result" is also an array and includes the result for this specific field. The result could be an array of IDs or as enriched with stored document data. - -A non-enriched result set now looks like: - -```js -[{ - field: "title", - result: [0, 1, 2] -},{ - field: "content", - result: [3, 4, 5] -}] -``` - -An enriched result set now looks like: - -```js -[{ - field: "title", - result: [ - { id: 0, doc: { /* document */ }}, - { id: 1, doc: { /* document */ }}, - { id: 2, doc: { /* document */ }} - ] -},{ - field: "content", - result: [ - { id: 3, doc: { /* document */ }}, - { id: 4, doc: { /* document */ }}, - { id: 5, doc: { /* document */ }} - ] -}] -``` - -When using `pluck` instead of "field" you can explicitly select just one field and get back a flat representation: - -```js -index.search(query, { pluck: "title", enrich: true }); -``` - -```js -[ - { id: 0, doc: { /* document */ }}, - { id: 1, doc: { /* document */ }}, - { id: 2, doc: { /* document */ }} -] -``` - -This result set is a replacement of "boolean search". Instead of applying your bool logic to a nested object, you can apply your logic by yourself on top of the result-set dynamically. This opens hugely capabilities on how you process the results. Therefore, the results from the fields aren't squashed into one result anymore. That keeps some important information, like the name of the field as well as the relevance of each field results which didn't get mixed anymore. - -> A field search will apply a query with the boolean "or" logic by default. Each field has its own result to the given query. - -There is one situation where the `bool` property is being still supported. When you like to switch the default "or" logic from the field search into "and", e.g.: - -```js -index.search(query, { - index: ["title", "content"], - bool: "and" -}); -``` - -You will just get results which contains the query in both fields. That's it. - -### Tags - -Like the `key` for the ID just define the path to the tag: - -```js -const index = new Document({ - document: { - id: "id", - tag: "tag", - index: "content" - } -}); -``` - -```js -index.add({ - id: 0, - tag: "cat", - content: "Some content ..." -}); -``` - -Your data also can have multiple tags as an array: - -```js -index.add({ - id: 1, - tag: ["animal", "dog"], - content: "Some content ..." -}); -``` - -You can perform a tag-specific search by: - -```js -index.search(query, { - index: "content", - tag: "animal" -}); -``` - -This just gives you result which was tagged with the given tag. - -Use multiple tags when searching: - -```js -index.search(query, { - index: "content", - tag: ["cat", "dog"] -}); -``` - -This gives you result which are tagged with one of the given tag. - -> Multiple tags will apply as the boolean "or" by default. It just needs one of the tags to be existing. - -This is another situation where the `bool` property is still supported. When you like to switch the default "or" logic from the tag search into "and", e.g.: - -```js -index.search(query, { - index: "content", - tag: ["dog", "animal"], - bool: "and" -}); -``` - -You will just get results which contains both tags (in this example there is just one records which has the tag "dog" and "animal"). - -### Tag Search - -You can also fetch results from one or more tags when no query was passed: - -```js -index.search({ tag: ["cat", "dog"] }); -``` - -In this case the result-set looks like: - -```js -[{ - tag: "cat", - result: [ /* all cats */ ] -},{ - tag: "dog", - result: [ /* all dogs */ ] -}] -``` - -### Limit & Offset - -> By default, every query is limited to 100 entries. Unbounded queries leads into issues. You need to set the limit as an option to adjust the size. - -You can set the limit and the offset for each query: - -```js -index.search(query, { limit: 20, offset: 100 }); -``` - -> You cannot pre-count the size of the result-set. That's a limit by the design of FlexSearch. When you really need a count of all results you are able to page through, then just assign a high enough limit and get back all results and apply your paging offset manually (this works also on server-side). FlexSearch is fast enough that this isn't an issue. - -## Document Store - -Only a document index can have a store. You can use a document index instead of a flat index to get this functionality also when only storing ID-content-pairs. - -You can define independently which fields should be indexed and which fields should be stored. This way you can index fields which should not be included in the search result. - -> Do not use a store when: 1. an array of IDs as the result is good enough, or 2. you already have the contents/documents stored elsewhere (outside the index). - -> When the `store` attribute was set, you have to include all fields which should be stored explicitly (acts like a whitelist). - -> When the `store` attribute was not set, the original document is stored as a fallback. - -This will add the whole original content to the store: - -```js -const index = new Document({ - document: { - index: "content", - store: true - } -}); - -index.add({ id: 0, content: "some text" }); -``` - -### Access documents from internal store - -You can get indexed documents from the store: - -```js -var data = index.get(1); -``` - -You can update/change store contents directly without changing the index by: - -```js -index.set(1, data); -``` - -To update the store and also update the index then just use `index.update`, `index.add` or `index.append`. - -When you perform a query, weather it is a document index or a flat index, then you will always get back an array of IDs. - -Optionally you can enrich the query results automatically with stored contents by: - -```js -index.search(query, { enrich: true }); -``` - -Your results look now like: - -```js -[{ - id: 0, - doc: { /* content from store */ } -},{ - id: 1, - doc: { /* content from store */ } -}] -``` - -### Configure Storage (Recommended) - -This will add just specific fields from a document to the store (the ID isn't necessary to keep in store): - -```js -const index = new Document({ - document: { - index: "content", - store: ["author", "email"] - } -}); - -index.add(id, content); -``` - -You can configure independently what should being indexed and what should being stored. It is highly recommended to make use of this whenever you can. - -Here a useful example of configuring doc and store: - -```js -const index = new Document({ - document: { - index: "content", - store: ["author", "email"] - } -}); - -index.add({ - id: 0, - author: "Jon Doe", - email: "john@mail.com", - content: "Some content for the index ..." -}); -``` - -You can query through the contents and will get back the stored values instead: - -```js -index.search("some content", { enrich: true }); -``` - -Your results are now looking like: - -```js -[{ - field: "content", - result: [{ - id: 0, - doc: { - author: "Jon Doe", - email: "john@mail.com", - } - }] -}] -``` - -Both field "author" and "email" are not indexed. +[Read here](doc/document-search.md) ### Chaining @@ -2382,36 +1249,28 @@ Both field "author" and "email" are not indexed. Simply chain methods like: ```js -var index = FlexSearch.create() - .addMatcher({'â': 'a'}) - .add(0, 'foo') - .add(1, 'bar'); +var index = Index.create().addMatcher({'â': 'a'}).add(0, 'foo').add(1, 'bar'); ``` ```js index.remove(0).update(1, 'foo').add(2, 'foobar'); ``` - -## Contextual Search + +## Context Search -> __Note:__ This feature is disabled by default because of its extended memory usage. Read here get more information about and how to enable. - -FlexSearch introduce a new scoring mechanism called __Contextual Search__ which was invented by Thomas Wilkerling, the author of this library. A Contextual Search incredibly boost up queries to a complete new level but also requires some additional memory (depending on ___depth___). -The basic idea of this concept is to limit relevance by its context instead of calculating relevance through the whole distance of its corresponding document. -This way contextual search also improves the results of relevance-based queries on a large amount of text data. +The basic idea of this concept is to limit relevance by its context instead of calculating relevance through the whole distance of its corresponding document. The context acts like a bidirectional moving window of 2 pointers (terms) which can initially have a maximum distance of the value passed via option setting `depth` and dynamically growth on search when the query did not match any results.

-## Enable Contextual Scoring +### Enable Context-Search Create an index and use the default context: ```js var index = new FlexSearch({ - tokenize: "strict", context: true }); @@ -2420,7 +1279,6 @@ var index = new FlexSearch({ Create an index and apply custom options for the context: ```js var index = new FlexSearch({ - tokenize: "strict", context: { resolution: 5, @@ -2434,950 +1292,9 @@ var index = new FlexSearch({ > The contextual index requires additional amount of memory depending on depth. - -### Auto-Balanced Cache (By Popularity) +## Index Memory Allocation -You need to initialize the cache and its limit during the creation of the index: - -```js -const index = new Index({ cache: 100 }); -``` - -```js -const results = index.searchCache(query); -``` - -A common scenario for using a cache is an autocomplete or instant search when typing. - -> When passing a number as a limit the cache automatically balance stored entries related to their popularity. - -> When just using "true" the cache is unbounded and perform actually 2-3 times faster (because the balancer do not have to run). - - -## Worker Parallelism (Browser + Node.js) - -The new worker model from v0.7.0 is divided into "fields" from the document (1 worker = 1 field index). This way the worker becomes able to solve tasks (subtasks) completely. The downside of this paradigm is they might not have been perfect balanced in storing contents (fields may have different length of contents). On the other hand there is no indication that balancing the storage gives any advantage (they all require the same amount in total). - -When using a document index, then just apply the option "worker": -```js -const index = new Document({ - index: ["tag", "name", "title", "text"], - worker: true -}); - -index.add({ - id: 1, tag: "cat", name: "Tom", title: "some", text: "some" -}).add({ - id: 2, tag: "dog", name: "Ben", title: "title", text: "content" -}).add({ - id: 3, tag: "cat", name: "Max", title: "to", text: "to" -}).add({ - id: 4, tag: "dog", name: "Tim", title: "index", text: "index" -}); -``` - -``` -Worker 1: { 1: "cat", 2: "dog", 3: "cat", 4: "dog" } -Worker 2: { 1: "Tom", 2: "Ben", 3: "Max", 4: "Tim" } -Worker 3: { 1: "some", 2: "title", 3: "to", 4: "index" } -Worker 4: { 1: "some", 2: "content", 3: "to", 4: "index" } -``` - -When you perform a field search through all fields then this task is being balanced perfectly through all workers, which can solve their subtasks independently. - -### Worker Index - -Above we have seen that documents will create worker automatically for each field. You can also create a WorkerIndex directly (same like using `Index` instead of `Document`). - -Use as ES6 module: - -```js -import WorkerIndex from "./worker/index.js"; -const index = new WorkerIndex(options); -index.add(1, "some") - .add(2, "content") - .add(3, "to") - .add(4, "index"); -``` - -Or when bundled version was used instead: - -```js -var index = new FlexSearch.Worker(options); -index.add(1, "some") - .add(2, "content") - .add(3, "to") - .add(4, "index"); -``` - -Such a WorkerIndex works pretty much the same as a created instance of `Index`. - -> A WorkerIndex only support the `async` variant of all methods. That means when you call `index.search()` on a WorkerIndex this will perform also in async the same way as `index.searchAsync()` will do. - -### Worker Threads (Node.js) - -The worker model for Node.js is based on "worker threads" and works exactly the same way: - -```js -const { Document } = require("flexsearch"); - -const index = new Document({ - index: ["tag", "name", "title", "text"], - worker: true -}); -``` - -Or create a single worker instance for a non-document index: - -```js -const { Worker } = require("flexsearch"); -const index = new Worker({ options }); -``` - -### The Worker Async Model (Best Practices) - -A worker will always perform as async. On a query method call you always should handle the returned promise (e.g. use `await`) or pass a callback function as the last parameter. - -```js -const index = new Document({ - index: ["tag", "name", "title", "text"], - worker: true -}); -``` - -All requests and sub-tasks will run in parallel (prioritize "all tasks completed"): - -```js -index.searchAsync(query, callback); -index.searchAsync(query, callback); -index.searchAsync(query, callback); -``` - -Also (prioritize "all tasks completed"): - -```js -index.searchAsync(query).then(callback); -index.searchAsync(query).then(callback); -index.searchAsync(query).then(callback); -``` - -Or when you have just one callback when all requests are done, simply use `Promise.all()` which also prioritize "all tasks completed": - -```js -Promise.all([ - index.searchAsync(query), - index.searchAsync(query), - index.searchAsync(query) -]).then(callback); -``` - -Inside the callback of `Promise.all()` you will also get an array of results as the first parameter respectively for each query you put into. - -When using `await` you can prioritize the order (prioritize "first task completed") and solve requests one by one and just process the sub-tasks in parallel: - -```js -await index.searchAsync(query); -await index.searchAsync(query); -await index.searchAsync(query); -``` - -Same for `index.add()`, `index.append()`, `index.remove()` or `index.update()`. Here there is a special case which isn't disabled by the library, but you need to keep in mind when using Workers. - -When you call the "synced" version on a worker index: - -```js -index.add(doc); -index.add(doc); -index.add(doc); -// contents aren't indexed yet, -// they just queued on the message channel -``` - -Of course, you can do that but keep in mind that the main thread does not have an additional queue for distributed worker tasks. Running these in a long loop fires content massively to the message channel via `worker.postMessage()` internally. Luckily the browser and Node.js will handle such incoming tasks for you automatically (as long enough free RAM is available). When using the "synced" version on a worker index, the content isn't indexed one line below, because all calls are treated as async by default. - -> When adding/updating/removing large bulks of content to the index (or high frequency), it is recommended to use the async version along with `async/await` to keep a low memory footprint during long processes. - - - -## Export / Import (In-Memory) - -### Node.js - -> Persistent-Indexes and Worker-Indexes don't support Import/Export. - -Export an `Index` or `Document-Index` to the folder `/export/`: - -```js -import { promises as fs } from "fs"; - -await index.export(async function(key, data){ - await fs.writeFile("./export/" + key, data, "utf8"); -}); -``` - -Import from folder `/export/` into an `Index` or `Document-Index`: - -```js -const index = new Index({/* keep old config and place it here */}); - -const files = await fs.readdir("./export/"); -for(let i = 0; i < files.length; i++){ - const data = await fs.readFile("./export/" + files[i], "utf8"); - await index.import(files[i], data); -} -``` - -> You'll need to use the same configuration as you used before the export. Any changes on the configuration needs to be re-indexed. - -### Browser - -```js -index.export(function(key, data){ - - // you need to store both the key and the data! - // e.g. use the key for the filename and save your data - - localStorage.setItem(key, data); -}); -``` - -> The size of the export corresponds to the memory consumption of the library. To reduce export size you have to use a configuration which has less memory footprint (use the table at the bottom to get information about configs and its memory allocation). - -When your save routine runs asynchronously you have to use `async/await` or return a promise: - -```js -index.export(function(key, data){ - - return new Promise(function(resolve){ - - // do the saving as async - - resolve(); - }); -}); -``` - -Before you can import data, you need to create your index first. For document indexes provide the same document descriptor you used when export the data. This configuration isn't stored in the export. - -```js -const index = new Index({/* keep old config and place it here */}); -``` - -To import the data just pass a key and data: - -``` -const data = localStorage.getItem(key); -index.import(key, data); -``` - -You need to import every key! Otherwise, your index does not work. You need to store the keys from the export and use this keys for the import (the order of the keys can differ). - -> The feature "fastupdate" is automatically disabled on import. - -This is just for demonstration and is not recommended, because you might have other keys in your localStorage which aren't supported as an import: - -```js -var keys = Object.keys(localStorage); - -for(let i = 0, key, data; i < keys.length; i++){ - key = keys[i] - data = localStorage.getItem(key); - index.import(key, data); -} -``` - -## Encoder - -Search capabilities highly depends on language processing. The old workflow wasn't really practicable. The new Encoder class is a huge improvement and fully replaces the encoding part. Some FlexSearch options was moved to the new `Encoder` instance. - -New Encoding Pipeline: -1. charset normalization -2. custom preparation -3. split into terms (apply includes/excludes) -4. filter (pre-filter) -5. matcher (substitute terms) -6. stemmer (substitute term endings) -7. filter (post-filter) -8. replace chars (mapper) -9. custom regex (replacer) -10. letter deduplication -11. apply finalize - -### Example - -```js -const encoder = new Encoder({ - normalize: true, - dedupe: true, - cache: true, - include: { - letter: true, - number: true, - symbol: false, - punctuation: false, - control: false, - char: "@" - } -}); -``` - -You can use an `include` __instead__ of an `exclude` definition: - -```js -const encoder = new Encoder({ - exclude: { - letter: false, - number: false, - symbol: true, - punctuation: true, - control: true - } -}); -``` - -Instead of using `include` or `exclude` you can pass a regular expression to the field `split`: - -```js -const encoder = new Encoder({ - split: /\s+/ -}); -``` - -> The definitions `include` and `exclude` is a replacement for `split`. You can just define one of those 3. - -Adding custom functions to the encoder pipeline: - -```js -const encoder = new Encoder({ - normalize: function(str){ - return str.toLowerCase(); - }, - prepare: function(str){ - return str.replace(/&/g, " and "); - }, - finalize: function(arr){ - return arr.filter(term => term.length > 2); - } -}); -``` - -Assign encoder to an index: - -```js -const index = new Index({ - encoder: encoder -}); -``` - -Define language specific transformations: - -```js -const encoder = new Encoder({ - replacer: [ - /[´`’ʼ]/g, "'" - ], - filter: new Set([ - "and", - ]), - matcher: new Map([ - ["xvi", "16"] - ]), - stemmer: new Map([ - ["ly", ""] - ]), - mapper: new Map([ - ["é", "e"] - ]) -}); -``` - -Or use predefined language and extend it with custom options: - -```js -import EnglishBookPreset from "./lang/en.js"; -const encoder = new Encoder(EnglishBookPreset, { - filter: false -}); -``` - -Equivalent: - -```js -import EnglishBookPreset from "./lang/en.js"; -const encoder = new Encoder(EnglishBookPreset); -encoder.assign({ filter: false }); -``` - -Assign extensions to the encoder instance: - -```js -import LatinEncoderPreset from "./charset/latin/simple.js"; -import EnglishBookPreset from "./lang/en.js"; -// stack definitions to the encoder instance -const encoder = new Encoder() - .assign(LatinEncoderPreset) - .assign(EnglishBookPreset) - // override preset options ... - .assign({ minlength: 3 }); - // assign further presets ... -``` - -> When adding extension to the encoder every previously assigned configuration is still intact, very much like Mixins, also when assigning custom functions. - -Add custom transformations to an existing index: - -```js -import LatinEncoderPreset from "./charset/latin/default.js"; -const encoder = new Encoder(LatinEncoderPreset); -encoder.addReplacer(/[´`’ʼ]/g, "'"); -encoder.addFilter("and"); -encoder.addMatcher("xvi", "16"); -encoder.addStemmer("ly", ""); -encoder.addMapper("é", "e"); -``` - -Shortcut for just assigning one encoder configuration to an index: - -```js -import LatinEncoderPreset from "./charset/latin/default.js"; -const index = new Index({ - encoder: LatinEncoderPreset -}); -``` - -### Custom Encoder - -Since it is very simple to create a custom Encoder, you are welcome to create your own. -e.g. -```js -function customEncoder(content){ - const tokens = []; - // split content into terms/tokens - // apply your changes to each term/token - // you will need to return an Array of terms/tokens - // so just iterate through the input string and - // push tokens to the array - // ... - return tokens; -} - -const index = new Index({ - // set to strict when your tokenization was already done - tokenize: "strict", - encode: customEncoder -}); -``` - -If you get some good results please feel free to share your encoder. - -## Languages - -Language-specific definitions are being divided into two groups: - -1. Charset - 1. ___encode___, type: `function(string):string[]` - 2. ___rtl___, type: `boolean` -2. Language - 1. ___matcher___, type: `{string: string}` - 2. ___stemmer___, type: `{string: string}` - 3. ___filter___, type: `string[]` - -The charset contains the encoding logic, the language contains stemmer, stopword filter and matchers. Multiple language definitions can use the same charset encoder. Also this separation let you manage different language definitions for special use cases (e.g. names, cities, dialects/slang, etc.). - -To fully describe a custom language __on the fly__ you need to pass: - -```js -const index = FlexSearch({ - // mandatory: - encode: (content) => [words], - // optionally: - rtl: false, - stemmer: {}, - matcher: {}, - filter: [] -}); -``` - -When passing no parameter it uses the `latin:default` schema by default. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
FieldCategoryDescription
encodecharsetThe encoder function. Has to return an array of separated words (or an empty string).
rtlcharsetA boolean property which indicates right-to-left encoding.
filterlanguageFilter are also known as "stopwords", they completely filter out words from being indexed.
stemmerlanguageStemmer removes word endings and is a kind of "partial normalization". A word ending just matched when the word length is bigger than the matched partial.
matcherlanguageMatcher replaces all occurrences of a given string regardless of its position and is also a kind of "partial normalization".
- -### 1. Language Packs: ES6 Modules - -The most simple way to assign charset/language specific encoding via modules is: - -```js -import charset from "./dist/module/lang/latin/advanced.js"; -import lang from "./dist/module/lang/en.js"; - -const index = FlexSearch({ - charset: charset, - lang: lang -}); -``` - -Just import the __default export__ by each module and assign them accordingly. - -The full qualified example from above is: - -```js -import { encode, rtl } from "./dist/module/lang/latin/advanced.js"; -import { stemmer, filter, matcher } from "./dist/module/lang/en.js"; - -const index = FlexSearch({ - encode: encode, - rtl: rtl, - stemmer: stemmer, - matcher: matcher, - filter: filter -}); -``` - -The example above is the standard interface which is at least exported from each charset/language. - -You can also define the encoder directly and left all other options: - -```js -import simple from "./dist/module/lang/latin/simple.js"; - -const index = FlexSearch({ - encode: simple -}); -``` - -#### Available Latin Encoders - -1. default -2. simple -3. balance -4. advanced -5. extra - -You can assign a charset by passing the charset during initialization, e.g. `charset: "latin"` for the default charset encoder or `charset: "latin:soundex"` for a encoder variant. - -#### Dialect / Slang - -Language definitions (especially matchers) also could be used to normalize dialect and slang of a specific language. - -### 2. Language Packs: ES5 (Language Packs) - -You need to make the charset and/or language definitions available by: - -1. All charset definitions are included in the `flexsearch.bundle.js` build by default, but no language-specific definitions are included -2. You can load packages located in `/dist/lang/` (files refers to languages, folders are charsets) -3. You can make a custom build - -When loading language packs, make sure that the library was loaded before: - -```html - - - -``` - -When using the full "bundle" version the built-in latin encoders are already included and you just have to load the language file: - -```html - - -``` - -Because you loading packs as external packages (non-ES6-modules) you have to initialize them by shortcuts: - -```js -const index = FlexSearch({ - charset: "latin:soundex", - lang: "en" -}); -``` - -> Use the `charset:variant` notation to assign charset and its variants. When just passing the charset without a variant will automatically resolve as `charset:default`. - -You can also override existing definitions, e.g.: - -```js -const index = FlexSearch({ - charset: "latin", - lang: "en", - matcher: {} -}); -``` - -> Passed definitions will __not__ extend default definitions, they will replace them. - -When you like to extend a definition just create a new language file and put in all the logic. - -#### Encoder Variants - -It is pretty straight forward when using an encoder variant: - -```html - - - - -``` - -When using the full "bundle" version the built-in latin encoders are already included and you just have to load the language file: - -```html - - -``` - -```js -const index_advanced = FlexSearch({ - charset: "latin:advanced" -}); - -const index_extra = FlexSearch({ - charset: "latin:extra" -}); -``` - -### Partial Tokenizer - -In FlexSearch you can't provide your own partial tokenizer, because it is a direct dependency to the core unit. The built-in tokenizer of FlexSearch splits each word into fragments by different patterns: - -1. strict (supports contextual index) -2. forward -3. reverse (including forward) -4. full - -### Language Processing Pipeline - -This is the default pipeline provided by FlexSearch: - -

- -

- -#### Custom Pipeline - -At first take a look into the default pipeline in `src/common.js`. It is very simple and straight forward. The pipeline will process as some sort of inversion of control, the final encoder implementation has to handle charset and also language specific transformations. This workaround has left over from many tests. - -Inject the default pipeline by e.g.: - -```js -this.pipeline( - - /* string: */ str.toLowerCase(), - /* normalize: */ false, - /* split: */ split, - /* collapse: */ false -); -``` - -Use the pipeline schema from above to understand the iteration and the difference of pre-encoding and post-encoding. Stemmer and matchers needs to be applied after charset normalization but before language transformations, filters also. - -Here is a good example of extending pipelines: `src/lang/latin/extra.js` → `src/lang/latin/advanced.js` → `src/lang/latin/simple.js`. - -### How to contribute? - -Search for your language in `src/lang/`, if it exists you can extend or provide variants (like dialect/slang). If the language doesn't exist create a new file and check if any of the existing charsets (e.g. latin) fits to your language. When no charset exist, you need to provide a charset as a base for the language. - -A new charset should provide at least: - -1. `encode` A function which normalize the charset of a passed text content (remove special chars, lingual transformations, etc.) and __returns an array of separated words__. Also stemmer, matcher or stopword filter needs to be applied here. When the language has no words make sure to provide something similar, e.g. each chinese sign could also be a "word". Don't return the whole text content without split. -3. `rtl` A boolean flag which indicates right-to-left encoding - -Basically the charset needs just to provide an encoder function along with an indicator for right-to-left encoding: - -```js -export function encode(str){ return [str] } -export const rtl = false; -``` - -## Fuzzy-Search - -Fuzzysearch describes a basic concept of how making queries more tolerant. FlexSearch provides several methods to achieve fuzziness: - -1. Use a tokenizer: `forward`, `reverse` or `full` -2. Don't forget to use any of the builtin encoder `simple` > `balance` > `advanced` > `extra` > `soundex` (sorted by fuzziness) -3. Use one of the language specific presets e.g. `/lang/en.js` for en-US specific content -4. Enable suggestions by passing the search option `suggest: true` - -Additionally, you can apply custom `Mapper`, `Replacer`, `Stemmer`, `Filter` or by assigning a custom `normalize(str)`, `prepare(str)` or `finalize(arr)` function to the Encoder. - -### Compare Fuzzy-Search Encoding - -Original term which was indexed: "Struldbrugs" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Encoder:LatinExactLatinDefaultLatinSimpleLatinBalanceLatinAdvancedLatinExtraLatinSoundex
Index Size3.1 Mb1.9 Mb1.8 Mb1.7 Mb1.6 Mb1.1 Mb0.7 Mb
Struldbrugs
struldbrugs
strũldbrųĝgs
strultbrooks
shtruhldbrohkz
zdroltbrykz
struhlbrogger
- -The index size was measured after indexing the book "Gulliver's Travels". - - - - -## Memory Allocation - -The book "Gulliver's Travels Swift Jonathan 1726" was fully indexed for the examples below. - -The most memory-optimized meaningful setting will allocate just 1.2 Mb for the whole book indexed! This is probably the most tiny memory footprint you will get from a search library. - -```js -import { encode } from "./lang/latin/extra.js"; - -index = new Index({ - encode: encode, - tokenize: "strict", - optimize: true, - resolution: 1, - minlength: 3, - fastupdate: false, - context: false -}); -``` - - -### Memory Consumption - -The book "Gulliver's Travels" (Swift Jonathan 1726) was completely indexed for this test: - - - -### Compare Impact of Memory Allocation +The book "Gulliver's Travels" (Swift Jonathan 1726) was indexed for this test. by default a lexical index is very small:
`depth: 0, bidirectional: 0, resolution: 3, minlength: 0` => 2.1 Mb @@ -3400,16 +1317,15 @@ using bidirectional will decrease memory allocation:
enable the option "fastupdate" will increase memory allocation:
`depth: 2, bidirectional: 1, resolution: 9, minlength: 3` => 6.3 Mb - ## Presets -1. `memory` (primary optimize for memory) -2. `performance` (primary optimize for performance) -3. `match` (primary optimize for matching) -4. `score` (primary optimize for scoring) -5. `default` (the default balanced profile) +1. `memory` primarily optimized for a small memory footprint +2. `performance` primarily optimized for high performance +3. `match` primarily optimized for matching capabilities +4. `score` primarily optimized for scoring capabilities (order of results) +5. `default` the default balanced profile -These profiles are covering standard use cases. It is recommended to apply custom configuration instead of using profiles to get the best out for your situation. Every profile could be optimized further to its specific task, e.g. extreme performance optimized configuration or extreme memory and so on. +These profiles are covering standard use cases. It is recommended to apply custom configuration instead of using profiles to get the best out. Every profile could be optimized further to its specific task, e.g. extreme performance optimized configuration or extreme memory and so on. You can pass a preset during creation/initialization of the index. @@ -3419,51 +1335,6 @@ You can pass a preset during creation/initialization of the index. It is recommended to use numeric id values as reference when adding content to the index. The byte length of passed ids influences the memory consumption significantly. If this is not possible you should consider to use a index table and map the ids with indexes, this becomes important especially when using contextual indexes on a large amount of content. - - --- Copyright 2018-2025 Thomas Wilkerling, Hosted by Nextapps GmbH
diff --git a/doc/async.md b/doc/async.md index 9c0bf81..9358e5f 100644 --- a/doc/async.md +++ b/doc/async.md @@ -2,9 +2,10 @@ > The async processing model is automatically observed by a runtime balancer to prevent any blocking issues, even on page load. -The most methods of each index type provides an async version, e.g.: +Those methods of each index type provides an async version: - addAsync() +- ~~appendAsync()~~ - updateAsync() - removeAsync() - searchAsync() @@ -13,6 +14,8 @@ All those async versions always return a `Promise`, although a callback can be p When calling async methods of the index, a runtime balancer observe the current event loop and will pass to the next event loop automatically. +### Task Priority + You can control how early the process should move over to the next event loop by passing the option property `priority`: ```js @@ -28,6 +31,8 @@ When you have some very smooth running animation you should use a priority of `2 On Node.js you can slightly increase this priority e.g. to `6`, because here there is no UI involved. A priority value of `9` will cycle the event loop on every ~250ms which is the maximum recommended blocking time. You should not use a value higher than this. +### Polling Tasks + Do not forget to `await` on every async task you apply to the index: ```js @@ -36,4 +41,44 @@ for(let i = 0; i < 99999999; i++){ } ``` -You can perform queries to the index during any other async batch is running. \ No newline at end of file +You can perform queries to the index during any other async batch is running. + +### Examples + +You can assign callbacks to each async function: + +```js +index.addAsync(id, content, function(){ + console.log("Task Done"); +}); + +index.searchAsync(query, function(result){ + console.log("Results: ", result); +}); +``` + +Or do not pass a callback function and getting back a `Promise` instead: + +```js +index.addAsync(id, content).then(function(){ + console.log("Task Done"); +}); + +index.searchAsync(query).then(function(result){ + console.log("Results: ", result); +}); +``` + +Or use `async` and `await`: + +```js +async function add(){ + await index.addAsync(id, content); + console.log("Task Done"); +} + +async function search(){ + const results = await index.searchAsync(query); + console.log("Results: ", result); +} +``` diff --git a/doc/cache.md b/doc/cache.md new file mode 100644 index 0000000..e80150e --- /dev/null +++ b/doc/cache.md @@ -0,0 +1,17 @@ +## Auto-Balanced Cache (By Popularity) + +You need to initialize the cache and its limit during the creation of the index: + +```js +const index = new Index({ cache: 100 }); +``` + +```js +const results = index.searchCache(query); +``` + +A common scenario for using a cache is an autocomplete or instant search when typing. + +> When passing a number as a limit the cache automatically balance stored entries related to their popularity. + +> When just using "true" the cache is unbounded and perform actually 2-3 times faster (because the balancer do not have to run). diff --git a/doc/document-search.md b/doc/document-search.md index 5160a72..215bdd0 100644 --- a/doc/document-search.md +++ b/doc/document-search.md @@ -1,4 +1,737 @@ + +## Index Documents (Field-Search) + +### The Document Descriptor + +Assuming our document has a data structure like this: + +```json +{ + "id": 0, + "content": "some text" +} +``` + +> The document descriptor has slightly changed, there is no `field` branch anymore, instead just apply one level higher, so `key` becomes a main member of options. + +For the new syntax the field "doc" was renamed to `document` and the field "field" was renamed to `index`: + +```js +const index = new Document({ + document: { + id: "id", + index: ["content"] + } +}); + +index.add({ + id: 0, + content: "some text" +}); +``` + +The field `id` describes where the ID or unique key lives inside your documents. The default key gets the value `id` by default when not passed, so you can shorten the example from above to: + +```js +const index = new Document({ + document: { + index: ["content"] + } +}); +``` + +The member `index` has a list of fields which you want to be indexed from your documents. When just selecting one field, then you can pass a string. When also using default key `id` then this shortens to just: + +```js +const index = new Document({ document: "content" }); +index.add({ id: 0, content: "some text" }); +``` + +Assuming you have several fields, you can add multiple fields to the index: + +```js +var docs = [{ + id: 0, + title: "Title A", + content: "Body A" +},{ + id: 1, + title: "Title B", + content: "Body B" +}]; +``` + +```js +const index = new Document({ + id: "id", + index: ["title", "content"] +}); +``` + +You can pass custom options for each field: + +```js +const index = new Document({ + id: "id", + index: [{ + field: "title", + tokenize: "forward", + optimize: true, + resolution: 9 + },{ + field: "content", + tokenize: "strict", + optimize: true, + resolution: 5, + minlength: 3, + context: { + depth: 1, + resolution: 3 + } + }] +}); +``` + +Field options gets inherited when also global options was passed, e.g.: + +```js +const index = new Document({ + tokenize: "strict", + optimize: true, + resolution: 9, + document: { + id: "id", + index:[{ + field: "title", + tokenize: "forward" + },{ + field: "content", + minlength: 3, + context: { + depth: 1, + resolution: 3 + } + }] + } +}); +``` + +Note: The context options from the field "content" also gets inherited by the corresponding field options, whereas this field options was inherited by the global option. + +### Nested Data Fields (Complex Objects) + +Assume the document array looks more complex (has nested branches etc.), e.g.: + +```json +{ + "record": { + "id": 0, + "title": "some title", + "content": { + "header": "some text", + "footer": "some text" + } + } +} +``` + +Then use the colon separated notation `root:child:child` to define hierarchy within the document descriptor: + +```js +const index = new Document({ + document: { + id: "record:id", + index: [ + "record:title", + "record:content:header", + "record:content:footer" + ] + } +}); +``` +> Just add fields you want to query against. Do not add fields to the index, you just need in the result (but did not query against). For this purpose you can store documents independently of its index (read below). + +When you want to query through a field you have to pass the exact key of the field you have defined in the `doc` as a field name (with colon syntax): + +```js +index.search(query, { + index: [ + "record:title", + "record:content:header", + "record:content:footer" + ] +}); +``` + +Same as: + +```js +index.search(query, [ + "record:title", + "record:content:header", + "record:content:footer" +]); +``` + +Using field-specific options: + +```js +index.search([{ + field: "record:title", + query: "some query", + limit: 100, + suggest: true +},{ + field: "record:title", + query: "some other query", + limit: 100, + suggest: true +}]); +``` + +You can perform a search through the same field with different queries. + +> When passing field-specific options you need to provide the full configuration for each field. They get not inherited like the document descriptor. + +### Complex Documents + +You need to follow 2 rules for your documents: + +1. The document cannot start with an Array at the root index. This will introduce sequential data and isn't supported yet. See below for a workaround for such data. + +```js +[ // <-- not allowed as document start! + { + "id": 0, + "title": "title" + } +] +``` + +2. The id can't be nested inside an array (also none of the parent fields can't be an array). This will introduce sequential data and isn't supported yet. See below for a workaround for such data. + +```js +{ + "records": [ // <-- not allowed when ID or tag lives inside! + { + "id": 0, + "title": "title" + } + ] +} +``` + +Here an example for a supported complex document: + +```json +{ + "meta": { + "tag": "cat", + "id": 0 + }, + "contents": [ + { + "body": { + "title": "some title", + "footer": "some text" + }, + "keywords": ["some", "key", "words"] + }, + { + "body": { + "title": "some title", + "footer": "some text" + }, + "keywords": ["some", "key", "words"] + } + ] +} +``` + +The corresponding document descriptor (when all fields should be indexed) looks like: + +```js +const index = new Document({ + document: { + id: "meta:id", + tag: "meta:tag", + index: [ + "contents[]:body:title", + "contents[]:body:footer", + "contents[]:keywords" + ] + } +}); +``` + +Again, when searching you have to use the same colon-separated-string from your field definition. + +```js +index.search(query, { + index: "contents[]:body:title" +}); +``` + +### Not Supported Documents (Sequential Data) + +This example breaks both rules from above: + +```js +[ // <-- not allowed as document start! + { + "tag": "cat", + "records": [ // <-- not allowed when ID or tag lives inside! + { + "id": 0, + "body": { + "title": "some title", + "footer": "some text" + }, + "keywords": ["some", "key", "words"] + }, + { + "id": 1, + "body": { + "title": "some title", + "footer": "some text" + }, + "keywords": ["some", "key", "words"] + } + ] + } +] +``` + +You need to apply some kind of structure normalization. + +A workaround to such a data structure looks like this: + +```js +const index = new Document({ + document: { + id: "record:id", + tag: "tag", + index: [ + "record:body:title", + "record:body:footer", + "record:body:keywords" + ] + } +}); + +function add(sequential_data){ + + for(let x = 0, data; x < sequential_data.length; x++){ + + data = sequential_data[x]; + + for(let y = 0, record; y < data.records.length; y++){ + + record = data.records[y]; + + index.add({ + id: record.id, + tag: data.tag, + record: record + }); + } + } +} + +// now just use add() helper method as usual: + +add([{ + // sequential structured data + // take the data example above +}]); +``` + +You can skip the first loop when your document data has just one index as the outer array. + +### Add/Update/Remove Documents to/from the Index + +Add a document to the index: + +```js +index.add({ + id: 0, + title: "Foo", + content: "Bar" + }); +``` + +Update index with a single object or an array of objects: + +```js +index.update({ + data:{ + id: 0, + title: "Foo", + body: { + content: "Bar" + } + } +}); +``` + +Remove a single object or an array of objects from the index: + +```js +index.remove(docs); +``` + +When the id is known, you can also simply remove by (faster): + +```js +index.remove(id); +``` + +### Join / Append Arrays + +On the complex example above, the field `keywords` is an array but here the markup did not have brackets like `keywords[]`. That will also detect the array but instead of appending each entry to a new context, the array will be joined into on large string and added to the index. + +The difference of both kinds of adding array contents is the relevance when searching. When adding each item of an array via `append()` to its own context by using the syntax `field[]`, then the relevance of the last entry concurrent with the first entry. When you left the brackets in the notation, it will join the array to one whitespace-separated string. Here the first entry has the highest relevance, whereas the last entry has the lowest relevance. + +So assuming the keyword from the example above are pre-sorted by relevance to its popularity, then you want to keep this order (information of relevance). For this purpose do not add brackets to the notation. Otherwise, it would take the entries in a new scoring context (the old order is getting lost). + +Also you can left bracket notation for better performance and smaller memory footprint. Use it when you did not need the granularity of relevance by the entries. + +### Field-Search + +Search through all fields: + +```js +index.search(query); +``` + +Search through a specific field: + +```js +index.search(query, { index: "title" }); +``` + +Search through a given set of fields: + +```js +index.search(query, { index: ["title", "content"] }); +``` + +Same as: + +```js +index.search(query, ["title", "content"]); +``` + +Pass custom modifiers and queries to each field: + +```js +index.search([{ + field: "content", + query: "some query", + limit: 100, + suggest: true +},{ + field: "content", + query: "some other query", + limit: 100, + suggest: true +}]); +``` + +You can perform a search through the same field with different queries. + +See all available field-search options. + +### The Result Set + +Schema of the result-set: + +> `fields[] => { field, result[] => { document }}` + +The first index is an array of fields the query was applied to. Each of this field has a record (object) with 2 properties "field" and "result". The "result" is also an array and includes the result for this specific field. The result could be an array of IDs or as enriched with stored document data. + +A non-enriched result set now looks like: + +```js +[{ + field: "title", + result: [0, 1, 2] +},{ + field: "content", + result: [3, 4, 5] +}] +``` + +An enriched result set now looks like: + +```js +[{ + field: "title", + result: [ + { id: 0, doc: { /* document */ }}, + { id: 1, doc: { /* document */ }}, + { id: 2, doc: { /* document */ }} + ] +},{ + field: "content", + result: [ + { id: 3, doc: { /* document */ }}, + { id: 4, doc: { /* document */ }}, + { id: 5, doc: { /* document */ }} + ] +}] +``` + +When using `pluck` instead of "field" you can explicitly select just one field and get back a flat representation: + +```js +index.search(query, { pluck: "title", enrich: true }); +``` + +```js +[ + { id: 0, doc: { /* document */ }}, + { id: 1, doc: { /* document */ }}, + { id: 2, doc: { /* document */ }} +] +``` + +This result set is a replacement of "boolean search". Instead of applying your bool logic to a nested object, you can apply your logic by yourself on top of the result-set dynamically. This opens hugely capabilities on how you process the results. Therefore, the results from the fields aren't squashed into one result anymore. That keeps some important information, like the name of the field as well as the relevance of each field results which didn't get mixed anymore. + +> A field search will apply a query with the boolean "or" logic by default. Each field has its own result to the given query. + +There is one situation where the `bool` property is being still supported. When you like to switch the default "or" logic from the field search into "and", e.g.: + +```js +index.search(query, { + index: ["title", "content"], + bool: "and" +}); +``` + +You will just get results which contains the query in both fields. That's it. + +### Tags + +Like the `key` for the ID just define the path to the tag: + +```js +const index = new Document({ + document: { + id: "id", + tag: "tag", + index: "content" + } +}); +``` + +```js +index.add({ + id: 0, + tag: "cat", + content: "Some content ..." +}); +``` + +Your data also can have multiple tags as an array: + +```js +index.add({ + id: 1, + tag: ["animal", "dog"], + content: "Some content ..." +}); +``` + +You can perform a tag-specific search by: + +```js +index.search(query, { + index: "content", + tag: "animal" +}); +``` + +This just gives you result which was tagged with the given tag. + +Use multiple tags when searching: + +```js +index.search(query, { + index: "content", + tag: ["cat", "dog"] +}); +``` + +This gives you result which are tagged with one of the given tag. + +> Multiple tags will apply as the boolean "or" by default. It just needs one of the tags to be existing. + +This is another situation where the `bool` property is still supported. When you like to switch the default "or" logic from the tag search into "and", e.g.: + +```js +index.search(query, { + index: "content", + tag: ["dog", "animal"], + bool: "and" +}); +``` + +You will just get results which contains both tags (in this example there is just one records which has the tag "dog" and "animal"). + +### Tag Search + +You can also fetch results from one or more tags when no query was passed: + +```js +index.search({ tag: ["cat", "dog"] }); +``` + +In this case the result-set looks like: + +```js +[{ + tag: "cat", + result: [ /* all cats */ ] +},{ + tag: "dog", + result: [ /* all dogs */ ] +}] +``` + +### Limit & Offset + +> By default, every query is limited to 100 entries. Unbounded queries leads into issues. You need to set the limit as an option to adjust the size. + +You can set the limit and the offset for each query: + +```js +index.search(query, { limit: 20, offset: 100 }); +``` + +> You cannot pre-count the size of the result-set. That's a limit by the design of FlexSearch. When you really need a count of all results you are able to page through, then just assign a high enough limit and get back all results and apply your paging offset manually (this works also on server-side). FlexSearch is fast enough that this isn't an issue. + +## Document Store + +Only a document index can have a store. You can use a document index instead of a flat index to get this functionality also when only storing ID-content-pairs. + +You can define independently which fields should be indexed and which fields should be stored. This way you can index fields which should not be included in the search result. + +> Do not use a store when: 1. an array of IDs as the result is good enough, or 2. you already have the contents/documents stored elsewhere (outside the index). + +> When the `store` attribute was set, you have to include all fields which should be stored explicitly (acts like a whitelist). + +> When the `store` attribute was not set, the original document is stored as a fallback. + +This will add the whole original content to the store: + +```js +const index = new Document({ + document: { + index: "content", + store: true + } +}); + +index.add({ id: 0, content: "some text" }); +``` + +### Access documents from internal store + +You can get indexed documents from the store: + +```js +var data = index.get(1); +``` + +You can update/change store contents directly without changing the index by: + +```js +index.set(1, data); +``` + +To update the store and also update the index then just use `index.update`, `index.add` or `index.append`. + +When you perform a query, weather it is a document index or a flat index, then you will always get back an array of IDs. + +Optionally you can enrich the query results automatically with stored contents by: + +```js +index.search(query, { enrich: true }); +``` + +Your results look now like: + +```js +[{ + id: 0, + doc: { /* content from store */ } +},{ + id: 1, + doc: { /* content from store */ } +}] +``` + +### Configure Storage (Recommended) + +This will add just specific fields from a document to the store (the ID isn't necessary to keep in store): + +```js +const index = new Document({ + document: { + index: "content", + store: ["author", "email"] + } +}); + +index.add(id, content); +``` + +You can configure independently what should being indexed and what should being stored. It is highly recommended to make use of this whenever you can. + +Here a useful example of configuring doc and store: + +```js +const index = new Document({ + document: { + index: "content", + store: ["author", "email"] + } +}); + +index.add({ + id: 0, + author: "Jon Doe", + email: "john@mail.com", + content: "Some content for the index ..." +}); +``` + +You can query through the contents and will get back the stored values instead: + +```js +index.search("some content", { enrich: true }); +``` + +Your results are now looking like: + +```js +[{ + field: "content", + result: [{ + id: 0, + doc: { + author: "Jon Doe", + email: "john@mail.com", + } + }] +}] +``` + +Both field "author" and "email" are not indexed. + + ## Merge Document Results By default, the result set of Field-Search has a structure grouped by field names: diff --git a/doc/encoder.md b/doc/encoder.md index 62311bc..a160f00 100644 --- a/doc/encoder.md +++ b/doc/encoder.md @@ -180,4 +180,690 @@ const index = new Index({ }); ``` -If you get some good results please feel free to share your encoder. \ No newline at end of file +If you get some good results please feel free to share your encoder. + + + +#### Add custom tokenizer + +> A tokenizer split words/terms into components or partials. + +Define a private custom tokenizer during creation/initialization: +```js +var index = new FlexSearch({ + + tokenize: function(str){ + + return str.split(/\s-\//g); + } +}); +``` + +> The tokenizer function gets a string as a parameter and has to return an array of strings representing a word or term. In some languages every char is a term and also not separated via whitespaces. + + +#### Add language-specific stemmer and/or filter + +> __Stemmer:__ several linguistic mutations of the same word (e.g. "run" and "running") + +> __Filter:__ a blacklist of words to be filtered out from indexing at all (e.g. "and", "to" or "be") + +Assign a private custom stemmer or filter during creation/initialization: +```js +var index = new FlexSearch({ + + stemmer: { + + // object {key: replacement} + "ational": "ate", + "tional": "tion", + "enci": "ence", + "ing": "" + }, + filter: [ + + // array blacklist + "in", + "into", + "is", + "isn't", + "it", + "it's" + ] +}); +``` + +Using a custom filter, e.g.: +```js +var index = new FlexSearch({ + + filter: function(value){ + + // just add values with length > 1 to the index + + return value.length > 1; + } +}); +``` + +Or assign stemmer/filters globally to a language: + +> Stemmer are passed as a object (key-value-pair), filter as an array. + +```js +FlexSearch.registerLanguage("us", { + + stemmer: { /* ... */ }, + filter: [ /* ... */ ] +}); +``` + +Or use some pre-defined stemmer or filter of your preferred languages: +```html + + + + + + +... +``` + +Now you can assign built-in stemmer during creation/initialization: +```js +var index_en = new FlexSearch.Index({ + language: "en" +}); + +var index_de = new FlexSearch.Index({ + language: "de" +}); +``` + +In Node.js all built-in language packs files are available: + +```js +const { Index } = require("flexsearch"); + +var index_en = new Index({ + language: "en" +}); +``` + + +### Right-To-Left Support + +> Set the tokenizer at least to "reverse" or "full" when using RTL. + +Just set the field "rtl" to _true_ and use a compatible tokenizer: + +```js +var index = new Index({ + encode: str => str.toLowerCase().split(/[^a-z]+/), + tokenize: "reverse", + rtl: true +}); +``` + + +### CJK Word Break (Chinese, Japanese, Korean) + +Set a custom tokenizer which fits your needs, e.g.: + +```js +var index = FlexSearch.create({ + encode: str => str.replace(/[\x00-\x7F]/g, "").split("") +}); +``` + +You can also pass a custom encoder function to apply some linguistic transformations. + +```js +index.add(0, "一个单词"); +``` + +```js +var results = index.search("单词"); +``` + + +## Fuzzy-Search + +Fuzzysearch describes a basic concept of how making queries more tolerant. FlexSearch provides several methods to achieve fuzziness: + +1. Use a tokenizer: `forward`, `reverse` or `full` +2. Don't forget to use any of the builtin encoder `simple` > `balance` > `advanced` > `extra` > `soundex` (sorted by fuzziness) +3. Use one of the language specific presets e.g. `/lang/en.js` for en-US specific content +4. Enable suggestions by passing the search option `suggest: true` + +Additionally, you can apply custom `Mapper`, `Replacer`, `Stemmer`, `Filter` or by assigning a custom `normalize(str)`, `prepare(str)` or `finalize(arr)` function to the Encoder. + +### Compare Fuzzy-Search Encoding + +Original term which was indexed: "Struldbrugs" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Encoder:LatinExactLatinDefaultLatinSimpleLatinBalanceLatinAdvancedLatinExtraLatinSoundex
Index Size3.1 Mb1.9 Mb1.8 Mb1.7 Mb1.6 Mb1.1 Mb0.7 Mb
Struldbrugs
struldbrugs
strũldbrųĝgs
strultbrooks
shtruhldbrohkz
zdroltbrykz
struhlbrogger
+ +The index size was measured after indexing the book "Gulliver's Travels". + + +## Encoder + +Search capabilities highly depends on language processing. The old workflow wasn't really practicable. The new Encoder class is a huge improvement and fully replaces the encoding part. Some FlexSearch options was moved to the new `Encoder` instance. + +New Encoding Pipeline: +1. charset normalization +2. custom preparation +3. split into terms (apply includes/excludes) +4. filter (pre-filter) +5. matcher (substitute terms) +6. stemmer (substitute term endings) +7. filter (post-filter) +8. replace chars (mapper) +9. custom regex (replacer) +10. letter deduplication +11. apply finalize + +### Example + +```js +const encoder = new Encoder({ + normalize: true, + dedupe: true, + cache: true, + include: { + letter: true, + number: true, + symbol: false, + punctuation: false, + control: false, + char: "@" + } +}); +``` + +You can use an `include` __instead__ of an `exclude` definition: + +```js +const encoder = new Encoder({ + exclude: { + letter: false, + number: false, + symbol: true, + punctuation: true, + control: true + } +}); +``` + +Instead of using `include` or `exclude` you can pass a regular expression to the field `split`: + +```js +const encoder = new Encoder({ + split: /\s+/ +}); +``` + +> The definitions `include` and `exclude` is a replacement for `split`. You can just define one of those 3. + +Adding custom functions to the encoder pipeline: + +```js +const encoder = new Encoder({ + normalize: function(str){ + return str.toLowerCase(); + }, + prepare: function(str){ + return str.replace(/&/g, " and "); + }, + finalize: function(arr){ + return arr.filter(term => term.length > 2); + } +}); +``` + +Assign encoder to an index: + +```js +const index = new Index({ + encoder: encoder +}); +``` + +Define language specific transformations: + +```js +const encoder = new Encoder({ + replacer: [ + /[´`’ʼ]/g, "'" + ], + filter: new Set([ + "and", + ]), + matcher: new Map([ + ["xvi", "16"] + ]), + stemmer: new Map([ + ["ly", ""] + ]), + mapper: new Map([ + ["é", "e"] + ]) +}); +``` + +Or use predefined language and extend it with custom options: + +```js +import EnglishBookPreset from "./lang/en.js"; +const encoder = new Encoder(EnglishBookPreset, { + filter: false +}); +``` + +Equivalent: + +```js +import EnglishBookPreset from "./lang/en.js"; +const encoder = new Encoder(EnglishBookPreset); +encoder.assign({ filter: false }); +``` + +Assign extensions to the encoder instance: + +```js +import LatinEncoderPreset from "./charset/latin/simple.js"; +import EnglishBookPreset from "./lang/en.js"; +// stack definitions to the encoder instance +const encoder = new Encoder() + .assign(LatinEncoderPreset) + .assign(EnglishBookPreset) + // override preset options ... + .assign({ minlength: 3 }); + // assign further presets ... +``` + +> When adding extension to the encoder every previously assigned configuration is still intact, very much like Mixins, also when assigning custom functions. + +Add custom transformations to an existing index: + +```js +import LatinEncoderPreset from "./charset/latin/default.js"; +const encoder = new Encoder(LatinEncoderPreset); +encoder.addReplacer(/[´`’ʼ]/g, "'"); +encoder.addFilter("and"); +encoder.addMatcher("xvi", "16"); +encoder.addStemmer("ly", ""); +encoder.addMapper("é", "e"); +``` + +Shortcut for just assigning one encoder configuration to an index: + +```js +import LatinEncoderPreset from "./charset/latin/default.js"; +const index = new Index({ + encoder: LatinEncoderPreset +}); +``` + +### Custom Encoder + +Since it is very simple to create a custom Encoder, you are welcome to create your own. +e.g. +```js +function customEncoder(content){ + const tokens = []; + // split content into terms/tokens + // apply your changes to each term/token + // you will need to return an Array of terms/tokens + // so just iterate through the input string and + // push tokens to the array + // ... + return tokens; +} + +const index = new Index({ + // set to strict when your tokenization was already done + tokenize: "strict", + encode: customEncoder +}); +``` + +If you get some good results please feel free to share your encoder. + +## Languages + +Language-specific definitions are being divided into two groups: + +1. Charset + 1. ___encode___, type: `function(string):string[]` + 2. ___rtl___, type: `boolean` +2. Language + 1. ___matcher___, type: `{string: string}` + 2. ___stemmer___, type: `{string: string}` + 3. ___filter___, type: `string[]` + +The charset contains the encoding logic, the language contains stemmer, stopword filter and matchers. Multiple language definitions can use the same charset encoder. Also this separation let you manage different language definitions for special use cases (e.g. names, cities, dialects/slang, etc.). + +To fully describe a custom language __on the fly__ you need to pass: + +```js +const index = FlexSearch({ + // mandatory: + encode: (content) => [words], + // optionally: + rtl: false, + stemmer: {}, + matcher: {}, + filter: [] +}); +``` + +When passing no parameter it uses the `latin:default` schema by default. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
FieldCategoryDescription
encodecharsetThe encoder function. Has to return an array of separated words (or an empty string).
rtlcharsetA boolean property which indicates right-to-left encoding.
filterlanguageFilter are also known as "stopwords", they completely filter out words from being indexed.
stemmerlanguageStemmer removes word endings and is a kind of "partial normalization". A word ending just matched when the word length is bigger than the matched partial.
matcherlanguageMatcher replaces all occurrences of a given string regardless of its position and is also a kind of "partial normalization".
+ +### 1. Language Packs: ES6 Modules + +The most simple way to assign charset/language specific encoding via modules is: + +```js +import charset from "./dist/module/lang/latin/advanced.js"; +import lang from "./dist/module/lang/en.js"; + +const index = FlexSearch({ + charset: charset, + lang: lang +}); +``` + +Just import the __default export__ by each module and assign them accordingly. + +The full qualified example from above is: + +```js +import { encode, rtl } from "./dist/module/lang/latin/advanced.js"; +import { stemmer, filter, matcher } from "./dist/module/lang/en.js"; + +const index = FlexSearch({ + encode: encode, + rtl: rtl, + stemmer: stemmer, + matcher: matcher, + filter: filter +}); +``` + +The example above is the standard interface which is at least exported from each charset/language. + +You can also define the encoder directly and left all other options: + +```js +import simple from "./dist/module/lang/latin/simple.js"; + +const index = FlexSearch({ + encode: simple +}); +``` + +#### Available Latin Encoders + +1. default +2. simple +3. balance +4. advanced +5. extra + +You can assign a charset by passing the charset during initialization, e.g. `charset: "latin"` for the default charset encoder or `charset: "latin:soundex"` for a encoder variant. + +#### Dialect / Slang + +Language definitions (especially matchers) also could be used to normalize dialect and slang of a specific language. + +### 2. Language Packs: ES5 (Language Packs) + +You need to make the charset and/or language definitions available by: + +1. All charset definitions are included in the `flexsearch.bundle.js` build by default, but no language-specific definitions are included +2. You can load packages located in `/dist/lang/` (files refers to languages, folders are charsets) +3. You can make a custom build + +When loading language packs, make sure that the library was loaded before: + +```html + + + +``` + +When using the full "bundle" version the built-in latin encoders are already included and you just have to load the language file: + +```html + + +``` + +Because you loading packs as external packages (non-ES6-modules) you have to initialize them by shortcuts: + +```js +const index = FlexSearch({ + charset: "latin:soundex", + lang: "en" +}); +``` + +> Use the `charset:variant` notation to assign charset and its variants. When just passing the charset without a variant will automatically resolve as `charset:default`. + +You can also override existing definitions, e.g.: + +```js +const index = FlexSearch({ + charset: "latin", + lang: "en", + matcher: {} +}); +``` + +> Passed definitions will __not__ extend default definitions, they will replace them. + +When you like to extend a definition just create a new language file and put in all the logic. + +#### Encoder Variants + +It is pretty straight forward when using an encoder variant: + +```html + + + + +``` + +When using the full "bundle" version the built-in latin encoders are already included and you just have to load the language file: + +```html + + +``` + +```js +const index_advanced = FlexSearch({ + charset: "latin:advanced" +}); + +const index_extra = FlexSearch({ + charset: "latin:extra" +}); +``` + + +### Language Processing Pipeline + +This is the default pipeline provided by FlexSearch: + +

+ +

+ +#### Custom Pipeline + +At first take a look into the default pipeline in `src/common.js`. It is very simple and straight forward. The pipeline will process as some sort of inversion of control, the final encoder implementation has to handle charset and also language specific transformations. This workaround has left over from many tests. + +Inject the default pipeline by e.g.: + +```js +this.pipeline( + + /* string: */ str.toLowerCase(), + /* normalize: */ false, + /* split: */ split, + /* collapse: */ false +); +``` + +Use the pipeline schema from above to understand the iteration and the difference of pre-encoding and post-encoding. Stemmer and matchers needs to be applied after charset normalization but before language transformations, filters also. + +Here is a good example of extending pipelines: `src/lang/latin/extra.js` → `src/lang/latin/advanced.js` → `src/lang/latin/simple.js`. + +### How to contribute? + +Search for your language in `src/lang/`, if it exists you can extend or provide variants (like dialect/slang). If the language doesn't exist create a new file and check if any of the existing charsets (e.g. latin) fits to your language. When no charset exist, you need to provide a charset as a base for the language. + +A new charset should provide at least: + +1. `encode` A function which normalize the charset of a passed text content (remove special chars, lingual transformations, etc.) and __returns an array of separated words__. Also stemmer, matcher or stopword filter needs to be applied here. When the language has no words make sure to provide something similar, e.g. each chinese sign could also be a "word". Don't return the whole text content without split. +3. `rtl` A boolean flag which indicates right-to-left encoding + +Basically the charset needs just to provide an encoder function along with an indicator for right-to-left encoding: + +```js +export function encode(str){ return [str] } +export const rtl = false; +``` diff --git a/doc/export-import.md b/doc/export-import.md index 5f124ba..da24337 100644 --- a/doc/export-import.md +++ b/doc/export-import.md @@ -26,6 +26,7 @@ for(let i = 0; i < files.length; i++){ > You'll need to use the same configuration as you used before the export. Any changes on the configuration needs to be re-indexed. + ## Fast-Boot Serialization for Server-Side-Rendering (PHP, Python, Ruby, Rust, Java, Go, Node.js, ...) > This is an experimental feature with limited support which probably might drop in future release. You're welcome to give some feedback. @@ -97,3 +98,93 @@ This function is callable like the above example: const index = new Index(); inject(index); ``` + + + + +## Export / Import (In-Memory) + +### Node.js + +> Persistent-Indexes and Worker-Indexes don't support Import/Export. + +Export an `Index` or `Document-Index` to the folder `/export/`: + +```js +import { promises as fs } from "fs"; + +await index.export(async function(key, data){ + await fs.writeFile("./export/" + key, data, "utf8"); +}); +``` + +Import from folder `/export/` into an `Index` or `Document-Index`: + +```js +const index = new Index({/* keep old config and place it here */}); + +const files = await fs.readdir("./export/"); +for(let i = 0; i < files.length; i++){ + const data = await fs.readFile("./export/" + files[i], "utf8"); + await index.import(files[i], data); +} +``` + +> You'll need to use the same configuration as you used before the export. Any changes on the configuration needs to be re-indexed. + +### Browser + +```js +index.export(function(key, data){ + + // you need to store both the key and the data! + // e.g. use the key for the filename and save your data + + localStorage.setItem(key, data); +}); +``` + +> The size of the export corresponds to the memory consumption of the library. To reduce export size you have to use a configuration which has less memory footprint (use the table at the bottom to get information about configs and its memory allocation). + +When your save routine runs asynchronously you have to use `async/await` or return a promise: + +```js +index.export(function(key, data){ + + return new Promise(function(resolve){ + + // do the saving as async + + resolve(); + }); +}); +``` + +Before you can import data, you need to create your index first. For document indexes provide the same document descriptor you used when export the data. This configuration isn't stored in the export. + +```js +const index = new Index({/* keep old config and place it here */}); +``` + +To import the data just pass a key and data: + +``` +const data = localStorage.getItem(key); +index.import(key, data); +``` + +You need to import every key! Otherwise, your index does not work. You need to store the keys from the export and use this keys for the import (the order of the keys can differ). + +> The feature "fastupdate" is automatically disabled on import. + +This is just for demonstration and is not recommended, because you might have other keys in your localStorage which aren't supported as an import: + +```js +var keys = Object.keys(localStorage); + +for(let i = 0, key, data; i < keys.length; i++){ + key = keys[i] + data = localStorage.getItem(key); + index.import(key, data); +} +``` diff --git a/doc/options.md b/doc/options.md new file mode 100644 index 0000000..11e2f48 --- /dev/null +++ b/doc/options.md @@ -0,0 +1,404 @@ +## Index Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
preset + "memory"
+ "performance"
+ "match"
+ "score"
+ "default" +
+ The configuration profile as a shortcut or as a base for your custom settings.
+
"default"
tokenize + "strict", "exact"
+ "forward"
+ "reverse", "bidirectional
+ "full" +
+ The indexing mode (tokenizer).

Choose one of the built-ins or pass a custom tokenizer function.
+
"strict"
cache + Boolean
+ Number +
Enable/Disable and/or set capacity of cached entries.

When passing a number as a limit the cache automatically balance stored entries related to their popularity.

Note: When just using "true" the cache has no limits and growth unbounded.
false
resolution + Number + Sets the scoring resolution (default: 9).9
context + Boolean
+ Context Options +
Enable/Disable contextual indexing. When passing "true" as value it will take the default values for the context.false
optimize + Boolean + When enabled it uses a memory-optimized stack flow for the index.true
boost + function(arr, str, int) => float + A custom boost function used when indexing contents to the index. The function has this signature: Function(words[], term, index) => Float. It has 3 parameters where you get an array of all words, the current term and the current index where the term is placed in the word array. You can apply your own calculation e.g. the occurrences of a term and return this factor (<1 means relevance is lowered, >1 means relevance is increased).

Note: this feature is currently limited by using the tokenizer "strict" only.
null
+ Language-specific Options and Encoding: +
charset

+ Charset Payload
+ String (key) +
+ Provide a custom charset payload or pass one of the keys of built-in charsets. + "latin"
language

+ Language Payload
+ String (key) +
+ Provide a custom language payload or pass in language shorthand flag (ISO-3166) of built-in languages. + null
encode






+ false
+ "default"
+ "simple"
+ "balance"
+ "advanced"
+ "extra"
+ function(str) => [words] +
The encoding type.

Choose one of the built-ins or pass a custom encoding function.
"default"
stemmer


+ false
+ String
+ Function +
false
filter


+ false
+ String
+ Function +
false
matcher


+ false
+ String
+ Function +
false
+ Additional Options for Document Indexes: +
worker
+ Boolean + Enable/Disable and set count of running worker threads.false
document
Document Descriptor + Includes definitions for the document index and storage. +
+ +## Context Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
resolution + Number + Sets the scoring resolution for the context (default: 1).1
depth

+ false
+ Number +
Enable/Disable contextual indexing and also sets contextual distance of relevance. Depth is the maximum number of words/tokens away a term to be considered as relevant.1
bidirectional + Boolean + Sets bidirectional search result. If enabled and the source text contains "red hat", it will be found for queries "red hat" and "hat red".true
+ +## Document Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
id
String"id""
tag

false
String
"tag"
index


String
Array<String>
Array<Object>
store


Boolean
String
Array<String>
false
+ +## Encoder Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
split

+ false
+ RegExp
+ String +
+ The rule to split words when using non-custom tokenizer (built-ins e.g. "forward"). Use a string/char or use a regular expression (default: /\W+/).
+
/[\W_]+/
rtl
+ Boolean + Enables Right-To-Left encoding.false
encode
+ function(str) => [words] + The custom encoding function./lang/latin/default.js
stemmer


+ false
+ String
+ Function +
Disable or pass in language shorthand flag (ISO-3166) or a custom object. +
filter


+ false
+ String
+ Function +
Disable or pass in language shorthand flag (ISO-3166) or a custom array.
matcher


+ false
+ String
+ Function +
Disable or pass in language shorthand flag (ISO-3166) or a custom array.
+ +## Search Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
limitnumberSets the limit of results.100
offsetnumberApply offset (skip items).0
suggestBooleanEnables suggestions in results.false
+ +## Document Search Options + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionValuesDescriptionDefault
indexString
Array<String>
Array<Object>
Sets the document fields which should be searched. When no field is set, all fields will be searched. Custom options per field are also supported.
tagString
Array<String>
Sets the document fields which should be searched. When no field is set, all fields will be searched. Custom options per field are also supported.false
enrichBooleanEnrich IDs from the results with the corresponding documents.false
bool"and"
"or"
Sets the used logical operator when searching through multiple fields or tags."or"
diff --git a/doc/persistent.md b/doc/persistent.md index 57edc65..f016066 100644 --- a/doc/persistent.md +++ b/doc/persistent.md @@ -235,3 +235,44 @@ __Search 1:__ Single term query
__Search N:__ Multi term query (Context-Search) The benchmark was executed against a single client. + +## Delete Store + Migration + +Actually there exist no migration tool. You will probably need some kind of migration on future updates or when you need to re-create the index on the database. + +> [!CAUTION] +> Please use the methods `index.destroy()` and `index.clear()` carefully. This methods will delete contents (truncate, drop) from the database accordingly to the passed `name` on initialization. + +Just clear all contents (truncate equivalent) from a store which connected to an index: + +```js +// always define a unique name when assigning a storage +const db = new Database("my-store", config); +await index.mount(db); +// truncate all contents +await index.clear(); +``` + +Drop all tables (and its schema): + +```js +// always define a unique name when assigning a storage +const db = new Database("my-store", config); +await index.mount(db); +// drop all associated tables +await index.destroy(); +``` + +A full migration cycle could be combined by: + +```js +// always define a unique name when assigning a storage +const db = new Database("my-store", config); +await index.mount(db); +// drop all associated tables +await index.destroy(); +// when destroyed you'll need to mount again +// to run table creation +await index.mount(db); +// access index ... +``` diff --git a/doc/worker.md b/doc/worker.md index e001ad2..e9845af 100644 --- a/doc/worker.md +++ b/doc/worker.md @@ -1,3 +1,149 @@ + + +## Worker Parallelism (Browser + Node.js) + +The new worker model from v0.7.0 is divided into "fields" from the document (1 worker = 1 field index). This way the worker becomes able to solve tasks (subtasks) completely. The downside of this paradigm is they might not have been perfect balanced in storing contents (fields may have different length of contents). On the other hand there is no indication that balancing the storage gives any advantage (they all require the same amount in total). + +When using a document index, then just apply the option "worker": +```js +const index = new Document({ + index: ["tag", "name", "title", "text"], + worker: true +}); + +index.add({ + id: 1, tag: "cat", name: "Tom", title: "some", text: "some" +}).add({ + id: 2, tag: "dog", name: "Ben", title: "title", text: "content" +}).add({ + id: 3, tag: "cat", name: "Max", title: "to", text: "to" +}).add({ + id: 4, tag: "dog", name: "Tim", title: "index", text: "index" +}); +``` + +``` +Worker 1: { 1: "cat", 2: "dog", 3: "cat", 4: "dog" } +Worker 2: { 1: "Tom", 2: "Ben", 3: "Max", 4: "Tim" } +Worker 3: { 1: "some", 2: "title", 3: "to", 4: "index" } +Worker 4: { 1: "some", 2: "content", 3: "to", 4: "index" } +``` + +When you perform a field search through all fields then this task is being balanced perfectly through all workers, which can solve their subtasks independently. + +### Worker Index + +Above we have seen that documents will create worker automatically for each field. You can also create a WorkerIndex directly (same like using `Index` instead of `Document`). + +Use as ES6 module: + +```js +import WorkerIndex from "./worker/index.js"; +const index = new WorkerIndex(options); +index.add(1, "some") + .add(2, "content") + .add(3, "to") + .add(4, "index"); +``` + +Or when bundled version was used instead: + +```js +var index = new FlexSearch.Worker(options); +index.add(1, "some") + .add(2, "content") + .add(3, "to") + .add(4, "index"); +``` + +Such a WorkerIndex works pretty much the same as a created instance of `Index`. + +> A WorkerIndex only support the `async` variant of all methods. That means when you call `index.search()` on a WorkerIndex this will perform also in async the same way as `index.searchAsync()` will do. + +### Worker Threads (Node.js) + +The worker model for Node.js is based on "worker threads" and works exactly the same way: + +```js +const { Document } = require("flexsearch"); + +const index = new Document({ + index: ["tag", "name", "title", "text"], + worker: true +}); +``` + +Or create a single worker instance for a non-document index: + +```js +const { Worker } = require("flexsearch"); +const index = new Worker({ options }); +``` + +### The Worker Async Model (Best Practices) + +A worker will always perform as async. On a query method call you always should handle the returned promise (e.g. use `await`) or pass a callback function as the last parameter. + +```js +const index = new Document({ + index: ["tag", "name", "title", "text"], + worker: true +}); +``` + +All requests and sub-tasks will run in parallel (prioritize "all tasks completed"): + +```js +index.searchAsync(query, callback); +index.searchAsync(query, callback); +index.searchAsync(query, callback); +``` + +Also (prioritize "all tasks completed"): + +```js +index.searchAsync(query).then(callback); +index.searchAsync(query).then(callback); +index.searchAsync(query).then(callback); +``` + +Or when you have just one callback when all requests are done, simply use `Promise.all()` which also prioritize "all tasks completed": + +```js +Promise.all([ + index.searchAsync(query), + index.searchAsync(query), + index.searchAsync(query) +]).then(callback); +``` + +Inside the callback of `Promise.all()` you will also get an array of results as the first parameter respectively for each query you put into. + +When using `await` you can prioritize the order (prioritize "first task completed") and solve requests one by one and just process the sub-tasks in parallel: + +```js +await index.searchAsync(query); +await index.searchAsync(query); +await index.searchAsync(query); +``` + +Same for `index.add()`, `index.append()`, `index.remove()` or `index.update()`. Here there is a special case which isn't disabled by the library, but you need to keep in mind when using Workers. + +When you call the "synced" version on a worker index: + +```js +index.add(doc); +index.add(doc); +index.add(doc); +// contents aren't indexed yet, +// they just queued on the message channel +``` + +Of course, you can do that but keep in mind that the main thread does not have an additional queue for distributed worker tasks. Running these in a long loop fires content massively to the message channel via `worker.postMessage()` internally. Luckily the browser and Node.js will handle such incoming tasks for you automatically (as long enough free RAM is available). When using the "synced" version on a worker index, the content isn't indexed one line below, because all calls are treated as async by default. + +> When adding/updating/removing large bulks of content to the index (or high frequency), it is recommended to use the async version along with `async/await` to keep a low memory footprint during long processes. + + ## Extern Worker Configuration When using Worker by __also__ assign custom functions to the options e.g.: diff --git a/index.d.ts b/index.d.ts index 808e088..a416b3a 100644 --- a/index.d.ts +++ b/index.d.ts @@ -439,6 +439,9 @@ declare module "flexsearch" { contain(id: Id): boolean | Promise; clear(): void | Promise; cleanup(): void | Promise; + get(id: Id): Promise | DocumentData | null; + set(id: Id, document: DocumentData): this; + set(document: DocumentData): this; // Export and Import export(handler: ExportHandler): void; @@ -561,11 +564,15 @@ declare module "flexsearch" { export class Resolver { constructor(options?: ResolverOptions | IntermediateSearchResults); + result: IntermediateSearchResults; + and(options: ResolverOptions): this; + or(options: ResolverOptions): this; + xor(options: ResolverOptions): this; + not(options: ResolverOptions): this; limit(limit: number): this; offset(offset: number): this; boost(boost: number): this; resolve(options?: DefaultResolve): SearchResults; - result: IntermediateSearchResults } class StorageInterface { diff --git a/package-lock.json b/package-lock.json index 9fcb704..f61cb45 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "flexsearch", - "version": "0.8.142", + "version": "0.8.143", "lockfileVersion": 2, "requires": true, "packages": { "": { "name": "flexsearch", - "version": "0.8.142", + "version": "0.8.143", "funding": [ { "type": "github", diff --git a/src/document.js b/src/document.js index 40d48a5..e3f535c 100644 --- a/src/document.js +++ b/src/document.js @@ -508,20 +508,25 @@ if(SUPPORT_STORE){ if(SUPPORT_PERSISTENT && this.db){ return this.index.get(this.field[0]).db.enrich(id).then(function(result){ - return result[0] && result[0]["doc"]; + return (result[0] && result[0]["doc"]) || null; }); } - return this.store.get(id); + return this.store.get(id) || null; }; /** - * @param {number|string} id + * @param {number|string|Object} id * @param {Object} data * @return {Document} */ Document.prototype.set = function(id, data){ + if(is_object(id)){ + data = id; + id = parse_simple(data, this.key); + } + this.store.set(id, data); return this; };