1
0
mirror of https://github.com/nextapps-de/flexsearch.git synced 2025-09-25 21:08:59 +02:00
Files
flexsearch/dist/module-debug/index/add.js

249 lines
9.1 KiB
JavaScript

import { create_object } from "../common.js";
import Index, { autoCommit } from "../index.js";
import default_compress from "../compress.js";
import { KeystoreArray } from "../keystore.js";
// TODO:
// string + number as text
// boolean, null, undefined as ?
/**
* @param {!number|string} id
* @param {!string} content
* @param {boolean=} _append
* @param {boolean=} _skip_update
*/
Index.prototype.add = function (id, content, _append, _skip_update) {
if (content && (id || 0 === id)) {
// todo check skip_update
//_skip_update = false;
if (!_skip_update && !_append) {
if (this.reg.has(id)) {
return this.update(id, content);
}
}
const depth = this.depth;
// do not force a string as input
// https://github.com/nextapps-de/flexsearch/issues/432
content = this.encoder.encode(content, !depth);
const word_length = content.length;
if (word_length) {
// check context dupes to skip all contextual redundancy along a document
const dupes_ctx = create_object(),
dupes = create_object(),
resolution = this.resolution;
for (let i = 0; i < word_length; i++) {
let term = content[this.rtl ? word_length - 1 - i : i],
term_length = term.length;
// todo check context search
// this check also wasn't applied on search, so it's useless here
// skip dupes will break the context chain
if (term_length && (depth || !dupes[term])) {
let score = this.score ? this.score(content, term, i, null, 0) : get_score(resolution, word_length, i),
token = "";
switch (this.tokenize) {
case "full":
if (2 < term_length) {
for (let x = 0, _x; x < term_length; x++) {
for (let y = term_length; y > x; y--) {
token = term.substring(x, y);
_x = this.rtl ? term_length - 1 - x : x;
const partial_score = this.score ? this.score(content, term, i, token, _x) : get_score(resolution, word_length, i, term_length, _x);
this.push_index(dupes, token, partial_score, id, _append);
}
}
break;
}
// fallthrough to next case when term length < 3
case "bidirectional":
case "reverse":
// skip last round (this token exist already in "forward")
if (1 < term_length) {
for (let x = term_length - 1; 0 < x; x--) {
token = term[this.rtl ? term_length - 1 - x : x] + token;
const partial_score = this.score ? this.score(content, term, i, token, x) : get_score(resolution, word_length, i, term_length, x);
this.push_index(dupes, token, partial_score, id, _append);
}
token = "";
}
// fallthrough to next case to apply forward also
case "forward":
if (1 < term_length) {
for (let x = 0; x < term_length; x++) {
token += term[this.rtl ? term_length - 1 - x : x];
this.push_index(dupes, token, score, id, _append);
}
break;
}
// fallthrough to next case when token has a length of 1
default:
// "strict":
this.push_index(dupes, term, score, id, _append);
// context is just supported by tokenizer "strict"
if (depth) {
if (1 < word_length && i < word_length - 1) {
// check inner dupes to skip repeating words in the current context
const dupes_inner = create_object(),
resolution = this.resolution_ctx,
keyword = term,
size = Math.min(depth + 1, this.rtl ? i + 1 : word_length - i);
dupes_inner[keyword] = 1;
for (let x = 1; x < size; x++) {
term = content[this.rtl ? word_length - 1 - i - x : i + x];
if (term && !dupes_inner[term]) {
dupes_inner[term] = 1;
const context_score = this.score ? this.score(content, keyword, i, term, x - 1) : get_score(resolution + (word_length / 2 > resolution ? 0 : 1), word_length, i, size - 1, x - 1),
swap = this.bidirectional && term > keyword;
this.push_index(dupes_ctx, swap ? keyword : term, context_score, id, _append, swap ? term : keyword);
}
}
}
}
}
}
}
this.fastupdate || this.reg.add(id);
} else {
content = "";
}
}
if (this.db) {
// when the term has no valid content (e.g. empty),
// then it was not added to the ID registry for removal
content || this.commit_task.push({ del: id });
this.commit_auto && autoCommit(this);
}
return this;
};
/**
* @private
* @param dupes
* @param term
* @param score
* @param id
* @param {boolean=} append
* @param {string=} keyword
*/
Index.prototype.push_index = function (dupes, term, score, id, append, keyword) {
let arr = keyword ? this.ctx : this.map,
tmp;
if (!dupes[term] || keyword && !(tmp = dupes[term])[keyword]) {
if (keyword) {
dupes = tmp || (dupes[term] = create_object());
dupes[keyword] = 1;
if (this.compress) {
keyword = default_compress(keyword);
}
tmp = arr.get(keyword);
tmp ? arr = tmp : arr.set(keyword, arr = new Map());
} else {
dupes[term] = 1;
}
if (this.compress) {
term = default_compress(term);
}
tmp = arr.get(term);
tmp ? arr = tmp : arr.set(term, arr = tmp = []);
// the ID array will be upgraded dynamically
arr = arr[score] || (arr[score] = []);
if (!append || !arr.includes(id)) {
// auto-upgrade to keystore array if max size exceeded
if (2147483647 === arr.length /*|| !(arr instanceof KeystoreArray)*/) {
const keystore = new KeystoreArray(arr);
if (this.fastupdate) {
for (let value of this.reg.values()) {
if (value.includes(arr)) {
value[value.indexOf(arr)] = keystore;
}
}
}
tmp[score] = arr = keystore;
}
arr.push(id);
// add a reference to the register for fast updates
if (this.fastupdate) {
const tmp = this.reg.get(id);
tmp ? tmp.push(arr) : this.reg.set(id, [arr]);
}
}
}
};
/**
* @param {number} resolution
* @param {number} length
* @param {number} i
* @param {number=} term_length
* @param {number=} x
* @returns {number}
*/
function get_score(resolution, length, i, term_length, x) {
// console.log("resolution", resolution);
// console.log("length", length);
// console.log("term_length", term_length);
// console.log("i", i);
// console.log("x", x);
// console.log((resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1);
// the first resolution slot is reserved for the best match,
// when a query matches the first word(s).
// also to stretch score to the whole range of resolution, the
// calculation is shift by one and cut the floating point.
// this needs the resolution "1" to be handled additionally.
// do not stretch the resolution more than the term length will
// improve performance and memory, also it improves scoring in
// most cases between a short document and a long document
return i && 1 < resolution ? length + (term_length || 0) <= resolution ? i + (x || 0) : 0 | (resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1 : 0;
}