mirror of
https://github.com/nextapps-de/flexsearch.git
synced 2025-09-25 21:08:59 +02:00
249 lines
9.1 KiB
JavaScript
249 lines
9.1 KiB
JavaScript
|
|
import { create_object } from "../common.js";
|
|
import Index, { autoCommit } from "../index.js";
|
|
import default_compress from "../compress.js";
|
|
import { KeystoreArray } from "../keystore.js";
|
|
|
|
// TODO:
|
|
// string + number as text
|
|
// boolean, null, undefined as ?
|
|
|
|
|
|
/**
|
|
* @param {!number|string} id
|
|
* @param {!string} content
|
|
* @param {boolean=} _append
|
|
* @param {boolean=} _skip_update
|
|
*/
|
|
|
|
Index.prototype.add = function (id, content, _append, _skip_update) {
|
|
|
|
if (content && (id || 0 === id)) {
|
|
|
|
// todo check skip_update
|
|
//_skip_update = false;
|
|
|
|
if (!_skip_update && !_append) {
|
|
if (this.reg.has(id)) {
|
|
return this.update(id, content);
|
|
}
|
|
}
|
|
|
|
const depth = this.depth;
|
|
// do not force a string as input
|
|
// https://github.com/nextapps-de/flexsearch/issues/432
|
|
content = this.encoder.encode(content, !depth);
|
|
const word_length = content.length;
|
|
|
|
if (word_length) {
|
|
|
|
// check context dupes to skip all contextual redundancy along a document
|
|
|
|
const dupes_ctx = create_object(),
|
|
dupes = create_object(),
|
|
resolution = this.resolution;
|
|
|
|
|
|
for (let i = 0; i < word_length; i++) {
|
|
let term = content[this.rtl ? word_length - 1 - i : i],
|
|
term_length = term.length;
|
|
|
|
|
|
// todo check context search
|
|
// this check also wasn't applied on search, so it's useless here
|
|
// skip dupes will break the context chain
|
|
if (term_length && (depth || !dupes[term])) {
|
|
let score = this.score ? this.score(content, term, i, null, 0) : get_score(resolution, word_length, i),
|
|
token = "";
|
|
|
|
|
|
switch (this.tokenize) {
|
|
|
|
case "full":
|
|
if (2 < term_length) {
|
|
for (let x = 0, _x; x < term_length; x++) {
|
|
for (let y = term_length; y > x; y--) {
|
|
token = term.substring(x, y);
|
|
_x = this.rtl ? term_length - 1 - x : x;
|
|
const partial_score = this.score ? this.score(content, term, i, token, _x) : get_score(resolution, word_length, i, term_length, _x);
|
|
this.push_index(dupes, token, partial_score, id, _append);
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
// fallthrough to next case when term length < 3
|
|
case "bidirectional":
|
|
case "reverse":
|
|
// skip last round (this token exist already in "forward")
|
|
if (1 < term_length) {
|
|
for (let x = term_length - 1; 0 < x; x--) {
|
|
token = term[this.rtl ? term_length - 1 - x : x] + token;
|
|
const partial_score = this.score ? this.score(content, term, i, token, x) : get_score(resolution, word_length, i, term_length, x);
|
|
this.push_index(dupes, token, partial_score, id, _append);
|
|
}
|
|
token = "";
|
|
}
|
|
|
|
// fallthrough to next case to apply forward also
|
|
case "forward":
|
|
if (1 < term_length) {
|
|
for (let x = 0; x < term_length; x++) {
|
|
token += term[this.rtl ? term_length - 1 - x : x];
|
|
this.push_index(dupes, token, score, id, _append);
|
|
}
|
|
break;
|
|
}
|
|
|
|
// fallthrough to next case when token has a length of 1
|
|
default:
|
|
// "strict":
|
|
this.push_index(dupes, term, score, id, _append);
|
|
// context is just supported by tokenizer "strict"
|
|
if (depth) {
|
|
|
|
if (1 < word_length && i < word_length - 1) {
|
|
|
|
// check inner dupes to skip repeating words in the current context
|
|
const dupes_inner = create_object(),
|
|
resolution = this.resolution_ctx,
|
|
keyword = term,
|
|
size = Math.min(depth + 1, this.rtl ? i + 1 : word_length - i);
|
|
|
|
|
|
dupes_inner[keyword] = 1;
|
|
|
|
for (let x = 1; x < size; x++) {
|
|
|
|
term = content[this.rtl ? word_length - 1 - i - x : i + x];
|
|
|
|
if (term && !dupes_inner[term]) {
|
|
|
|
dupes_inner[term] = 1;
|
|
const context_score = this.score ? this.score(content, keyword, i, term, x - 1) : get_score(resolution + (word_length / 2 > resolution ? 0 : 1), word_length, i, size - 1, x - 1),
|
|
swap = this.bidirectional && term > keyword;
|
|
|
|
this.push_index(dupes_ctx, swap ? keyword : term, context_score, id, _append, swap ? term : keyword);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
this.fastupdate || this.reg.add(id);
|
|
} else {
|
|
content = "";
|
|
}
|
|
}
|
|
|
|
if (this.db) {
|
|
// when the term has no valid content (e.g. empty),
|
|
// then it was not added to the ID registry for removal
|
|
content || this.commit_task.push({ del: id });
|
|
this.commit_auto && autoCommit(this);
|
|
}
|
|
|
|
return this;
|
|
};
|
|
|
|
/**
|
|
* @private
|
|
* @param dupes
|
|
* @param term
|
|
* @param score
|
|
* @param id
|
|
* @param {boolean=} append
|
|
* @param {string=} keyword
|
|
*/
|
|
|
|
Index.prototype.push_index = function (dupes, term, score, id, append, keyword) {
|
|
let arr = keyword ? this.ctx : this.map,
|
|
tmp;
|
|
|
|
|
|
if (!dupes[term] || keyword && !(tmp = dupes[term])[keyword]) {
|
|
|
|
if (keyword) {
|
|
|
|
dupes = tmp || (dupes[term] = create_object());
|
|
dupes[keyword] = 1;
|
|
|
|
if (this.compress) {
|
|
keyword = default_compress(keyword);
|
|
}
|
|
|
|
tmp = arr.get(keyword);
|
|
tmp ? arr = tmp : arr.set(keyword, arr = new Map());
|
|
} else {
|
|
|
|
dupes[term] = 1;
|
|
}
|
|
|
|
if (this.compress) {
|
|
term = default_compress(term);
|
|
}
|
|
|
|
tmp = arr.get(term);
|
|
tmp ? arr = tmp : arr.set(term, arr = tmp = []);
|
|
// the ID array will be upgraded dynamically
|
|
arr = arr[score] || (arr[score] = []);
|
|
|
|
if (!append || !arr.includes(id)) {
|
|
|
|
// auto-upgrade to keystore array if max size exceeded
|
|
if (2147483647 === arr.length /*|| !(arr instanceof KeystoreArray)*/) {
|
|
const keystore = new KeystoreArray(arr);
|
|
if (this.fastupdate) {
|
|
for (let value of this.reg.values()) {
|
|
if (value.includes(arr)) {
|
|
value[value.indexOf(arr)] = keystore;
|
|
}
|
|
}
|
|
}
|
|
tmp[score] = arr = keystore;
|
|
}
|
|
|
|
|
|
arr.push(id);
|
|
|
|
// add a reference to the register for fast updates
|
|
if (this.fastupdate) {
|
|
const tmp = this.reg.get(id);
|
|
tmp ? tmp.push(arr) : this.reg.set(id, [arr]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @param {number} resolution
|
|
* @param {number} length
|
|
* @param {number} i
|
|
* @param {number=} term_length
|
|
* @param {number=} x
|
|
* @returns {number}
|
|
*/
|
|
|
|
function get_score(resolution, length, i, term_length, x) {
|
|
|
|
// console.log("resolution", resolution);
|
|
// console.log("length", length);
|
|
// console.log("term_length", term_length);
|
|
// console.log("i", i);
|
|
// console.log("x", x);
|
|
// console.log((resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1);
|
|
|
|
// the first resolution slot is reserved for the best match,
|
|
// when a query matches the first word(s).
|
|
|
|
// also to stretch score to the whole range of resolution, the
|
|
// calculation is shift by one and cut the floating point.
|
|
// this needs the resolution "1" to be handled additionally.
|
|
|
|
// do not stretch the resolution more than the term length will
|
|
// improve performance and memory, also it improves scoring in
|
|
// most cases between a short document and a long document
|
|
|
|
return i && 1 < resolution ? length + (term_length || 0) <= resolution ? i + (x || 0) : 0 | (resolution - 1) / (length + (term_length || 0)) * (i + (x || 0)) + 1 : 0;
|
|
} |