1
0
mirror of https://github.com/nextapps-de/flexsearch.git synced 2025-08-23 22:24:31 +02:00
Files
flexsearch/test/encoder.js
Thomas Wilkerling 15018e1b26 update github action
2025-05-23 19:38:26 +02:00

583 lines
18 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

global.self = global;
const env = process.argv[process.argv.length - 1] === "--exit" ? "" : process.argv[process.argv.length - 1];
import { expect } from "chai";
let FlexSearch = await import(env ? "../dist/" + env + ".js" : "../src/bundle.js");
if(FlexSearch.default) FlexSearch = FlexSearch.default;
if(FlexSearch.FlexSearch) FlexSearch = FlexSearch.FlexSearch;
const { Index, Document, Worker, Charset: _Charset, Encoder, Resolver } = FlexSearch;
const build_light = env && env.includes("light");
const build_compact = env && env.includes("compact");
const build_esm = !env || env.startsWith("module");
const Charset = _Charset || (await import("../src/charset.js")).default;
describe("Encoder", function(){
it("Should have been properly normalized", function(){
const index = new Index();
expect(index.encoder.normalize).to.equal(true);
index.add(1, "La Bamba");
index.add(2, "La Bohème");
expect(index.search("la boheme")).to.eql([2]);
expect(index.search("la boheme", { suggest: true })).to.eql([2, 1]);
});
it("Should have been properly added a custom encoder", function(){
const encode = str => str.toLowerCase().split(/\s+/);
const index = new Index({ encoder: encode });
expect(index.encoder.encode).to.eql(encode);
});
it("Should have been properly added a custom encode (alternative)", function(){
const encode = str => str.toLowerCase().split(/\s+/);
const index = new Index({ encode });
expect(index.encoder.encode).to.eql(encode);
});
});
describe("Encoder: Charset", function(){
it("Should have been encoded properly: Default", function(){
let index = new Index({ encoder: Charset.Default });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
["bjorn", "philip", "mayer"]
);
index = new Index();
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
["bjorn", "philip", "mayer"]
);
});
if(!build_light){
it("Should have been encoded properly: Exact", function(){
const index = new Index({ encoder: Charset.Exact });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
["Björn", "Phillipp", "Mayer"]
);
});
it("Should have been encoded properly: Normalize", function(){
const index = new Index({ encoder: Charset.Normalize });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
index.encoder.encode("bjorn/philip mayer")
);
});
it("Should have been encoded properly: LatinBalance", function(){
const index = new Index({ encoder: Charset.LatinBalance });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
index.encoder.encode("bjorn philip mair")
);
});
it("Should have been encoded properly: LatinAdvanced", function(){
const index = new Index({ encoder: Charset.LatinAdvanced });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
index.encoder.encode("bjoern filip mair")
);
});
it("Should have been encoded properly: LatinExtra", function(){
const index = new Index({ encoder: Charset.LatinExtra });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
index.encoder.encode("bjorm filib mayr")
);
});
it("Should have been encoded properly: LatinSoundex", function(){
const index = new Index({ encoder: Charset.LatinSoundex });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
index.encoder.encode("bjoernsen philippo mayr")
);
});
}
it("Should have been encoded properly: Custom Encoder", function(){
function test_encoder(str){
return "-[" + str.toUpperCase() + "]-";
}
const index = new Index({ encoder: test_encoder });
expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql("-[BJÖRN-PHILLIPP MAYER]-");
});
});
describe("Encoder: CJK Charset", function(){
it("Should have been tokenized properly", function(){
const index = Index({ encoder: Charset.CJK });
index.add(0, "서울시가 잠이 든 시간에 아무 말, 미뤄, 미뤄");
expect(index.search("든")).to.include(0);
expect(index.search("시간에")).to.include(0);
index.add(1, "一个单词");
expect(index.search("一个")).to.include(1);
expect(index.search("单词")).to.include(1);
expect(index.search("词单")).to.include(1);
});
it("Should have been tokenized properly", function(){
const index = Index({ encoder: Charset.CJK });
index.add(1 , "多大的;多少平");
const result = index.search('是多少平的', { suggest: true });
expect(result).to.include(1);
});
});
describe("Encoder: Cyrillic Charset", function(){
it("Should have been tokenized properly", function(){
const index = Index({ tokenize: "forward" });
index.add(0, "Фообар");
expect(index.search("Фообар")).to.include(0);
expect(index.search("Фоо")).to.include(0);
});
});
describe("Encoder: Arabic Charset", function(){
it("Should have been tokenized properly", function(){
let index = Index({ tokenize: "forward" });
index.add(0, "لكن لا بد أن أوضح لك أن كل");
expect(index.search("بد أن")).to.include(0);
expect(index.search("أو")).to.include(0);
index = Index({ tokenize: "reverse" });
index.add(0, "لكن لا بد أن أوضح لك أن كل");
expect(index.search("ضح")).to.include(0);
});
});
describe("Encoder: Greek Charset", function(){
it("Should have been tokenized properly", function(){
const index = Index({ tokenize: "forward" });
index.add(0, "Μήγαρις ἔχω ἄλλο στὸ νοῦ μου πάρεξ ἐλευθερία καὶ γλώσσα");
expect(index.search("Μηγαρις εχω αλλο στο νου μου παρε ελευθ και γλωσσα")).to.include(0);
});
});
describe("Encoder: Right-to-Left", function(){
it("Should have been scored properly", function(){
let index = new Index({
tokenize: "forward",
rtl: true
});
index.add(0, "54321 4 3 2 0");
index.add(1, "0 2 3 4 54321");
index.add(2, "0 2 3 4 12345");
expect(index.search("5")).to.eql([2]);
expect(index.search("1")).to.eql([1, 0]);
index = new Index({
tokenize: "reverse",
rtl: true
});
index.add(0, "54321 4 3 2 1 0");
index.add(1, "0 1 2 3 4 54321");
index.add(2, "0 1 2 3 4 12345");
expect(index.search("5")).to.eql([2, 1, 0]);
});
});
describe("Filter", function(){
it("Should have been filtered properly", function(){
let encoder = new Encoder({
filter: ["in", "the"]
});
let index = new Index({
tokenize: "strict",
encoder: encoder
});
index.add(0, "Today in the morning.");
expect(index.search("today in the morning.")).to.include(0);
expect(index.search("today morning")).to.include(0);
expect(index.search("in the")).to.have.length(0);
index = new Index({
tokenize: "strict",
encoder: encoder,
context: true
});
index.add(0, "Today in the morning.");
expect(index.search("today morning")).to.include(0);
encoder = new Encoder();
encoder.addFilter("in");
index = new Index({
tokenize: "strict",
encoder: encoder
});
index.encoder.addFilter("the");
index.add(0, "Today in the morning.");
expect(index.search("in the")).to.have.length(0);
// extend
index.encoder.assign({
filter: ["morning"]
});
index.add(0, "Today in the morning.");
expect(index.search("in the")).to.have.length(0);
expect(index.search("morning")).to.have.length(0);
expect(index.search("Today")).to.eql([0]);
});
it("Should have been filtered properly (custom function)", function(){
const encoder = new Encoder({
filter: function(word){
return word.length > 3;
}
});
const index = new Index({
tokenize: "strict",
encoder: encoder
});
index.add(0, "Today in the morning.");
expect(index.search("today in the morning.")).to.include(0);
expect(index.search("today morning")).to.include(0);
expect(index.search("in the")).to.have.length(0);
encoder.assign({
filter: function(word){
return word.length > 3 &&
word !== "today";
}
});
index.add(0, "Today in the morning.");
expect(index.search("today in the morning.")).to.include(0);
expect(index.search("today morning")).to.include(0);
expect(index.search("in the")).to.have.length(0);
expect(index.search("today")).to.have.length(0);
});
it("Should have been filtered properly (finalize)", function(){
const encoder = new Encoder({
finalize: function(word){
return word.filter(t => t.length > 3);
}
});
const index = new Index({
tokenize: "strict",
encoder: encoder
});
index.add(0, "Today in the morning.");
expect(index.search("today in the morning.")).to.include(0);
expect(index.search("today morning")).to.include(0);
expect(index.search("in the")).to.have.length(0);
// extend
encoder.assign({
finalize: function(word){
return word.filter(t => t.length > 5);
}
});
expect(index.search("today in the morning.")).to.include(0);
expect(index.search("today morning")).to.include(0);
expect(index.search("in the")).to.have.length(0);
expect(index.search("today")).to.have.length(0);
});
it("Should have been filtered properly (minlength)", function(){
const encoder = new Encoder({
minlength: 4
});
const index = new Index({
tokenize: "strict",
encoder: encoder
});
index.add(0, "Today in the morning.");
expect(index.search("today in the morning.")).to.include(0);
expect(index.search("today morning")).to.include(0);
expect(index.search("in the")).to.have.length(0);
});
});
describe("Stemmer", function(){
it("Should have been stemmed properly", function(){
const encoder = new Encoder({
stemmer: new Map([
["ization", "ize"],
["tional", "tion"]
])
});
const index = new Index({
tokenize: "strict",
encoder: encoder
});
index.add(0, "Just a multinational colonization.");
expect(index.search("Just a multinational colonization.")).to.include(0);
expect(index.search("multinational colonization")).to.include(0);
expect(index.search("multination colonize")).to.include(0);
// extend
encoder.assign({
stemmer: new Map([
["licate", "e"]
])
});
index.add(0, "Just a duplicate multinational colonization.");
expect(index.search("Just a multinational colonization.")).to.include(0);
expect(index.search("multinational colonization")).to.include(0);
expect(index.search("multination colonize")).to.include(0);
expect(index.search("dupe")).to.include(0);
});
// it("Should have been stemmed properly (custom function)", function(){
//
// var stems = {
// "ization": "ize",
// "tional": "tion"
// };
//
// var index = new FlexSearch({
// tokenize: "strict",
// stemmer: function(word){
// return stems[word] || word;
// }
// });
//
// index.add(0, "Just a multinational colonization.");
//
// expect(index.length).to.equal(1);
// expect(index.search("Just a multinational colonization.")).to.include(0);
// expect(index.search("multinational colonization")).to.include(0);
// expect(index.search("tional tion")).to.have.length(0);
// });
// });
//
//
// describe("Custom Language", function(){
//
// it("Should have been applied properly", function(){
//
// var index = new FlexSearch({
// tokenize: "reverse",
// filter: ["a", "an"],
// stemmer: {
// "ization": "ize",
// "tional": "tion"
// }
// });
//
// index.add(0, "Just a multinational colonization.");
//
// expect(index.length).to.equal(1);
// expect(index.search("Just a multinational colonization.")).to.include(0);
// expect(index.search("Just an multinational colonization.")).to.include(0);
// expect(index.search("multinational colonization")).to.include(0);
// expect(index.search("tional tion")).to.have.length(0);
//
// FlexSearch.registerLanguage("custom", {
// filter: ["a", "an"],
// stemmer: {
// "ization": "ize",
// "tional": "tion"
// }
// });
//
// index = new FlexSearch({
// tokenize: "reverse",
// lang: "custom"
// });
//
// index.add(0, "Just a multinational colonization.");
//
// expect(index.length).to.equal(1);
// expect(index.search("Just a multinational colonization.")).to.include(0);
// expect(index.search("Just an multinational colonization.")).to.include(0);
// expect(index.search("multinational colonization")).to.include(0);
// expect(index.search("tional tion")).to.have.length(0);
// });
});
describe("Mapper", function(){
it("Should have been applied custom Mapper properly", function(){
const index = new Index({
tokenize: "forward",
encoder: new Encoder({
numeric: false,
dedupe: false,
mapper: new Map([
["1", "a"],
["2", "b"],
["3", "c"],
["4", "d"],
["5", "d"],
["6", "d"],
["7", "e"],
["8", "f"]
])
})
});
index.add(0, "12345678");
expect(index.search("12345678")).to.eql([0]);
expect(index.search("abcd")).to.eql([0]);
expect(index.encoder.encode("12345678")).to.eql(["abcdddef"]);
// extend
index.encoder.assign({
mapper: new Map([
["1", "x"],
["2", "y"],
["3", "z"],
["7", "x"],
["8", "y"]
])
});
index.add(0, "12345678");
expect(index.search("12345678")).to.eql([0]);
expect(index.search("xyzd")).to.eql([0]);
expect(index.encoder.encode("12345678")).to.eql(["xyzdddxy"]);
});
});
describe("Matcher", function(){
it("Should have been applied custom Matcher properly", function(){
const index = new Index({
tokenize: "forward",
encoder: new Encoder({
numeric: false,
dedupe: false,
matcher: new Map([
["1", "a"],
["2", "b"],
["3", "c"],
["456", "d"],
["7", "e"],
["8", "f"]
])
})
});
index.add(0, "12345678");
expect(index.search("12345678")).to.eql([0]);
expect(index.search("abcd")).to.eql([0]);
expect(index.encoder.encode("12345678")).to.eql(["abcdef"]);
// extend
index.encoder.assign({
matcher: new Map([
["1", "x"],
["456", "ddd"],
["8", "y"]
])
});
index.add(0, "12345678");
expect(index.search("12345678")).to.eql([0]);
expect(index.search("xbcd")).to.eql([0]);
expect(index.encoder.encode("12345678")).to.eql(["xbcdddey"]);
});
});
describe("Replacer", function(){
it("Should have been applied custom Replacer properly", function(){
const index = new Index({
tokenize: "forward",
encoder: new Encoder({
numeric: false,
dedupe: false,
replacer: [
"1", "a",
"2", "b",
"3", "c",
/[456]/g, "d",
"7", "e",
"8", "f"
]
})
});
index.add(0, "12345678");
expect(index.search("12345678")).to.eql([0]);
expect(index.search("abcd")).to.eql([0]);
expect(index.encoder.encode("12345678")).to.eql(["abcdddef"]);
// extend
index.encoder.assign({
replacer: [
"a", "1",
"b", "2",
"c", "3",
"e", "7",
"f", "8"
]
});
index.add(0, "12345678");
expect(index.search("12345678")).to.eql([0]);
expect(index.search("123d")).to.eql([0]);
expect(index.search("abcd")).to.eql([0]);
expect(index.encoder.encode("12345678")).to.eql(["123ddd78"]);
});
});