flexsearch/test/encoder.js

global.self = global;
const env = process.argv[process.argv.length - 1] === "--exit" ? "" : process.argv[process.argv.length - 1];
import { expect } from "chai";
let FlexSearch = await import(env ? "../dist/" + env + ".js" : "../src/bundle.js");
if(FlexSearch.default) FlexSearch = FlexSearch.default;
if(FlexSearch.FlexSearch) FlexSearch = FlexSearch.FlexSearch;
const { Index, Document, Worker, Charset: _Charset, Encoder, Resolver } = FlexSearch;
const build_light = env && env.includes("light");
const build_compact = env && env.includes("compact");
const build_esm = !env || env.startsWith("module");
const Charset = _Charset || (await import("../src/charset.js")).default;

describe("Encoder", function(){

    it("Should have been properly normalized", function(){

        const index = new Index();
        expect(index.encoder.normalize).to.equal(true);

        index.add(1, "La Bamba");
        index.add(2, "La Bohème");

        expect(index.search("la boheme")).to.eql([2]);
        expect(index.search("la boheme", { suggest: true })).to.eql([2, 1]);
    });

    it("Should have been properly added a custom encoder", function(){

        const encode = str => str.toLowerCase().split(/\s+/);
        const index = new Index({ encoder: encode });
        expect(index.encoder.encode).to.eql(encode);
    });

    it("Should have been properly added a custom encode (alternative)", function(){

        const encode = str => str.toLowerCase().split(/\s+/);
        const index = new Index({ encode });
        expect(index.encoder.encode).to.eql(encode);
    });
});

describe("Encoder: Charset", function(){

    it("Should have been encoded properly: Default", function(){

        let index = new Index({ encoder: Charset.Default });
        expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
            ["bjorn", "philip", "mayer"]
        );

        index = new Index();
        expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
            ["bjorn", "philip", "mayer"]
        );
    });

    if(!build_light){

        it("Should have been encoded properly: Exact", function(){

            const index = new Index({ encoder: Charset.Exact });
            expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
                ["Björn", "Phillipp", "Mayer"]
            );
        });

        it("Should have been encoded properly: Normalize", function(){

            const index = new Index({ encoder: Charset.Normalize });
            expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
                index.encoder.encode("bjorn/philip mayer")
            );
        });

        it("Should have been encoded properly: LatinBalance", function(){

            const index = new Index({ encoder: Charset.LatinBalance });
            expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
                index.encoder.encode("bjorn philip mair")
            );
        });

        it("Should have been encoded properly: LatinAdvanced", function(){

            const index = new Index({ encoder: Charset.LatinAdvanced });
            expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
                index.encoder.encode("bjoern filip mair")
            );
        });

        it("Should have been encoded properly: LatinExtra", function(){

            const index = new Index({ encoder: Charset.LatinExtra });
            expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
                index.encoder.encode("bjorm filib mayr")
            );
        });

        it("Should have been encoded properly: LatinSoundex", function(){

            const index = new Index({ encoder: Charset.LatinSoundex });
            expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql(
                index.encoder.encode("bjoernsen philippo mayr")
            );
        });
    }

    it("Should have been encoded properly: Custom Encoder", function(){

        function test_encoder(str){
            return "-[" + str.toUpperCase() + "]-";
        }

        const index = new Index({ encoder: test_encoder });
        expect(index.encoder.encode("Björn-Phillipp Mayer")).to.eql("-[BJÖRN-PHILLIPP MAYER]-");
    });
});

describe("Encoder: CJK Charset", function(){

    it("Should have been tokenized properly", function(){

        const index = Index({ encoder: Charset.CJK });

        index.add(0, "서울시가 잠이 든 시간에 아무 말, 미뤄, 미뤄");
        expect(index.search("든")).to.include(0);
        expect(index.search("시간에")).to.include(0);

        index.add(1, "一个单词");
        expect(index.search("一个")).to.include(1);
        expect(index.search("单词")).to.include(1);
        expect(index.search("词单")).to.include(1);
    });

    it("Should have been tokenized properly", function(){

        const index = Index({ encoder: Charset.CJK });
        index.add(1 , "多大的;多少平");

        const result = index.search('是多少平的', { suggest: true });
        expect(result).to.include(1);
    });
});

describe("Encoder: Cyrillic Charset", function(){

    it("Should have been tokenized properly", function(){

        const index = Index({ tokenize: "forward" });

        index.add(0, "Фообар");
        expect(index.search("Фообар")).to.include(0);
        expect(index.search("Фоо")).to.include(0);
    });
});

describe("Encoder: Arabic Charset", function(){

    it("Should have been tokenized properly", function(){

        let index = Index({ tokenize: "forward" });

        index.add(0, "لكن لا بد أن أوضح لك أن كل");
        expect(index.search("بد أن")).to.include(0);
        expect(index.search("أو")).to.include(0);

        index = Index({ tokenize: "reverse" });

        index.add(0, "لكن لا بد أن أوضح لك أن كل");
        expect(index.search("ضح")).to.include(0);
    });
});

describe("Encoder: Greek Charset", function(){

    it("Should have been tokenized properly", function(){

        const index = Index({ tokenize: "forward" });
        index.add(0, "Μήγαρις ἔχω ἄλλο στὸ νοῦ μου πάρεξ ἐλευθερία καὶ γλώσσα");
        expect(index.search("Μηγαρις εχω αλλο στο νου μου παρε ελευθ και γλωσσα")).to.include(0);
    });
});

describe("Encoder: Right-to-Left", function(){

    it("Should have been scored properly", function(){

        let index = new Index({
            tokenize: "forward",
            rtl: true
        });

        index.add(0, "54321 4 3 2 0");
        index.add(1, "0 2 3 4 54321");
        index.add(2, "0 2 3 4 12345");

        expect(index.search("5")).to.eql([2]);
        expect(index.search("1")).to.eql([1, 0]);

        index = new Index({
            tokenize: "reverse",
            rtl: true
        });

        index.add(0, "54321 4 3 2 1 0");
        index.add(1, "0 1 2 3 4 54321");
        index.add(2, "0 1 2 3 4 12345");

        expect(index.search("5")).to.eql([2, 1, 0]);
    });
});

describe("Filter", function(){

    it("Should have been filtered properly", function(){

        let encoder = new Encoder({
            filter: ["in", "the"]
        });
        let index = new Index({
            tokenize: "strict",
            encoder: encoder
        });

        index.add(0, "Today in the morning.");

        expect(index.search("today in the morning.")).to.include(0);
        expect(index.search("today morning")).to.include(0);
        expect(index.search("in the")).to.have.length(0);

        index = new Index({
            tokenize: "strict",
            encoder: encoder,
            context: true
        });

        index.add(0, "Today in the morning.");
        expect(index.search("today morning")).to.include(0);

        encoder = new Encoder();
        encoder.addFilter("in");
        index = new Index({
            tokenize: "strict",
            encoder: encoder
        });
        index.encoder.addFilter("the");

        index.add(0, "Today in the morning.");
        expect(index.search("in the")).to.have.length(0);

        // extend
        index.encoder.assign({
            filter: ["morning"]
        });

        index.add(0, "Today in the morning.");
        expect(index.search("in the")).to.have.length(0);
        expect(index.search("morning")).to.have.length(0);
        expect(index.search("Today")).to.eql([0]);
    });

    it("Should have been filtered properly (custom function)", function(){

        const encoder = new Encoder({
            filter: function(word){
                return word.length > 3;
            }
        });
        const index = new Index({
            tokenize: "strict",
            encoder: encoder
        });

        index.add(0, "Today in the morning.");

        expect(index.search("today in the morning.")).to.include(0);
        expect(index.search("today morning")).to.include(0);
        expect(index.search("in the")).to.have.length(0);

        encoder.assign({
            filter: function(word){
                return word.length > 3 &&
                       word !== "today";
            }
        });

        index.add(0, "Today in the morning.");

        expect(index.search("today in the morning.")).to.include(0);
        expect(index.search("today morning")).to.include(0);
        expect(index.search("in the")).to.have.length(0);
        expect(index.search("today")).to.have.length(0);
    });

    it("Should have been filtered properly (finalize)", function(){

        const encoder = new Encoder({
            finalize: function(word){
                return word.filter(t => t.length > 3);
            }
        });
        const index = new Index({
            tokenize: "strict",
            encoder: encoder
        });

        index.add(0, "Today in the morning.");

        expect(index.search("today in the morning.")).to.include(0);
        expect(index.search("today morning")).to.include(0);
        expect(index.search("in the")).to.have.length(0);

        // extend
        encoder.assign({
            finalize: function(word){
                return word.filter(t => t.length > 5);
            }
        });

        expect(index.search("today in the morning.")).to.include(0);
        expect(index.search("today morning")).to.include(0);
        expect(index.search("in the")).to.have.length(0);
        expect(index.search("today")).to.have.length(0);
    });

    it("Should have been filtered properly (minlength)", function(){

        const encoder = new Encoder({
            minlength: 4
        });
        const index = new Index({
            tokenize: "strict",
            encoder: encoder
        });

        index.add(0, "Today in the morning.");

        expect(index.search("today in the morning.")).to.include(0);
        expect(index.search("today morning")).to.include(0);
        expect(index.search("in the")).to.have.length(0);
    });
});

describe("Stemmer", function(){

    it("Should have been stemmed properly", function(){

        const encoder = new Encoder({
            stemmer: new Map([
                ["ization", "ize"],
                ["tional", "tion"]
            ])
        });
        const index = new Index({
            tokenize: "strict",
            encoder: encoder
        });

        index.add(0, "Just a multinational colonization.");

        expect(index.search("Just a multinational colonization.")).to.include(0);
        expect(index.search("multinational colonization")).to.include(0);
        expect(index.search("multination colonize")).to.include(0);

        // extend
        encoder.assign({
            stemmer: new Map([
                ["licate", "e"]
            ])
        });

        index.add(0, "Just a duplicate multinational colonization.");

        expect(index.search("Just a multinational colonization.")).to.include(0);
        expect(index.search("multinational colonization")).to.include(0);
        expect(index.search("multination colonize")).to.include(0);
        expect(index.search("dupe")).to.include(0);
    });

//     it("Should have been stemmed properly (custom function)", function(){
//
//         var stems = {
//             "ization": "ize",
//             "tional": "tion"
//         };
//
//         var index = new FlexSearch({
//             tokenize: "strict",
//             stemmer: function(word){
//                 return stems[word] || word;
//             }
//         });
//
//         index.add(0, "Just a multinational colonization.");
//
//         expect(index.length).to.equal(1);
//         expect(index.search("Just a multinational colonization.")).to.include(0);
//         expect(index.search("multinational colonization")).to.include(0);
//         expect(index.search("tional tion")).to.have.length(0);
//     });
// });
//
//
// describe("Custom Language", function(){
//
//     it("Should have been applied properly", function(){
//
//         var index = new FlexSearch({
//             tokenize: "reverse",
//             filter: ["a", "an"],
//             stemmer: {
//                 "ization": "ize",
//                 "tional": "tion"
//             }
//         });
//
//         index.add(0, "Just a multinational colonization.");
//
//         expect(index.length).to.equal(1);
//         expect(index.search("Just a multinational colonization.")).to.include(0);
//         expect(index.search("Just an multinational colonization.")).to.include(0);
//         expect(index.search("multinational colonization")).to.include(0);
//         expect(index.search("tional tion")).to.have.length(0);
//
//         FlexSearch.registerLanguage("custom", {
//             filter: ["a", "an"],
//             stemmer: {
//                 "ization": "ize",
//                 "tional": "tion"
//             }
//         });
//
//         index = new FlexSearch({
//             tokenize: "reverse",
//             lang: "custom"
//         });
//
//         index.add(0, "Just a multinational colonization.");
//
//         expect(index.length).to.equal(1);
//         expect(index.search("Just a multinational colonization.")).to.include(0);
//         expect(index.search("Just an multinational colonization.")).to.include(0);
//         expect(index.search("multinational colonization")).to.include(0);
//         expect(index.search("tional tion")).to.have.length(0);
//     });
});

describe("Mapper", function(){

    it("Should have been applied custom Mapper properly", function(){

        const index = new Index({
            tokenize: "forward",
            encoder: new Encoder({
                numeric: false,
                dedupe: false,
                mapper: new Map([
                    ["1", "a"],
                    ["2", "b"],
                    ["3", "c"],
                    ["4", "d"],
                    ["5", "d"],
                    ["6", "d"],
                    ["7", "e"],
                    ["8", "f"]
                ])
            })
        });

        index.add(0, "12345678");

        expect(index.search("12345678")).to.eql([0]);
        expect(index.search("abcd")).to.eql([0]);
        expect(index.encoder.encode("12345678")).to.eql(["abcdddef"]);

        // extend
        index.encoder.assign({
            mapper: new Map([
                ["1", "x"],
                ["2", "y"],
                ["3", "z"],
                ["7", "x"],
                ["8", "y"]
            ])
        });

        index.add(0, "12345678");

        expect(index.search("12345678")).to.eql([0]);
        expect(index.search("xyzd")).to.eql([0]);
        expect(index.encoder.encode("12345678")).to.eql(["xyzdddxy"]);
    });
});

describe("Matcher", function(){

    it("Should have been applied custom Matcher properly", function(){

        const index = new Index({
            tokenize: "forward",
            encoder: new Encoder({
                numeric: false,
                dedupe: false,
                matcher: new Map([
                    ["1", "a"],
                    ["2", "b"],
                    ["3", "c"],
                    ["456", "d"],
                    ["7", "e"],
                    ["8", "f"]
                ])
            })
        });

        index.add(0, "12345678");

        expect(index.search("12345678")).to.eql([0]);
        expect(index.search("abcd")).to.eql([0]);
        expect(index.encoder.encode("12345678")).to.eql(["abcdef"]);

        // extend
        index.encoder.assign({
            matcher: new Map([
                ["1", "x"],
                ["456", "ddd"],
                ["8", "y"]
            ])
        });

        index.add(0, "12345678");

        expect(index.search("12345678")).to.eql([0]);
        expect(index.search("xbcd")).to.eql([0]);
        expect(index.encoder.encode("12345678")).to.eql(["xbcdddey"]);
    });
});

describe("Replacer", function(){

    it("Should have been applied custom Replacer properly", function(){

        const index = new Index({
            tokenize: "forward",
            encoder: new Encoder({
                numeric: false,
                dedupe: false,
                replacer: [
                    "1", "a",
                    "2", "b",
                    "3", "c",
                    /[456]/g, "d",
                    "7", "e",
                    "8", "f"
                ]
            })
        });

        index.add(0, "12345678");

        expect(index.search("12345678")).to.eql([0]);
        expect(index.search("abcd")).to.eql([0]);
        expect(index.encoder.encode("12345678")).to.eql(["abcdddef"]);

        // extend
        index.encoder.assign({
            replacer: [
                "a", "1",
                "b", "2",
                "c", "3",
                "e", "7",
                "f", "8"
            ]
        });

        index.add(0, "12345678");

        expect(index.search("12345678")).to.eql([0]);
        expect(index.search("123d")).to.eql([0]);
        expect(index.search("abcd")).to.eql([0]);
        expect(index.encoder.encode("12345678")).to.eql(["123ddd78"]);
    });
});