flexsearch/dist/module-debug/document/highlight.js


import { parse_simple } from "../common.js";
import Index from "../index.js";
import { EnrichedDocumentSearchResults, EnrichedSearchResults, HighlightOptions } from "../type.js";

/**
 * @param {string} query
 * @param {EnrichedDocumentSearchResults|EnrichedSearchResults} result
 * @param {Map<string, Index>} index
 * @param {string} pluck
 * @param {HighlightOptions|string} config
 * @return {EnrichedDocumentSearchResults|EnrichedSearchResults}
 */
export function highlight_fields(query, result, index, pluck, config) {

    // The biggest issue is dealing with custom encoders, for this reason
    // a combined regular expression can't apply as a template

    let template, markup_open, markup_close;

    if ("string" == typeof config) {
        template = config;
        config = "";
    } else {
        template = config.template;
    }

    if (!template) {
        throw new Error('No template pattern was specified by the search option "highlight"');
    }


    markup_open = template.indexOf("$1");

    if (-1 === markup_open) {
        throw new Error('Invalid highlight template. The replacement pattern "$1" was not found in template: ' + template);
    }


    markup_close = template.substring(markup_open + 2);
    markup_open = template.substring(0, markup_open);

    let boundary = config && config.boundary,
        clip = !config || /* suggest */ /* append: */!1 !== config.clip,
        merge = config && config.merge && markup_close && markup_open && new RegExp(markup_close + " " + markup_open, "g"),
        ellipsis = config && config.ellipsis,
        ellipsis_markup_length = 0,
        ellipsis_markup;

    if ("object" == typeof ellipsis) {
        ellipsis_markup = ellipsis.template;
        ellipsis_markup_length = ellipsis_markup.length - 2;
        ellipsis = ellipsis.pattern;
    }
    if ("string" != typeof ellipsis) {
        ellipsis = !1 === ellipsis ? "" : "...";
    }
    if (ellipsis_markup_length) {
        ellipsis = ellipsis_markup.replace("$1", ellipsis);
    }

    let ellipsis_length = ellipsis.length - ellipsis_markup_length,
        boundary_before,
        boundary_after;


    if ("object" == typeof boundary) {
        boundary_before = boundary.before;
        if (0 === boundary_before) boundary_before = -1;
        boundary_after = boundary.after;
        if (0 === boundary_after) boundary_after = -1;
        boundary = boundary.total || 9e5;
    }

    // cache shared encoders across fields
    let encoder = new Map(),
        query_enc;


    // todo remove this loop and pass in the field data directly
    // todo support field-specific configuration

    // for every field
    for (let i = 0, enc, idx, path; i < result.length; i++) {

        /** @type {EnrichedSearchResults} */
        let res;

        if (pluck) {
            //res = result[0].result;
            res = result;
            path = pluck;
        } else {
            const tmp = result[i];
            path = tmp.field;
            // skip when not a field entry (e.g. tags)
            if (!path) continue;
            res = tmp.result;
        }

        idx = index.get(path);
        enc = idx.encoder;
        idx.tokenize;
        query_enc = encoder.get(enc);

        // re-encode query when encoder has changed or take cache from shared encoders
        if ("string" != typeof query_enc) {
            query_enc = enc.encode(query);
            encoder.set(enc, query_enc);
        }

        // for every doc in results
        for (let j = 0; j < res.length; j++) {

            const doc = res[j].doc;
            if (!doc) continue;
            const content = parse_simple(doc, path);
            if (!content) continue;
            // just split on whitespace and keep original string (encoder split can't apply)
            const doc_org = content.trim().split(/\s+/);
            if (!doc_org.length) continue;

            let str = "",
                str_arr = [],
                pos_matches = [],
                pos_first_match = -1,
                pos_last_match = -1,
                length_matches_all = 0;


            // loop terms of encoded doc content
            for (let k = 0; k < doc_org.length; k++) {
                let doc_org_cur = doc_org[k],
                    doc_enc_cur = enc.encode(doc_org_cur);

                doc_enc_cur = 1 < doc_enc_cur.length ? doc_enc_cur.join(" ") : doc_enc_cur[0];

                let found;

                if (doc_enc_cur && doc_org_cur) {
                    let doc_org_cur_len = doc_org_cur.length,
                        doc_org_diff = (enc.split ? doc_org_cur.replace(enc.split, "") : doc_org_cur).length - doc_enc_cur.length,
                        match = "",
                        match_length = 0;


                    // loop terms of encoded query content and determine the longest match
                    for (let l = 0, query_enc_cur; l < query_enc.length; l++) {
                        query_enc_cur = query_enc[l];

                        if (!query_enc_cur) continue;
                        let query_enc_cur_len = query_enc_cur.length;
                        // add length from shrinking phonetic transformations (todo: add tests)
                        query_enc_cur_len += doc_org_diff;
                        // skip query token when match length can't exceed previously highest found match
                        if (match_length && query_enc_cur_len <= match_length) {
                            continue;
                        }
                        const position = doc_enc_cur.indexOf(query_enc_cur);
                        if (-1 < position) {
                            match =
                            // prefix
                            (position ? doc_org_cur.substring(0, position) : "") +
                            // match
                            markup_open + doc_org_cur.substring(position, position + query_enc_cur_len) + markup_close + (
                            // suffix
                            position + query_enc_cur_len < doc_org_cur_len ? doc_org_cur.substring(position + query_enc_cur_len) : "");
                            match_length = query_enc_cur_len;
                            found = /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */ /* skip_update: */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/ /*await rows.hasNext()*/;
                        }
                        //console.log(doc_org_cur, doc_enc_cur, query_enc_cur, position, match)
                    }

                    // apply the longest match
                    if (match) {
                        if (boundary) {
                            // the outer boundary is used to check if all matches are within the total boundary
                            // if so, it can apply a simpler alignment
                            if (0 > pos_first_match) {
                                pos_first_match = str.length + (str ? 1 : 0);
                            }
                            pos_last_match = str.length + (str ? 1 : 0) + match.length;
                            // the overall length of all matches is used to check if matches exceeds the total boundary
                            // if so, it can early stop further processing
                            length_matches_all += doc_org_cur_len;
                            // the match positions are used to pick items for the final result more quickly
                            pos_matches.push(str_arr.length);
                            // collect every term as match or text
                            str_arr.push({ match });
                        }
                        str += (str ? " " : "") + match;
                    }
                }

                if (!found) {
                    const text = doc_org[k];
                    str += (str ? " " : "") + text;
                    // collect every term as match or text
                    boundary && str_arr.push({ text });
                } else if (boundary) {
                    if (length_matches_all >= boundary) {
                        // matches has reached total boundary
                        break;
                    }
                }
            }

            // the markup length does not apply to the total boundary
            let markup_length = pos_matches.length * (template.length - 2);

            // apply boundaries and align highlights
            if (boundary_before || boundary_after || boundary && str.length - markup_length > boundary) {

                // also reduce ellipsis length from boundary
                let boundary_length = boundary + markup_length - 2 * ellipsis_length,
                    length = pos_last_match - pos_first_match,
                    start,
                    end;


                if (0 < boundary_before) {
                    length += boundary_before;
                }
                if (0 < boundary_after) {
                    length += boundary_after;
                }

                // 1. all matches are withing the overall boundary (apply simple alignment)
                if (length <= boundary_length) {

                    start = boundary_before ? pos_first_match - (0 < boundary_before ? boundary_before : 0) : pos_first_match - (0 | (boundary_length - length) / 2);
                    end = boundary_after ? pos_last_match + (0 < boundary_after ? boundary_after : 0) : start + boundary_length;

                    // do not clip terms
                    if (!clip) {
                        if (0 < start) {
                            if (" " === str.charAt(start)) {} else if (" " !== str.charAt(start - 1)) {
                                start = str.indexOf(" ", start);
                                0 > start && (start = 0);
                            }
                        }
                        if (end < str.length) {
                            if (" " === str.charAt(end - 1)) {} else if (" " !== str.charAt(end)) {
                                end = str.lastIndexOf(" ", end);
                                end < pos_last_match ? end = pos_last_match : ++end;
                            }
                        }
                    }

                    str = (start ? ellipsis : "") + str.substring(start, end) + (end < str.length ? ellipsis : "");
                }
                // 2. matches needs to be split by surrounded terms to fit into the boundary
                else {
                        const final = [],
                              check = {},
                              seamless = {},
                              finished = {},
                              before = {},
                              after = {};
                        let final_length = 0,
                            shift_left = 0,
                            shift_right = 0,
                            loop_left = 1,
                            loop_right = 1;


                        while (!0) {

                            let loop;

                            for (let k = 0, pos; k < pos_matches.length; k++) {

                                pos = pos_matches[k];

                                // 1. add matches to the result
                                if (!shift_right) {
                                    str = str_arr[pos].match;

                                    // initialize custom boundaries for each slot
                                    if (boundary_before) {
                                        before[k] = boundary_before;
                                    }
                                    if (boundary_after) {
                                        after[k] = boundary_after;
                                    }

                                    // count whitespaces between each term
                                    if (k) {
                                        final_length++;
                                    }

                                    let close;

                                    // close left side when first term was matched
                                    if (!pos) {
                                        // it can be set before content was added,
                                        // because the first term match is always added
                                        seamless[k] = 1;
                                        finished[k] = 1;
                                    }
                                    // initial ellipsis
                                    else if (!k && ellipsis_length) {
                                            final_length += ellipsis_length;
                                        }

                                    // close right side when last term was matched
                                    if (pos >= doc_org.length - 1) {
                                        close = 1;
                                    }
                                    // close right side when next term was a match
                                    else if (pos < str_arr.length - 1 && str_arr[pos + 1].match) {
                                            close = 1;
                                        } else if (ellipsis_length) {
                                            final_length += ellipsis_length;
                                        }

                                    // reduce template length on matches
                                    final_length -= template.length - 2;

                                    // at least add one match
                                    if (!k || final_length + str.length <= boundary) {
                                        final[k] = str;
                                    } else {
                                        seamless[k] = 0;
                                        loop = loop_left = loop_right = 0;
                                        break;
                                    }

                                    // update state when term was added
                                    if (close) {
                                        seamless[k + 1] = 1;
                                        finished[k + 1] = 1;
                                    }
                                }
                                // 2. add surrounded text to the result
                                else {
                                        // alternate direction term by term
                                        // 2.1. extend to right first (index: k + 1)
                                        if (shift_left != shift_right) {
                                            if (finished[k + 1]) continue;
                                            pos += shift_right;

                                            // overlap with other slot
                                            if (check[pos]) {
                                                final_length -= ellipsis_length;
                                                seamless[k + 1] = 1;
                                                finished[k + 1] = 1;
                                                continue;
                                            }
                                            // end reached
                                            if (pos >= str_arr.length - 1) {
                                                if (pos >= str_arr.length) {
                                                    finished[k + 1] = 1;
                                                    if (pos >= doc_org.length) {
                                                        seamless[k + 1] = 1;
                                                    }
                                                    continue;
                                                }
                                                final_length -= ellipsis_length;
                                            }

                                            str = str_arr[pos].text;

                                            let current_after = boundary_after && after[k];
                                            if (current_after) {
                                                if (0 < current_after) {
                                                    if (str.length > current_after) {
                                                        finished[k + 1] = 1;
                                                        if (clip) {
                                                            str = str.substring(0, current_after);
                                                        } else {
                                                            continue;
                                                        }
                                                    }
                                                    current_after -= str.length;
                                                    if (!current_after) current_after = -1;
                                                    after[k] = current_after;
                                                } else {
                                                    finished[k + 1] = 1;
                                                    continue;
                                                }
                                            }

                                            // count whitespaces between each term
                                            if (final_length + str.length + 1 <= boundary) {
                                                str = " " + str;
                                                final[k] += str;
                                            } else if (clip) {
                                                const diff = boundary - final_length - 1;
                                                if (0 < diff) {
                                                    str = " " + str.substring(0, diff);
                                                    final[k] += str;
                                                }
                                                finished[k + 1] = 1;
                                            } else {
                                                finished[k + 1] = 1;
                                                continue;
                                            }
                                        }
                                        // 2.2. extend to left (index: k)
                                        else {
                                                if (finished[k]) continue;
                                                pos -= shift_left;

                                                // overlap with other slot
                                                if (check[pos]) {
                                                    final_length -= ellipsis_length;
                                                    finished[k] = 1;
                                                    seamless[k] = 1;
                                                    continue;
                                                }
                                                // start reached
                                                if (0 >= pos) {
                                                    if (0 > pos) {
                                                        finished[k] = 1;
                                                        seamless[k] = 1;
                                                        continue;
                                                    }
                                                    final_length -= ellipsis_length;
                                                }

                                                str = str_arr[pos].text;

                                                let current_before = boundary_before && before[k];
                                                if (current_before) {
                                                    if (0 < current_before) {
                                                        if (str.length > current_before) {
                                                            finished[k] = 1;
                                                            if (clip) {
                                                                str = str.substring(str.length - current_before);
                                                            } else {
                                                                continue;
                                                            }
                                                        }
                                                        current_before -= str.length;
                                                        if (!current_before) current_before = -1;
                                                        before[k] = current_before;
                                                    } else {
                                                        finished[k] = 1;
                                                        continue;
                                                    }
                                                }

                                                // count whitespaces between each term
                                                if (final_length + str.length + 1 <= boundary) {
                                                    str += " ";
                                                    final[k] = str + final[k];
                                                } else if (clip) {
                                                    const diff = str.length + 1 - (boundary - final_length);
                                                    if (0 <= diff && diff < str.length) {
                                                        str = str.substring(diff) + " ";
                                                        final[k] = str + final[k];
                                                    }
                                                    finished[k] = 1;
                                                } else {
                                                    finished[k] = 1;
                                                    continue;
                                                }
                                            }
                                    }

                                // update state when term was added
                                final_length += str.length;
                                check[pos] = 1;
                                loop = 1;
                            }

                            if (loop) {
                                // alternate shift direction
                                shift_left == shift_right ? shift_right++ : shift_left++;
                            } else {
                                // check finish state
                                shift_left == shift_right ? loop_left = 0 : loop_right = 0;
                                // break process when both directions are done
                                if (!loop_left && !loop_right) {
                                    break;
                                }
                                // continue with opposite direction
                                if (loop_left) {
                                    shift_left++;
                                    shift_right = shift_left;
                                } else {
                                    shift_right++;
                                }
                            }
                        }

                        str = "";
                        for (let k = 0, tmp; k < final.length; k++) {
                            tmp = (k && seamless[k] ? " " : (k && !ellipsis ? " " : "") + ellipsis) + final[k];
                            str += tmp;
                        }
                        if (ellipsis && !seamless[final.length]) {
                            str += ellipsis;
                        }
                        //console.log(query, seamless, final)
                    }
            }

            if (merge) {
                str = str.replace( /** @type {RegExp} */merge, " ");
            }

            res[j].highlight = str;
        }

        if (pluck) {
            break;
        }
    }

    return result;
}

// /**
//  * @param {string} query
//  * @param {EnrichedDocumentSearchResults|EnrichedSearchResults} result
//  * @param {Map<string, Index>} index
//  * @param {string} pluck
//  * @param {HighlightOptions|string} config
//  * @return {EnrichedDocumentSearchResults|EnrichedSearchResults}
//  */
// export function highlight_fields(query, result, index, pluck, config){
//
//     // The biggest issue is dealing with custom encoders, for this reason
//     // a combined regular expression can't apply as a template
//
//     let template, markup_open, markup_close;
//
//     if(typeof config === "string"){
//         template = config;
//         config = "";
//     }
//     else{
//         template = config.template;
//     }
//
//     if(DEBUG){
//         if(!template){
//             throw new Error('No template pattern was specified by the search option "highlight"');
//         }
//     }
//
//     markup_open = template.indexOf("$1");
//
//     if(DEBUG){
//         if(markup_open === -1){
//             throw new Error('Invalid highlight template. The replacement pattern "$1" was not found in template: ' + template);
//         }
//     }
//
//     markup_close = template.substring(markup_open + 2);
//     markup_open = template.substring(0, markup_open);
//
//     let boundary = config && config.boundary;
//     let clip = !config || config.clip !== false;
//     let merge = config && config.merge && markup_close && markup_open && new RegExp(markup_close + " " + markup_open, "g");
//     let ellipsis = config && config.ellipsis;
//     if(typeof ellipsis !== "string"){
//         ellipsis = "...";
//     }
//
//     let boundary_before, boundary_after;
//
//     if(typeof boundary === "object"){
//         boundary_before = boundary.before;
//         if(boundary_before === 0) boundary_before = -1;
//         boundary_after = boundary.after;
//         if(boundary_after === 0) boundary_after = -1;
//         boundary = boundary.total || 9e5;
//     }
//
//     // cache shared encoders across fields
//     let encoder = new Map();
//     let query_enc;
//     let tokenize;
//
//     // todo remove this loop and pass in the field data directly
//     // todo support field-specific configuration
//
//     // for every field
//     for(let i = 0, enc, idx, path; i < result.length; i++){
//
//         /** @type {EnrichedSearchResults} */
//         let res;
//
//         if(pluck){
//             res = result;
//             path = pluck;
//         }
//         else{
//             const tmp = result[i];
//             path = tmp.field;
//             // skip when not a field entry (e.g. tags)
//             if(!path) continue;
//             res = tmp.result;
//         }
//
//         idx = index.get(path);
//         enc = idx.encoder;
//         tokenize = idx.tokenize;
//         query_enc = encoder.get(enc);
//
//         // re-encode query when encoder has changed or take cache from shared encoders
//         if(typeof query_enc !== "string"){
//             query_enc = enc.encode(query);
//             encoder.set(enc, query_enc);
//         }
//
//         // for every doc in results
//         for(let j = 0; j < res.length; j++){
//
//             const doc = res[j]["doc"];
//             if(!doc) continue;
//             const content = parse_simple(doc, path);
//             if(!content) continue;
//             const doc_org = content.trim().split(/\s+/);
//             if(!doc_org.length) continue;
//
//             let str = "";
//             let pos_matches = [];
//             let length_matches_all = 0;
//
//             // loop terms of encoded doc content
//             for(let k = 0; k < doc_org.length; k++){
//                 let doc_org_cur = doc_org[k];
//                 let doc_org_cur_len = doc_org_cur.length;
//                 let doc_enc_cur = enc.encode(doc_org_cur);
//                 doc_enc_cur = doc_enc_cur.length > 1
//                     ? doc_enc_cur.join(" ")
//                     : doc_enc_cur[0];
//
//                 let found;
//
//                 if(doc_enc_cur && doc_org_cur){
//
//                     let match = "";
//                     let match_length = 0;
//
//                     // loop terms of encoded query content and determine the longest match
//                     for(let l = 0; l < query_enc.length; l++){
//                         let query_enc_cur = query_enc[l];
//                         if(!query_enc_cur) continue;
//                         let query_enc_cur_len = query_enc_cur.length;
//                         // add length from shrinking phonetic transformations (todo: add tests)
//                         query_enc_cur_len += doc_org_cur.length - doc_enc_cur.length;
//                         // skip query token when match length can't exceed previously highest found match
//                         if(match_length && query_enc_cur_len <= match_length){
//                             continue;
//                         }
//                         const position = doc_enc_cur.indexOf(query_enc_cur);
//                         //console.log(doc_org_cur, doc_enc_cur, query_enc_cur, position)
//                         if(position > -1){
//                             match =
//                                 // prefix
//                                 (position ? doc_org_cur.substring(0, position) : "") +
//                                 // match
//                                 markup_open + doc_org_cur.substring(position, position + query_enc_cur_len) + markup_close +
//                                 // suffix
//                                 (position + query_enc_cur_len < doc_org_cur_len ? doc_org_cur.substring(position + query_enc_cur_len) : "");
//                             match_length = query_enc_cur_len;
//                             found = true;
//                         }
//                     }
//
//                     // apply the longest match
//                     if(match){
//                         if(boundary){
//                             if(!pos_matches.length && k) length_matches_all += ellipsis.length;
//                             // the overall length of all matches is used to check if matches exceeds the total boundary
//                             // if so, it can early stop further processing
//                             length_matches_all += match.length;//doc_org_cur_len + (str ? 1 : 0) + (k < doc_org.length - 1 ? ellipsis.length : 0);
//                             // the match positions are used to pick items for the final result more quickly
//                             pos_matches.push([
//                                 str.length + (str ? 1 : 0),
//                                 str.length + (str ? 1 : 0) + match.length,
//                                 k
//                             ]);
//                         }
//                         str += (str ? " " : "") + match;
//                     }
//                 }
//
//                 if(!found){
//                     const text = doc_org[k];
//                     str += (str ? " " : "") + text;
//                 }
//                 else if(boundary){
//                     if(length_matches_all >= boundary){
//                         // matches has reached total boundary
//                         break;
//                     }
//                 }
//             }
//
//             // the markup length does not apply to the total boundary
//             let markup_length = pos_matches.length * (template.length - 2);
//
//             // apply boundaries and align highlights
//             if(boundary_before || boundary_after || (boundary && (str.length - markup_length) > boundary)){
//
//                 let final = "";
//                 let surrounded_length = (((((boundary + markup_length) - length_matches_all) / pos_matches.length) - ellipsis.length) / 2);
//                 //if(surrounded_length < 0) surrounded_length = 0;
//
//                 let before = boundary_before || (
//                     surrounded_length > 0
//                         ? Math.floor(surrounded_length +
//                             (boundary_after
//                                 ? surrounded_length - boundary_after
//                                 : 0))
//                         : 0
//                 );
//                 let after = boundary_after || (
//                     surrounded_length > 0
//                         ? Math.ceil(surrounded_length +
//                             (boundary_before
//                                 ? surrounded_length - boundary_before
//                                 : 0))
//                         : 0
//                 );
//
//                 //console.log(surrounded_length, before, after)
//
//                 for(let k = 0, cur, prev, next; k < pos_matches.length; k++){
//
//                     prev = cur;
//                     cur = next || pos_matches[k];
//                     next = pos_matches[k + 1];
//
//                     let start = cur[0] - before;
//                     let end = cur[1] + after;
//                     let closed_left;
//                     let closed_right;
//
//                     // if(k){
//                     //     closed_left = 1;
//                     // }
//
//                     // apply right limit
//                     if(next && (end >= next[0] - before)){
//                         end = cur[1] + (next[0] - cur[1]) / 2 | 0;
//                         start -= ellipsis.length + 1;
//                         closed_right = 1;
//                     }
//                     // apply left limit
//                     if(prev && (start <= prev[1] + after)){
//                         start = cur[0] - (cur[0] - prev[1]) / 2 | 0;
//                         end += ellipsis.length + 1;
//                         closed_left = 1;
//
//                         // repeat right limit
//                         if(next && (end >= next[0] - before)){
//                             end = cur[1] + (next[0] - cur[1]) / 2 | 0;
//                             closed_right = 1;
//                         }
//                     }
//
//                     //console.log(start, end, prev, cur, next);
//
//                     // do not clip terms
//                     if(!clip){
//                         if(start){
//                             if(str.charAt(start) === " "){
//                                 //start++;
//                             }
//                             else if(str.charAt(start - 1) !== " "){
//                                 start = str.indexOf(" ", start);
//                                 start < 0
//                                     ? start = cur[0]
//                                     : start;//++;
//                             }
//                         }
//                         if(end < str.length){
//                             if(str.charAt(end - 1) === " "){
//                                 //end--;
//                             }
//                             else if(str.charAt(end) !== " "){
//                                 end = str.lastIndexOf(" ", end);
//                                 end < cur[1]
//                                     ? end = cur[1]
//                                     : end++;
//                             }
//                         }
//                     }
//
//                     final +=
//                         /*(final ? " " : "") +*/
//                         (!closed_left && start > 0 ? ellipsis : "") +
//                         str.substring(start, end) +
//                         (!closed_right && cur[2] < doc_org.length - 1 ? ellipsis : "");
//
//                     //console.log(final)
//                 }
//
//                 str = final;
//             }
//
//             if(merge){
//                 str = str.replace(merge, " ");
//             }
//
//             res[j]["highlight"] = str;
//         }
//
//         if(pluck){
//             break;
//         }
//     }
//
//     return result;
// }