import { parse_simple } from "../common.js"; import Index from "../index.js"; import { EnrichedDocumentSearchResults, EnrichedSearchResults, HighlightOptions } from "../type.js"; /** * @param {string} query * @param {EnrichedDocumentSearchResults|EnrichedSearchResults} result * @param {Map} index * @param {string} pluck * @param {HighlightOptions|string} config * @return {EnrichedDocumentSearchResults|EnrichedSearchResults} */ export function highlight_fields(query, result, index, pluck, config) { // The biggest issue is dealing with custom encoders, for this reason // a combined regular expression can't apply as a template let template, markup_open, markup_close; if ("string" == typeof config) { template = config; config = ""; } else { template = config.template; } if (!template) { throw new Error('No template pattern was specified by the search option "highlight"'); } markup_open = template.indexOf("$1"); if (-1 === markup_open) { throw new Error('Invalid highlight template. The replacement pattern "$1" was not found in template: ' + template); } markup_close = template.substring(markup_open + 2); markup_open = template.substring(0, markup_open); let boundary = config && config.boundary, clip = !config || /* suggest */ /* append: */!1 !== config.clip, merge = config && config.merge && markup_close && markup_open && new RegExp(markup_close + " " + markup_open, "g"), ellipsis = config && config.ellipsis, ellipsis_markup_length = 0, ellipsis_markup; if ("object" == typeof ellipsis) { ellipsis_markup = ellipsis.template; ellipsis_markup_length = ellipsis_markup.length - 2; ellipsis = ellipsis.pattern; } if ("string" != typeof ellipsis) { ellipsis = !1 === ellipsis ? "" : "..."; } if (ellipsis_markup_length) { ellipsis = ellipsis_markup.replace("$1", ellipsis); } let ellipsis_length = ellipsis.length - ellipsis_markup_length, boundary_before, boundary_after; if ("object" == typeof boundary) { boundary_before = boundary.before; if (0 === boundary_before) boundary_before = -1; boundary_after = boundary.after; if (0 === boundary_after) boundary_after = -1; boundary = boundary.total || 9e5; } // cache shared encoders across fields let encoder = new Map(), query_enc; // todo remove this loop and pass in the field data directly // todo support field-specific configuration // for every field for (let i = 0, enc, idx, path; i < result.length; i++) { /** @type {EnrichedSearchResults} */ let res; if (pluck) { //res = result[0].result; res = result; path = pluck; } else { const tmp = result[i]; path = tmp.field; // skip when not a field entry (e.g. tags) if (!path) continue; res = tmp.result; } idx = index.get(path); enc = idx.encoder; idx.tokenize; query_enc = encoder.get(enc); // re-encode query when encoder has changed or take cache from shared encoders if ("string" != typeof query_enc) { query_enc = enc.encode(query); encoder.set(enc, query_enc); } // for every doc in results for (let j = 0; j < res.length; j++) { const doc = res[j].doc; if (!doc) continue; const content = parse_simple(doc, path); if (!content) continue; // just split on whitespace and keep original string (encoder split can't apply) const doc_org = content.trim().split(/\s+/); if (!doc_org.length) continue; let str = "", str_arr = [], pos_matches = [], pos_first_match = -1, pos_last_match = -1, length_matches_all = 0; // loop terms of encoded doc content for (let k = 0; k < doc_org.length; k++) { let doc_org_cur = doc_org[k], doc_enc_cur = enc.encode(doc_org_cur); doc_enc_cur = 1 < doc_enc_cur.length ? doc_enc_cur.join(" ") : doc_enc_cur[0]; let found; if (doc_enc_cur && doc_org_cur) { let doc_org_cur_len = doc_org_cur.length, doc_org_diff = (enc.split ? doc_org_cur.replace(enc.split, "") : doc_org_cur).length - doc_enc_cur.length, match = "", match_length = 0; // loop terms of encoded query content and determine the longest match for (let l = 0, query_enc_cur; l < query_enc.length; l++) { query_enc_cur = query_enc[l]; if (!query_enc_cur) continue; let query_enc_cur_len = query_enc_cur.length; // add length from shrinking phonetic transformations (todo: add tests) query_enc_cur_len += doc_org_diff; // skip query token when match length can't exceed previously highest found match if (match_length && query_enc_cur_len <= match_length) { continue; } const position = doc_enc_cur.indexOf(query_enc_cur); if (-1 < position) { match = // prefix (position ? doc_org_cur.substring(0, position) : "") + // match markup_open + doc_org_cur.substring(position, position + query_enc_cur_len) + markup_close + ( // suffix position + query_enc_cur_len < doc_org_cur_len ? doc_org_cur.substring(position + query_enc_cur_len) : ""); match_length = query_enc_cur_len; found = /* tag? */ /* stringify */ /* stringify */ /* single param */ /* skip update: */ /* append: */ /* skip update: */ /* skip_update: */!0 /*await rows.hasNext()*/ /*await rows.hasNext()*/ /*await rows.hasNext()*/; } //console.log(doc_org_cur, doc_enc_cur, query_enc_cur, position, match) } // apply the longest match if (match) { if (boundary) { // the outer boundary is used to check if all matches are within the total boundary // if so, it can apply a simpler alignment if (0 > pos_first_match) { pos_first_match = str.length + (str ? 1 : 0); } pos_last_match = str.length + (str ? 1 : 0) + match.length; // the overall length of all matches is used to check if matches exceeds the total boundary // if so, it can early stop further processing length_matches_all += doc_org_cur_len; // the match positions are used to pick items for the final result more quickly pos_matches.push(str_arr.length); // collect every term as match or text str_arr.push({ match }); } str += (str ? " " : "") + match; } } if (!found) { const text = doc_org[k]; str += (str ? " " : "") + text; // collect every term as match or text boundary && str_arr.push({ text }); } else if (boundary) { if (length_matches_all >= boundary) { // matches has reached total boundary break; } } } // the markup length does not apply to the total boundary let markup_length = pos_matches.length * (template.length - 2); // apply boundaries and align highlights if (boundary_before || boundary_after || boundary && str.length - markup_length > boundary) { // also reduce ellipsis length from boundary let boundary_length = boundary + markup_length - 2 * ellipsis_length, length = pos_last_match - pos_first_match, start, end; if (0 < boundary_before) { length += boundary_before; } if (0 < boundary_after) { length += boundary_after; } // 1. all matches are withing the overall boundary (apply simple alignment) if (length <= boundary_length) { start = boundary_before ? pos_first_match - (0 < boundary_before ? boundary_before : 0) : pos_first_match - (0 | (boundary_length - length) / 2); end = boundary_after ? pos_last_match + (0 < boundary_after ? boundary_after : 0) : start + boundary_length; // do not clip terms if (!clip) { if (0 < start) { if (" " === str.charAt(start)) {} else if (" " !== str.charAt(start - 1)) { start = str.indexOf(" ", start); 0 > start && (start = 0); } } if (end < str.length) { if (" " === str.charAt(end - 1)) {} else if (" " !== str.charAt(end)) { end = str.lastIndexOf(" ", end); end < pos_last_match ? end = pos_last_match : ++end; } } } str = (start ? ellipsis : "") + str.substring(start, end) + (end < str.length ? ellipsis : ""); } // 2. matches needs to be split by surrounded terms to fit into the boundary else { const final = [], check = {}, seamless = {}, finished = {}, before = {}, after = {}; let final_length = 0, shift_left = 0, shift_right = 0, loop_left = 1, loop_right = 1; while (!0) { let loop; for (let k = 0, pos; k < pos_matches.length; k++) { pos = pos_matches[k]; // 1. add matches to the result if (!shift_right) { str = str_arr[pos].match; // initialize custom boundaries for each slot if (boundary_before) { before[k] = boundary_before; } if (boundary_after) { after[k] = boundary_after; } // count whitespaces between each term if (k) { final_length++; } let close; // close left side when first term was matched if (!pos) { // it can be set before content was added, // because the first term match is always added seamless[k] = 1; finished[k] = 1; } // initial ellipsis else if (!k && ellipsis_length) { final_length += ellipsis_length; } // close right side when last term was matched if (pos >= doc_org.length - 1) { close = 1; } // close right side when next term was a match else if (pos < str_arr.length - 1 && str_arr[pos + 1].match) { close = 1; } else if (ellipsis_length) { final_length += ellipsis_length; } // reduce template length on matches final_length -= template.length - 2; // at least add one match if (!k || final_length + str.length <= boundary) { final[k] = str; } else { seamless[k] = 0; loop = loop_left = loop_right = 0; break; } // update state when term was added if (close) { seamless[k + 1] = 1; finished[k + 1] = 1; } } // 2. add surrounded text to the result else { // alternate direction term by term // 2.1. extend to right first (index: k + 1) if (shift_left != shift_right) { if (finished[k + 1]) continue; pos += shift_right; // overlap with other slot if (check[pos]) { final_length -= ellipsis_length; seamless[k + 1] = 1; finished[k + 1] = 1; continue; } // end reached if (pos >= str_arr.length - 1) { if (pos >= str_arr.length) { finished[k + 1] = 1; if (pos >= doc_org.length) { seamless[k + 1] = 1; } continue; } final_length -= ellipsis_length; } str = str_arr[pos].text; let current_after = boundary_after && after[k]; if (current_after) { if (0 < current_after) { if (str.length > current_after) { finished[k + 1] = 1; if (clip) { str = str.substring(0, current_after); } else { continue; } } current_after -= str.length; if (!current_after) current_after = -1; after[k] = current_after; } else { finished[k + 1] = 1; continue; } } // count whitespaces between each term if (final_length + str.length + 1 <= boundary) { str = " " + str; final[k] += str; } else if (clip) { const diff = boundary - final_length - 1; if (0 < diff) { str = " " + str.substring(0, diff); final[k] += str; } finished[k + 1] = 1; } else { finished[k + 1] = 1; continue; } } // 2.2. extend to left (index: k) else { if (finished[k]) continue; pos -= shift_left; // overlap with other slot if (check[pos]) { final_length -= ellipsis_length; finished[k] = 1; seamless[k] = 1; continue; } // start reached if (0 >= pos) { if (0 > pos) { finished[k] = 1; seamless[k] = 1; continue; } final_length -= ellipsis_length; } str = str_arr[pos].text; let current_before = boundary_before && before[k]; if (current_before) { if (0 < current_before) { if (str.length > current_before) { finished[k] = 1; if (clip) { str = str.substring(str.length - current_before); } else { continue; } } current_before -= str.length; if (!current_before) current_before = -1; before[k] = current_before; } else { finished[k] = 1; continue; } } // count whitespaces between each term if (final_length + str.length + 1 <= boundary) { str += " "; final[k] = str + final[k]; } else if (clip) { const diff = str.length + 1 - (boundary - final_length); if (0 <= diff && diff < str.length) { str = str.substring(diff) + " "; final[k] = str + final[k]; } finished[k] = 1; } else { finished[k] = 1; continue; } } } // update state when term was added final_length += str.length; check[pos] = 1; loop = 1; } if (loop) { // alternate shift direction shift_left == shift_right ? shift_right++ : shift_left++; } else { // check finish state shift_left == shift_right ? loop_left = 0 : loop_right = 0; // break process when both directions are done if (!loop_left && !loop_right) { break; } // continue with opposite direction if (loop_left) { shift_left++; shift_right = shift_left; } else { shift_right++; } } } str = ""; for (let k = 0, tmp; k < final.length; k++) { tmp = (k && seamless[k] ? " " : (k && !ellipsis ? " " : "") + ellipsis) + final[k]; str += tmp; } if (ellipsis && !seamless[final.length]) { str += ellipsis; } //console.log(query, seamless, final) } } if (merge) { str = str.replace( /** @type {RegExp} */merge, " "); } res[j].highlight = str; } if (pluck) { break; } } return result; } // /** // * @param {string} query // * @param {EnrichedDocumentSearchResults|EnrichedSearchResults} result // * @param {Map} index // * @param {string} pluck // * @param {HighlightOptions|string} config // * @return {EnrichedDocumentSearchResults|EnrichedSearchResults} // */ // export function highlight_fields(query, result, index, pluck, config){ // // // The biggest issue is dealing with custom encoders, for this reason // // a combined regular expression can't apply as a template // // let template, markup_open, markup_close; // // if(typeof config === "string"){ // template = config; // config = ""; // } // else{ // template = config.template; // } // // if(DEBUG){ // if(!template){ // throw new Error('No template pattern was specified by the search option "highlight"'); // } // } // // markup_open = template.indexOf("$1"); // // if(DEBUG){ // if(markup_open === -1){ // throw new Error('Invalid highlight template. The replacement pattern "$1" was not found in template: ' + template); // } // } // // markup_close = template.substring(markup_open + 2); // markup_open = template.substring(0, markup_open); // // let boundary = config && config.boundary; // let clip = !config || config.clip !== false; // let merge = config && config.merge && markup_close && markup_open && new RegExp(markup_close + " " + markup_open, "g"); // let ellipsis = config && config.ellipsis; // if(typeof ellipsis !== "string"){ // ellipsis = "..."; // } // // let boundary_before, boundary_after; // // if(typeof boundary === "object"){ // boundary_before = boundary.before; // if(boundary_before === 0) boundary_before = -1; // boundary_after = boundary.after; // if(boundary_after === 0) boundary_after = -1; // boundary = boundary.total || 9e5; // } // // // cache shared encoders across fields // let encoder = new Map(); // let query_enc; // let tokenize; // // // todo remove this loop and pass in the field data directly // // todo support field-specific configuration // // // for every field // for(let i = 0, enc, idx, path; i < result.length; i++){ // // /** @type {EnrichedSearchResults} */ // let res; // // if(pluck){ // res = result; // path = pluck; // } // else{ // const tmp = result[i]; // path = tmp.field; // // skip when not a field entry (e.g. tags) // if(!path) continue; // res = tmp.result; // } // // idx = index.get(path); // enc = idx.encoder; // tokenize = idx.tokenize; // query_enc = encoder.get(enc); // // // re-encode query when encoder has changed or take cache from shared encoders // if(typeof query_enc !== "string"){ // query_enc = enc.encode(query); // encoder.set(enc, query_enc); // } // // // for every doc in results // for(let j = 0; j < res.length; j++){ // // const doc = res[j]["doc"]; // if(!doc) continue; // const content = parse_simple(doc, path); // if(!content) continue; // const doc_org = content.trim().split(/\s+/); // if(!doc_org.length) continue; // // let str = ""; // let pos_matches = []; // let length_matches_all = 0; // // // loop terms of encoded doc content // for(let k = 0; k < doc_org.length; k++){ // let doc_org_cur = doc_org[k]; // let doc_org_cur_len = doc_org_cur.length; // let doc_enc_cur = enc.encode(doc_org_cur); // doc_enc_cur = doc_enc_cur.length > 1 // ? doc_enc_cur.join(" ") // : doc_enc_cur[0]; // // let found; // // if(doc_enc_cur && doc_org_cur){ // // let match = ""; // let match_length = 0; // // // loop terms of encoded query content and determine the longest match // for(let l = 0; l < query_enc.length; l++){ // let query_enc_cur = query_enc[l]; // if(!query_enc_cur) continue; // let query_enc_cur_len = query_enc_cur.length; // // add length from shrinking phonetic transformations (todo: add tests) // query_enc_cur_len += doc_org_cur.length - doc_enc_cur.length; // // skip query token when match length can't exceed previously highest found match // if(match_length && query_enc_cur_len <= match_length){ // continue; // } // const position = doc_enc_cur.indexOf(query_enc_cur); // //console.log(doc_org_cur, doc_enc_cur, query_enc_cur, position) // if(position > -1){ // match = // // prefix // (position ? doc_org_cur.substring(0, position) : "") + // // match // markup_open + doc_org_cur.substring(position, position + query_enc_cur_len) + markup_close + // // suffix // (position + query_enc_cur_len < doc_org_cur_len ? doc_org_cur.substring(position + query_enc_cur_len) : ""); // match_length = query_enc_cur_len; // found = true; // } // } // // // apply the longest match // if(match){ // if(boundary){ // if(!pos_matches.length && k) length_matches_all += ellipsis.length; // // the overall length of all matches is used to check if matches exceeds the total boundary // // if so, it can early stop further processing // length_matches_all += match.length;//doc_org_cur_len + (str ? 1 : 0) + (k < doc_org.length - 1 ? ellipsis.length : 0); // // the match positions are used to pick items for the final result more quickly // pos_matches.push([ // str.length + (str ? 1 : 0), // str.length + (str ? 1 : 0) + match.length, // k // ]); // } // str += (str ? " " : "") + match; // } // } // // if(!found){ // const text = doc_org[k]; // str += (str ? " " : "") + text; // } // else if(boundary){ // if(length_matches_all >= boundary){ // // matches has reached total boundary // break; // } // } // } // // // the markup length does not apply to the total boundary // let markup_length = pos_matches.length * (template.length - 2); // // // apply boundaries and align highlights // if(boundary_before || boundary_after || (boundary && (str.length - markup_length) > boundary)){ // // let final = ""; // let surrounded_length = (((((boundary + markup_length) - length_matches_all) / pos_matches.length) - ellipsis.length) / 2); // //if(surrounded_length < 0) surrounded_length = 0; // // let before = boundary_before || ( // surrounded_length > 0 // ? Math.floor(surrounded_length + // (boundary_after // ? surrounded_length - boundary_after // : 0)) // : 0 // ); // let after = boundary_after || ( // surrounded_length > 0 // ? Math.ceil(surrounded_length + // (boundary_before // ? surrounded_length - boundary_before // : 0)) // : 0 // ); // // //console.log(surrounded_length, before, after) // // for(let k = 0, cur, prev, next; k < pos_matches.length; k++){ // // prev = cur; // cur = next || pos_matches[k]; // next = pos_matches[k + 1]; // // let start = cur[0] - before; // let end = cur[1] + after; // let closed_left; // let closed_right; // // // if(k){ // // closed_left = 1; // // } // // // apply right limit // if(next && (end >= next[0] - before)){ // end = cur[1] + (next[0] - cur[1]) / 2 | 0; // start -= ellipsis.length + 1; // closed_right = 1; // } // // apply left limit // if(prev && (start <= prev[1] + after)){ // start = cur[0] - (cur[0] - prev[1]) / 2 | 0; // end += ellipsis.length + 1; // closed_left = 1; // // // repeat right limit // if(next && (end >= next[0] - before)){ // end = cur[1] + (next[0] - cur[1]) / 2 | 0; // closed_right = 1; // } // } // // //console.log(start, end, prev, cur, next); // // // do not clip terms // if(!clip){ // if(start){ // if(str.charAt(start) === " "){ // //start++; // } // else if(str.charAt(start - 1) !== " "){ // start = str.indexOf(" ", start); // start < 0 // ? start = cur[0] // : start;//++; // } // } // if(end < str.length){ // if(str.charAt(end - 1) === " "){ // //end--; // } // else if(str.charAt(end) !== " "){ // end = str.lastIndexOf(" ", end); // end < cur[1] // ? end = cur[1] // : end++; // } // } // } // // final += // /*(final ? " " : "") +*/ // (!closed_left && start > 0 ? ellipsis : "") + // str.substring(start, end) + // (!closed_right && cur[2] < doc_org.length - 1 ? ellipsis : ""); // // //console.log(final) // } // // str = final; // } // // if(merge){ // str = str.replace(merge, " "); // } // // res[j]["highlight"] = str; // } // // if(pluck){ // break; // } // } // // return result; // }