Fix unicode offset (#3044)

* Merges pr-2680 * Handle end/beginning of node/text: we have to return 1 in order not to break jumping to next/previous node * Update emojis example to insert emojis as plain text with shift+click. * Fix eslint issues
2025-08-29 18:09:49 +02:00 · 2019-10-06 15:07:53 +02:00
parent d71ea08fdf
commit 8dce7538eb
4 changed files with 222 additions and 19 deletions
--- a/examples/emojis/index.js
+++ b/examples/emojis/index.js
@@ -37,7 +37,8 @@ const EMOJIS = [
  '👻',
  '🍔',
  '🍑',
-  '🔑',
+  '👩‍❤️‍👩',
+  '👨‍👩‍👦',
 ]

 /**
@@ -158,11 +159,16 @@ class Emojis extends React.Component {

  onClickEmoji = (e, code) => {
    e.preventDefault()
+    const { editor } = this

-    this.editor
-      .insertInline({ type: 'emoji', data: { code } })
-      .moveToStartOfNextText()
-      .focus()
+    if (e.shiftKey) {
+      editor.insertText(code)
+    } else {
+      editor.insertInline({ type: 'emoji', data: { code } })
+      editor.moveToStartOfNextText()
+    }
+
+    editor.focus()
  }
 }

--- a/examples/emojis/value.json
+++ b/examples/emojis/value.json
@@ -43,6 +43,17 @@
            "text": "This example shows emojis in action."
          }
        ]
+      },
+      {
+        "object": "block",
+        "type": "paragraph",
+        "nodes": [
+          {
+            "object": "text",
+            "text":
+              "Click on an emoji at the top to insert it as an inline node, shift-click to insert it as plain text."
+          }
+        ]
      }
    ]
  }
--- a/packages/slate/src/commands/on-selection.js
+++ b/packages/slate/src/commands/on-selection.js
@@ -131,8 +131,18 @@ Commands.moveAnchorToStartOfText = editor => {
  editor.command(pointEdgeObject, 'anchor', 'start', 'text')
 }

-Commands.moveBackward = (editor, ...args) => {
-  editor.moveAnchorBackward(...args).moveFocusBackward(...args)
+Commands.moveBackward = (editor, chars = 1) => {
+  if (chars === 0) return
+
+  const { value } = editor
+  const { document, selection } = value
+  const { start } = selection
+  const startBlock = document.getClosestBlock(start.key)
+  const o = startBlock.getOffset(start.key)
+  const offset = o + start.offset
+  const { text } = startBlock
+  const charsOffset = TextUtils.getCharOffsetBackward(text, offset, chars)
+  editor.moveAnchorBackward(charsOffset).moveFocusBackward(charsOffset)
 }

 Commands.moveWordBackward = (editor, ...args) => {
@@ -355,8 +365,18 @@ Commands.moveFocusToStartOfText = editor => {
  editor.command(pointEdgeObject, 'focus', 'start', 'text')
 }

-Commands.moveForward = (editor, ...args) => {
-  editor.moveAnchorForward(...args).moveFocusForward(...args)
+Commands.moveForward = (editor, chars = 1) => {
+  if (chars === 0) return
+
+  const { value } = editor
+  const { document, selection } = value
+  const { start } = selection
+  const startBlock = document.getClosestBlock(start.path)
+  const o = startBlock.getOffset(start.key)
+  const offset = o + start.offset
+  const { text } = startBlock
+  const charsOffset = TextUtils.getCharOffsetForward(text, offset, chars)
+  editor.moveAnchorForward(charsOffset).moveFocusForward(charsOffset)
 }

 Commands.moveWordForward = (editor, ...args) => {
--- a/packages/slate/src/utils/text-utils.js
+++ b/packages/slate/src/utils/text-utils.js
@@ -45,6 +45,63 @@ function isSurrogate(code) {
  return SURROGATE_START <= code && code <= SURROGATE_END
 }

+/**
+ * Does `code` form Modifier with next one.
+ *
+ * https://emojipedia.org/modifiers/
+ *
+ * @param {Number} code
+ * @param {String} text
+ * @param {Number} offset
+ * @return {Boolean}
+ */
+
+function isModifier(code, text, offset) {
+  if (code === 0xd83c) {
+    const next = text.charCodeAt(offset + 1)
+    return next <= 0xdfff && next >= 0xdffb
+  }
+  return false
+}
+
+/**
+ * Is `code` a Variation Selector.
+ *
+ * https://codepoints.net/variation_selectors
+ *
+ * @param {Number} code
+ * @return {Boolean}
+ */
+
+function isVariationSelector(code) {
+  return code <= 0xfe0f && code >= 0xfe00
+}
+
+/**
+ * Is `code` one of the BMP codes used in emoji sequences.
+ *
+ * https://emojipedia.org/emoji-zwj-sequences/
+ *
+ * @param {Number} code
+ * @return {Boolean}
+ */
+
+function isBMPEmoji(code) {
+  // This requires tiny bit of maintanance, better ideas?
+  // Fortunately it only happens if new Unicode Standard
+  // is released. Fails gracefully if upkeep lags behind,
+  // same way Slate previously behaved with all emojis.
+  return (
+    code === 0x2764 || // heart (❤)
+    code === 0x2642 || // male (♂)
+    code === 0x2640 || // female (♀)
+    code === 0x2620 || // scull (☠)
+    code === 0x2695 || // medical (⚕)
+    code === 0x2708 || // plane (✈️)
+    code === 0x25ef // large circle (◯)
+  )
+}
+
 /**
 * Is a character a word character? Needs the `remaining` characters too.
 *
@@ -81,42 +138,151 @@ function getCharLength(char) {
 }

 /**
- * Get the offset to the end of the first character in `text`.
+ * Get the offset to the end of the character(s) in `text`.
+ * This function is emoji aware and handles them correctly.
 *
 * @param {String} text
+ * @param {Number} chars
+ * @param {Boolean} forward
 * @return {Number}
 */

-function getCharOffset(text) {
-  const char = text.charAt(0)
-  return getCharLength(char)
+function getCharOffset(text, chars, forward) {
+  let offset = 0
+
+  // Handle end/beginning of node: we have to return 1 in order not to
+  // break cursor's jumping to next/previous node. We need to return early
+  // because otherwise, ''.charCodeAt(0) returned NaN and, the default
+  // handling 'latin characters' at the end of the while loop would
+  // would never be reached an we returned '0' as offset.
+  if (text === '') return 1
+
+  // Calculate offset sum of each character
+  for (let i = 0; i < chars; i++) {
+    // `prev` types (better ideas?):
+    // - SURR: surrogate pair
+    // - MOD: modifier (technically also surrogate pair)
+    // - ZWJ: zero width joiner
+    // - VAR: variation selector
+    // - BMP: sequenceable character from Basic Multilingual Plane
+    let prev = null
+    let charCode = text.charCodeAt(offset)
+
+    while (charCode) {
+      if (isSurrogate(charCode)) {
+        const modifier = isModifier(charCode, text, offset)
+
+        // Early returns are the heart of this loop where
+        // we decide if previous and current codepoints
+        // should form a single character (in other words:
+        // how many of them should selection jump over).
+        if (forward) {
+          if (
+            (!modifier && prev && prev !== 'ZWJ') ||
+            (modifier && prev && prev !== 'SURR')
+          ) {
+            break
+          }
+        } else if (prev === 'SURR' || prev === 'BMP') {
+          break
+        }
+
+        offset += 2
+        prev = modifier ? 'MOD' : 'SURR'
+        charCode = text.charCodeAt(offset)
+        // It's okay to `continue` without checking
+        // because if `charCode` is NaN (which is
+        // the case when out of `text` range), next
+        // `while` loop won't execute and we're done.
+        continue
+      }
+
+      // If zero width joiner
+      if (charCode === 0x200d) {
+        offset += 1
+        prev = 'ZWJ'
+        charCode = text.charCodeAt(offset)
+        continue
+      }
+
+      if (isBMPEmoji(charCode)) {
+        if (
+          (forward && prev === 'VAR') ||
+          (prev && prev !== 'ZWJ' && prev !== 'VAR')
+        ) {
+          break
+        }
+
+        offset += 1
+        prev = 'BMP'
+        charCode = text.charCodeAt(offset)
+        continue
+      }
+
+      if (isVariationSelector(charCode)) {
+        if (!forward && prev && prev !== 'ZWJ') {
+          break
+        }
+
+        offset += 1
+        prev = 'VAR'
+        charCode = text.charCodeAt(offset)
+        continue
+      }
+
+      // Modifier "fuses" with what ever character is before that
+      // (even whitespace), need to look ahead if loop gets here.
+      if (forward) {
+        const nextCharCode = text.charCodeAt(offset + 1)
+
+        if (isModifier(nextCharCode, text, offset + 1)) {
+          offset += 3
+          prev = 'MOD'
+          charCode = text.charCodeAt(offset)
+          continue
+        }
+      } else if (prev === 'MOD') {
+        offset += 1
+        break
+      }
+
+      // If while loop ever gets here, we're
+      // done (e.g Latin characters, length 1).
+      if (prev === null) offset += 1
+      break
+    }
+  }
+
+  return offset
 }

 /**
- * Get the offset to the end of the character before an `offset` in `text`.
+ * Get the offset to the end of character(s) before an `offset` in `text`.
 *
 * @param {String} text
 * @param {Number} offset
+ * @param {Number} chars
 * @return {Number}
 */

-function getCharOffsetBackward(text, offset) {
+function getCharOffsetBackward(text, offset, chars = 1) {
  text = text.slice(0, offset)
  text = reverse(text)
-  return getCharOffset(text)
+  return getCharOffset(text, chars)
 }

 /**
- * Get the offset to the end of the character after an `offset` in `text`.
+ * Get the offset to the end of character(s) after an `offset` in `text`.
 *
 * @param {String} text
 * @param {Number} offset
+ * @param {Number} chars
 * @return {Number}
 */

-function getCharOffsetForward(text, offset) {
+function getCharOffsetForward(text, offset, chars = 1) {
  text = text.slice(offset)
-  return getCharOffset(text)
+  return getCharOffset(text, chars, true)
 }

 /**