mirror of
https://github.com/ianstormtaylor/slate.git
synced 2025-08-11 17:53:59 +02:00
Fix issue with unicode 1.1 smileys (#4565)
This commit is contained in:
5
.changeset/bright-seahorses-share.md
Normal file
5
.changeset/bright-seahorses-share.md
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
'slate': minor
|
||||||
|
---
|
||||||
|
|
||||||
|
Fix issue with unicode 1.1 smileys followed by a variation selector.
|
@@ -10,151 +10,137 @@ const CHAMELEON = /['\u2018\u2019]/
|
|||||||
* Get the distance to the end of the first character in a string of text.
|
* Get the distance to the end of the first character in a string of text.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
enum CodepointType {
|
||||||
|
// ZWJ sequences consist of multiple emojis separated by ZWJ character. They
|
||||||
|
// are used to combine multiple emojis into one emoji.
|
||||||
|
// https://en.wikipedia.org/wiki/Zero-width_joiner
|
||||||
|
ZeroWidthJoiner,
|
||||||
|
// Kecap sequences consit of a digit, an asterisk or a number sign followed by
|
||||||
|
// the Combining Enclosing Keycap character. They are used to create emoji
|
||||||
|
// with a keycap appearance. https://emojipedia.org/emoji-keycap-sequence
|
||||||
|
Keycap,
|
||||||
|
// Modifiers are used in ZWJ sequences to apply a skin tone to an emoji.
|
||||||
|
// https://en.wikipedia.org/wiki/Emoticons_(Unicode_block)#Emoji_modifiers
|
||||||
|
Modifier,
|
||||||
|
// Variation selectors are used to specify if a character should be displayed
|
||||||
|
// as text or as an emoji.
|
||||||
|
VariationSelector,
|
||||||
|
// Flag sequences consist of a pair of regional indicators.
|
||||||
|
// https://en.wikipedia.org/wiki/Regional_indicator_symbol
|
||||||
|
RegionalIndicator,
|
||||||
|
// Tag sequences consist of a Black Flag emoji followed by a series of Tag
|
||||||
|
// codepoints, then the Cancel Tag codepoint.
|
||||||
|
// https://en.wikipedia.org/wiki/Tags_(Unicode_block)
|
||||||
|
Tag,
|
||||||
|
Character,
|
||||||
|
}
|
||||||
|
|
||||||
export const getCharacterDistance = (str: string, isRTL = false): number => {
|
export const getCharacterDistance = (str: string, isRTL = false): number => {
|
||||||
|
const codepoints = isRTL ? codepointsIteratorRTL(str) : str
|
||||||
const isLTR = !isRTL
|
const isLTR = !isRTL
|
||||||
|
|
||||||
let dist = 0
|
const [isKeycapStart, isKeycapEnd, isTagStart, isTagEnd] = isLTR
|
||||||
// prev types:
|
? [isKeycap, isCombiningEnclosingKeycap, isBlackFlag, isCancelTag]
|
||||||
// NSEQ: non sequenceable codepoint.
|
: [isCombiningEnclosingKeycap, isKeycap, isCancelTag, isBlackFlag]
|
||||||
// MOD: modifier
|
|
||||||
// ZWJ: zero width joiner
|
|
||||||
// VAR: variation selector
|
|
||||||
// BMP: sequenceable codepoint from basic multilingual plane
|
|
||||||
// RI: regional indicator
|
|
||||||
// KC: keycap
|
|
||||||
// TAG: tag
|
|
||||||
let prev:
|
|
||||||
| 'NSEQ'
|
|
||||||
| 'MOD'
|
|
||||||
| 'ZWJ'
|
|
||||||
| 'VAR'
|
|
||||||
| 'BMP'
|
|
||||||
| 'RI'
|
|
||||||
| 'KC'
|
|
||||||
| 'TAG'
|
|
||||||
| null = null
|
|
||||||
|
|
||||||
const codepoints = isLTR ? str : codepointsIteratorRTL(str)
|
let distance = 0
|
||||||
|
let previousType: CodepointType | null = null
|
||||||
|
|
||||||
for (const codepoint of codepoints) {
|
for (const codepoint of codepoints) {
|
||||||
const code = codepoint.codePointAt(0)
|
const code = codepoint.codePointAt(0)
|
||||||
if (!code) break
|
if (!code) break
|
||||||
|
|
||||||
// Check if codepoint is part of a sequence.
|
if (
|
||||||
if (isZWJ(code)) {
|
isLTR &&
|
||||||
dist += codepoint.length
|
previousType === CodepointType.VariationSelector &&
|
||||||
prev = 'ZWJ'
|
!isZWJ(code) &&
|
||||||
|
!isKeycapEnd(code)
|
||||||
continue
|
) {
|
||||||
}
|
|
||||||
|
|
||||||
const [isKeycapStart, isKeycapEnd] = isLTR
|
|
||||||
? [isKeycap, isCombiningEnclosingKeycap]
|
|
||||||
: [isCombiningEnclosingKeycap, isKeycap]
|
|
||||||
if (isKeycapStart(code)) {
|
|
||||||
if (prev === 'KC') {
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
dist += codepoint.length
|
if (isZWJ(code)) {
|
||||||
prev = 'KC'
|
distance += codepoint.length
|
||||||
|
previousType = CodepointType.ZeroWidthJoiner
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isKeycapStart(code)) {
|
||||||
|
if (previousType === CodepointType.Keycap) break
|
||||||
|
|
||||||
|
distance += codepoint.length
|
||||||
|
previousType = CodepointType.Keycap
|
||||||
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if (isKeycapEnd(code)) {
|
if (isKeycapEnd(code)) {
|
||||||
dist += codepoint.length
|
distance += codepoint.length
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isVariationSelector(code)) {
|
if (isVariationSelector(code)) {
|
||||||
dist += codepoint.length
|
if (isRTL && previousType === CodepointType.Character) break
|
||||||
|
|
||||||
if (isLTR && prev === 'BMP') {
|
distance += codepoint.length
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
prev = 'VAR'
|
previousType = CodepointType.VariationSelector
|
||||||
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isBMPEmoji(code)) {
|
|
||||||
if (isLTR && prev && prev !== 'ZWJ' && prev !== 'VAR') {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
dist += codepoint.length
|
|
||||||
|
|
||||||
if (isRTL && prev === 'VAR') {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
prev = 'BMP'
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isModifier(code)) {
|
if (isModifier(code)) {
|
||||||
dist += codepoint.length
|
distance += codepoint.length
|
||||||
prev = 'MOD'
|
previousType = CodepointType.Modifier
|
||||||
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
const [isTagStart, isTagEnd] = isLTR
|
|
||||||
? [isBlackFlag, isCancelTag]
|
|
||||||
: [isCancelTag, isBlackFlag]
|
|
||||||
if (isTagStart(code)) {
|
if (isTagStart(code)) {
|
||||||
if (prev === 'TAG') break
|
if (previousType === CodepointType.Tag) break
|
||||||
|
|
||||||
dist += codepoint.length
|
distance += codepoint.length
|
||||||
prev = 'TAG'
|
previousType = CodepointType.Tag
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
if (previousType === CodepointType.Tag) {
|
||||||
if (isTagEnd(code)) {
|
if (isTagEnd(code)) {
|
||||||
dist += codepoint.length
|
distance += codepoint.length
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if (prev === 'TAG' && isTag(code)) {
|
if (isTag(code)) {
|
||||||
dist += codepoint.length
|
distance += codepoint.length
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
if (isRegionalIndicator(code)) {
|
if (isRegionalIndicator(code)) {
|
||||||
dist += codepoint.length
|
if (previousType && previousType !== CodepointType.RegionalIndicator) {
|
||||||
|
|
||||||
if (prev === 'RI') {
|
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
prev = 'RI'
|
distance += codepoint.length
|
||||||
|
|
||||||
|
if (previousType === CodepointType.RegionalIndicator) break
|
||||||
|
|
||||||
|
previousType = CodepointType.RegionalIndicator
|
||||||
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!isBMP(code)) {
|
// If previous and curent codepoints are regular characters. it means we are
|
||||||
// If previous code point is not sequenceable, it means we are not in a
|
// not in a sequence.
|
||||||
// sequence.
|
if (previousType === CodepointType.Character) break
|
||||||
if (prev === 'NSEQ') {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
dist += codepoint.length
|
distance += codepoint.length
|
||||||
prev = 'NSEQ'
|
previousType = CodepointType.Character
|
||||||
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Modifier 'groups up' with what ever character is before that (even whitespace), need to
|
return distance || 1
|
||||||
// look ahead.
|
|
||||||
if (isLTR && prev === 'MOD') {
|
|
||||||
dist += codepoint.length
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
// If while loop ever gets here, we're done (e.g latin chars).
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
return dist || 1
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -282,42 +268,6 @@ const isCombiningEnclosingKeycap = (code: number): boolean => {
|
|||||||
return code === 0x20e3
|
return code === 0x20e3
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Is `code` one of the BMP codes used in emoji sequences.
|
|
||||||
*
|
|
||||||
* https://emojipedia.org/emoji-zwj-sequences/
|
|
||||||
*/
|
|
||||||
|
|
||||||
const isBMPEmoji = (code: number): boolean => {
|
|
||||||
// This requires tiny bit of maintanance, better ideas?
|
|
||||||
// Fortunately it only happens if new Unicode Standard
|
|
||||||
// is released. Fails gracefully if upkeep lags behind,
|
|
||||||
// same way Slate previously behaved with all emojis.
|
|
||||||
return (
|
|
||||||
code === 0x2764 || // heart (❤)
|
|
||||||
code === 0x2642 || // male (♂)
|
|
||||||
code === 0x2640 || // female (♀)
|
|
||||||
code === 0x2620 || // scull (☠)
|
|
||||||
code === 0x2695 || // medical (⚕)
|
|
||||||
code === 0x2708 || // plane (✈️)
|
|
||||||
code === 0x25ef || // large circle (◯)
|
|
||||||
code === 0x2b06 || // up arrow (⬆)
|
|
||||||
code === 0x2197 || // up-right arrow (↗)
|
|
||||||
code === 0x27a1 || // right arrow (➡)
|
|
||||||
code === 0x2198 || // down-right arrow (↘)
|
|
||||||
code === 0x2b07 || // down arrow (⬇)
|
|
||||||
code === 0x2199 || // down-left arrow (↙)
|
|
||||||
code === 0x2b05 || // left arrow (⬅)
|
|
||||||
code === 0x2196 || // up-left arrow (↖)
|
|
||||||
code === 0x2195 || // up-down arrow (↕)
|
|
||||||
code === 0x2194 || // left-right arrow (↔)
|
|
||||||
code === 0x21a9 || // right arrow curving left (↩)
|
|
||||||
code === 0x21aa || // left arrow curving right (↪)
|
|
||||||
code === 0x2934 || // right arrow curving up (⤴)
|
|
||||||
code === 0x2935 // right arrow curving down (⤵)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is `code` a Regional Indicator.
|
* Is `code` a Regional Indicator.
|
||||||
*
|
*
|
||||||
@@ -328,16 +278,6 @@ const isRegionalIndicator = (code: number): boolean => {
|
|||||||
return code >= 0x1f1e6 && code <= 0x1f1ff
|
return code >= 0x1f1e6 && code <= 0x1f1ff
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Is `code` from basic multilingual plane.
|
|
||||||
*
|
|
||||||
* https://codepoints.net/basic_multilingual_plane
|
|
||||||
*/
|
|
||||||
|
|
||||||
const isBMP = (code: number): boolean => {
|
|
||||||
return code <= 0xffff
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Is `code` a Zero Width Joiner.
|
* Is `code` a Zero Width Joiner.
|
||||||
*
|
*
|
||||||
|
@@ -10,13 +10,26 @@ const codepoints = [
|
|||||||
['0', 1],
|
['0', 1],
|
||||||
[' ', 1],
|
[' ', 1],
|
||||||
['🙂', 2],
|
['🙂', 2],
|
||||||
|
['☺️', 2],
|
||||||
|
['☺️', 2],
|
||||||
['⬅️', 2],
|
['⬅️', 2],
|
||||||
['🏴', 2],
|
['🏴', 2],
|
||||||
|
['☺️a', 2, 1],
|
||||||
|
['🏁🇨🇳', 2, 4],
|
||||||
|
['🎌🇩🇪', 2, 4],
|
||||||
|
['🚩🇺🇸', 2, 4],
|
||||||
|
['🇨🇳🎌', 4, 2],
|
||||||
|
['🏴🏳️', 2, 3],
|
||||||
|
['🇷🇺🚩', 4, 2],
|
||||||
] as const
|
] as const
|
||||||
|
|
||||||
const zwjSequences = [
|
const zwjSequences = [
|
||||||
['👁🗨', 5],
|
['👁🗨', 5],
|
||||||
['👨👩👧👧', 11],
|
['👨👩👧👧', 11],
|
||||||
|
['👩❤️👨', 8],
|
||||||
|
['🙋🏽♂️', 7],
|
||||||
|
['🙋♂️', 5],
|
||||||
|
['🕵️♀️', 6],
|
||||||
['👨🏿🦳', 7],
|
['👨🏿🦳', 7],
|
||||||
] as const
|
] as const
|
||||||
|
|
||||||
@@ -60,7 +73,9 @@ dirs.forEach(dir => {
|
|||||||
const isRTL = dir === 'rtl'
|
const isRTL = dir === 'rtl'
|
||||||
|
|
||||||
describe(`getCharacterDistance - ${dir}`, () => {
|
describe(`getCharacterDistance - ${dir}`, () => {
|
||||||
codepoints.forEach(([str, dist]) => {
|
codepoints.forEach(([str, ltrDist, rtlDist]) => {
|
||||||
|
const dist = isRTL && rtlDist != null ? rtlDist : ltrDist
|
||||||
|
|
||||||
it(str, () => {
|
it(str, () => {
|
||||||
assert.strictEqual(getCharacterDistance(str + str, isRTL), dist)
|
assert.strictEqual(getCharacterDistance(str + str, isRTL), dist)
|
||||||
})
|
})
|
||||||
|
Reference in New Issue
Block a user