1
0
mirror of https://github.com/ianstormtaylor/slate.git synced 2025-01-18 05:59:13 +01:00

Fix issue with unicode 1.1 smileys (#4565)

This commit is contained in:
Jimmy Oliger 2021-11-04 20:36:41 +05:30 committed by GitHub
parent e0f41514a1
commit 5818aca503
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 99 additions and 139 deletions

View File

@ -0,0 +1,5 @@
---
'slate': minor
---
Fix issue with unicode 1.1 smileys followed by a variation selector.

View File

@ -10,151 +10,137 @@ const CHAMELEON = /['\u2018\u2019]/
* Get the distance to the end of the first character in a string of text.
*/
enum CodepointType {
// ZWJ sequences consist of multiple emojis separated by ZWJ character. They
// are used to combine multiple emojis into one emoji.
// https://en.wikipedia.org/wiki/Zero-width_joiner
ZeroWidthJoiner,
// Kecap sequences consit of a digit, an asterisk or a number sign followed by
// the Combining Enclosing Keycap character. They are used to create emoji
// with a keycap appearance. https://emojipedia.org/emoji-keycap-sequence
Keycap,
// Modifiers are used in ZWJ sequences to apply a skin tone to an emoji.
// https://en.wikipedia.org/wiki/Emoticons_(Unicode_block)#Emoji_modifiers
Modifier,
// Variation selectors are used to specify if a character should be displayed
// as text or as an emoji.
VariationSelector,
// Flag sequences consist of a pair of regional indicators.
// https://en.wikipedia.org/wiki/Regional_indicator_symbol
RegionalIndicator,
// Tag sequences consist of a Black Flag emoji followed by a series of Tag
// codepoints, then the Cancel Tag codepoint.
// https://en.wikipedia.org/wiki/Tags_(Unicode_block)
Tag,
Character,
}
export const getCharacterDistance = (str: string, isRTL = false): number => {
const codepoints = isRTL ? codepointsIteratorRTL(str) : str
const isLTR = !isRTL
let dist = 0
// prev types:
// NSEQ: non sequenceable codepoint.
// MOD: modifier
// ZWJ: zero width joiner
// VAR: variation selector
// BMP: sequenceable codepoint from basic multilingual plane
// RI: regional indicator
// KC: keycap
// TAG: tag
let prev:
| 'NSEQ'
| 'MOD'
| 'ZWJ'
| 'VAR'
| 'BMP'
| 'RI'
| 'KC'
| 'TAG'
| null = null
const [isKeycapStart, isKeycapEnd, isTagStart, isTagEnd] = isLTR
? [isKeycap, isCombiningEnclosingKeycap, isBlackFlag, isCancelTag]
: [isCombiningEnclosingKeycap, isKeycap, isCancelTag, isBlackFlag]
const codepoints = isLTR ? str : codepointsIteratorRTL(str)
let distance = 0
let previousType: CodepointType | null = null
for (const codepoint of codepoints) {
const code = codepoint.codePointAt(0)
if (!code) break
// Check if codepoint is part of a sequence.
if (
isLTR &&
previousType === CodepointType.VariationSelector &&
!isZWJ(code) &&
!isKeycapEnd(code)
) {
break
}
if (isZWJ(code)) {
dist += codepoint.length
prev = 'ZWJ'
distance += codepoint.length
previousType = CodepointType.ZeroWidthJoiner
continue
}
const [isKeycapStart, isKeycapEnd] = isLTR
? [isKeycap, isCombiningEnclosingKeycap]
: [isCombiningEnclosingKeycap, isKeycap]
if (isKeycapStart(code)) {
if (prev === 'KC') {
break
}
if (previousType === CodepointType.Keycap) break
distance += codepoint.length
previousType = CodepointType.Keycap
dist += codepoint.length
prev = 'KC'
continue
}
if (isKeycapEnd(code)) {
dist += codepoint.length
distance += codepoint.length
break
}
if (isVariationSelector(code)) {
dist += codepoint.length
if (isRTL && previousType === CodepointType.Character) break
if (isLTR && prev === 'BMP') {
break
}
distance += codepoint.length
prev = 'VAR'
previousType = CodepointType.VariationSelector
continue
}
if (isBMPEmoji(code)) {
if (isLTR && prev && prev !== 'ZWJ' && prev !== 'VAR') {
break
}
dist += codepoint.length
if (isRTL && prev === 'VAR') {
break
}
prev = 'BMP'
continue
}
if (isModifier(code)) {
dist += codepoint.length
prev = 'MOD'
distance += codepoint.length
previousType = CodepointType.Modifier
continue
}
const [isTagStart, isTagEnd] = isLTR
? [isBlackFlag, isCancelTag]
: [isCancelTag, isBlackFlag]
if (isTagStart(code)) {
if (prev === 'TAG') break
if (previousType === CodepointType.Tag) break
dist += codepoint.length
prev = 'TAG'
distance += codepoint.length
previousType = CodepointType.Tag
continue
}
if (isTagEnd(code)) {
dist += codepoint.length
if (previousType === CodepointType.Tag) {
if (isTagEnd(code)) {
distance += codepoint.length
break
}
if (isTag(code)) {
distance += codepoint.length
continue
}
break
}
if (prev === 'TAG' && isTag(code)) {
dist += codepoint.length
continue
}
if (isRegionalIndicator(code)) {
dist += codepoint.length
if (prev === 'RI') {
if (previousType && previousType !== CodepointType.RegionalIndicator) {
break
}
prev = 'RI'
distance += codepoint.length
if (previousType === CodepointType.RegionalIndicator) break
previousType = CodepointType.RegionalIndicator
continue
}
if (!isBMP(code)) {
// If previous code point is not sequenceable, it means we are not in a
// sequence.
if (prev === 'NSEQ') {
break
}
// If previous and curent codepoints are regular characters. it means we are
// not in a sequence.
if (previousType === CodepointType.Character) break
dist += codepoint.length
prev = 'NSEQ'
distance += codepoint.length
previousType = CodepointType.Character
continue
}
// Modifier 'groups up' with what ever character is before that (even whitespace), need to
// look ahead.
if (isLTR && prev === 'MOD') {
dist += codepoint.length
break
}
// If while loop ever gets here, we're done (e.g latin chars).
break
continue
}
return dist || 1
return distance || 1
}
/**
@ -282,42 +268,6 @@ const isCombiningEnclosingKeycap = (code: number): boolean => {
return code === 0x20e3
}
/**
* Is `code` one of the BMP codes used in emoji sequences.
*
* https://emojipedia.org/emoji-zwj-sequences/
*/
const isBMPEmoji = (code: number): boolean => {
// This requires tiny bit of maintanance, better ideas?
// Fortunately it only happens if new Unicode Standard
// is released. Fails gracefully if upkeep lags behind,
// same way Slate previously behaved with all emojis.
return (
code === 0x2764 || // heart (❤)
code === 0x2642 || // male (♂)
code === 0x2640 || // female (♀)
code === 0x2620 || // scull (☠)
code === 0x2695 || // medical (⚕)
code === 0x2708 || // plane (✈️)
code === 0x25ef || // large circle (◯)
code === 0x2b06 || // up arrow (⬆)
code === 0x2197 || // up-right arrow (↗)
code === 0x27a1 || // right arrow (➡)
code === 0x2198 || // down-right arrow (↘)
code === 0x2b07 || // down arrow (⬇)
code === 0x2199 || // down-left arrow (↙)
code === 0x2b05 || // left arrow (⬅)
code === 0x2196 || // up-left arrow (↖)
code === 0x2195 || // up-down arrow (↕)
code === 0x2194 || // left-right arrow (↔)
code === 0x21a9 || // right arrow curving left (↩)
code === 0x21aa || // left arrow curving right (↪)
code === 0x2934 || // right arrow curving up (⤴)
code === 0x2935 // right arrow curving down (⤵)
)
}
/**
* Is `code` a Regional Indicator.
*
@ -328,16 +278,6 @@ const isRegionalIndicator = (code: number): boolean => {
return code >= 0x1f1e6 && code <= 0x1f1ff
}
/**
* Is `code` from basic multilingual plane.
*
* https://codepoints.net/basic_multilingual_plane
*/
const isBMP = (code: number): boolean => {
return code <= 0xffff
}
/**
* Is `code` a Zero Width Joiner.
*

View File

@ -10,13 +10,26 @@ const codepoints = [
['0', 1],
[' ', 1],
['🙂', 2],
['☺️', 2],
['☺️', 2],
['⬅️', 2],
['🏴', 2],
['☺a', 2, 1],
['🏁🇨🇳', 2, 4],
['🎌🇩🇪', 2, 4],
['🚩🇺🇸', 2, 4],
['🇨🇳🎌', 4, 2],
['🏴🏳️', 2, 3],
['🇷🇺🚩', 4, 2],
] as const
const zwjSequences = [
['👁‍🗨', 5],
['👨‍👩‍👧‍👧', 11],
['👩‍❤️‍👨', 8],
['🙋🏽‍♂️', 7],
['🙋‍♂️', 5],
['🕵️‍♀️', 6],
['👨🏿‍🦳', 7],
] as const
@ -60,7 +73,9 @@ dirs.forEach(dir => {
const isRTL = dir === 'rtl'
describe(`getCharacterDistance - ${dir}`, () => {
codepoints.forEach(([str, dist]) => {
codepoints.forEach(([str, ltrDist, rtlDist]) => {
const dist = isRTL && rtlDist != null ? rtlDist : ltrDist
it(str, () => {
assert.strictEqual(getCharacterDistance(str + str, isRTL), dist)
})