diff --git a/wire/config.php b/wire/config.php index 575d49e9..7c43b624 100644 --- a/wire/config.php +++ b/wire/config.php @@ -912,6 +912,17 @@ $config->dbSocket = ''; */ $config->dbQueryLogMax = 500; +/** + * Remove 4-byte characters (like emoji) when dbEngine is not utf8mb4? + * + * When charset is not “utf8mb4” and this value is true, 4-byte UTF-8 characters are stripped + * out of inserted values when possible. Note that this can add some overhead to INSERTs. + * + * @var bool + * + */ +$config->dbStripMB4 = false; + /*** 8. MODULES *********************************************************************************/ diff --git a/wire/core/Config.php b/wire/core/Config.php index 9d4bc0c2..d2ad7127 100644 --- a/wire/core/Config.php +++ b/wire/core/Config.php @@ -104,7 +104,7 @@ * @property array $dbSqlModes Set or adjust SQL mode per MySQL version, where array keys are MySQL version and values are SQL mode command(s). #pw-group-database * @property int $dbQueryLogMax Maximum number of queries WireDatabasePDO will log in memory, when debug mode is enabled (default=1000). #pw-group-database * @property string $dbInitCommand Database init command, for PDO::MYSQL_ATTR_INIT_COMMAND. Note placeholder {charset} gets replaced with $config->dbCharset. #pw-group-database - * $property array $dbSqlModes Set, add or remove SQL mode based on MySQL version. See default in /wire/config.php for details. #pw-group-database + * @property bool $dbStripMB4 When dbEngine is not utf8mb4 and this is true, we will attempt to remove 4-byte characters (like emoji) from inserts when possible. Note that this adds some overhead. #pw-group-database * * @property array $pageList Settings specific to Page lists. #pw-group-modules * @property array $pageEdit Settings specific to Page editors. #pw-group-modules diff --git a/wire/core/PagesEditor.php b/wire/core/PagesEditor.php index 504586fc..79789e4b 100644 --- a/wire/core/PagesEditor.php +++ b/wire/core/PagesEditor.php @@ -38,6 +38,11 @@ class PagesEditor extends Wire { public function __construct(Pages $pages) { $this->pages = $pages; + + $config = $pages->wire('config'); + if($config->dbStripMB4 && strtolower($config->dbEngine) != 'utf8mb4') { + $this->addHookAfter('Fieldtype::sleepValue', $this, 'hookFieldtypeSleepValueStripMB4'); + } } public function isCloning() { @@ -1384,4 +1389,16 @@ class PagesEditor extends Wire { return count($sorts); } + + /** + * Hook after Fieldtype::sleepValue to remove MB4 characters when present and applicable + * + * This hook is only used if $config->dbStripMB4 is true and $config->dbEngine is not “utf8mb4”. + * + * @param HookEvent $event + * + */ + protected function hookFieldtypeSleepValueStripMB4(HookEvent $event) { + $event->return = $this->wire('sanitizer')->removeMB4($event->return); + } } diff --git a/wire/core/Sanitizer.php b/wire/core/Sanitizer.php index c3590c12..b7da2518 100644 --- a/wire/core/Sanitizer.php +++ b/wire/core/Sanitizer.php @@ -918,6 +918,7 @@ class Sanitizer extends Wire { * - `maxLength` (int): maximum characters allowed, or 0=no max (default=255). * - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*4). * - `stripTags` (bool): strip markup tags? (default=true). + * - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false). * - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function. * - `multiLine` (bool): allow multiple lines? if false, then $newlineReplacement below is applicable (default=false). * - `newlineReplacement` (string): character to replace newlines with, OR specify boolean TRUE to remove extra lines (default=" "). @@ -933,6 +934,7 @@ class Sanitizer extends Wire { 'maxLength' => 255, // maximum characters allowed, or 0=no max 'maxBytes' => 0, // maximum bytes allowed (0 = default, which is maxLength*4) 'stripTags' => true, // strip markup tags + 'stripMB4' => false, // strip Emoji and 4-byte characters? 'allowableTags' => '', // tags that are allowed, if stripTags is true (use same format as for PHP's strip_tags function) 'multiLine' => false, // allow multiple lines? if false, then $newlineReplacement below is applicable 'newlineReplacement' => ' ', // character to replace newlines with, OR specify boolean TRUE to remove extra lines @@ -962,7 +964,9 @@ class Sanitizer extends Wire { if($options['stripTags']) $value = strip_tags($value, $options['allowableTags']); - if($options['inCharset'] != $options['outCharset']) $value = iconv($options['inCharset'], $options['outCharset'], $value); + if($options['inCharset'] != $options['outCharset']) $value = iconv($options['inCharset'], $options['outCharset'], $value); + + if($options['stripMB4']) $value = $this->removeMB4($value); if($options['maxLength']) { if(empty($options['maxBytes'])) $options['maxBytes'] = $options['maxLength'] * 4; @@ -1010,6 +1014,7 @@ class Sanitizer extends Wire { * - `maxLength` (int): maximum characters allowed, or 0=no max (default=16384 or 16kb). * - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*3 or 48kb). * - `stripTags` (bool): strip markup tags? (default=true). + * - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false). * - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function. * - `allowCRLF` (bool): allow CR+LF newlines (i.e. "\r\n")? (default=false, which means "\r\n" is replaced with "\n"). * - `inCharset` (string): input character set (default="UTF-8"). @@ -1799,6 +1804,41 @@ class Sanitizer extends Wire { return str_replace(array("\r\n", "\r", "\n"), $replacement, $str); } + /** + * Removes 4-byte UTF-8 characters (like emoji) that produce error with with MySQL regular “UTF8” encoding + * + * Returns the same value type that it is given. If given something other than a string or array, it just + * returns it without modification. + * + * @param string|array $value String or array containing strings + * @return string|array|mixed + * + */ + function removeMB4($value) { + if(empty($value)) return $value; + if(is_array($value)) { + // process array recursively, looking for strings to convert + foreach($value as $key => $val) { + if(empty($val)) continue; + if(is_string($val) || is_array($val)) $value[$key] = $this->removeMB4($val); + } + } else if(is_string($value)) { + if(strlen($value) > 3 && max(array_map('ord', str_split($value))) >= 240) { + // string contains 4-byte characters + $regex = + '!(?:' . + '\xF0[\x90-\xBF][\x80-\xBF]{2}' . + '|[\xF1-\xF3][\x80-\xBF]{3}' . + '|\xF4[\x80-\x8F][\x80-\xBF]{2}' . + ')!s'; + $value = preg_replace($regex, '', $value); + } + } else { + // not a string or an array, leave as-is + } + return $value; + } + /** * Sanitize value to string * diff --git a/wire/core/WireDatabasePDO.php b/wire/core/WireDatabasePDO.php index b34ef302..4597618f 100644 --- a/wire/core/WireDatabasePDO.php +++ b/wire/core/WireDatabasePDO.php @@ -63,6 +63,14 @@ class WireDatabasePDO extends Wire implements WireDatabase { */ protected $init = false; + /** + * Strip 4-byte characters in “quote” and “escapeStr” methods? (only when dbEngine is not utf8mb4) + * + * @var bool + * + */ + protected $stripMB4 = false; + /** * PDO connection settings * @@ -167,6 +175,7 @@ class WireDatabasePDO extends Wire implements WireDatabase { if($this->init || !$this->isWired()) return; $this->init = true; $config = $this->wire('config'); + $this->stripMB4 = $config->dbStripMB4 && strtolower($config->dbEngine) != 'utf8mb4'; $this->queryLogMax = (int) $config->dbQueryLogMax; $sqlModes = $config->dbSqlModes; if(is_array($sqlModes)) { @@ -631,7 +640,7 @@ class WireDatabasePDO extends Wire implements WireDatabase { * */ public function escapeStr($str) { - return substr($this->pdo()->quote($str), 1, -1); + return substr($this->quote($str), 1, -1); } /** @@ -660,6 +669,9 @@ class WireDatabasePDO extends Wire implements WireDatabase { * */ public function quote($str) { + if($this->stripMB4 && is_string($str) && !empty($str)) { + $str = $this->wire('sanitizer')->removeMB4($str); + } return $this->pdo()->quote($str); }