1
0
mirror of https://github.com/processwire/processwire.git synced 2025-08-09 00:06:55 +02:00

Fix issue processwire/processwire-issues#192 where inserted emoji could cause text to be truncated on systems using dbEngine "utf8" (as opposed to "utf8mb4"). Because the emoji/MB4 detection and replacement has some overhead, it's not enabled by default. To enable, set $config->dbStripMB4=true; in your /site/config.php file.

This commit is contained in:
Ryan Cramer
2017-03-09 09:11:30 -05:00
parent 25bfb8a5a6
commit e87dcd5985
5 changed files with 83 additions and 3 deletions

View File

@@ -912,6 +912,17 @@ $config->dbSocket = '';
*/ */
$config->dbQueryLogMax = 500; $config->dbQueryLogMax = 500;
/**
* Remove 4-byte characters (like emoji) when dbEngine is not utf8mb4?
*
* When charset is not “utf8mb4” and this value is true, 4-byte UTF-8 characters are stripped
* out of inserted values when possible. Note that this can add some overhead to INSERTs.
*
* @var bool
*
*/
$config->dbStripMB4 = false;
/*** 8. MODULES *********************************************************************************/ /*** 8. MODULES *********************************************************************************/

View File

@@ -104,7 +104,7 @@
* @property array $dbSqlModes Set or adjust SQL mode per MySQL version, where array keys are MySQL version and values are SQL mode command(s). #pw-group-database * @property array $dbSqlModes Set or adjust SQL mode per MySQL version, where array keys are MySQL version and values are SQL mode command(s). #pw-group-database
* @property int $dbQueryLogMax Maximum number of queries WireDatabasePDO will log in memory, when debug mode is enabled (default=1000). #pw-group-database * @property int $dbQueryLogMax Maximum number of queries WireDatabasePDO will log in memory, when debug mode is enabled (default=1000). #pw-group-database
* @property string $dbInitCommand Database init command, for PDO::MYSQL_ATTR_INIT_COMMAND. Note placeholder {charset} gets replaced with $config->dbCharset. #pw-group-database * @property string $dbInitCommand Database init command, for PDO::MYSQL_ATTR_INIT_COMMAND. Note placeholder {charset} gets replaced with $config->dbCharset. #pw-group-database
* $property array $dbSqlModes Set, add or remove SQL mode based on MySQL version. See default in /wire/config.php for details. #pw-group-database * @property bool $dbStripMB4 When dbEngine is not utf8mb4 and this is true, we will attempt to remove 4-byte characters (like emoji) from inserts when possible. Note that this adds some overhead. #pw-group-database
* *
* @property array $pageList Settings specific to Page lists. #pw-group-modules * @property array $pageList Settings specific to Page lists. #pw-group-modules
* @property array $pageEdit Settings specific to Page editors. #pw-group-modules * @property array $pageEdit Settings specific to Page editors. #pw-group-modules

View File

@@ -38,6 +38,11 @@ class PagesEditor extends Wire {
public function __construct(Pages $pages) { public function __construct(Pages $pages) {
$this->pages = $pages; $this->pages = $pages;
$config = $pages->wire('config');
if($config->dbStripMB4 && strtolower($config->dbEngine) != 'utf8mb4') {
$this->addHookAfter('Fieldtype::sleepValue', $this, 'hookFieldtypeSleepValueStripMB4');
}
} }
public function isCloning() { public function isCloning() {
@@ -1384,4 +1389,16 @@ class PagesEditor extends Wire {
return count($sorts); return count($sorts);
} }
/**
* Hook after Fieldtype::sleepValue to remove MB4 characters when present and applicable
*
* This hook is only used if $config->dbStripMB4 is true and $config->dbEngine is not “utf8mb4”.
*
* @param HookEvent $event
*
*/
protected function hookFieldtypeSleepValueStripMB4(HookEvent $event) {
$event->return = $this->wire('sanitizer')->removeMB4($event->return);
}
} }

View File

@@ -918,6 +918,7 @@ class Sanitizer extends Wire {
* - `maxLength` (int): maximum characters allowed, or 0=no max (default=255). * - `maxLength` (int): maximum characters allowed, or 0=no max (default=255).
* - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*4). * - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*4).
* - `stripTags` (bool): strip markup tags? (default=true). * - `stripTags` (bool): strip markup tags? (default=true).
* - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false).
* - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function. * - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function.
* - `multiLine` (bool): allow multiple lines? if false, then $newlineReplacement below is applicable (default=false). * - `multiLine` (bool): allow multiple lines? if false, then $newlineReplacement below is applicable (default=false).
* - `newlineReplacement` (string): character to replace newlines with, OR specify boolean TRUE to remove extra lines (default=" "). * - `newlineReplacement` (string): character to replace newlines with, OR specify boolean TRUE to remove extra lines (default=" ").
@@ -933,6 +934,7 @@ class Sanitizer extends Wire {
'maxLength' => 255, // maximum characters allowed, or 0=no max 'maxLength' => 255, // maximum characters allowed, or 0=no max
'maxBytes' => 0, // maximum bytes allowed (0 = default, which is maxLength*4) 'maxBytes' => 0, // maximum bytes allowed (0 = default, which is maxLength*4)
'stripTags' => true, // strip markup tags 'stripTags' => true, // strip markup tags
'stripMB4' => false, // strip Emoji and 4-byte characters?
'allowableTags' => '', // tags that are allowed, if stripTags is true (use same format as for PHP's strip_tags function) 'allowableTags' => '', // tags that are allowed, if stripTags is true (use same format as for PHP's strip_tags function)
'multiLine' => false, // allow multiple lines? if false, then $newlineReplacement below is applicable 'multiLine' => false, // allow multiple lines? if false, then $newlineReplacement below is applicable
'newlineReplacement' => ' ', // character to replace newlines with, OR specify boolean TRUE to remove extra lines 'newlineReplacement' => ' ', // character to replace newlines with, OR specify boolean TRUE to remove extra lines
@@ -964,6 +966,8 @@ class Sanitizer extends Wire {
if($options['inCharset'] != $options['outCharset']) $value = iconv($options['inCharset'], $options['outCharset'], $value); if($options['inCharset'] != $options['outCharset']) $value = iconv($options['inCharset'], $options['outCharset'], $value);
if($options['stripMB4']) $value = $this->removeMB4($value);
if($options['maxLength']) { if($options['maxLength']) {
if(empty($options['maxBytes'])) $options['maxBytes'] = $options['maxLength'] * 4; if(empty($options['maxBytes'])) $options['maxBytes'] = $options['maxLength'] * 4;
if($this->multibyteSupport) { if($this->multibyteSupport) {
@@ -1010,6 +1014,7 @@ class Sanitizer extends Wire {
* - `maxLength` (int): maximum characters allowed, or 0=no max (default=16384 or 16kb). * - `maxLength` (int): maximum characters allowed, or 0=no max (default=16384 or 16kb).
* - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*3 or 48kb). * - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*3 or 48kb).
* - `stripTags` (bool): strip markup tags? (default=true). * - `stripTags` (bool): strip markup tags? (default=true).
* - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false).
* - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function. * - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function.
* - `allowCRLF` (bool): allow CR+LF newlines (i.e. "\r\n")? (default=false, which means "\r\n" is replaced with "\n"). * - `allowCRLF` (bool): allow CR+LF newlines (i.e. "\r\n")? (default=false, which means "\r\n" is replaced with "\n").
* - `inCharset` (string): input character set (default="UTF-8"). * - `inCharset` (string): input character set (default="UTF-8").
@@ -1799,6 +1804,41 @@ class Sanitizer extends Wire {
return str_replace(array("\r\n", "\r", "\n"), $replacement, $str); return str_replace(array("\r\n", "\r", "\n"), $replacement, $str);
} }
/**
* Removes 4-byte UTF-8 characters (like emoji) that produce error with with MySQL regular “UTF8” encoding
*
* Returns the same value type that it is given. If given something other than a string or array, it just
* returns it without modification.
*
* @param string|array $value String or array containing strings
* @return string|array|mixed
*
*/
function removeMB4($value) {
if(empty($value)) return $value;
if(is_array($value)) {
// process array recursively, looking for strings to convert
foreach($value as $key => $val) {
if(empty($val)) continue;
if(is_string($val) || is_array($val)) $value[$key] = $this->removeMB4($val);
}
} else if(is_string($value)) {
if(strlen($value) > 3 && max(array_map('ord', str_split($value))) >= 240) {
// string contains 4-byte characters
$regex =
'!(?:' .
'\xF0[\x90-\xBF][\x80-\xBF]{2}' .
'|[\xF1-\xF3][\x80-\xBF]{3}' .
'|\xF4[\x80-\x8F][\x80-\xBF]{2}' .
')!s';
$value = preg_replace($regex, '', $value);
}
} else {
// not a string or an array, leave as-is
}
return $value;
}
/** /**
* Sanitize value to string * Sanitize value to string
* *

View File

@@ -63,6 +63,14 @@ class WireDatabasePDO extends Wire implements WireDatabase {
*/ */
protected $init = false; protected $init = false;
/**
* Strip 4-byte characters in “quote” and “escapeStr” methods? (only when dbEngine is not utf8mb4)
*
* @var bool
*
*/
protected $stripMB4 = false;
/** /**
* PDO connection settings * PDO connection settings
* *
@@ -167,6 +175,7 @@ class WireDatabasePDO extends Wire implements WireDatabase {
if($this->init || !$this->isWired()) return; if($this->init || !$this->isWired()) return;
$this->init = true; $this->init = true;
$config = $this->wire('config'); $config = $this->wire('config');
$this->stripMB4 = $config->dbStripMB4 && strtolower($config->dbEngine) != 'utf8mb4';
$this->queryLogMax = (int) $config->dbQueryLogMax; $this->queryLogMax = (int) $config->dbQueryLogMax;
$sqlModes = $config->dbSqlModes; $sqlModes = $config->dbSqlModes;
if(is_array($sqlModes)) { if(is_array($sqlModes)) {
@@ -631,7 +640,7 @@ class WireDatabasePDO extends Wire implements WireDatabase {
* *
*/ */
public function escapeStr($str) { public function escapeStr($str) {
return substr($this->pdo()->quote($str), 1, -1); return substr($this->quote($str), 1, -1);
} }
/** /**
@@ -660,6 +669,9 @@ class WireDatabasePDO extends Wire implements WireDatabase {
* *
*/ */
public function quote($str) { public function quote($str) {
if($this->stripMB4 && is_string($str) && !empty($str)) {
$str = $this->wire('sanitizer')->removeMB4($str);
}
return $this->pdo()->quote($str); return $this->pdo()->quote($str);
} }