From a33cd12f1af535fd0eff42a97a497b48dac62691 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Fri, 18 Aug 2006 17:49:33 +0000 Subject: [PATCH] Fixed broken multibyte numeric entity conversion in Lexer::substituteNonSpecialEntities() git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@299 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 1 + library/HTMLPurifier/Lexer.php | 62 ++++++++++++++++++++++++++++++-- tests/HTMLPurifier/LexerTest.php | 49 +++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 179f4103..39955416 100644 --- a/NEWS +++ b/NEWS @@ -2,6 +2,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||| 1.0.0rc1, released 2006-??-?? +- Fixed broken numeric entity conversion 1.0.0beta, released 2006-08-16 - First public release, most functionality implemented. Notable omissions are: diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index ae012ed1..cb4f0fcc 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -172,21 +172,77 @@ class HTMLPurifier_Lexer * * @warning Though this is public in order to let the callback happen, * calling it directly is not recommended. + * @note Based on Feyd's function at + * , + * which is in public domain. + * @note While we're going to do code point parsing anyway, a good + * optimization would be to refuse to translate code points that + * are non-SGML characters. However, this could lead to duplication. * @param $matches PCRE matches array, with 0 the entire match, and * either index 1, 2 or 3 set with a hex value, dec value, * or string (respectively). * @returns Replacement string. * @todo Implement string translations */ + + // +----------+----------+----------+----------+ + // | 33222222 | 22221111 | 111111 | | + // | 10987654 | 32109876 | 54321098 | 76543210 | bit + // +----------+----------+----------+----------+ + // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F + // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF + // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF + // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF + // +----------+----------+----------+----------+ + // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF) + // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes + // +----------+----------+----------+----------+ + function nonSpecialEntityCallback($matches) { // replaces all but big five $entity = $matches[0]; $is_num = (@$matches[0][1] === '#'); if ($is_num) { $is_hex = (@$entity[2] === 'x'); - $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; - if (isset($this->_special_dec2str[$int])) return $entity; - return chr($int); + $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2]; + + // abort for special characters + if (isset($this->_special_dec2str[$code])) return $entity; + + if($code > 1114111 or $code < 0 or + ($code >= 55296 and $code <= 57343) ) { + // bits are set outside the "valid" range as defined + // by UNICODE 4.1.0 + return ''; + } + + $x = $y = $z = $w = 0; + if ($code < 128) { + // regular ASCII character + $x = $code; + } else { + // set up bits for UTF-8 + $x = ($code & 63) | 128; + if ($code < 2048) { + $y = (($code & 2047) >> 6) | 192; + } else { + $y = (($code & 4032) >> 6) | 128; + if($code < 65536) { + $z = (($code >> 12) & 15) | 224; + } else { + $z = (($code >> 12) & 63) | 128; + $w = (($code >> 18) & 7) | 240; + } + } + } + // set up the actual character + $ret = ''; + if($w) $ret .= chr($w); + if($z) $ret .= chr($z); + if($y) $ret .= chr($y); + $ret .= chr($x); + + return $ret; } else { if (isset($this->_special_ent2dec[$matches[3]])) return $entity; if (!$this->_entity_lookup) { diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 97e44af4..8f65f88a 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -38,6 +38,55 @@ class HTMLPurifier_LexerTest extends UnitTestCase $this->Lexer->substituteNonSpecialEntities('θ') ); $this->assertIdentical('"', $this->Lexer->substituteNonSpecialEntities('"') ); + + // numeric tests, adapted from Feyd + $args = array(); + $args[] = array(1114112,false ); + $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF + $args[] = array(1048576,'F4808080'); // 0x00100000 + $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF + $args[] = array(262144, 'F1808080'); // 0x00040000 + $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF + $args[] = array(65536, 'F0908080'); // 0x00010000 + $args[] = array(65535, 'EFBFBF' ); // 0x0000FFFF + $args[] = array(57344, 'EE8080' ); // 0x0000E000 + $args[] = array(57343, false ); // 0x0000DFFF these are ill-formed + $args[] = array(56040, false ); // 0x0000DAE8 these are ill-formed + $args[] = array(55296, false ); // 0x0000D800 these are ill-formed + $args[] = array(55295, 'ED9FBF' ); // 0x0000D7FF + $args[] = array(53248, 'ED8080' ); // 0x0000D000 + $args[] = array(53247, 'ECBFBF' ); // 0x0000CFFF + $args[] = array(4096, 'E18080' ); // 0x00001000 + $args[] = array(4095, 'E0BFBF' ); // 0x00000FFF + $args[] = array(2048, 'E0A080' ); // 0x00000800 + $args[] = array(2047, 'DFBF' ); // 0x000007FF + $args[] = array(128, 'C280' ); // 0x00000080 invalid SGML char + $args[] = array(127, '7F' ); // 0x0000007F invalid SGML char + $args[] = array(0, '00' ); // 0x00000000 invalid SGML char + + $args[] = array(20108, 'E4BA8C' ); // 0x00004E8C + $args[] = array(77, '4D' ); // 0x0000004D + $args[] = array(66306, 'F0908C82'); // 0x00010302 + $args[] = array(1072, 'D0B0' ); // 0x00000430 + + foreach ($args as $arg) { + $string = '&#' . $arg[0] . ';' . // decimal + '&#x' . dechex($arg[0]) . ';'; // hex + $expect = ''; + if ($arg[1] !== false) { + $chars = str_split($arg[1], 2); + foreach ($chars as $char) { + $expect .= chr(hexdec($char)); + } + $expect .= $expect; // double it + } + $this->assertIdentical( + $this->Lexer->substituteNonSpecialEntities($string), + $expect, + $arg[0] . ': %s' + ); + } + } function assertExtractBody($text, $extract = true) {