diff --git a/docs/examples/demo.php b/docs/examples/demo.php
index b0a80d52..07630078 100644
--- a/docs/examples/demo.php
+++ b/docs/examples/demo.php
@@ -62,7 +62,7 @@ if (isset($_GET['profile']) || isset($_GET['XDEBUG_PROFILE'])) {
if (isset($html)) {
echo htmlspecialchars(
- HTMLPurifier_Lexer::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
+ HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
}
?>
diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php
index c6611add..7596b0ed 100644
--- a/library/HTMLPurifier.php
+++ b/library/HTMLPurifier.php
@@ -28,48 +28,6 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Strategy/Core.php';
-/*
-
-// Darn you fellas still using ISO-8859-1! It would be so easy for me
-// to just drop the characters that can't be expressed this way, but I'm
-// a stickler for code quality, so I won't do that to you. You'll have
-// to wait for this functionality to be implemented later.
-
-HTMLPurifier_ConfigDef::define(
- 'Core', 'Encoding', 'utf-8', 'istring',
- 'Set this to the encoding your webpages are served as. This defines '.
- 'the encoding the HTMLPurifier will convert to and from before passing '.
- 'the text back to you. Note that although we offer full HTML document '.
- 'parsing functionality, we ignore meta tags in such documents, because '.
- 'most modern browsers have already re-encoded the file in the correct '.
- 'encoding (though it did not change the meta tag). '.
- 'Since browsers do not alter file uploads, '.
- 'HTML from a file will fail fantastically if its real encoding is does '.
- 'match the encoding passed here (which is often the case).'
-);
-
-if ( !function_exists('iconv') ) {
-
- // these are the only encodings we offer native PHP support for.
- // if iconv is enabled, iconv's encoding support dictates what we can
- // use.
-
- HTMLPurifier_ConfigDef::defineAllowedValues(
- 'Core', 'Encoding', array(
- 'utf-8',
- 'iso-8859-1'
- )
- );
- HTMLPurifier_ConfigDef::defineValueAliases(
- 'Core', 'Encoding', array(
- 'iso8859-1' => 'iso-8859-1'
- )
- );
-
-}
-
-*/
-
/**
* Main library execution class.
*
diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php
new file mode 100644
index 00000000..ed02b4ef
--- /dev/null
+++ b/library/HTMLPurifier/Encoder.php
@@ -0,0 +1,366 @@
+ '"',
+ 38 => '&',
+ 39 => "'",
+ 60 => '<',
+ 62 => '>'
+ );
+
+ /**
+ * Stripped entity names to decimal conversion table for special entities.
+ * @protected
+ */
+ var $_special_ent2dec =
+ array(
+ 'quot' => 34,
+ 'amp' => 38,
+ 'lt' => 60,
+ 'gt' => 62
+ );
+
+ /**
+ * Substitutes non-special entities with their parsed equivalents. Since
+ * running this whenever you have parsed character is t3h 5uck, we run
+ * it before everything else.
+ *
+ * @protected
+ * @param $string String to have non-special entities parsed.
+ * @returns Parsed string.
+ */
+ function substituteNonSpecialEntities($string) {
+ // it will try to detect missing semicolons, but don't rely on it
+ return preg_replace_callback(
+ $this->_substituteEntitiesRegex,
+ array($this, 'nonSpecialEntityCallback'),
+ $string
+ );
+ }
+
+ /**
+ * Callback function for substituteNonSpecialEntities() that does the work.
+ *
+ * @warning Though this is public in order to let the callback happen,
+ * calling it directly is not recommended.
+ * @note Based on Feyd's function at
+ * ,
+ * which is in public domain.
+ * @note While we're going to do code point parsing anyway, a good
+ * optimization would be to refuse to translate code points that
+ * are non-SGML characters. However, this could lead to duplication.
+ * @param $matches PCRE matches array, with 0 the entire match, and
+ * either index 1, 2 or 3 set with a hex value, dec value,
+ * or string (respectively).
+ * @returns Replacement string.
+ * @todo Implement string translations
+ */
+
+ // +----------+----------+----------+----------+
+ // | 33222222 | 22221111 | 111111 | |
+ // | 10987654 | 32109876 | 54321098 | 76543210 | bit
+ // +----------+----------+----------+----------+
+ // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
+ // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
+ // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
+ // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
+ // +----------+----------+----------+----------+
+ // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
+ // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
+ // +----------+----------+----------+----------+
+
+ function nonSpecialEntityCallback($matches) {
+ // replaces all but big five
+ $entity = $matches[0];
+ $is_num = (@$matches[0][1] === '#');
+ if ($is_num) {
+ $is_hex = (@$entity[2] === 'x');
+ $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+
+ // abort for special characters
+ if (isset($this->_special_dec2str[$code])) return $entity;
+
+ if($code > 1114111 or $code < 0 or
+ ($code >= 55296 and $code <= 57343) ) {
+ // bits are set outside the "valid" range as defined
+ // by UNICODE 4.1.0
+ return '';
+ }
+
+ $x = $y = $z = $w = 0;
+ if ($code < 128) {
+ // regular ASCII character
+ $x = $code;
+ } else {
+ // set up bits for UTF-8
+ $x = ($code & 63) | 128;
+ if ($code < 2048) {
+ $y = (($code & 2047) >> 6) | 192;
+ } else {
+ $y = (($code & 4032) >> 6) | 128;
+ if($code < 65536) {
+ $z = (($code >> 12) & 15) | 224;
+ } else {
+ $z = (($code >> 12) & 63) | 128;
+ $w = (($code >> 18) & 7) | 240;
+ }
+ }
+ }
+ // set up the actual character
+ $ret = '';
+ if($w) $ret .= chr($w);
+ if($z) $ret .= chr($z);
+ if($y) $ret .= chr($y);
+ $ret .= chr($x);
+
+ return $ret;
+ } else {
+ if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
+ if (!$this->_entity_lookup) {
+ require_once 'HTMLPurifier/EntityLookup.php';
+ $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+ }
+ if (isset($this->_entity_lookup->table[$matches[3]])) {
+ return $this->_entity_lookup->table[$matches[3]];
+ } else {
+ return $entity;
+ }
+ }
+ }
+
+ /**
+ * Cleans a UTF-8 string for well-formedness and SGML validity
+ *
+ * It will parse according to UTF-8 and return a valid UTF8 string, with
+ * non-SGML codepoints excluded.
+ *
+ * @warning This function can find a lot of use, so we may be moving
+ * it to a dedicated class.
+ *
+ * @note Just for reference, the non-SGML code points are 0 to 31 and
+ * 127 to 159, inclusive. However, we allow code points 9, 10
+ * and 13, which are the tab, line feed and carriage return
+ * respectively. 128 and above the code points map to multibyte
+ * UTF-8 representations.
+ *
+ * @note The functionality provided by the original function could be
+ * implemented with iconv using 'UTF-8//IGNORE', mbstring, or
+ * even the PCRE modifier 'u', these do not allow us to strip
+ * control characters or disallowed code points, and the latter
+ * does not allow invalid UTF8 characters to be ignored.
+ *
+ * @note Decomposing the string into Unicode code points is necessary
+ * because SGML disallows the use of specific code points, not
+ * necessarily bytes. A naive implementation that simply strtr
+ * disallowed code points as bytes will break other Unicode
+ * characters in which using such bytes is valid.
+ *
+ * @note Code adapted from utf8ToUnicode by Henri Sivonen and
+ * hsivonen@iki.fi at under the
+ * LGPL license. Notes on what changed are inside.
+ */
+ function cleanUTF8($str) {
+ $mState = 0; // cached expected number of octets after the current octet
+ // until the beginning of the next UTF8 character sequence
+ $mUcs4 = 0; // cached Unicode character
+ $mBytes = 1; // cached expected number of octets in the current sequence
+
+ // original code involved an $out that was an array of Unicode
+ // codepoints. Instead of having to convert back into UTF-8, we've
+ // decided to directly append valid UTF-8 characters onto a string
+ // $out once they're done. $char accumulates raw bytes, while $mUcs4
+ // turns into the Unicode code point, so there's some redundancy.
+
+ $out = '';
+ $char = '';
+
+ $len = strlen($str);
+ for($i = 0; $i < $len; $i++) {
+ $in = ord($str{$i});
+ $char .= $str[$i]; // append byte to char
+ if (0 == $mState) {
+ // When mState is zero we expect either a US-ASCII character
+ // or a multi-octet sequence.
+ if (0 == (0x80 & ($in))) {
+ // US-ASCII, pass straight through.
+ if (($in <= 31 || $in == 127) &&
+ !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
+ ) {
+ // control characters, remove
+ } else {
+ $out .= $char;
+ }
+ // reset
+ $char = '';
+ $mBytes = 1;
+ } elseif (0xC0 == (0xE0 & ($in))) {
+ // First octet of 2 octet sequence
+ $mUcs4 = ($in);
+ $mUcs4 = ($mUcs4 & 0x1F) << 6;
+ $mState = 1;
+ $mBytes = 2;
+ } elseif (0xE0 == (0xF0 & ($in))) {
+ // First octet of 3 octet sequence
+ $mUcs4 = ($in);
+ $mUcs4 = ($mUcs4 & 0x0F) << 12;
+ $mState = 2;
+ $mBytes = 3;
+ } elseif (0xF0 == (0xF8 & ($in))) {
+ // First octet of 4 octet sequence
+ $mUcs4 = ($in);
+ $mUcs4 = ($mUcs4 & 0x07) << 18;
+ $mState = 3;
+ $mBytes = 4;
+ } elseif (0xF8 == (0xFC & ($in))) {
+ // First octet of 5 octet sequence.
+ //
+ // This is illegal because the encoded codepoint must be
+ // either:
+ // (a) not the shortest form or
+ // (b) outside the Unicode range of 0-0x10FFFF.
+ // Rather than trying to resynchronize, we will carry on
+ // until the end of the sequence and let the later error
+ // handling code catch it.
+ $mUcs4 = ($in);
+ $mUcs4 = ($mUcs4 & 0x03) << 24;
+ $mState = 4;
+ $mBytes = 5;
+ } elseif (0xFC == (0xFE & ($in))) {
+ // First octet of 6 octet sequence, see comments for 5
+ // octet sequence.
+ $mUcs4 = ($in);
+ $mUcs4 = ($mUcs4 & 1) << 30;
+ $mState = 5;
+ $mBytes = 6;
+ } else {
+ // Current octet is neither in the US-ASCII range nor a
+ // legal first octet of a multi-octet sequence.
+ $mState = 0;
+ $mUcs4 = 0;
+ $mBytes = 1;
+ $char = '';
+ }
+ } else {
+ // When mState is non-zero, we expect a continuation of the
+ // multi-octet sequence
+ if (0x80 == (0xC0 & ($in))) {
+ // Legal continuation.
+ $shift = ($mState - 1) * 6;
+ $tmp = $in;
+ $tmp = ($tmp & 0x0000003F) << $shift;
+ $mUcs4 |= $tmp;
+
+ if (0 == --$mState) {
+ // End of the multi-octet sequence. mUcs4 now contains
+ // the final Unicode codepoint to be output
+
+ // Check for illegal sequences and codepoints.
+
+ // From Unicode 3.1, non-shortest form is illegal
+ if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
+ ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
+ ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
+ (4 < $mBytes) ||
+ // From Unicode 3.2, surrogate characters = illegal
+ (($mUcs4 & 0xFFFFF800) == 0xD800) ||
+ // Codepoints outside the Unicode range are illegal
+ ($mUcs4 > 0x10FFFF)
+ ) {
+
+ } elseif (0xFEFF != $mUcs4 && // omit BOM
+ !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
+ ) {
+ $out .= $char;
+ }
+ // initialize UTF8 cache (reset)
+ $mState = 0;
+ $mUcs4 = 0;
+ $mBytes = 1;
+ $char = '';
+ }
+ } else {
+ // ((0xC0 & (*in) != 0x80) && (mState != 0))
+ // Incomplete multi-octet sequence.
+ // used to result in complete fail, but we'll reset
+ $mState = 0;
+ $mUcs4 = 0;
+ $mBytes = 1;
+ $char ='';
+ }
+ }
+ }
+ return $out;
+ }
+
+ /**
+ * Substitutes only special entities with their parsed equivalents.
+ *
+ * @notice We try to avoid calling this function because otherwise, it
+ * would have to be called a lot (for every parsed section).
+ *
+ * @protected
+ * @param $string String to have non-special entities parsed.
+ * @returns Parsed string.
+ */
+ function substituteSpecialEntities($string) {
+ return preg_replace_callback(
+ $this->_substituteEntitiesRegex,
+ array('HTMLPurifier_Encoder', 'specialEntityCallback'),
+ $string);
+ }
+
+ /**
+ * Callback function for substituteSpecialEntities() that does the work.
+ *
+ * This callback has same syntax as nonSpecialEntityCallback().
+ *
+ * @warning Though this is public in order to let the callback happen,
+ * calling it directly is not recommended.
+ * @param $matches PCRE-style matches array, with 0 the entire match, and
+ * either index 1, 2 or 3 set with a hex value, dec value,
+ * or string (respectively).
+ * @returns Replacement string.
+ */
+ function specialEntityCallback($matches) {
+ $entity = $matches[0];
+ $is_num = (@$matches[0][1] === '#');
+ if ($is_num) {
+ $is_hex = (@$entity[2] === 'x');
+ $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+ return isset($this->_special_dec2str[$int]) ?
+ $this->_special_dec2str[$int] :
+ $entity;
+ } else {
+ return isset($this->_special_ent2dec[$matches[3]]) ?
+ $this->_special_ent2dec[$matches[3]] :
+ $entity;
+ }
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 1dd984b6..031e8e3d 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -1,6 +1,7 @@
_encoder = new HTMLPurifier_Encoder();
+ }
+
+ var $_encoder;
+
/**
* Lexes an HTML string into tokens.
*
@@ -101,168 +108,6 @@ class HTMLPurifier_Lexer
return $lexer;
}
- /**
- * Decimal to parsed string conversion table for special entities.
- * @protected
- */
- var $_special_dec2str =
- array(
- 34 => '"',
- 38 => '&',
- 39 => "'",
- 60 => '<',
- 62 => '>'
- );
-
- /**
- * Stripped entity names to decimal conversion table for special entities.
- * @protected
- */
- var $_special_ent2dec =
- array(
- 'quot' => 34,
- 'amp' => 38,
- 'lt' => 60,
- 'gt' => 62
- );
-
- /**
- * Most common entity to raw value conversion table for special entities.
- * @protected
- */
- var $_special_entity2str =
- array(
- '"' => '"',
- '&' => '&',
- '<' => '<',
- '>' => '>',
- ''' => "'",
- ''' => "'",
- ''' => "'"
- );
-
- /**
- * Callback regex string for parsing entities.
- * @protected
- */
- var $_substituteEntitiesRegex =
-'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
-// 1. hex 2. dec 3. string
-
- /**
- * Substitutes non-special entities with their parsed equivalents. Since
- * running this whenever you have parsed character is t3h 5uck, we run
- * it before everything else.
- *
- * @protected
- * @param $string String to have non-special entities parsed.
- * @returns Parsed string.
- */
- function substituteNonSpecialEntities($string) {
- // it will try to detect missing semicolons, but don't rely on it
- return preg_replace_callback(
- $this->_substituteEntitiesRegex,
- array($this, 'nonSpecialEntityCallback'),
- $string
- );
- }
-
- /**
- * Callback function for substituteNonSpecialEntities() that does the work.
- *
- * @warning Though this is public in order to let the callback happen,
- * calling it directly is not recommended.
- * @note Based on Feyd's function at
- * ,
- * which is in public domain.
- * @note While we're going to do code point parsing anyway, a good
- * optimization would be to refuse to translate code points that
- * are non-SGML characters. However, this could lead to duplication.
- * @param $matches PCRE matches array, with 0 the entire match, and
- * either index 1, 2 or 3 set with a hex value, dec value,
- * or string (respectively).
- * @returns Replacement string.
- * @todo Implement string translations
- */
-
- // +----------+----------+----------+----------+
- // | 33222222 | 22221111 | 111111 | |
- // | 10987654 | 32109876 | 54321098 | 76543210 | bit
- // +----------+----------+----------+----------+
- // | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
- // | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
- // | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
- // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
- // +----------+----------+----------+----------+
- // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
- // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
- // +----------+----------+----------+----------+
-
- function nonSpecialEntityCallback($matches) {
- // replaces all but big five
- $entity = $matches[0];
- $is_num = (@$matches[0][1] === '#');
- if ($is_num) {
- $is_hex = (@$entity[2] === 'x');
- $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-
- // abort for special characters
- if (isset($this->_special_dec2str[$code])) return $entity;
-
- if($code > 1114111 or $code < 0 or
- ($code >= 55296 and $code <= 57343) ) {
- // bits are set outside the "valid" range as defined
- // by UNICODE 4.1.0
- return '';
- }
-
- $x = $y = $z = $w = 0;
- if ($code < 128) {
- // regular ASCII character
- $x = $code;
- } else {
- // set up bits for UTF-8
- $x = ($code & 63) | 128;
- if ($code < 2048) {
- $y = (($code & 2047) >> 6) | 192;
- } else {
- $y = (($code & 4032) >> 6) | 128;
- if($code < 65536) {
- $z = (($code >> 12) & 15) | 224;
- } else {
- $z = (($code >> 12) & 63) | 128;
- $w = (($code >> 18) & 7) | 240;
- }
- }
- }
- // set up the actual character
- $ret = '';
- if($w) $ret .= chr($w);
- if($z) $ret .= chr($z);
- if($y) $ret .= chr($y);
- $ret .= chr($x);
-
- return $ret;
- } else {
- if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
- if (!$this->_entity_lookup) {
- require_once 'HTMLPurifier/EntityLookup.php';
- $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
- }
- if (isset($this->_entity_lookup->table[$matches[3]])) {
- return $this->_entity_lookup->table[$matches[3]];
- } else {
- return $entity;
- }
- }
- }
-
- /**
- * Contains a copy of the EntityLookup table.
- * @protected
- */
- var $_entity_lookup;
-
/**
* Translates CDATA sections into regular sections (through escaping).
*
@@ -305,170 +150,6 @@ class HTMLPurifier_Lexer
}
}
- /**
- * Cleans a UTF-8 string for well-formedness and SGML validity
- *
- * It will parse according to UTF-8 and return a valid UTF8 string, with
- * non-SGML codepoints excluded.
- *
- * @warning This function can find a lot of use, so we may be moving
- * it to a dedicated class.
- *
- * @note Just for reference, the non-SGML code points are 0 to 31 and
- * 127 to 159, inclusive. However, we allow code points 9, 10
- * and 13, which are the tab, line feed and carriage return
- * respectively. 128 and above the code points map to multibyte
- * UTF-8 representations.
- *
- * @note The functionality provided by the original function could be
- * implemented with iconv using 'UTF-8//IGNORE', mbstring, or
- * even the PCRE modifier 'u', these do not allow us to strip
- * control characters or disallowed code points, and the latter
- * does not allow invalid UTF8 characters to be ignored.
- *
- * @note Decomposing the string into Unicode code points is necessary
- * because SGML disallows the use of specific code points, not
- * necessarily bytes. A naive implementation that simply strtr
- * disallowed code points as bytes will break other Unicode
- * characters in which using such bytes is valid.
- *
- * @note Code adapted from utf8ToUnicode by Henri Sivonen and
- * hsivonen@iki.fi at under the
- * LGPL license. Notes on what changed are inside.
- */
- function cleanUTF8($str) {
- $mState = 0; // cached expected number of octets after the current octet
- // until the beginning of the next UTF8 character sequence
- $mUcs4 = 0; // cached Unicode character
- $mBytes = 1; // cached expected number of octets in the current sequence
-
- // original code involved an $out that was an array of Unicode
- // codepoints. Instead of having to convert back into UTF-8, we've
- // decided to directly append valid UTF-8 characters onto a string
- // $out once they're done. $char accumulates raw bytes, while $mUcs4
- // turns into the Unicode code point, so there's some redundancy.
-
- $out = '';
- $char = '';
-
- $len = strlen($str);
- for($i = 0; $i < $len; $i++) {
- $in = ord($str{$i});
- $char .= $str[$i]; // append byte to char
- if (0 == $mState) {
- // When mState is zero we expect either a US-ASCII character
- // or a multi-octet sequence.
- if (0 == (0x80 & ($in))) {
- // US-ASCII, pass straight through.
- if (($in <= 31 || $in == 127) &&
- !($in == 9 || $in == 13 || $in == 10) // save \r\t\n
- ) {
- // control characters, remove
- } else {
- $out .= $char;
- }
- // reset
- $char = '';
- $mBytes = 1;
- } elseif (0xC0 == (0xE0 & ($in))) {
- // First octet of 2 octet sequence
- $mUcs4 = ($in);
- $mUcs4 = ($mUcs4 & 0x1F) << 6;
- $mState = 1;
- $mBytes = 2;
- } elseif (0xE0 == (0xF0 & ($in))) {
- // First octet of 3 octet sequence
- $mUcs4 = ($in);
- $mUcs4 = ($mUcs4 & 0x0F) << 12;
- $mState = 2;
- $mBytes = 3;
- } elseif (0xF0 == (0xF8 & ($in))) {
- // First octet of 4 octet sequence
- $mUcs4 = ($in);
- $mUcs4 = ($mUcs4 & 0x07) << 18;
- $mState = 3;
- $mBytes = 4;
- } elseif (0xF8 == (0xFC & ($in))) {
- // First octet of 5 octet sequence.
- //
- // This is illegal because the encoded codepoint must be
- // either:
- // (a) not the shortest form or
- // (b) outside the Unicode range of 0-0x10FFFF.
- // Rather than trying to resynchronize, we will carry on
- // until the end of the sequence and let the later error
- // handling code catch it.
- $mUcs4 = ($in);
- $mUcs4 = ($mUcs4 & 0x03) << 24;
- $mState = 4;
- $mBytes = 5;
- } elseif (0xFC == (0xFE & ($in))) {
- // First octet of 6 octet sequence, see comments for 5
- // octet sequence.
- $mUcs4 = ($in);
- $mUcs4 = ($mUcs4 & 1) << 30;
- $mState = 5;
- $mBytes = 6;
- } else {
- // Current octet is neither in the US-ASCII range nor a
- // legal first octet of a multi-octet sequence.
- $mState = 0;
- $mUcs4 = 0;
- $mBytes = 1;
- $char = '';
- }
- } else {
- // When mState is non-zero, we expect a continuation of the
- // multi-octet sequence
- if (0x80 == (0xC0 & ($in))) {
- // Legal continuation.
- $shift = ($mState - 1) * 6;
- $tmp = $in;
- $tmp = ($tmp & 0x0000003F) << $shift;
- $mUcs4 |= $tmp;
-
- if (0 == --$mState) {
- // End of the multi-octet sequence. mUcs4 now contains
- // the final Unicode codepoint to be output
-
- // Check for illegal sequences and codepoints.
-
- // From Unicode 3.1, non-shortest form is illegal
- if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
- ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
- ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
- (4 < $mBytes) ||
- // From Unicode 3.2, surrogate characters = illegal
- (($mUcs4 & 0xFFFFF800) == 0xD800) ||
- // Codepoints outside the Unicode range are illegal
- ($mUcs4 > 0x10FFFF)
- ) {
-
- } elseif (0xFEFF != $mUcs4 && // omit BOM
- !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
- ) {
- $out .= $char;
- }
- // initialize UTF8 cache (reset)
- $mState = 0;
- $mUcs4 = 0;
- $mBytes = 1;
- $char = '';
- }
- } else {
- // ((0xC0 & (*in) != 0x80) && (mState != 0))
- // Incomplete multi-octet sequence.
- // used to result in complete fail, but we'll reset
- $mState = 0;
- $mUcs4 = 0;
- $mBytes = 1;
- $char ='';
- }
- }
- }
- return $out;
- }
-
}
?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 230e694e..e408fa84 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -30,6 +30,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
public function __construct() {
// setup the factory
+ parent::HTMLPurifier_Lexer();
$this->factory = new HTMLPurifier_TokenFactory();
}
@@ -50,10 +51,10 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
// substitute non-special entities. While DOM is perfectly capable
// of doing this, we need to get at the UTF-8 characters in
// cleanUTF8
- $string = $this->substituteNonSpecialEntities($string);
+ $string = $this->_encoder->substituteNonSpecialEntities($string);
// clean it into well-formed UTF-8 string
- $string = $this->cleanUTF8($string);
+ $string = $this->_encoder->cleanUTF8($string);
// preprocess string, essential for UTF-8
$string =
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index aa1250df..6951c491 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -20,6 +20,21 @@ require_once 'HTMLPurifier/Lexer.php';
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{
+ /**
+ * Most common entity to raw value conversion table for special entities.
+ * @protected
+ */
+ var $_special_entity2str =
+ array(
+ '"' => '"',
+ '&' => '&',
+ '<' => '<',
+ '>' => '>',
+ ''' => "'",
+ ''' => "'",
+ ''' => "'"
+ );
+
/**
* Parses special entities into the proper characters.
*
@@ -51,7 +66,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($num_amp_2 <= $num_esc_amp) return $string;
// hmm... now we have some uncommon entities. Use the callback.
- $string = $this->substituteSpecialEntities($string);
+ $string = $this->_encoder->substituteSpecialEntities($string);
return $string;
}
@@ -61,51 +76,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
*/
var $_whitespace = "\x20\x09\x0D\x0A";
- /**
- * Substitutes only special entities with their parsed equivalents.
- *
- * @notice We try to avoid calling this function because otherwise, it
- * would have to be called a lot (for every parsed section).
- *
- * @protected
- * @param $string String to have non-special entities parsed.
- * @returns Parsed string.
- */
- function substituteSpecialEntities($string) {
- return preg_replace_callback(
- $this->_substituteEntitiesRegex,
- array('HTMLPurifier_Lexer_DirectLex', 'specialEntityCallback'),
- $string);
- }
-
- /**
- * Callback function for substituteSpecialEntities() that does the work.
- *
- * This callback has same syntax as nonSpecialEntityCallback().
- *
- * @warning Though this is public in order to let the callback happen,
- * calling it directly is not recommended.
- * @param $matches PCRE-style matches array, with 0 the entire match, and
- * either index 1, 2 or 3 set with a hex value, dec value,
- * or string (respectively).
- * @returns Replacement string.
- */
- function specialEntityCallback($matches) {
- $entity = $matches[0];
- $is_num = (@$matches[0][1] === '#');
- if ($is_num) {
- $is_hex = (@$entity[2] === 'x');
- $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
- return isset($this->_special_dec2str[$int]) ?
- $this->_special_dec2str[$int] :
- $entity;
- } else {
- return isset($this->_special_ent2dec[$matches[3]]) ?
- $this->_special_ent2dec[$matches[3]] :
- $entity;
- }
- }
-
function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
@@ -126,10 +96,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$string = $this->escapeCDATA($string);
// expand entities THAT AREN'T THE BIG FIVE
- $string = $this->substituteNonSpecialEntities($string);
+ $string = $this->_encoder->substituteNonSpecialEntities($string);
// clean it into wellformed UTF-8 string
- $string = $this->cleanUTF8($string);
+ $string = $this->_encoder->cleanUTF8($string);
// infinite loop protection
// has to be pretty big, since html docs can be big
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index a0bdfe66..c042d2f9 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -35,8 +35,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
if ($config->get('Core', 'AcceptFullDocuments')) {
$string = $this->extractBody($string);
}
- $string = $this->substituteNonSpecialEntities($string);
- $string = $this->cleanUTF8($string);
+ $string = $this->_encoder->substituteNonSpecialEntities($string);
+ $string = $this->_encoder->cleanUTF8($string);
$parser=& new XML_HTMLSax3();
$parser->set_object($this);
$parser->set_element_handler('openHandler','closeHandler');
diff --git a/smoketests/common.php b/smoketests/common.php
index a6a8c146..e01d7500 100644
--- a/smoketests/common.php
+++ b/smoketests/common.php
@@ -6,7 +6,7 @@ set_include_path('../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
function escapeHTML($string) {
- $string = HTMLPurifier_Lexer::cleanUTF8($string);
+ $string = HTMLPurifier_Encoder::cleanUTF8($string);
$string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
return $string;
}
diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php
new file mode 100644
index 00000000..7b8d7998
--- /dev/null
+++ b/tests/HTMLPurifier/EncoderTest.php
@@ -0,0 +1,96 @@
+Encoder = new HTMLPurifier_Encoder();
+ $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+ }
+
+ function assertCleanUTF8($string, $expect = null) {
+ if ($expect === null) $expect = $string;
+ $this->assertIdentical($this->Encoder->cleanUTF8($string), $expect);
+ }
+
+ function test_cleanUTF8() {
+ $this->assertCleanUTF8('Normal string.');
+ $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
+ $this->assertCleanUTF8("null byte: \0", 'null byte: ');
+ $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
+ $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
+ $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
+ $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
+ $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
+ }
+
+ function test_substituteNonSpecialEntities() {
+ $char_theta = $this->_entity_lookup->table['theta'];
+ $this->assertIdentical($char_theta,
+ $this->Encoder->substituteNonSpecialEntities('θ') );
+ $this->assertIdentical('"',
+ $this->Encoder->substituteNonSpecialEntities('"') );
+
+ // numeric tests, adapted from Feyd
+ $args = array();
+ $args[] = array(1114112,false );
+ $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF
+ $args[] = array(1048576,'F4808080'); // 0x00100000
+ $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF
+ $args[] = array(262144, 'F1808080'); // 0x00040000
+ $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF
+ $args[] = array(65536, 'F0908080'); // 0x00010000
+ $args[] = array(65535, 'EFBFBF' ); // 0x0000FFFF
+ $args[] = array(57344, 'EE8080' ); // 0x0000E000
+ $args[] = array(57343, false ); // 0x0000DFFF these are ill-formed
+ $args[] = array(56040, false ); // 0x0000DAE8 these are ill-formed
+ $args[] = array(55296, false ); // 0x0000D800 these are ill-formed
+ $args[] = array(55295, 'ED9FBF' ); // 0x0000D7FF
+ $args[] = array(53248, 'ED8080' ); // 0x0000D000
+ $args[] = array(53247, 'ECBFBF' ); // 0x0000CFFF
+ $args[] = array(4096, 'E18080' ); // 0x00001000
+ $args[] = array(4095, 'E0BFBF' ); // 0x00000FFF
+ $args[] = array(2048, 'E0A080' ); // 0x00000800
+ $args[] = array(2047, 'DFBF' ); // 0x000007FF
+ $args[] = array(128, 'C280' ); // 0x00000080 invalid SGML char
+ $args[] = array(127, '7F' ); // 0x0000007F invalid SGML char
+ $args[] = array(0, '00' ); // 0x00000000 invalid SGML char
+
+ $args[] = array(20108, 'E4BA8C' ); // 0x00004E8C
+ $args[] = array(77, '4D' ); // 0x0000004D
+ $args[] = array(66306, 'F0908C82'); // 0x00010302
+ $args[] = array(1072, 'D0B0' ); // 0x00000430
+
+ foreach ($args as $arg) {
+ $string = '' . $arg[0] . ';' . // decimal
+ '' . dechex($arg[0]) . ';'; // hex
+ $expect = '';
+ if ($arg[1] !== false) {
+ $chars = str_split($arg[1], 2);
+ foreach ($chars as $char) {
+ $expect .= chr(hexdec($char));
+ }
+ $expect .= $expect; // double it
+ }
+ $this->assertIdentical(
+ $this->Encoder->substituteNonSpecialEntities($string),
+ $expect,
+ $arg[0] . ': %s'
+ );
+ }
+
+ }
+
+ function test_specialEntityCallback() {
+
+ $this->assertIdentical("'",$this->Encoder->specialEntityCallback(
+ array(''', null, '39', null) ));
+ }
+
+}
+
+?>
\ No newline at end of file
diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php
index d1e6f088..2ad14476 100644
--- a/tests/HTMLPurifier/Lexer/DirectLexTest.php
+++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php
@@ -11,13 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
$this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
}
- function test_specialEntityCallback() {
- $HP =& $this->DirectLex;
-
- $this->assertIdentical("'",$HP->specialEntityCallback(
- array(''', null, '39', null) ));
- }
-
function test_parseData() {
$HP =& $this->DirectLex;
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 09eec396..25fff13c 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -32,79 +32,6 @@ class HTMLPurifier_LexerTest extends UnitTestCase
}
- function assertCleanUTF8($string, $expect = null) {
- if ($expect === null) $expect = $string;
- $this->assertIdentical($this->Lexer->cleanUTF8($string), $expect);
- }
-
- function test_cleanUTF8() {
- $this->assertCleanUTF8('Normal string.');
- $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
- $this->assertCleanUTF8("null byte: \0", 'null byte: ');
- $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
- $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
- $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
- $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
- $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
- }
-
- function test_substituteNonSpecialEntities() {
- $char_theta = $this->_entity_lookup->table['theta'];
- $this->assertIdentical($char_theta,
- $this->Lexer->substituteNonSpecialEntities('θ') );
- $this->assertIdentical('"',
- $this->Lexer->substituteNonSpecialEntities('"') );
-
- // numeric tests, adapted from Feyd
- $args = array();
- $args[] = array(1114112,false );
- $args[] = array(1114111,'F48FBFBF'); // 0x0010FFFF
- $args[] = array(1048576,'F4808080'); // 0x00100000
- $args[] = array(1048575,'F3BFBFBF'); // 0x000FFFFF
- $args[] = array(262144, 'F1808080'); // 0x00040000
- $args[] = array(262143, 'F0BFBFBF'); // 0x0003FFFF
- $args[] = array(65536, 'F0908080'); // 0x00010000
- $args[] = array(65535, 'EFBFBF' ); // 0x0000FFFF
- $args[] = array(57344, 'EE8080' ); // 0x0000E000
- $args[] = array(57343, false ); // 0x0000DFFF these are ill-formed
- $args[] = array(56040, false ); // 0x0000DAE8 these are ill-formed
- $args[] = array(55296, false ); // 0x0000D800 these are ill-formed
- $args[] = array(55295, 'ED9FBF' ); // 0x0000D7FF
- $args[] = array(53248, 'ED8080' ); // 0x0000D000
- $args[] = array(53247, 'ECBFBF' ); // 0x0000CFFF
- $args[] = array(4096, 'E18080' ); // 0x00001000
- $args[] = array(4095, 'E0BFBF' ); // 0x00000FFF
- $args[] = array(2048, 'E0A080' ); // 0x00000800
- $args[] = array(2047, 'DFBF' ); // 0x000007FF
- $args[] = array(128, 'C280' ); // 0x00000080 invalid SGML char
- $args[] = array(127, '7F' ); // 0x0000007F invalid SGML char
- $args[] = array(0, '00' ); // 0x00000000 invalid SGML char
-
- $args[] = array(20108, 'E4BA8C' ); // 0x00004E8C
- $args[] = array(77, '4D' ); // 0x0000004D
- $args[] = array(66306, 'F0908C82'); // 0x00010302
- $args[] = array(1072, 'D0B0' ); // 0x00000430
-
- foreach ($args as $arg) {
- $string = '' . $arg[0] . ';' . // decimal
- '' . dechex($arg[0]) . ';'; // hex
- $expect = '';
- if ($arg[1] !== false) {
- $chars = str_split($arg[1], 2);
- foreach ($chars as $char) {
- $expect .= chr(hexdec($char));
- }
- $expect .= $expect; // double it
- }
- $this->assertIdentical(
- $this->Lexer->substituteNonSpecialEntities($string),
- $expect,
- $arg[0] . ': %s'
- );
- }
-
- }
-
function assertExtractBody($text, $extract = true) {
$result = $this->Lexer->extractBody($text);
if ($extract === true) $extract = $text;
diff --git a/tests/index.php b/tests/index.php
index 39ef5a6d..de57e739 100644
--- a/tests/index.php
+++ b/tests/index.php
@@ -86,6 +86,7 @@ $test_files[] = 'AttrTransform/BdoDirTest.php';
$test_files[] = 'AttrTransform/ImgRequiredTest.php';
$test_files[] = 'URISchemeRegistryTest.php';
$test_files[] = 'URISchemeTest.php';
+$test_files[] = 'EncoderTest.php';
if (version_compare(PHP_VERSION, '5', '>=')) {
$test_files[] = 'TokenFactoryTest.php';