1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-30 19:00:10 +02:00

Malformed UTF-8 and non-SGML character detection and cleaning implemented

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@303 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2006-08-19 17:53:59 +00:00
parent 53808ee34a
commit 973cc43b64
11 changed files with 131 additions and 58 deletions

View File

@@ -2,35 +2,51 @@
// pretty-printing with indentation would be pretty cool
require_once 'HTMLPurifier/Lexer.php';
HTMLPurifier_ConfigDef::define(
'Core', 'CleanUTF8DuringGeneration', false,
'When true, HTMLPurifier_Generator will also check all strings it '.
'escapes for UTF-8 well-formedness as a defense in depth measure. '.
'This could cause a considerable performance impact, and is not '.
'strictly necessary due to the fact that the Lexers should have '.
'ensured that all the UTF-8 strings were well-formed. Note that '.
'the configuration value is only read at the beginning of '.
'generateFromTokens.'
);
class HTMLPurifier_Generator
{
var $clean_utf8 = false;
// only unit tests may omit configuration: internals MUST pass config
function generateFromTokens($tokens, $config = null) {
$html = '';
if (!$config) $config = HTMLPurifier_Config::createDefault();
$this->clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
if (!$tokens) return '';
foreach ($tokens as $token) {
$html .= $this->generateFromToken($token, $config);
$html .= $this->generateFromToken($token);
}
return $html;
}
function generateFromToken($token, $config) {
function generateFromToken($token) {
if (!isset($token->type)) return '';
if ($token->type == 'start') {
$attr = $this->generateAttributes($token->attributes, $config);
$attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token->type == 'end') {
return '</' . $token->name . '>';
} elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attributes, $config);
$attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
} elseif ($token->type == 'text') {
return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8');
return $this->escape($token->data);
} else {
return '';
@@ -38,14 +54,19 @@ class HTMLPurifier_Generator
}
}
function generateAttributes($assoc_array_of_attributes, $config) {
function generateAttributes($assoc_array_of_attributes) {
$html = '';
foreach ($assoc_array_of_attributes as $key => $value) {
$html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" ';
$html .= $key.'="'.$this->escape($value).'" ';
}
return rtrim($html);
}
function escape($string) {
if ($this->clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
}
}
?>

View File

@@ -307,16 +307,19 @@ class HTMLPurifier_Lexer
}
/**
* Currently converts UTF8 into an array of Unicode codepoints. (changing)
* Cleans a UTF-8 string for well-formedness and SGML validity
*
* We're going to convert this into a multi-purpose UTF-8 well-formedness
* checker as well as handler for the control characters that are illegal
* in SGML documents. But *after* we draw up some unit-tests. This means
* that the function, in the end, will not return an array of codepoints
* but a valid UTF8 string, with non-SGML codepoints excluded.
* It will parse according to UTF-8 and return a valid UTF8 string, with
* non-SGML codepoints excluded.
*
* @warning This function can find a lot of use, so we may be moving
* it to a dedicated class.
*
* @note Just for reference, the non-SGML code points are 0 to 31 and
* 127 to 159, inclusive.
* 127 to 159, inclusive. However, we allow code points 9, 10
* and 13, which are the tab, line feed and carriage return
* respectively. 128 and above the code points map to multibyte
* UTF-8 representations.
*
* @note The functionality provided by the original function could be
* implemented with iconv using 'UTF-8//IGNORE', mbstring, or
@@ -332,7 +335,7 @@ class HTMLPurifier_Lexer
*
* @note Code adapted from utf8ToUnicode by Henri Sivonen and
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
* LGPL license.
* LGPL license. Notes on what changed are inside.
*/
function cleanUTF8($str) {
$mState = 0; // cached expected number of octets after the current octet
@@ -340,17 +343,33 @@ class HTMLPurifier_Lexer
$mUcs4 = 0; // cached Unicode character
$mBytes = 1; // cached expected number of octets in the current sequence
$out = array();
// original code involved an $out that was an array of Unicode
// codepoints. Instead of having to convert back into UTF-8, we've
// decided to directly append valid UTF-8 characters onto a string
// $out once they're done. $char accumulates raw bytes, while $mUcs4
// turns into the Unicode code point, so there's some redundancy.
$out = '';
$char = '';
$len = strlen($str);
for($i = 0; $i < $len; $i++) {
$in = ord($str{$i});
$char .= $str[$i]; // append byte to char
if (0 == $mState) {
// When mState is zero we expect either a US-ASCII character
// or a multi-octet sequence.
if (0 == (0x80 & ($in))) {
// US-ASCII, pass straight through.
$out[] = $in;
if (($in <= 31 || $in == 127) &&
!($in == 9 || $in == 13 || $in == 10) // save \r\t\n
) {
// control characters, remove
} else {
$out .= $char;
}
// reset
$char = '';
$mBytes = 1;
} elseif (0xC0 == (0xE0 & ($in))) {
// First octet of 2 octet sequence
@@ -394,7 +413,10 @@ class HTMLPurifier_Lexer
} else {
// Current octet is neither in the US-ASCII range nor a
// legal first octet of a multi-octet sequence.
return false;
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
$char = '';
}
} else {
// When mState is non-zero, we expect a continuation of the
@@ -422,23 +444,26 @@ class HTMLPurifier_Lexer
// Codepoints outside the Unicode range are illegal
($mUcs4 > 0x10FFFF)
) {
return false;
} elseif (0xFEFF != $mUcs4 && // omit BOM
!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
) {
$out .= $char;
}
if (0xFEFF != $mUcs4) {
// BOM is legal but we don't want to output it
$out[] = $mUcs4;
}
//initialize UTF8 cache
// initialize UTF8 cache (reset)
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
$char = '';
}
} else {
// ((0xC0 & (*in) != 0x80) && (mState != 0))
//
// Incomplete multi-octet sequence.
//
return false;
// used to result in complete fail, but we'll reset
$mState = 0;
$mUcs4 = 0;
$mBytes = 1;
$char ='';
}
}
}

View File

@@ -47,6 +47,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
// mode won't get 'em.
$string = $this->escapeCDATA($string);
// substitute non-special entities. While DOM is perfectly capable
// of doing this, we need to get at the UTF-8 characters in
// cleanUTF8
$string = $this->substituteNonSpecialEntities($string);
// clean it into well-formed UTF-8 string
$string = $this->cleanUTF8($string);
if (!$is_full) {
// preprocess string, essential for UTF-8
$string =

View File

@@ -128,6 +128,9 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// expand entities THAT AREN'T THE BIG FIVE
$string = $this->substituteNonSpecialEntities($string);
// clean it into wellformed UTF-8 string
$string = $this->cleanUTF8($string);
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?

View File

@@ -29,20 +29,21 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
*/
var $tokens = array();
function tokenizeHTML($html, $config = null) {
function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
$html = $this->escapeCDATA($html);
$string = $this->escapeCDATA($string);
if ($config->get('Core', 'AcceptFullDocuments')) {
$html = $this->extractBody($html);
$string = $this->extractBody($string);
}
$html = $this->substituteNonSpecialEntities($html);
$string = $this->substituteNonSpecialEntities($string);
$string = $this->cleanUTF8($string);
$parser=& new XML_HTMLSax3();
$parser->set_object($this);
$parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler');
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($html);
$parser->parse($string);
$tokens = $this->tokens;
$this->tokens = array();
return $tokens;