diff --git a/library/HTMLPurifier/Generator.php b/library/HTMLPurifier/Generator.php index deecc084..19295dbc 100644 --- a/library/HTMLPurifier/Generator.php +++ b/library/HTMLPurifier/Generator.php @@ -13,6 +13,7 @@ class HTMLPurifier_Generator } function generateFromToken($token) { + if (!isset($token->type)) return ''; if ($token->type == 'start') { $attr = $this->generateAttributes($token->attributes); return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>'; @@ -25,7 +26,7 @@ class HTMLPurifier_Generator return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />'; } elseif ($token->type == 'text') { - return htmlentities($token->data, ENT_COMPAT, 'UTF-8'); + return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8'); } else { return ''; @@ -36,7 +37,7 @@ class HTMLPurifier_Generator function generateAttributes($assoc_array_of_attributes) { $html = ''; foreach ($assoc_array_of_attributes as $key => $value) { - $html .= $key.'="'.htmlentities($value, ENT_COMPAT, 'UTF-8').'" '; + $html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" '; } return rtrim($html); } diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index ee341ff8..086d7fb9 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -5,18 +5,20 @@ require_once 'HTMLPurifier/Token.php'; /** * Forgivingly lexes HTML (SGML-style) markup into tokens. * - * The lexer parses a string of SGML-style markup and converts them into + * A lexer parses a string of SGML-style markup and converts them into * corresponding tokens. It doesn't check for well-formedness, although its * internal mechanism may make this automatic (such as the case of * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose * from. * - * The lexer is HTML-oriented: it might work with XML, but it's not + * A lexer is HTML-oriented: it might work with XML, but it's not * recommended, as we adhere to a subset of the specification for optimization * reasons. * * This class should not be directly instantiated, but you may use create() to - * retrieve a default copy of the lexer. + * retrieve a default copy of the lexer. Being a supertype, this class + * does not actually define any implementation, but offers commonly used + * convenience functions for subclasses. * * @note The unit tests will instantiate this class for testing purposes, as * many of the utility functions require a class to be instantiated. diff --git a/tests/HTMLPurifier/GeneratorTest.php b/tests/HTMLPurifier/GeneratorTest.php index d93decf0..5806868b 100644 --- a/tests/HTMLPurifier/GeneratorTest.php +++ b/tests/HTMLPurifier/GeneratorTest.php @@ -1,15 +1,18 @@ UnitTestCase(); $this->gen = new HTMLPurifier_Generator(); + $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); } function test_generateFromToken() { @@ -39,6 +42,15 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs[5] = new HTMLPurifier_Token_Empty('br'); $expect[5] = '
'; + // test fault tolerance + $inputs[6] = null; + $expect[6] = ''; + + // don't convert non-special characters + $theta_char = $this->_entity_lookup->table['theta']; + $inputs[7] = new HTMLPurifier_Token_Text($theta_char); + $expect[7] = $theta_char; + foreach ($inputs as $i => $input) { $result = $this->gen->generateFromToken($input); $this->assertEqual($result, $expect[$i]); @@ -64,6 +76,11 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase $inputs[3] = array('src' => 'picture.jpg', 'alt' => 'Short & interesting'); $expect[3] = 'src="picture.jpg" alt="Short & interesting"'; + // don't escape nonspecial characters + $theta_char = $this->_entity_lookup->table['theta']; + $inputs[4] = array('title' => 'Theta is ' . $theta_char); + $expect[4] = 'title="Theta is ' . $theta_char . '"'; + foreach ($inputs as $i => $input) { $result = $this->gen->generateAttributes($input); $this->assertEqual($result, $expect[$i]);