From bcc2b09ac7d4eed5e34219ec8bbad8f7799c516e Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 23 Jul 2006 18:56:00 +0000 Subject: [PATCH] Finish documenting PEARSax3, touch up the other docs. Nuke the original lexer.txt document. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@102 48356398-32a2-884e-a903-53898d9a118a --- docs/lexer.txt | 41 ------------------------ library/HTMLPurifier/Lexer.php | 2 +- library/HTMLPurifier/Lexer/DOMLex.php | 36 ++++++++++++++++++++- library/HTMLPurifier/Lexer/DirectLex.php | 33 ++++++++++++++++--- library/HTMLPurifier/Lexer/PEARSax3.php | 40 ++++++++++++++++++++--- 5 files changed, 101 insertions(+), 51 deletions(-) delete mode 100644 docs/lexer.txt diff --git a/docs/lexer.txt b/docs/lexer.txt deleted file mode 100644 index 31b55ba7..00000000 --- a/docs/lexer.txt +++ /dev/null @@ -1,41 +0,0 @@ - -Lexer - -The lexer parses a string of SGML-style markup and converts them into -corresponding tokens. It doesn't check for well-formedness, although it's -internal mechanism may make this automatic (such as the case of DOMLex). - -We have several implementations of the Lexer: - -DirectLex [4,5] - our in-house implementation - DirectLex has absolutely no dependencies, making it a reasonably good - default for PHP4. Written with efficiency in mind, it is up to two - times faster than the PEAR parser. It will support UTF-8 completely - eventually. - -PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse - PEAR, not suprisingly, also has a SAX parser for HTML. I don't know - very much about implementation, but it's fairly well written. However, that - abstraction comes at a price: performance. You need to have it installed, - and if the API changes, it might break our adapter. Not sure whether or not - it's UTF-8 aware, but it has some entity parsing trouble. - -DOMLex [5] - uses the PHP5 core extension DOM to parse - In PHP 5, the DOM XML extension was revamped into DOM and added to the core. - It gives us a forgiving HTML parser, which we use to transform the HTML - into a DOM, and then into the tokens. It is blazingly fast, and is the - default choice for PHP 5. However, entity resolution may be troublesome, - though its UTF-8 is excellent. Also, any empty elements will have empty - tokens associated with them, even if this is prohibited. - -We use tokens because creating a DOM representation would: - -1. Require more processing power to create, -2. Require recursion to iterate, -3. Must be compatible with PHP 5's DOM, -4. Has the entire document structure (html and body not needed), and -5. Has unknown readability improvement. - -What the last item means is that the functions for manipulating tokens are -already fairly compact, and when well-commented, more abstraction may not -be needed. \ No newline at end of file diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 2a8f37a5..34c489ed 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -6,7 +6,7 @@ require_once 'HTMLPurifier/Token.php'; * Forgivingly lexes HTML (SGML-style) markup into tokens. * * The lexer parses a string of SGML-style markup and converts them into - * corresponding tokens. It doesn't check for well-formedness, although it's + * corresponding tokens. It doesn't check for well-formedness, although its * internal mechanism may make this automatic (such as the case of * HTMLPurifier_Lexer_DOMLex). There are several implementations to choose * from. diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 8b72aa24..d9f41e61 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -2,7 +2,25 @@ require_once 'HTMLPurifier/Lexer.php'; -// PHP5 only! +/** + * Parser that uses PHP 5's DOM extension (part of the core). + * + * In PHP 5, the DOM XML extension was revamped into DOM and added to the core. + * It gives us a forgiving HTML parser, which we use to transform the HTML + * into a DOM, and then into the tokens. It is blazingly fast (for large + * documents, it performs twenty times faster than + * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. + * + * @notice + * Any empty elements will have empty tokens associated with them, even if + * this is prohibited by the spec. This is cannot be fixed until the spec + * comes into play. + * + * @todo Determine DOM's entity parsing behavior, point to local entity files + * if necessary. + * @todo Make div access less fragile, and refrain from preprocessing when + * HTML tag and friends are already present. + */ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer { @@ -19,6 +37,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer ); } + /** + * Recursive function that tokenizes a node, putting it into an accumulator. + * + * @param $node DOMNode to be tokenized. + * @param $tokens Array-list of already tokenized tokens. + * @param $collect Says whether or start and close are collected, set to + * false at first recursion because it's the implicit DIV + * tag you're dealing with. + * @returns Tokens of node appended to previously passed tokens. + */ protected function tokenizeDOM($node, $tokens = array(), $collect = false) { // recursive goodness! @@ -63,6 +91,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer } + /** + * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array. + * + * @param $attribute_list DOMNamedNodeMap of DOMAttr objects. + * @returns Associative array of attributes. + */ protected function transformAttrToAssoc($attribute_list) { $attribute_array = array(); // undocumented behavior diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 6d135e92..91706370 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -16,6 +16,7 @@ require_once 'HTMLPurifier/Lexer.php'; * @todo Add support for CDATA sections. * @todo Determine correct behavior in outputting comment data. (preserve dashes?) * @todo Optimize main function tokenizeHTML(). + * @todo Less than sign (<) being prohibited (even as entity) in attr-values? */ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer { @@ -108,6 +109,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer /** * Substitutes non-special entities with their parsed equivalents. + * + * @protected + * @param $string String to have non-special entities parsed. + * @returns Parsed string. */ function substituteNonSpecialEntities($string) { // it will try to detect missing semicolons, but don't rely on it @@ -119,6 +124,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer /** * Callback function for substituteNonSpecialEntities() that does the work. + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @param $matches PCRE-style matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. + * @todo Implement string translations */ function nonSpecialEntityCallback($matches) { // replaces all but big five @@ -132,14 +145,19 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } else { if (isset($this->_special_ent2dec[$matches[3]])) return $entity; // translate $matches[3] + return ''; } } /** * Substitutes only special entities with their parsed equivalents. * - * We try to avoid calling this function because otherwise, it would have - * to be called a lot (for every parsed section). + * @notice We try to avoid calling this function because otherwise, it + * would have to be called a lot (for every parsed section). + * + * @protected + * @param $string String to have non-special entities parsed. + * @returns Parsed string. */ function substituteSpecialEntities($string) { return preg_replace_callback( @@ -151,7 +169,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer /** * Callback function for substituteSpecialEntities() that does the work. * - * This callback is very similar to nonSpecialEntityCallback(). + * This callback has same syntax as nonSpecialEntityCallback(). + * + * @warning Though this is public in order to let the callback happen, + * calling it directly is not recommended. + * @param $matches PCRE-style matches array, with 0 the entire match, and + * either index 1, 2 or 3 set with a hex value, dec value, + * or string (respectively). + * @returns Replacement string. */ function specialEntityCallback($matches) { $entity = $matches[0]; @@ -327,7 +352,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer * Takes the inside of an HTML tag and makes an assoc array of attributes. * * @param $string Inside of tag excluding name. - * @return Assoc array of attributes. + * @returns Assoc array of attributes. */ function parseAttributeString($string) { $string = (string) $string; // quick typecast diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index c13dc377..89681f2e 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -3,14 +3,32 @@ require_once 'XML/HTMLSax3.php'; // PEAR require_once 'HTMLPurifier/Lexer.php'; -// uses the PEAR class XML_HTMLSax3 to parse XML +/** + * Lexer that uses the PEAR package XML_HTMLSax3 to parse + * + * PEAR, not suprisingly, also has a SAX parser for HTML. I don't know + * very much about implementation, but it's fairly well written. However, that + * abstraction comes at a price: performance. You need to have it installed, + * and if the API changes, it might break our adapter. Not sure whether or not + * it's UTF-8 aware, but it has some entity parsing trouble. + * + * Quite personally, I don't recommend using the PEAR class, and the defaults + * don't use it. The unit tests do perform the tests on the SAX parser too, but + * whatever it does for poorly formed HTML is up to it. + * + * @todo Generalize so that XML_HTMLSax is also supported. + */ + class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer { - var $tokens; + /** + * Internal accumulator array for SAX parsers. + * @protected + */ + var $tokens = array(); function tokenizeHTML($html) { - $this->tokens = array(); $parser=& new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); @@ -18,9 +36,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer $parser->set_escape_handler('escapeHandler'); $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); $parser->parse($html); - return $this->tokens; + $tokens = $this->tokens; + $this->tokens = array(); + return $tokens; } + /** + * Open tag event handler, interface is defined by PEAR package. + */ function openHandler(&$parser, $name, $attrs, $closed) { if ($closed) { $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs); @@ -30,6 +53,9 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer return true; } + /** + * Close tag event handler, interface is defined by PEAR package. + */ function closeHandler(&$parser, $name) { // HTMLSax3 seems to always send empty tags an extra close tag // check and ignore if you see it: @@ -41,11 +67,17 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer return true; } + /** + * Data event handler, interface is defined by PEAR package. + */ function dataHandler(&$parser, $data) { $this->tokens[] = new HTMLPurifier_Token_Text($data); return true; } + /** + * Escaped text handler,interface is defined by PEAR package. + */ function escapeHandler(&$parser, $data) { if (strpos($data, '-') === 0) { $this->tokens[] = new HTMLPurifier_Token_Comment($data);