Finish documenting PEARSax3, touch up the other docs. Nuke the original lexer.txt document.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@102 48356398-32a2-884e-a903-53898d9a118a
2025-10-14 21:54:24 +02:00 · 2006-07-23 18:56:00 +00:00
parent 48cf55eae4
commit bcc2b09ac7
5 changed files with 101 additions and 51 deletions
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -6,7 +6,7 @@ require_once 'HTMLPurifier/Token.php';
 * Forgivingly lexes HTML (SGML-style) markup into tokens.
 * 
 * The lexer parses a string of SGML-style markup and converts them into
- * corresponding tokens.  It doesn't check for well-formedness, although it's
+ * corresponding tokens.  It doesn't check for well-formedness, although its
 * internal mechanism may make this automatic (such as the case of
 * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
 * from.
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -2,7 +2,25 @@

 require_once 'HTMLPurifier/Lexer.php';

-// PHP5 only!
+/**
+ * Parser that uses PHP 5's DOM extension (part of the core).
+ * 
+ * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
+ * It gives us a forgiving HTML parser, which we use to transform the HTML
+ * into a DOM, and then into the tokens.  It is blazingly fast (for large
+ * documents, it performs twenty times faster than
+ * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. 
+ * 
+ * @notice
+ * Any empty elements will have empty tokens associated with them, even if
+ * this is prohibited by the spec. This is cannot be fixed until the spec
+ * comes into play.
+ * 
+ * @todo Determine DOM's entity parsing behavior, point to local entity files
+ *       if necessary.
+ * @todo Make div access less fragile, and refrain from preprocessing when
+ *       HTML tag and friends are already present.
+ */

 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
 {
@@ -19,6 +37,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
            );
    }
    
+    /**
+     * Recursive function that tokenizes a node, putting it into an accumulator.
+     * 
+     * @param $node     DOMNode to be tokenized.
+     * @param $tokens   Array-list of already tokenized tokens.
+     * @param $collect  Says whether or start and close are collected, set to
+     *                  false at first recursion because it's the implicit DIV
+     *                  tag you're dealing with.
+     * @returns Tokens of node appended to previously passed tokens.
+     */
    protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
        // recursive goodness!
        
@@ -63,6 +91,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        
    }
    
+    /**
+     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
+     * 
+     * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
+     * @returns Associative array of attributes.
+     */
    protected function transformAttrToAssoc($attribute_list) {
        $attribute_array = array();
        // undocumented behavior
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -16,6 +16,7 @@ require_once 'HTMLPurifier/Lexer.php';
 * @todo Add support for CDATA sections.
 * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
 * @todo Optimize main function tokenizeHTML().
+ * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
 */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
@@ -108,6 +109,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    
    /**
     * Substitutes non-special entities with their parsed equivalents.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
     */
    function substituteNonSpecialEntities($string) {
        // it will try to detect missing semicolons, but don't rely on it
@@ -119,6 +124,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    
    /**
     * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     * @todo Implement string translations
     */
    function nonSpecialEntityCallback($matches) {
        // replaces all but big five
@@ -132,14 +145,19 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        } else {
            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
            // translate $matches[3]
+            return '';
        }
    }
    
    /**
     * Substitutes only special entities with their parsed equivalents.
     * 
-     * We try to avoid calling this function because otherwise, it would have
-     * to be called a lot (for every parsed section).
+     * @notice We try to avoid calling this function because otherwise, it
+     * would have to be called a lot (for every parsed section).
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
     */
    function substituteSpecialEntities($string) {
        return preg_replace_callback(
@@ -151,7 +169,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
    /**
     * Callback function for substituteSpecialEntities() that does the work.
     * 
-     * This callback is very similar to nonSpecialEntityCallback().
+     * This callback has same syntax as nonSpecialEntityCallback().
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
     */
    function specialEntityCallback($matches) {
        $entity = $matches[0];
@@ -327,7 +352,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     * Takes the inside of an HTML tag and makes an assoc array of attributes.
     * 
     * @param $string Inside of tag excluding name.
-     * @return Assoc array of attributes.
+     * @returns Assoc array of attributes.
     */
    function parseAttributeString($string) {
        $string = (string) $string; // quick typecast
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -3,14 +3,32 @@
 require_once 'XML/HTMLSax3.php'; // PEAR
 require_once 'HTMLPurifier/Lexer.php';

-// uses the PEAR class XML_HTMLSax3 to parse XML
+/**
+ * Lexer that uses the PEAR package XML_HTMLSax3 to parse
+ * 
+ * PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
+ * very much about implementation, but it's fairly well written.  However, that
+ * abstraction comes at a price: performance. You need to have it installed,
+ * and if the API changes, it might break our adapter. Not sure whether or not
+ * it's UTF-8 aware, but it has some entity parsing trouble.
+ * 
+ * Quite personally, I don't recommend using the PEAR class, and the defaults
+ * don't use it. The unit tests do perform the tests on the SAX parser too, but
+ * whatever it does for poorly formed HTML is up to it.
+ * 
+ * @todo Generalize so that XML_HTMLSax is also supported.
+ */
+
 class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
 {
    
-    var $tokens;
+    /**
+     * Internal accumulator array for SAX parsers.
+     * @protected
+     */
+    var $tokens = array();
    
    function tokenizeHTML($html) {
-        $this->tokens = array();
        $parser=& new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
@@ -18,9 +36,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        $parser->set_escape_handler('escapeHandler');
        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
        $parser->parse($html);
-        return $this->tokens;
+        $tokens = $this->tokens;
+        $this->tokens = array();
+        return $tokens;
    }
    
+    /**
+     * Open tag event handler, interface is defined by PEAR package.
+     */
    function openHandler(&$parser, $name, $attrs, $closed) {
        if ($closed) {
            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
@@ -30,6 +53,9 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        return true;
    }
    
+    /**
+     * Close tag event handler, interface is defined by PEAR package.
+     */
    function closeHandler(&$parser, $name) {
        // HTMLSax3 seems to always send empty tags an extra close tag
        // check and ignore if you see it:
@@ -41,11 +67,17 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        return true;
    }
    
+    /**
+     * Data event handler, interface is defined by PEAR package.
+     */
    function dataHandler(&$parser, $data) {
        $this->tokens[] = new HTMLPurifier_Token_Text($data);
        return true;
    }
    
+    /**
+     * Escaped text handler,interface is defined by PEAR package.
+     */
    function escapeHandler(&$parser, $data) {
        if (strpos($data, '-') === 0) {
            $this->tokens[] = new HTMLPurifier_Token_Comment($data);