From bcc2b09ac7d4eed5e34219ec8bbad8f7799c516e Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Sun, 23 Jul 2006 18:56:00 +0000
Subject: [PATCH] Finish documenting PEARSax3, touch up the other docs. Nuke
 the original lexer.txt document.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@102 48356398-32a2-884e-a903-53898d9a118a
---
 docs/lexer.txt                           | 41 ------------------------
 library/HTMLPurifier/Lexer.php           |  2 +-
 library/HTMLPurifier/Lexer/DOMLex.php    | 36 ++++++++++++++++++++-
 library/HTMLPurifier/Lexer/DirectLex.php | 33 ++++++++++++++++---
 library/HTMLPurifier/Lexer/PEARSax3.php  | 40 ++++++++++++++++++++---
 5 files changed, 101 insertions(+), 51 deletions(-)
 delete mode 100644 docs/lexer.txt

diff --git a/docs/lexer.txt b/docs/lexer.txt
deleted file mode 100644
index 31b55ba7..00000000
--- a/docs/lexer.txt
+++ /dev/null
@@ -1,41 +0,0 @@
-
-Lexer
-
-The lexer parses a string of SGML-style markup and converts them into
-corresponding tokens. It doesn't check for well-formedness, although it's
-internal mechanism may make this automatic (such as the case of DOMLex).
-
-We have several implementations of the Lexer:
-
-DirectLex [4,5] - our in-house implementation
-    DirectLex has absolutely no dependencies, making it a reasonably good
-    default for PHP4.  Written with efficiency in mind, it is up to two
-    times faster than the PEAR parser.  It will support UTF-8 completely
-    eventually.
-
-PEARSax3 [4,5] - uses the PEAR package XML_HTMLSax3 to parse
-    PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
-    very much about implementation, but it's fairly well written.  However, that
-    abstraction comes at a price: performance. You need to have it installed,
-    and if the API changes, it might break our adapter. Not sure whether or not
-    it's UTF-8 aware, but it has some entity parsing trouble.
-
-DOMLex [5] - uses the PHP5 core extension DOM to parse
-    In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
-    It gives us a forgiving HTML parser, which we use to transform the HTML
-    into a DOM, and then into the tokens.  It is blazingly fast, and is the
-    default choice for PHP 5.  However, entity resolution may be troublesome,
-    though its UTF-8 is excellent.  Also, any empty elements will have empty
-    tokens associated with them, even if this is prohibited.
-
-We use tokens because creating a DOM representation would:
-
-1. Require more processing power to create,
-2. Require recursion to iterate,
-3. Must be compatible with PHP 5's DOM,
-4. Has the entire document structure (html and body not needed), and
-5. Has unknown readability improvement.
-
-What the last item means is that the functions for manipulating tokens are
-already fairly compact, and when well-commented, more abstraction may not
-be needed.
\ No newline at end of file
diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 2a8f37a5..34c489ed 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -6,7 +6,7 @@ require_once 'HTMLPurifier/Token.php';
  * Forgivingly lexes HTML (SGML-style) markup into tokens.
  * 
  * The lexer parses a string of SGML-style markup and converts them into
- * corresponding tokens.  It doesn't check for well-formedness, although it's
+ * corresponding tokens.  It doesn't check for well-formedness, although its
  * internal mechanism may make this automatic (such as the case of
  * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
  * from.
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 8b72aa24..d9f41e61 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -2,7 +2,25 @@
 
 require_once 'HTMLPurifier/Lexer.php';
 
-// PHP5 only!
+/**
+ * Parser that uses PHP 5's DOM extension (part of the core).
+ * 
+ * In PHP 5, the DOM XML extension was revamped into DOM and added to the core.
+ * It gives us a forgiving HTML parser, which we use to transform the HTML
+ * into a DOM, and then into the tokens.  It is blazingly fast (for large
+ * documents, it performs twenty times faster than
+ * HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5. 
+ * 
+ * @notice
+ * Any empty elements will have empty tokens associated with them, even if
+ * this is prohibited by the spec. This is cannot be fixed until the spec
+ * comes into play.
+ * 
+ * @todo Determine DOM's entity parsing behavior, point to local entity files
+ *       if necessary.
+ * @todo Make div access less fragile, and refrain from preprocessing when
+ *       HTML tag and friends are already present.
+ */
 
 class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
 {
@@ -19,6 +37,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
             );
     }
     
+    /**
+     * Recursive function that tokenizes a node, putting it into an accumulator.
+     * 
+     * @param $node     DOMNode to be tokenized.
+     * @param $tokens   Array-list of already tokenized tokens.
+     * @param $collect  Says whether or start and close are collected, set to
+     *                  false at first recursion because it's the implicit DIV
+     *                  tag you're dealing with.
+     * @returns Tokens of node appended to previously passed tokens.
+     */
     protected function tokenizeDOM($node, $tokens = array(), $collect = false) {
         // recursive goodness!
         
@@ -63,6 +91,12 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         
     }
     
+    /**
+     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
+     * 
+     * @param $attribute_list DOMNamedNodeMap of DOMAttr objects.
+     * @returns Associative array of attributes.
+     */
     protected function transformAttrToAssoc($attribute_list) {
         $attribute_array = array();
         // undocumented behavior
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 6d135e92..91706370 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -16,6 +16,7 @@ require_once 'HTMLPurifier/Lexer.php';
  * @todo Add support for CDATA sections.
  * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
  * @todo Optimize main function tokenizeHTML().
+ * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
  */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
@@ -108,6 +109,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     
     /**
      * Substitutes non-special entities with their parsed equivalents.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
      */
     function substituteNonSpecialEntities($string) {
         // it will try to detect missing semicolons, but don't rely on it
@@ -119,6 +124,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     
     /**
      * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     * @todo Implement string translations
      */
     function nonSpecialEntityCallback($matches) {
         // replaces all but big five
@@ -132,14 +145,19 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
         } else {
             if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
             // translate $matches[3]
+            return '';
         }
     }
     
     /**
      * Substitutes only special entities with their parsed equivalents.
      * 
-     * We try to avoid calling this function because otherwise, it would have
-     * to be called a lot (for every parsed section).
+     * @notice We try to avoid calling this function because otherwise, it
+     * would have to be called a lot (for every parsed section).
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
      */
     function substituteSpecialEntities($string) {
         return preg_replace_callback(
@@ -151,7 +169,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
     /**
      * Callback function for substituteSpecialEntities() that does the work.
      * 
-     * This callback is very similar to nonSpecialEntityCallback().
+     * This callback has same syntax as nonSpecialEntityCallback().
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
      */
     function specialEntityCallback($matches) {
         $entity = $matches[0];
@@ -327,7 +352,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
      * Takes the inside of an HTML tag and makes an assoc array of attributes.
      * 
      * @param $string Inside of tag excluding name.
-     * @return Assoc array of attributes.
+     * @returns Assoc array of attributes.
      */
     function parseAttributeString($string) {
         $string = (string) $string; // quick typecast
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index c13dc377..89681f2e 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -3,14 +3,32 @@
 require_once 'XML/HTMLSax3.php'; // PEAR
 require_once 'HTMLPurifier/Lexer.php';
 
-// uses the PEAR class XML_HTMLSax3 to parse XML
+/**
+ * Lexer that uses the PEAR package XML_HTMLSax3 to parse
+ * 
+ * PEAR, not suprisingly, also has a SAX parser for HTML.  I don't know
+ * very much about implementation, but it's fairly well written.  However, that
+ * abstraction comes at a price: performance. You need to have it installed,
+ * and if the API changes, it might break our adapter. Not sure whether or not
+ * it's UTF-8 aware, but it has some entity parsing trouble.
+ * 
+ * Quite personally, I don't recommend using the PEAR class, and the defaults
+ * don't use it. The unit tests do perform the tests on the SAX parser too, but
+ * whatever it does for poorly formed HTML is up to it.
+ * 
+ * @todo Generalize so that XML_HTMLSax is also supported.
+ */
+
 class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
 {
     
-    var $tokens;
+    /**
+     * Internal accumulator array for SAX parsers.
+     * @protected
+     */
+    var $tokens = array();
     
     function tokenizeHTML($html) {
-        $this->tokens = array();
         $parser=& new XML_HTMLSax3();
         $parser->set_object($this);
         $parser->set_element_handler('openHandler','closeHandler');
@@ -18,9 +36,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         $parser->set_escape_handler('escapeHandler');
         $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
         $parser->parse($html);
-        return $this->tokens;
+        $tokens = $this->tokens;
+        $this->tokens = array();
+        return $tokens;
     }
     
+    /**
+     * Open tag event handler, interface is defined by PEAR package.
+     */
     function openHandler(&$parser, $name, $attrs, $closed) {
         if ($closed) {
             $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
@@ -30,6 +53,9 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         return true;
     }
     
+    /**
+     * Close tag event handler, interface is defined by PEAR package.
+     */
     function closeHandler(&$parser, $name) {
         // HTMLSax3 seems to always send empty tags an extra close tag
         // check and ignore if you see it:
@@ -41,11 +67,17 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
         return true;
     }
     
+    /**
+     * Data event handler, interface is defined by PEAR package.
+     */
     function dataHandler(&$parser, $data) {
         $this->tokens[] = new HTMLPurifier_Token_Text($data);
         return true;
     }
     
+    /**
+     * Escaped text handler,interface is defined by PEAR package.
+     */
     function escapeHandler(&$parser, $data) {
         if (strpos($data, '-') === 0) {
             $this->tokens[] = new HTMLPurifier_Token_Comment($data);