diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index 031e8e3d..0da29766 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -137,6 +137,31 @@ class HTMLPurifier_Lexer
         return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
     }
     
+    /**
+     * Takes a piece of HTML and normalizes it by converting entities, fixing
+     * encoding, extracting bits, and other good stuff.
+     */
+    function normalize($html, $config) {
+        
+        // extract body from document if applicable
+        if ($config->get('Core', 'AcceptFullDocuments')) {
+            $html = $this->extractBody($html);
+        }
+        
+        // escape CDATA
+        $html = $this->escapeCDATA($html);
+        
+        // expand entities that aren't the big five
+        $html = $this->_encoder->substituteNonSpecialEntities($html);
+        
+        // clean into wellformed UTF-8 string for an SGML context: this has
+        // to be done after entity expansion because the entities sometimes
+        // represent non-SGML characters (horror, horror!)
+        $html = $this->_encoder->cleanUTF8($html);
+        
+        return $html;
+    }
+    
     /**
      * Takes a string of HTML (fragment or document) and returns the content
      */
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index e408fa84..202b4c8a 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -37,24 +37,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     public function tokenizeHTML($string, $config = null) {
         if (!$config) $config = HTMLPurifier_Config::createDefault();
         
-        if ($config->get('Core', 'AcceptFullDocuments')) {
-            $string = $this->extractBody($string);
-        }
-        
-        $doc = new DOMDocument();
-        $doc->encoding = 'UTF-8'; // technically does nothing, but whatever
-        
-        // replace and escape the CDATA sections, since parsing under HTML
-        // mode won't get 'em.
-        $string = $this->escapeCDATA($string);
-        
-        // substitute non-special entities. While DOM is perfectly capable
-        // of doing this, we need to get at the UTF-8 characters in
-        // cleanUTF8
-        $string = $this->_encoder->substituteNonSpecialEntities($string);
-        
-        // clean it into well-formed UTF-8 string
-        $string = $this->_encoder->cleanUTF8($string);
+        $string = $this->normalize($string, $config);
         
         // preprocess string, essential for UTF-8
         $string =
@@ -66,6 +49,8 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                 ' charset=utf-8" />'.
             '</head><body><div>'.$string.'</div></body></html>';
         
+        $doc = new DOMDocument();
+        $doc->encoding = 'UTF-8'; // technically does nothing, but whatever
         @$doc->loadHTML($string); // mute all errors, handle it transparently
         
         $tokens = array();
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 6951c491..7e749581 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -76,31 +76,16 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
      */
     var $_whitespace = "\x20\x09\x0D\x0A";
     
-    function tokenizeHTML($string, $config = null) {
+    function tokenizeHTML($html, $config = null) {
         
         if (!$config) $config = HTMLPurifier_Config::createDefault();
         
-        // some quick checking (if empty, return empty)
-        $string = @ (string) $string;
-        if ($string == '') return array();
-        
-        if ($config->get('Core', 'AcceptFullDocuments')) {
-            $string = $this->extractBody($string);
-        }
+        $html = $this->normalize($html, $config);
         
         $cursor = 0; // our location in the text
         $inside_tag = false; // whether or not we're parsing the inside of a tag
         $array = array(); // result array
         
-        // escape CDATA
-        $string = $this->escapeCDATA($string);
-        
-        // expand entities THAT AREN'T THE BIG FIVE
-        $string = $this->_encoder->substituteNonSpecialEntities($string);
-        
-        // clean it into wellformed UTF-8 string
-        $string = $this->_encoder->cleanUTF8($string);
-        
         // infinite loop protection
         // has to be pretty big, since html docs can be big
         // we're allow two hundred thousand tags... more than enough?
@@ -111,8 +96,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             // infinite loop protection
             if (++$loops > 200000) return array();
             
-            $position_next_lt = strpos($string, '<', $cursor);
-            $position_next_gt = strpos($string, '>', $cursor);
+            $position_next_lt = strpos($html, '<', $cursor);
+            $position_next_gt = strpos($html, '>', $cursor);
             
             // triggers on "<b>asdf</b>" but not "asdf <b></b>"
             if ($position_next_lt === $cursor) {
@@ -126,7 +111,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     HTMLPurifier_Token_Text(
                         $this->parseData(
                             substr(
-                                $string, $cursor, $position_next_lt - $cursor
+                                $html, $cursor, $position_next_lt - $cursor
                             )
                         )
                     );
@@ -136,13 +121,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
             } elseif (!$inside_tag) {
                 // We are not inside tag but there are no more tags
                 // If we're already at the end, break
-                if ($cursor === strlen($string)) break;
+                if ($cursor === strlen($html)) break;
                 // Create Text of rest of string
                 $array[] = new
                     HTMLPurifier_Token_Text(
                         $this->parseData(
                             substr(
-                                $string, $cursor
+                                $html, $cursor
                             )
                         )
                     );
@@ -151,7 +136,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                 // We are in tag and it is well formed
                 // Grab the internals of the tag
                 $strlen_segment = $position_next_gt - $cursor;
-                $segment = substr($string, $cursor, $strlen_segment);
+                $segment = substr($html, $cursor, $strlen_segment);
                 
                 // Check if it's a comment
                 if (
@@ -232,7 +217,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                     HTMLPurifier_Token_Text(
                         '<' .
                         $this->parseData(
-                            substr($string, $cursor)
+                            substr($html, $cursor)
                         )
                     );
                 break;
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index c042d2f9..d2d90a12 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -30,23 +30,23 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     var $tokens = array();
     
     function tokenizeHTML($string, $config = null) {
+        
+        $this->tokens = array();
+        
         if (!$config) $config = HTMLPurifier_Config::createDefault();
-        $string = $this->escapeCDATA($string);
-        if ($config->get('Core', 'AcceptFullDocuments')) {
-            $string = $this->extractBody($string);
-        }
-        $string = $this->_encoder->substituteNonSpecialEntities($string);
-        $string = $this->_encoder->cleanUTF8($string);
+        $string = $this->normalize($string, $config);
+        
         $parser=& new XML_HTMLSax3();
         $parser->set_object($this);
         $parser->set_element_handler('openHandler','closeHandler');
         $parser->set_data_handler('dataHandler');
         $parser->set_escape_handler('escapeHandler');
         $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
+        
         $parser->parse($string);
-        $tokens = $this->tokens;
-        $this->tokens = array();
-        return $tokens;
+        
+        return $this->tokens;
+        
     }
     
     /**