Release 2.1.0, merged in 1255 to HEAD.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@1368 48356398-32a2-884e-a903-53898d9a118a
2025-10-23 01:26:19 +02:00 · 2007-08-05 02:02:46 +00:00
parent 503e76081b
commit 80c60bb9b5
141 changed files with 4250 additions and 1155 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -42,6 +42,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        
        $html = $this->normalize($html, $config, $context);
        
+        // attempt to armor stray angled brackets that cannot possibly
+        // form tags and thus are probably being used as emoticons
+        if ($config->get('Core', 'AggressivelyFixLt')) {
+            $char = '[^a-z!\/]';
+            $comment = "/<!--(.*?)(-->|\z)/is";
+            $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html);
+            $html = preg_replace("/<($char)/i", '&lt;\\1', $html);
+            $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments
+        }
+        
        // preprocess html, essential for UTF-8
        $html =
            '<!DOCTYPE html '.
@@ -151,5 +161,21 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     */
    public function muteErrorHandler($errno, $errstr) {}
    
+    /**
+     * Callback function for undoing escaping of stray angled brackets
+     * in comments
+     */
+    static public function callbackUndoCommentSubst($matches) {
+        return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
+    }
+    
+    /**
+     * Callback function that entity-izes ampersands in comments so that
+     * callbackUndoCommentSubst doesn't clobber them
+     */
+    static public function callbackArmorCommentEntities($matches) {
+        return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
+    }
+    
 }

--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -150,6 +150,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // We are in tag and it is well formed
                // Grab the internals of the tag
                $strlen_segment = $position_next_gt - $cursor;
+                
+                if ($strlen_segment < 1) {
+                    // there's nothing to process!
+                    $token = new HTMLPurifier_Token_Text('<');
+                    $cursor++;
+                    continue;
+                }
+                
                $segment = substr($html, $cursor, $strlen_segment);
                
                // Check if it's a comment
@@ -204,7 +212,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // Check leading character is alnum, if not, we may
                // have accidently grabbed an emoticon. Translate into
                // text and go our merry way
-                if (!ctype_alnum($segment[0])) {
+                if (!ctype_alpha($segment[0])) {
+                    // XML:  $segment[0] !== '_' && $segment[0] !== ':'
                    if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
                    $token = new
                        HTMLPurifier_Token_Text(
@@ -371,6 +380,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                    $value = $quoted_value;
                }
            }
+            if ($value === false) $value = '';
            return array($key => $value);
        }
        
@@ -385,7 +395,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        
        // infinite loop protection
        $loops = 0;
-        
        while(true) {
            
            // infinite loop protection
@@ -399,7 +408,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            }
            
            $cursor += ($value = strspn($string, $this->_whitespace, $cursor));
-            
            // grab the key
            
            $key_begin = $cursor; //we're currently at the start of the key
@@ -435,6 +443,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $cursor++;
                $cursor += strspn($string, $this->_whitespace, $cursor);
                
+                if ($cursor === false) {
+                    $array[$key] = '';
+                    break;
+                }
+                
                // we might be in front of a quote right now
                
                $char = @$string[$cursor];
@@ -452,7 +465,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                    $value_end = $cursor;
                }
                
+                // we reached a premature end
+                if ($cursor === false) {
+                    $cursor = $size;
+                    $value_end = $cursor;
+                }
+                
                $value = substr($string, $value_begin, $value_end - $value_begin);
+                if ($value === false) $value = '';
                $array[$key] = $this->parseData($value);
                $cursor++;