[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
2025-10-14 21:54:24 +02:00 · 2007-06-18 02:01:01 +00:00
parent 70bcccf54c
commit 4bf15de536
6 changed files with 168 additions and 11 deletions
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -2,6 +2,20 @@

 require_once 'HTMLPurifier/Lexer.php';

+HTMLPurifier_ConfigSchema::define(
+    'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
+<p>
+  Specifies the number of tokens the DirectLex line number tracking
+  implementations should process before attempting to resyncronize the
+  current line count by manually counting all previous new-lines. When
+  at 0, this functionality is disabled. Lower values will decrease
+  performance, and this is only strictly necessary if the counting
+  algorithm is buggy (in which case you should report it as a bug).
+  This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
+  not being used. This directive has been available since 1.7.0.
+</p>
+');
+
 /**
 * Our in-house implementation of a parser.
 * 
@@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        $inside_tag = false; // whether or not we're parsing the inside of a tag
        $array = array(); // result array
        
+        $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
+        $current_line = 1;
+        $nl = PHP_EOL;
+        // how often to manually recalculate. This will ALWAYS be right,
+        // but it's pretty wasteful. Set to 0 to turn off
+        $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); 
+        
        // infinite loop protection
        // has to be pretty big, since html docs can be big
        // we're allow two hundred thousand tags... more than enough?
+        // NOTE: this is also used for synchronization, so watch out
        $loops = 0;
        
        while(true) {
@@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            // infinite loop protection
            if (++$loops > 200000) return array();
            
+            // recalculate lines
+            if (
+                $maintain_line_numbers && // line number tracking is on
+                $synchronize_interval &&  // synchronization is on
+                $cursor > 0 &&            // cursor is further than zero
+                $loops % $synchronize_interval === 0 // time to synchronize!
+            ) {
+                $current_line = 1 + substr_count($html, $nl, 0, $cursor);
+            }
+            
            $position_next_lt = strpos($html, '<', $cursor);
            $position_next_gt = strpos($html, '>', $cursor);
            
            // triggers on "<b>asdf</b>" but not "asdf <b></b>"
+            // special case to set up context
            if ($position_next_lt === $cursor) {
                $inside_tag = true;
                $cursor++;
@@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            
            if (!$inside_tag && $position_next_lt !== false) {
                // We are not inside tag and there still is another tag to parse
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        $this->parseData(
                            substr(
@@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            )
                        )
                    );
+                if ($maintain_line_numbers) {
+                    $token->line = $current_line;
+                    $current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
+                }
+                $array[] = $token;
                $cursor  = $position_next_lt + 1;
                $inside_tag = true;
                continue;
@@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // If we're already at the end, break
                if ($cursor === strlen($html)) break;
                // Create Text of rest of string
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        $this->parseData(
                            substr(
@@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            )
                        )
                    );
+                if ($maintain_line_numbers) $token->line = $current_line;
+                $array[] = $token;
                break;
            } elseif ($inside_tag && $position_next_gt !== false) {
                // We are in tag and it is well formed
@@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                    substr($segment, 0, 3) == '!--' &&
                    substr($segment, $strlen_segment-2, 2) == '--'
                ) {
-                    $array[] = new
+                    $token = new
                        HTMLPurifier_Token_Comment(
                            substr(
                                $segment, 3, $strlen_segment - 5
                            )
                        );
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $is_end_tag = (strpos($segment,'/') === 0);
                if ($is_end_tag) {
                    $type = substr($segment, 1);
-                    $array[] = new HTMLPurifier_Token_End($type);
+                    $token = new HTMLPurifier_Token_End($type);
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // have accidently grabbed an emoticon. Translate into
                // text and go our merry way
                if (!ctype_alnum($segment[0])) {
-                    $array[] = new
+                    $token = new
                        HTMLPurifier_Token_Text(
                            '<' .
                            $this->parseData(
@@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                            ) . 
                            '>'
                        );
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $cursor = $position_next_gt + 1;
                    $inside_tag = false;
                    continue;
@@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                
                if ($position_first_space >= $strlen_segment) {
                    if ($is_self_closing) {
-                        $array[] = new HTMLPurifier_Token_Empty($segment);
+                        $token = new HTMLPurifier_Token_Empty($segment);
                    } else {
-                        $array[] = new HTMLPurifier_Token_Start($segment);
+                        $token = new HTMLPurifier_Token_Start($segment);
                    }
+                    if ($maintain_line_numbers) {
+                        $token->line = $current_line;
+                        $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                    }
+                    $array[] = $token;
                    $inside_tag = false;
                    $cursor = $position_next_gt + 1;
                    continue;
@@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                }
                
                if ($is_self_closing) {
-                    $array[] = new HTMLPurifier_Token_Empty($type, $attr);
+                    $token = new HTMLPurifier_Token_Empty($type, $attr);
                } else {
-                    $array[] = new HTMLPurifier_Token_Start($type, $attr);
+                    $token = new HTMLPurifier_Token_Start($type, $attr);
                }
+                if ($maintain_line_numbers) {
+                    $token->line = $current_line;
+                    $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
+                }
+                $array[] = $token;
                $cursor = $position_next_gt + 1;
                $inside_tag = false;
                continue;
            } else {
-                $array[] = new
+                $token = new
                    HTMLPurifier_Token_Text(
                        '<' .
                        $this->parseData(
                            substr($html, $cursor)
                        )
                    );
+                if ($maintain_line_numbers) $token->line = $current_line;
+                // no cursor scroll? Hmm...
+                $array[] = $token;
                break;
            }
            break;