diff --git a/NEWS b/NEWS index 3464dcb8..def75d62 100644 --- a/NEWS +++ b/NEWS @@ -53,6 +53,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier . Lexer is now pre-emptively included, with a conditional include for the PHP5 only version. . HTMLDefinition and CSSDefinition have a common parent class: Definition. +. DirectLex can now track line-numbers 1.6.1, released 2007-05-05 ! Support for more deprecated attributes via transformations: diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index f3af0487..e2c0fccf 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -47,9 +47,24 @@ HTMLPurifier_ConfigSchema::define( to use it. +
+ This directive has been available since 1.7.0. +
' ); +HTMLPurifier_ConfigSchema::define( + 'Core', 'MaintainLineNumbers', false, 'bool', ' ++ If true, HTML Purifier will add line number information to all tokens. + This is useful when error reporting is turned on, but can result in + significant performance degradation and should not be used when + unnecessary. This directive must be used with the DirectLex lexer, + as the DOMLex lexer does not (yet) support this functionality. This directive + has been available since 1.7.0. +
+'); + /** * Forgivingly lexes HTML (SGML-style) markup into tokens. * @@ -135,7 +150,14 @@ class HTMLPurifier_Lexer } if (is_null($lexer)) { do { - // auto-detectection algorithm + // auto-detection algorithm + + // once PHP DOM implements native line numbers, or we + // hack out something using XSLT, remove this stipulation + if ($config->get('Core', 'MaintainLineNumbers')) { + $lexer = 'DirectLex'; + break; + } if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5 class_exists('DOMDocument')) { // check for DOM support diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 57d116a4..fa9d541a 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -2,6 +2,20 @@ require_once 'HTMLPurifier/Lexer.php'; +HTMLPurifier_ConfigSchema::define( + 'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', ' ++ Specifies the number of tokens the DirectLex line number tracking + implementations should process before attempting to resyncronize the + current line count by manually counting all previous new-lines. When + at 0, this functionality is disabled. Lower values will decrease + performance, and this is only strictly necessary if the counting + algorithm is buggy (in which case you should report it as a bug). + This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is + not being used. This directive has been available since 1.7.0. +
+'); + /** * Our in-house implementation of a parser. * @@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array + $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers'); + $current_line = 1; + $nl = PHP_EOL; + // how often to manually recalculate. This will ALWAYS be right, + // but it's pretty wasteful. Set to 0 to turn off + $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval'); + // infinite loop protection // has to be pretty big, since html docs can be big // we're allow two hundred thousand tags... more than enough? + // NOTE: this is also used for synchronization, so watch out $loops = 0; while(true) { @@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // infinite loop protection if (++$loops > 200000) return array(); + // recalculate lines + if ( + $maintain_line_numbers && // line number tracking is on + $synchronize_interval && // synchronization is on + $cursor > 0 && // cursor is further than zero + $loops % $synchronize_interval === 0 // time to synchronize! + ) { + $current_line = 1 + substr_count($html, $nl, 0, $cursor); + } + $position_next_lt = strpos($html, '<', $cursor); $position_next_gt = strpos($html, '>', $cursor); // triggers on "asdf" but not "asdf " + // special case to set up context if ($position_next_lt === $cursor) { $inside_tag = true; $cursor++; @@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if (!$inside_tag && $position_next_lt !== false) { // We are not inside tag and there still is another tag to parse - $array[] = new + $token = new HTMLPurifier_Token_Text( $this->parseData( substr( @@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ) ); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor); + } + $array[] = $token; $cursor = $position_next_lt + 1; $inside_tag = true; continue; @@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // If we're already at the end, break if ($cursor === strlen($html)) break; // Create Text of rest of string - $array[] = new + $token = new HTMLPurifier_Token_Text( $this->parseData( substr( @@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) ) ); + if ($maintain_line_numbers) $token->line = $current_line; + $array[] = $token; break; } elseif ($inside_tag && $position_next_gt !== false) { // We are in tag and it is well formed @@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer substr($segment, 0, 3) == '!--' && substr($segment, $strlen_segment-2, 2) == '--' ) { - $array[] = new + $token = new HTMLPurifier_Token_Comment( substr( $segment, 3, $strlen_segment - 5 ) ); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; @@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer $is_end_tag = (strpos($segment,'/') === 0); if ($is_end_tag) { $type = substr($segment, 1); - $array[] = new HTMLPurifier_Token_End($type); + $token = new HTMLPurifier_Token_End($type); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; @@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // have accidently grabbed an emoticon. Translate into // text and go our merry way if (!ctype_alnum($segment[0])) { - $array[] = new + $token = new HTMLPurifier_Token_Text( '<' . $this->parseData( @@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer ) . '>' ); + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $cursor = $position_next_gt + 1; $inside_tag = false; continue; @@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer if ($position_first_space >= $strlen_segment) { if ($is_self_closing) { - $array[] = new HTMLPurifier_Token_Empty($segment); + $token = new HTMLPurifier_Token_Empty($segment); } else { - $array[] = new HTMLPurifier_Token_Start($segment); + $token = new HTMLPurifier_Token_Start($segment); } + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $inside_tag = false; $cursor = $position_next_gt + 1; continue; @@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } if ($is_self_closing) { - $array[] = new HTMLPurifier_Token_Empty($type, $attr); + $token = new HTMLPurifier_Token_Empty($type, $attr); } else { - $array[] = new HTMLPurifier_Token_Start($type, $attr); + $token = new HTMLPurifier_Token_Start($type, $attr); } + if ($maintain_line_numbers) { + $token->line = $current_line; + $current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor); + } + $array[] = $token; $cursor = $position_next_gt + 1; $inside_tag = false; continue; } else { - $array[] = new + $token = new HTMLPurifier_Token_Text( '<' . $this->parseData( substr($html, $cursor) ) ); + if ($maintain_line_numbers) $token->line = $current_line; + // no cursor scroll? Hmm... + $array[] = $token; break; } break; diff --git a/library/HTMLPurifier/Token.php b/library/HTMLPurifier/Token.php index 555e76f1..82b2a88c 100644 --- a/library/HTMLPurifier/Token.php +++ b/library/HTMLPurifier/Token.php @@ -11,6 +11,7 @@ */ class HTMLPurifier_Token { var $type; /**< Type of node to bypass is_a(). @public */ + var $line; /**< Line number node was on in source document. Null if unknown. @public */ /** * Copies the tag into a new one (clone substitute). diff --git a/tests/HTMLPurifier/Lexer/DirectLexTest.php b/tests/HTMLPurifier/Lexer/DirectLexTest.php index 19ec0ad0..37c516f3 100644 --- a/tests/HTMLPurifier/Lexer/DirectLexTest.php +++ b/tests/HTMLPurifier/Lexer/DirectLexTest.php @@ -64,6 +64,65 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase } + function testLineNumbers() { + + $html = 'Line 1 + Line 2 + Still Line 2