1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-05 13:47:24 +02:00

[1.7.0] Implement line number counting in DirectLex, in preparation for error reporting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1155 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2007-06-18 02:01:01 +00:00
parent 70bcccf54c
commit 4bf15de536
6 changed files with 168 additions and 11 deletions

View File

@@ -2,6 +2,20 @@
require_once 'HTMLPurifier/Lexer.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
<p>
Specifies the number of tokens the DirectLex line number tracking
implementations should process before attempting to resyncronize the
current line count by manually counting all previous new-lines. When
at 0, this functionality is disabled. Lower values will decrease
performance, and this is only strictly necessary if the counting
algorithm is buggy (in which case you should report it as a bug).
This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
not being used. This directive has been available since 1.7.0.
</p>
');
/**
* Our in-house implementation of a parser.
*
@@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
$current_line = 1;
$nl = PHP_EOL;
// how often to manually recalculate. This will ALWAYS be right,
// but it's pretty wasteful. Set to 0 to turn off
$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?
// NOTE: this is also used for synchronization, so watch out
$loops = 0;
while(true) {
@@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// infinite loop protection
if (++$loops > 200000) return array();
// recalculate lines
if (
$maintain_line_numbers && // line number tracking is on
$synchronize_interval && // synchronization is on
$cursor > 0 && // cursor is further than zero
$loops % $synchronize_interval === 0 // time to synchronize!
) {
$current_line = 1 + substr_count($html, $nl, 0, $cursor);
}
$position_next_lt = strpos($html, '<', $cursor);
$position_next_gt = strpos($html, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
// special case to set up context
if ($position_next_lt === $cursor) {
$inside_tag = true;
$cursor++;
@@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
$array[] = new
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
@@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
)
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_lt - $cursor);
}
$array[] = $token;
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
@@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// If we're already at the end, break
if ($cursor === strlen($html)) break;
// Create Text of rest of string
$array[] = new
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
@@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
)
)
);
if ($maintain_line_numbers) $token->line = $current_line;
$array[] = $token;
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
@@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
substr($segment, 0, 3) == '!--' &&
substr($segment, $strlen_segment-2, 2) == '--'
) {
$array[] = new
$token = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, $strlen_segment - 5
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) {
$type = substr($segment, 1);
$array[] = new HTMLPurifier_Token_End($type);
$token = new HTMLPurifier_Token_End($type);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// have accidently grabbed an emoticon. Translate into
// text and go our merry way
if (!ctype_alnum($segment[0])) {
$array[] = new
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
@@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
) .
'>'
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
@@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($position_first_space >= $strlen_segment) {
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($segment);
$token = new HTMLPurifier_Token_Empty($segment);
} else {
$array[] = new HTMLPurifier_Token_Start($segment);
$token = new HTMLPurifier_Token_Start($segment);
}
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
}
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attr);
$token = new HTMLPurifier_Token_Empty($type, $attr);
} else {
$array[] = new HTMLPurifier_Token_Start($type, $attr);
$token = new HTMLPurifier_Token_Start($type, $attr);
}
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += substr_count($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
} else {
$array[] = new
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
substr($html, $cursor)
)
);
if ($maintain_line_numbers) $token->line = $current_line;
// no cursor scroll? Hmm...
$array[] = $token;
break;
}
break;