1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-09 23:57:03 +02:00

Release 2.0.0, merged in 1026 to HEAD.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@1179 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2007-06-21 00:36:12 +00:00
parent c35eb3e95f
commit 0101311193
172 changed files with 7713 additions and 2520 deletions

View File

@@ -2,6 +2,20 @@
require_once 'HTMLPurifier/Lexer.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'DirectLexLineNumberSyncInterval', 0, 'int', '
<p>
Specifies the number of tokens the DirectLex line number tracking
implementations should process before attempting to resyncronize the
current line count by manually counting all previous new-lines. When
at 0, this functionality is disabled. Lower values will decrease
performance, and this is only strictly necessary if the counting
algorithm is buggy (in which case you should report it as a bug).
This has no effect when %Core.MaintainLineNumbers is disabled or DirectLex is
not being used. This directive has been available since 2.0.0.
</p>
');
/**
* Our in-house implementation of a parser.
*
@@ -32,9 +46,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
$maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
$current_line = 1;
$nl = PHP_EOL;
// how often to manually recalculate. This will ALWAYS be right,
// but it's pretty wasteful. Set to 0 to turn off
$synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?
// NOTE: this is also used for synchronization, so watch out
$loops = 0;
while(true) {
@@ -42,10 +64,21 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// infinite loop protection
if (++$loops > 200000) return array();
// recalculate lines
if (
$maintain_line_numbers && // line number tracking is on
$synchronize_interval && // synchronization is on
$cursor > 0 && // cursor is further than zero
$loops % $synchronize_interval === 0 // time to synchronize!
) {
$current_line = 1 + $this->substrCount($html, $nl, 0, $cursor);
}
$position_next_lt = strpos($html, '<', $cursor);
$position_next_gt = strpos($html, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
// special case to set up context
if ($position_next_lt === $cursor) {
$inside_tag = true;
$cursor++;
@@ -53,7 +86,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
$array[] = new
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
@@ -61,6 +94,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
)
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_lt - $cursor);
}
$array[] = $token;
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
@@ -69,7 +107,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// If we're already at the end, break
if ($cursor === strlen($html)) break;
// Create Text of rest of string
$array[] = new
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
@@ -77,6 +115,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
)
)
);
if ($maintain_line_numbers) $token->line = $current_line;
$array[] = $token;
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
@@ -89,12 +129,17 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
substr($segment, 0, 3) == '!--' &&
substr($segment, $strlen_segment-2, 2) == '--'
) {
$array[] = new
$token = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, $strlen_segment - 5
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@@ -104,7 +149,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) {
$type = substr($segment, 1);
$array[] = new HTMLPurifier_Token_End($type);
$token = new HTMLPurifier_Token_End($type);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@@ -114,7 +164,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// have accidently grabbed an emoticon. Translate into
// text and go our merry way
if (!ctype_alnum($segment[0])) {
$array[] = new
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
@@ -122,6 +172,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
) .
'>'
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
@@ -142,10 +197,15 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($position_first_space >= $strlen_segment) {
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($segment);
$token = new HTMLPurifier_Token_Empty($segment);
} else {
$array[] = new HTMLPurifier_Token_Start($segment);
$token = new HTMLPurifier_Token_Start($segment);
}
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
@@ -169,21 +229,29 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
}
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attr);
$token = new HTMLPurifier_Token_Empty($type, $attr);
} else {
$array[] = new HTMLPurifier_Token_Start($type, $attr);
$token = new HTMLPurifier_Token_Start($type, $attr);
}
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
}
$array[] = $token;
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
} else {
$array[] = new
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
substr($html, $cursor)
)
);
if ($maintain_line_numbers) $token->line = $current_line;
// no cursor scroll? Hmm...
$array[] = $token;
break;
}
break;
@@ -191,6 +259,22 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
return $array;
}
/**
* PHP 4 compatible substr_count that implements offset and length
*/
function substrCount($haystack, $needle, $offset, $length) {
static $oldVersion;
if ($oldVersion === null) {
$oldVersion = version_compare(PHP_VERSION, '5.1', '<');
}
if ($oldVersion) {
$haystack = substr($haystack, $offset, $length);
return substr_count($haystack, $needle);
} else {
return substr_count($haystack, $needle, $offset, $length);
}
}
/**
* Takes the inside of an HTML tag and makes an assoc array of attributes.
*