1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-31 19:30:21 +02:00

Remove a huge swath of duplicated function calls by factoring them into a normalize() function. Also made DirectLex's variable names consistent with the rest of the classes.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@340 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2006-08-29 20:05:26 +00:00
parent 1de3088276
commit 89376a11e3
4 changed files with 46 additions and 51 deletions

View File

@@ -76,31 +76,16 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
*/
var $_whitespace = "\x20\x09\x0D\x0A";
function tokenizeHTML($string, $config = null) {
function tokenizeHTML($html, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
// some quick checking (if empty, return empty)
$string = @ (string) $string;
if ($string == '') return array();
if ($config->get('Core', 'AcceptFullDocuments')) {
$string = $this->extractBody($string);
}
$html = $this->normalize($html, $config);
$cursor = 0; // our location in the text
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
// escape CDATA
$string = $this->escapeCDATA($string);
// expand entities THAT AREN'T THE BIG FIVE
$string = $this->_encoder->substituteNonSpecialEntities($string);
// clean it into wellformed UTF-8 string
$string = $this->_encoder->cleanUTF8($string);
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?
@@ -111,8 +96,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// infinite loop protection
if (++$loops > 200000) return array();
$position_next_lt = strpos($string, '<', $cursor);
$position_next_gt = strpos($string, '>', $cursor);
$position_next_lt = strpos($html, '<', $cursor);
$position_next_gt = strpos($html, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
if ($position_next_lt === $cursor) {
@@ -126,7 +111,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
HTMLPurifier_Token_Text(
$this->parseData(
substr(
$string, $cursor, $position_next_lt - $cursor
$html, $cursor, $position_next_lt - $cursor
)
)
);
@@ -136,13 +121,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
} elseif (!$inside_tag) {
// We are not inside tag but there are no more tags
// If we're already at the end, break
if ($cursor === strlen($string)) break;
if ($cursor === strlen($html)) break;
// Create Text of rest of string
$array[] = new
HTMLPurifier_Token_Text(
$this->parseData(
substr(
$string, $cursor
$html, $cursor
)
)
);
@@ -151,7 +136,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// We are in tag and it is well formed
// Grab the internals of the tag
$strlen_segment = $position_next_gt - $cursor;
$segment = substr($string, $cursor, $strlen_segment);
$segment = substr($html, $cursor, $strlen_segment);
// Check if it's a comment
if (
@@ -232,7 +217,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
substr($string, $cursor)
substr($html, $cursor)
)
);
break;