1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-31 19:30:21 +02:00

[2.0.1] Improve special case handling for <script>

- DirectLex now honors comments with greater than or less than signs in them
- Comments are transformed into script elements, ending comments are scrapped
- Buggy generator code rewritten to be more error-proof
- AttrValidator checks if token has attributes before processing
- Remove invalid documentation from Scripting
- "Commenting" of script elements switched to the more advanced version

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1189 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2007-06-21 14:44:26 +00:00
parent e55551ecdd
commit bf0d659c47
13 changed files with 179 additions and 32 deletions

View File

@@ -8,6 +8,8 @@ class HTMLPurifier_AttrValidator
$definition = $config->getHTMLDefinition();
if ($token->type !== 'start' && $token->type !== 'empty') return $token;
// create alias to global definition array, see also $defs
// DEFINITION CALL
$d_defs = $definition->info_global_attr;

View File

@@ -4,7 +4,7 @@ HTMLPurifier_ConfigSchema::define(
'Output', 'CommentScriptContents', true, 'bool',
'Determines whether or not HTML Purifier should attempt to fix up '.
'the contents of script tags for legacy browsers with comments. This '.
'directive was available since 1.7.'
'directive was available since 2.0.0.'
);
HTMLPurifier_ConfigSchema::defineAlias('Core', 'CommentScriptContents', 'Output', 'CommentScriptContents');
@@ -76,13 +76,17 @@ class HTMLPurifier_Generator
if (!$tokens) return '';
for ($i = 0, $size = count($tokens); $i < $size; $i++) {
if ($this->_scriptFix && $tokens[$i]->name === 'script') {
if ($this->_scriptFix && $tokens[$i]->name === 'script'
&& $i + 2 < $size && $tokens[$i+2]->type == 'end') {
// script special case
// the contents of the script block must be ONE token
// for this to work
$html .= $this->generateFromToken($tokens[$i++]);
$html .= $this->generateScriptFromToken($tokens[$i++]);
while ($tokens[$i]->name != 'script') {
$html .= $this->generateScriptFromToken($tokens[$i++]);
}
// We're not going to do this: it wouldn't be valid anyway
//while ($tokens[$i]->name != 'script') {
// $html .= $this->generateScriptFromToken($tokens[$i++]);
//}
}
$html .= $this->generateFromToken($tokens[$i]);
}
@@ -148,10 +152,12 @@ class HTMLPurifier_Generator
* --> somewhere inside the script contents.
*/
function generateScriptFromToken($token) {
if (!$token->type == 'text') return $this->generateFromToken($token);
return '<!--' . PHP_EOL . $token->data . PHP_EOL . '// -->';
if ($token->type != 'text') return $this->generateFromToken($token);
// return '<!--' . PHP_EOL . trim($token->data) . PHP_EOL . '// -->';
// more advanced version:
// return '<!--//--><![CDATA[//><!--' . PHP_EOL . $token->data . PHP_EOL . '//--><!]]>';
// thanks <http://lachy.id.au/log/2005/05/script-comments>
$data = preg_replace('#//\s*$#', '', $token->data);
return '<!--//--><![CDATA[//><!--' . PHP_EOL . trim($data) . PHP_EOL . '//--><!]]>';
}
/**

View File

@@ -5,14 +5,6 @@
WARNING: THIS MODULE IS EXTREMELY DANGEROUS AS IT ENABLES INLINE SCRIPTING
INSIDE HTML PURIFIER DOCUMENTS. USE ONLY WITH TRUSTED USER INPUT!!!
Usage:
require_once 'HTMLPurifier/HTMLModule/Scripting.php';
$def =& $config->getHTMLDefinition(true); // get the raw version
$def->manager->addModule('Scripting');
This must come before any other calls to getHTMLDefinition()
*/
/**
@@ -63,6 +55,7 @@ class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
);
$this->info['script']->content_model = '#PCDATA';
$this->info['script']->content_model_type = 'optional';
$this->info['script']->attr_transform_pre['type'] =
$this->info['script']->attr_transform_post['type'] =
new HTMLPurifier_AttrTransform_ScriptRequired();
}

View File

@@ -259,7 +259,19 @@ class HTMLPurifier_Lexer
*/
function escapeCDATA($string) {
return preg_replace_callback(
'/<!\[CDATA\[(.+?)\]\]>/',
'/<!\[CDATA\[(.+?)\]\]>/s',
array('HTMLPurifier_Lexer', 'CDATACallback'),
$string
);
}
/**
* Special CDATA case that is especiall convoluted for <script>
*/
function escapeCommentedCDATA($string) {
// <!--//--><![CDATA[//><!--
return preg_replace_callback(
'#<!--//--><!\[CDATA\[//><!--(.+?)//--><!\]\]>#s',
array('HTMLPurifier_Lexer', 'CDATACallback'),
$string
);
@@ -291,6 +303,11 @@ class HTMLPurifier_Lexer
$html = $this->extractBody($html);
}
if ($config->get('HTML', 'Trusted')) {
// escape convoluted CDATA
$html = $this->escapeCommentedCDATA($html);
}
// escape CDATA
$html = $this->escapeCDATA($html);

View File

@@ -83,10 +83,13 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE ||
$node->nodeType === XML_CDATA_SECTION_NODE) {
if ($node->nodeType === XML_TEXT_NODE) {
$tokens[] = $this->factory->createText($node->data);
return;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo DOM's special treatment of <script> tags
$tokens[] = $this->factory->createText($this->parseData($node->data));
return;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
$tokens[] = $this->factory->createComment($node->data);
return;

View File

@@ -126,22 +126,34 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Check if it's a comment
if (
substr($segment, 0, 3) == '!--' &&
substr($segment, $strlen_segment-2, 2) == '--'
substr($segment, 0, 3) == '!--'
) {
// re-determine segment length, looking for -->
$position_comment_end = strpos($html, '-->', $cursor);
if ($position_comment_end === false) {
// uh oh, we have a comment that extends to
// infinity. Can't be helped: set comment
// end position to end of string
$position_comment_end = strlen($html);
$end = true;
} else {
$end = false;
}
$strlen_segment = $position_comment_end - $cursor;
$segment = substr($html, $cursor, $strlen_segment);
$token = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, $strlen_segment - 5
$segment, 3, $strlen_segment - 3
)
);
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $position_next_gt - $cursor);
$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
}
$array[] = $token;
$cursor = $end ? $position_comment_end : $position_comment_end + 3;
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}

View File

@@ -17,9 +17,11 @@ HTMLPurifier_ConfigSchema::define(
HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveScriptContents', true, 'bool', '
This directive enables HTML Purifier to remove not only script tags
but all of their contents. This directive has been available since 2.0.0,
revert to pre-2.0.0 behavior by setting to false.
<p>
This directive enables HTML Purifier to remove not only script tags
but all of their contents. This directive has been available since 2.0.0,
revert to pre-2.0.0 behavior by setting to false.
</p>
'
);
@@ -48,6 +50,9 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
// removes tokens until it reaches a closing tag with its value
$remove_until = false;
// converts comments into text tokens when this is equal to a tag name
$textify_comments = false;
foreach($tokens as $token) {
if ($remove_until) {
if (empty($token->is_tag) || $token->name !== $remove_until) {
@@ -88,6 +93,13 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$token->armor['ValidateAttributes'] = true;
}
// CAN BE GENERICIZED
if ($token->name == 'script' && $token->type == 'start') {
$textify_comments = $token->name;
} elseif ($token->name === $textify_comments && $token->type == 'end') {
$textify_comments = false;
}
} elseif ($escape_invalid_tags) {
// invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text(
@@ -108,8 +120,14 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
continue;
}
} elseif ($token->type == 'comment') {
// strip comments
continue;
// textify comments in script tags when they are allowed
if ($textify_comments !== false) {
$data = $token->data;
$token = new HTMLPurifier_Token_Text($data);
} else {
// strip comments
continue;
}
} elseif ($token->type == 'text') {
} else {
continue;