1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-08 07:06:46 +02:00

Release 2.0.0, merged in 1026 to HEAD.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@1179 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2007-06-21 00:36:12 +00:00
parent c35eb3e95f
commit 0101311193
172 changed files with 7713 additions and 2520 deletions

View File

@@ -1,59 +1,65 @@
<?php
require_once 'HTMLPurifier/Lexer.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'CleanUTF8DuringGeneration', false, 'bool',
'When true, HTMLPurifier_Generator will also check all strings it '.
'escapes for UTF-8 well-formedness as a defense in depth measure. '.
'This could cause a considerable performance impact, and is not '.
'strictly necessary due to the fact that the Lexers should have '.
'ensured that all the UTF-8 strings were well-formed. Note that '.
'the configuration value is only read at the beginning of '.
'generateFromTokens.'
);
HTMLPurifier_ConfigSchema::define(
'Core', 'XHTML', true, 'bool',
'Determines whether or not output is XHTML or not. When disabled, HTML '.
'Purifier goes into HTML 4.01 removes XHTML-specific markup constructs, '.
'such as boolean attribute expansion and trailing slashes in empty tags. '.
'This directive was available since 1.1.'
'Output', 'CommentScriptContents', true, 'bool',
'Determines whether or not HTML Purifier should attempt to fix up '.
'the contents of script tags for legacy browsers with comments. This '.
'directive was available since 1.7.'
);
HTMLPurifier_ConfigSchema::defineAlias('Core', 'CommentScriptContents', 'Output', 'CommentScriptContents');
// extension constraints could be factored into ConfigSchema
HTMLPurifier_ConfigSchema::define(
'Core', 'TidyFormat', false, 'bool',
'<p>Determines whether or not to run Tidy on the final output for pretty '.
'formatting reasons, such as indentation and wrap.</p><p>This can greatly '.
'improve readability for editors who are hand-editing the HTML, but is '.
'by no means necessary as HTML Purifier has already fixed all major '.
'errors the HTML may have had. Tidy is a non-default extension, and this directive '.
'will silently fail if Tidy is not available.</p><p>If you are looking to make '.
'the overall look of your page\'s source better, I recommend running Tidy '.
'on the entire page rather than just user-content (after all, the '.
'indentation relative to the containing blocks will be incorrect).</p><p>This '.
'directive was available since 1.1.1.</p>'
'Output', 'TidyFormat', false, 'bool', <<<HTML
<p>
Determines whether or not to run Tidy on the final output for pretty
formatting reasons, such as indentation and wrap.
</p>
<p>
This can greatly improve readability for editors who are hand-editing
the HTML, but is by no means necessary as HTML Purifier has already
fixed all major errors the HTML may have had. Tidy is a non-default
extension, and this directive will silently fail if Tidy is not
available.
</p>
<p>
If you are looking to make the overall look of your page's source
better, I recommend running Tidy on the entire page rather than just
user-content (after all, the indentation relative to the containing
blocks will be incorrect).
</p>
<p>
This directive was available since 1.1.1.
</p>
HTML
);
HTMLPurifier_ConfigSchema::defineAlias('Core', 'TidyFormat', 'Output', 'TidyFormat');
/**
* Generates HTML from tokens.
* @todo Create a configuration-wide instance that all objects retrieve
*/
class HTMLPurifier_Generator
{
/**
* Bool cache of %Core.CleanUTF8DuringGeneration
* @private
*/
var $_clean_utf8 = false;
/**
* Bool cache of %Core.XHTML
* Bool cache of %HTML.XHTML
* @private
*/
var $_xhtml = true;
/**
* Bool cache of %Output.CommentScriptContents
* @private
*/
var $_scriptFix = false;
/**
* Cache of HTMLDefinition
* @private
*/
var $_def;
/**
* Generates HTML from an array of tokens.
* @param $tokens Array of HTMLPurifier_Token
@@ -63,13 +69,24 @@ class HTMLPurifier_Generator
function generateFromTokens($tokens, $config, &$context) {
$html = '';
if (!$config) $config = HTMLPurifier_Config::createDefault();
$this->_clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
$this->_xhtml = $config->get('Core', 'XHTML');
$this->_scriptFix = $config->get('Output', 'CommentScriptContents');
$this->_def = $config->getHTMLDefinition();
$this->_xhtml = $this->_def->doctype->xml;
if (!$tokens) return '';
foreach ($tokens as $token) {
$html .= $this->generateFromToken($token);
for ($i = 0, $size = count($tokens); $i < $size; $i++) {
if ($this->_scriptFix && $tokens[$i]->name === 'script') {
// script special case
$html .= $this->generateFromToken($tokens[$i++]);
$html .= $this->generateScriptFromToken($tokens[$i++]);
while ($tokens[$i]->name != 'script') {
$html .= $this->generateScriptFromToken($tokens[$i++]);
}
}
$html .= $this->generateFromToken($tokens[$i]);
}
if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) {
if ($config->get('Output', 'TidyFormat') && extension_loaded('tidy')) {
$tidy_options = array(
'indent'=> true,
@@ -104,14 +121,14 @@ class HTMLPurifier_Generator
function generateFromToken($token) {
if (!isset($token->type)) return '';
if ($token->type == 'start') {
$attr = $this->generateAttributes($token->attr);
$attr = $this->generateAttributes($token->attr, $token->name);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token->type == 'end') {
return '</' . $token->name . '>';
} elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attr);
$attr = $this->generateAttributes($token->attr, $token->name);
return '<' . $token->name . ($attr ? ' ' : '') . $attr .
( $this->_xhtml ? ' /': '' )
. '>';
@@ -125,18 +142,33 @@ class HTMLPurifier_Generator
}
}
/**
* Special case processor for the contents of script tags
* @warning This runs into problems if there's already a literal
* --> somewhere inside the script contents.
*/
function generateScriptFromToken($token) {
if (!$token->type == 'text') return $this->generateFromToken($token);
return '<!--' . PHP_EOL . $token->data . PHP_EOL . '// -->';
// more advanced version:
// return '<!--//--><![CDATA[//><!--' . PHP_EOL . $token->data . PHP_EOL . '//--><!]]>';
}
/**
* Generates attribute declarations from attribute array.
* @param $assoc_array_of_attributes Attribute array
* @return Generate HTML fragment for insertion.
*/
function generateAttributes($assoc_array_of_attributes) {
function generateAttributes($assoc_array_of_attributes, $element) {
$html = '';
foreach ($assoc_array_of_attributes as $key => $value) {
if (!$this->_xhtml) {
// remove namespaced attributes
if (strpos($key, ':') !== false) continue;
// also needed: check for attribute minimization
if (!empty($this->_def->info[$element]->attr[$key]->minimized)) {
$html .= $key . ' ';
continue;
}
}
$html .= $key.'="'.$this->escape($value).'" ';
}
@@ -149,7 +181,6 @@ class HTMLPurifier_Generator
* @return String escaped data.
*/
function escape($string) {
if ($this->_clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
}