1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-06 14:16:32 +02:00

Merged 463:474 for 1.1.2 release.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/1.1@475 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2006-09-30 19:10:07 +00:00
parent 6ef8abd04f
commit 8104145580
24 changed files with 554 additions and 405 deletions

View File

@@ -48,7 +48,16 @@ class HTMLPurifier_AttrDef
*
* @note This method is not entirely standards compliant, as trim() removes
* more types of whitespace than specified in the spec. In practice,
* this is rarely a problem.
* this is rarely a problem, as those extra characters usually have
* already been removed by HTMLPurifier_Encoder.
*
* @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized
* to "\n", so the fix is not as simple as fixing it in this
* function. Trim and whitespace collapsing are supposed to only
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
*
* @public
*/

View File

@@ -56,6 +56,8 @@ class HTMLPurifier_ChildDef
*
* @warning Currently this class is an all or nothing proposition, that is,
* it will only give a bool return value.
* @note This class is currently not used by any code, although it is unit
* tested.
*/
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
{

View File

@@ -26,12 +26,12 @@ class HTMLPurifier_Config
var $def;
/**
* Instance of HTMLPurifier_HTMLDefinition
* Cached instance of HTMLPurifier_HTMLDefinition
*/
var $html_definition;
/**
* Instance of HTMLPurifier_CSSDefinition
* Cached instance of HTMLPurifier_CSSDefinition
*/
var $css_definition;

View File

@@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
$this->_entity_parser = new HTMLPurifier_EntityParser();
}
/**
* Most common entity to raw value conversion table for special entities.
* @protected
*/
var $_special_entity2str =
array(
'"' => '"',
'&' => '&',
'&lt;' => '<',
'&gt;' => '>',
'&#39;' => "'",
'&#039;' => "'",
'&#x27;' => "'"
);
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param $string String character data to be parsed.
* @returns Parsed character data.
*/
function parseData($string) {
// following functions require at least one character
if ($string === '') return '';
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if (!$num_amp) return $string; // abort if no entities
$num_esc_amp = substr_count($string, '&amp;');
$string = strtr($string, $this->_special_entity2str);
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if ($num_amp_2 <= $num_esc_amp) return $string;
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities($string);
return $string;
}
var $_encoder;
/**

View File

@@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
* completely eventually.
*
* @todo Reread XML spec and document differences.
* @todo Add support for CDATA sections.
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
* @todo Optimize main function tokenizeHTML().
* @todo Less than sign (<) being prohibited (even as entity) in attr-values?
*
* @todo Determine correct behavior in transforming comment data. (preserve dashes?)
*/
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{
/**
* Most common entity to raw value conversion table for special entities.
* @protected
*/
var $_special_entity2str =
array(
'&quot;' => '"',
'&amp;' => '&',
'&lt;' => '<',
'&gt;' => '>',
'&#39;' => "'",
'&#039;' => "'",
'&#x27;' => "'"
);
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param $string String character data to be parsed.
* @returns Parsed character data.
*/
function parseData($string) {
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if (!$num_amp) return $string; // abort if no entities
$num_esc_amp = substr_count($string, '&amp;');
$string = strtr($string, $this->_special_entity2str);
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if ($num_amp_2 <= $num_esc_amp) return $string;
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities($string);
return $string;
}
/**
* Whitespace characters for str(c)spn.
* @protected

View File

@@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
* whatever it does for poorly formed HTML is up to it.
*
* @todo Generalize so that XML_HTMLSax is also supported.
*
* @warning Entity-resolution inside attributes is broken.
*/
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
@@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
$parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler');
// doesn't seem to work correctly for attributes
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($string);
@@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
* Open tag event handler, interface is defined by PEAR package.
*/
function openHandler(&$parser, $name, $attrs, $closed) {
// entities are not resolved in attrs
foreach ($attrs as $key => $attr) {
$attrs[$key] = $this->parseData($attr);
}
if ($closed) {
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
} else {

View File

@@ -4,7 +4,6 @@ require_once 'HTMLPurifier/URIScheme.php';
/**
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
* @todo Typecode check on path
*/
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
@@ -16,7 +15,27 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
list($userinfo, $host, $port, $path, $query) =
parent::validateComponents(
$userinfo, $host, $port, $path, $query, $config );
// typecode check needed on path
$semicolon_pos = strrpos($path, ';'); // reverse
if ($semicolon_pos !== false) {
// typecode check
$type = substr($path, $semicolon_pos + 1); // no semicolon
$path = substr($path, 0, $semicolon_pos);
$type_ret = '';
if (strpos($type, '=') !== false) {
// figure out whether or not the declaration is correct
list($key, $typecode) = explode('=', $type, 2);
if ($key !== 'type') {
// invalid key, tack it back on encoded
$path .= '%3B' . $type;
} elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
$type_ret = ";type=$typecode";
}
} else {
$path .= '%3B' . $type;
}
$path = str_replace(';', '%3B', $path);
$path .= $type_ret;
}
return array($userinfo, $host, $port, $path, null);
}