mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-06 14:16:32 +02:00
Merged 463:474 for 1.1.2 release.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/1.1@475 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
@@ -48,7 +48,16 @@ class HTMLPurifier_AttrDef
|
||||
*
|
||||
* @note This method is not entirely standards compliant, as trim() removes
|
||||
* more types of whitespace than specified in the spec. In practice,
|
||||
* this is rarely a problem.
|
||||
* this is rarely a problem, as those extra characters usually have
|
||||
* already been removed by HTMLPurifier_Encoder.
|
||||
*
|
||||
* @warning This processing is inconsistent with XML's whitespace handling
|
||||
* as specified by section 3.3.3 and referenced XHTML 1.0 section
|
||||
* 4.7. Compliant processing requires all line breaks normalized
|
||||
* to "\n", so the fix is not as simple as fixing it in this
|
||||
* function. Trim and whitespace collapsing are supposed to only
|
||||
* occur in NMTOKENs. However, note that we are NOT necessarily
|
||||
* parsing XML, thus, this behavior may still be correct.
|
||||
*
|
||||
* @public
|
||||
*/
|
||||
|
@@ -56,6 +56,8 @@ class HTMLPurifier_ChildDef
|
||||
*
|
||||
* @warning Currently this class is an all or nothing proposition, that is,
|
||||
* it will only give a bool return value.
|
||||
* @note This class is currently not used by any code, although it is unit
|
||||
* tested.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
|
||||
{
|
||||
|
@@ -26,12 +26,12 @@ class HTMLPurifier_Config
|
||||
var $def;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_HTMLDefinition
|
||||
* Cached instance of HTMLPurifier_HTMLDefinition
|
||||
*/
|
||||
var $html_definition;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_CSSDefinition
|
||||
* Cached instance of HTMLPurifier_CSSDefinition
|
||||
*/
|
||||
var $css_definition;
|
||||
|
||||
|
@@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
|
||||
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Most common entity to raw value conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_entity2str =
|
||||
array(
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
/**
|
||||
* Parses special entities into the proper characters.
|
||||
*
|
||||
* This string will translate escaped versions of the special characters
|
||||
* into the correct ones.
|
||||
*
|
||||
* @warning
|
||||
* You should be able to treat the output of this function as
|
||||
* completely parsed, but that's only because all other entities should
|
||||
* have been handled previously in substituteNonSpecialEntities()
|
||||
*
|
||||
* @param $string String character data to be parsed.
|
||||
* @returns Parsed character data.
|
||||
*/
|
||||
function parseData($string) {
|
||||
|
||||
// following functions require at least one character
|
||||
if ($string === '') return '';
|
||||
|
||||
// subtracts amps that cannot possibly be escaped
|
||||
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if (!$num_amp) return $string; // abort if no entities
|
||||
$num_esc_amp = substr_count($string, '&');
|
||||
$string = strtr($string, $this->_special_entity2str);
|
||||
|
||||
// code duplication for sake of optimization, see above
|
||||
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if ($num_amp_2 <= $num_esc_amp) return $string;
|
||||
|
||||
// hmm... now we have some uncommon entities. Use the callback.
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
var $_encoder;
|
||||
|
||||
/**
|
||||
|
@@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
* completely eventually.
|
||||
*
|
||||
* @todo Reread XML spec and document differences.
|
||||
* @todo Add support for CDATA sections.
|
||||
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
|
||||
* @todo Optimize main function tokenizeHTML().
|
||||
* @todo Less than sign (<) being prohibited (even as entity) in attr-values?
|
||||
*
|
||||
* @todo Determine correct behavior in transforming comment data. (preserve dashes?)
|
||||
*/
|
||||
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
{
|
||||
|
||||
/**
|
||||
* Most common entity to raw value conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_entity2str =
|
||||
array(
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
/**
|
||||
* Parses special entities into the proper characters.
|
||||
*
|
||||
* This string will translate escaped versions of the special characters
|
||||
* into the correct ones.
|
||||
*
|
||||
* @warning
|
||||
* You should be able to treat the output of this function as
|
||||
* completely parsed, but that's only because all other entities should
|
||||
* have been handled previously in substituteNonSpecialEntities()
|
||||
*
|
||||
* @param $string String character data to be parsed.
|
||||
* @returns Parsed character data.
|
||||
*/
|
||||
function parseData($string) {
|
||||
|
||||
// subtracts amps that cannot possibly be escaped
|
||||
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if (!$num_amp) return $string; // abort if no entities
|
||||
$num_esc_amp = substr_count($string, '&');
|
||||
$string = strtr($string, $this->_special_entity2str);
|
||||
|
||||
// code duplication for sake of optimization, see above
|
||||
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if ($num_amp_2 <= $num_esc_amp) return $string;
|
||||
|
||||
// hmm... now we have some uncommon entities. Use the callback.
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whitespace characters for str(c)spn.
|
||||
* @protected
|
||||
|
@@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
* whatever it does for poorly formed HTML is up to it.
|
||||
*
|
||||
* @todo Generalize so that XML_HTMLSax is also supported.
|
||||
*
|
||||
* @warning Entity-resolution inside attributes is broken.
|
||||
*/
|
||||
|
||||
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
@@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
$parser->set_element_handler('openHandler','closeHandler');
|
||||
$parser->set_data_handler('dataHandler');
|
||||
$parser->set_escape_handler('escapeHandler');
|
||||
|
||||
// doesn't seem to work correctly for attributes
|
||||
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
||||
|
||||
$parser->parse($string);
|
||||
@@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
* Open tag event handler, interface is defined by PEAR package.
|
||||
*/
|
||||
function openHandler(&$parser, $name, $attrs, $closed) {
|
||||
// entities are not resolved in attrs
|
||||
foreach ($attrs as $key => $attr) {
|
||||
$attrs[$key] = $this->parseData($attr);
|
||||
}
|
||||
if ($closed) {
|
||||
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
|
||||
} else {
|
||||
|
@@ -4,7 +4,6 @@ require_once 'HTMLPurifier/URIScheme.php';
|
||||
|
||||
/**
|
||||
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
|
||||
* @todo Typecode check on path
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
|
||||
@@ -16,7 +15,27 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config );
|
||||
// typecode check needed on path
|
||||
$semicolon_pos = strrpos($path, ';'); // reverse
|
||||
if ($semicolon_pos !== false) {
|
||||
// typecode check
|
||||
$type = substr($path, $semicolon_pos + 1); // no semicolon
|
||||
$path = substr($path, 0, $semicolon_pos);
|
||||
$type_ret = '';
|
||||
if (strpos($type, '=') !== false) {
|
||||
// figure out whether or not the declaration is correct
|
||||
list($key, $typecode) = explode('=', $type, 2);
|
||||
if ($key !== 'type') {
|
||||
// invalid key, tack it back on encoded
|
||||
$path .= '%3B' . $type;
|
||||
} elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
|
||||
$type_ret = ";type=$typecode";
|
||||
}
|
||||
} else {
|
||||
$path .= '%3B' . $type;
|
||||
}
|
||||
$path = str_replace(';', '%3B', $path);
|
||||
$path .= $type_ret;
|
||||
}
|
||||
return array($userinfo, $host, $port, $path, null);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user