Merged 463:474 for 1.1.2 release.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/1.1@475 48356398-32a2-884e-a903-53898d9a118a
2025-10-28 18:54:10 +01:00 · 2006-09-30 19:10:07 +00:00
parent 6ef8abd04f
commit 8104145580
24 changed files with 554 additions and 405 deletions
--- a/library/HTMLPurifier/AttrDef.php
+++ b/library/HTMLPurifier/AttrDef.php
@@ -48,7 +48,16 @@ class HTMLPurifier_AttrDef
     * 
     * @note This method is not entirely standards compliant, as trim() removes
     *       more types of whitespace than specified in the spec. In practice,
-     *       this is rarely a problem.
+     *       this is rarely a problem, as those extra characters usually have
+     *       already been removed by HTMLPurifier_Encoder.
+     * 
+     * @warning This processing is inconsistent with XML's whitespace handling
+     *          as specified by section 3.3.3 and referenced XHTML 1.0 section
+     *          4.7.  Compliant processing requires all line breaks normalized
+     *          to "\n", so the fix is not as simple as fixing it in this
+     *          function.  Trim and whitespace collapsing are supposed to only
+     *          occur in NMTOKENs.  However, note that we are NOT necessarily
+     *          parsing XML, thus, this behavior may still be correct.
     * 
     * @public
     */
--- a/library/HTMLPurifier/ChildDef.php
+++ b/library/HTMLPurifier/ChildDef.php
@@ -56,6 +56,8 @@ class HTMLPurifier_ChildDef
 * 
 * @warning Currently this class is an all or nothing proposition, that is,
 *          it will only give a bool return value.
+ * @note This class is currently not used by any code, although it is unit
+ *       tested.
 */
 class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
 {
--- a/library/HTMLPurifier/Config.php
+++ b/library/HTMLPurifier/Config.php
@@ -26,12 +26,12 @@ class HTMLPurifier_Config
    var $def;
    
    /**
-     * Instance of HTMLPurifier_HTMLDefinition
+     * Cached instance of HTMLPurifier_HTMLDefinition
     */
    var $html_definition;
    
    /**
-     * Instance of HTMLPurifier_CSSDefinition
+     * Cached instance of HTMLPurifier_CSSDefinition
     */
    var $css_definition;
    
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
        $this->_entity_parser = new HTMLPurifier_EntityParser();
    }
    
+    
+    /**
+     * Most common entity to raw value conversion table for special entities.
+     * @protected
+     */
+    var $_special_entity2str =
+            array(
+                    '&quot;' => '"',
+                    '&amp;'  => '&',
+                    '&lt;'   => '<',
+                    '&gt;'   => '>',
+                    '&#39;'  => "'",
+                    '&#039;' => "'",
+                    '&#x27;' => "'"
+            );
+    
+    /**
+     * Parses special entities into the proper characters.
+     * 
+     * This string will translate escaped versions of the special characters
+     * into the correct ones.
+     * 
+     * @warning
+     * You should be able to treat the output of this function as
+     * completely parsed, but that's only because all other entities should
+     * have been handled previously in substituteNonSpecialEntities()
+     * 
+     * @param $string String character data to be parsed.
+     * @returns Parsed character data.
+     */
+    function parseData($string) {
+        
+        // following functions require at least one character
+        if ($string === '') return '';
+        
+        // subtracts amps that cannot possibly be escaped
+        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string)-1] === '&' ? 1 : 0);
+        
+        if (!$num_amp) return $string; // abort if no entities
+        $num_esc_amp = substr_count($string, '&amp;');
+        $string = strtr($string, $this->_special_entity2str);
+        
+        // code duplication for sake of optimization, see above
+        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
+            ($string[strlen($string)-1] === '&' ? 1 : 0);
+        
+        if ($num_amp_2 <= $num_esc_amp) return $string;
+        
+        // hmm... now we have some uncommon entities. Use the callback.
+        $string = $this->_entity_parser->substituteSpecialEntities($string);
+        return $string;
+    }
+    
    var $_encoder;
    
    /**
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
 * completely eventually.
 * 
 * @todo Reread XML spec and document differences.
- * @todo Add support for CDATA sections.
- * @todo Determine correct behavior in outputting comment data. (preserve dashes?)
- * @todo Optimize main function tokenizeHTML().
- * @todo Less than sign (<) being prohibited (even as entity) in attr-values?
+ * 
+ * @todo Determine correct behavior in transforming comment data. (preserve dashes?)
 */
 class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
 {
    
-    /**
-     * Most common entity to raw value conversion table for special entities.
-     * @protected
-     */
-    var $_special_entity2str =
-            array(
-                    '&quot;' => '"',
-                    '&amp;'  => '&',
-                    '&lt;'   => '<',
-                    '&gt;'   => '>',
-                    '&#39;'  => "'",
-                    '&#039;' => "'",
-                    '&#x27;' => "'"
-            );
-    
-    /**
-     * Parses special entities into the proper characters.
-     * 
-     * This string will translate escaped versions of the special characters
-     * into the correct ones.
-     * 
-     * @warning
-     * You should be able to treat the output of this function as
-     * completely parsed, but that's only because all other entities should
-     * have been handled previously in substituteNonSpecialEntities()
-     * 
-     * @param $string String character data to be parsed.
-     * @returns Parsed character data.
-     */
-    function parseData($string) {
-        
-        // subtracts amps that cannot possibly be escaped
-        $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
-            ($string[strlen($string)-1] === '&' ? 1 : 0);
-        
-        if (!$num_amp) return $string; // abort if no entities
-        $num_esc_amp = substr_count($string, '&amp;');
-        $string = strtr($string, $this->_special_entity2str);
-        
-        // code duplication for sake of optimization, see above
-        $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
-            ($string[strlen($string)-1] === '&' ? 1 : 0);
-        
-        if ($num_amp_2 <= $num_esc_amp) return $string;
-        
-        // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->_entity_parser->substituteSpecialEntities($string);
-        return $string;
-    }
-    
    /**
     * Whitespace characters for str(c)spn.
     * @protected
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
 * whatever it does for poorly formed HTML is up to it.
 * 
 * @todo Generalize so that XML_HTMLSax is also supported.
+ * 
+ * @warning Entity-resolution inside attributes is broken.
 */

 class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
@@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        $parser->set_element_handler('openHandler','closeHandler');
        $parser->set_data_handler('dataHandler');
        $parser->set_escape_handler('escapeHandler');
+        
+        // doesn't seem to work correctly for attributes
        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
        
        $parser->parse($string);
@@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     * Open tag event handler, interface is defined by PEAR package.
     */
    function openHandler(&$parser, $name, $attrs, $closed) {
+        // entities are not resolved in attrs
+        foreach ($attrs as $key => $attr) {
+            $attrs[$key] = $this->parseData($attr);
+        }
        if ($closed) {
            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
        } else {
--- a/library/HTMLPurifier/URIScheme/ftp.php
+++ b/library/HTMLPurifier/URIScheme/ftp.php
@@ -4,7 +4,6 @@ require_once 'HTMLPurifier/URIScheme.php';

 /**
 * Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
- * @todo Typecode check on path
 */
 class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
    
@@ -16,7 +15,27 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
        list($userinfo, $host, $port, $path, $query) = 
            parent::validateComponents(
                $userinfo, $host, $port, $path, $query, $config );
-        // typecode check needed on path
+        $semicolon_pos = strrpos($path, ';'); // reverse
+        if ($semicolon_pos !== false) {
+            // typecode check
+            $type = substr($path, $semicolon_pos + 1); // no semicolon
+            $path = substr($path, 0, $semicolon_pos);
+            $type_ret = '';
+            if (strpos($type, '=') !== false) {
+                // figure out whether or not the declaration is correct
+                list($key, $typecode) = explode('=', $type, 2);
+                if ($key !== 'type') {
+                    // invalid key, tack it back on encoded
+                    $path .= '%3B' . $type;
+                } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
+                    $type_ret = ";type=$typecode";
+                }
+            } else {
+                $path .= '%3B' . $type;
+            }
+            $path = str_replace(';', '%3B', $path);
+            $path .= $type_ret;
+        }
        return array($userinfo, $host, $port, $path, null);
    }