From 0df1013fe36ee31ef731e788e870a9d10a64901c Mon Sep 17 00:00:00 2001 From: Cameron Date: Fri, 1 Mar 2013 18:17:03 -0800 Subject: [PATCH] Corrected and
 handling. Added utf-8 support.
 Added attribute white-list per tag.

---
 e107_handlers/e_parse_class.php | 136 +++++++++++++++++++++-----------
 1 file changed, 92 insertions(+), 44 deletions(-)

diff --git a/e107_handlers/e_parse_class.php b/e107_handlers/e_parse_class.php
index 05ac4152d..7c8d4c9ce 100644
--- a/e107_handlers/e_parse_class.php
+++ b/e107_handlers/e_parse_class.php
@@ -29,7 +29,7 @@ define('E_UTF8_PACK', e_HANDLER.'utf8/');
 
 define("E_NL", chr(2));
 
-class e_parse
+class e_parse extends e_parser
 {
 	/**
 	 * Determine how to handle utf-8.
@@ -240,6 +240,7 @@ class e_parse
 	public function __construct()
 	{
 		// initialise the type of UTF-8 processing methods depending on PHP version and mb string extension
+		$this->init();
 		$this->initCharset();
 
 		// Preprocess the supermods to be useful default arrays with all values
@@ -2342,14 +2343,19 @@ class e_parse
  * Start Fresh and Build on it over time to become eventual replacement to e_parse. 
  * Cameron's DOM-based parser. 
  */
-class e_parser extends e_parse
+class e_parser
 {
-    private $domObj             = null;
+    public $domObj             = null;
     private $removedList        = array();
     private $nodesToDelete      = array();
     private $nodesToConvert     = array();
     private $pathList           = array();
-    private $allowedAttributes  = array('id','href','src','style','class', 'alt', 'title'); // allow posting of data-* ?
+    private $allowedAttributes  = array(
+                                    'default'   => array('id', 'style', 'class'),
+                                    'img'       => array('id', 'src', 'style', 'class', 'alt', 'title', 'width', 'height'),
+                                    'a'         => array('id', 'href', 'style', 'class', 'title'),
+                                  ); 
+    private $badAttrValues       = array("javascript[\s]*?:","alert\(","vbscript[\s]*?:","data:text/html", "mhtml[\s]*?:", "data:[\s]*?image");
     private $allowedTags        = array('html', 'body','div','a','img','table','tr', 'td', 'th', 'tbody', 'thead', 'colgroup', 'b', 
                                         'i', 'pre','code', 'strong', 'u', 'em','ul','li','img','h1','h2','h3','h4','h5','h6','p',
                                         'div','pre','section','article', 'blockquote','hgroup','aside','figure','span', 'video', 'br',
@@ -2358,7 +2364,7 @@ class e_parser extends e_parse
         
     public function __construct()
     {
-       $this->domObj = new DOMDocument();
+       $this->init();
     
          /*
         $meths = get_class_methods('DomDocument');
@@ -2366,6 +2372,15 @@ class e_parser extends e_parse
         print_a($meths);
         */        
     }  
+
+    /**
+     * Used by e_parse to start
+     */
+    function init()
+    {
+        $this->domObj = new DOMDocument();    
+        
+    }
     
     /**
      * Set Allowed Tags. 
@@ -2403,18 +2418,18 @@ class e_parser extends e_parse
  
         echo "

Standard v2 Parser

"; echo "

\$tp->dataFilter()

"; - // echo $this->dataFilter($html); // Remove Comment for a real mess! - $sql->db_Mark_Time('Start Parser Test'); + // echo $tp->dataFilter($html); // Remove Comment for a real mess! + $sql->db_Mark_Time('------ Start Parser Test -------'); print_a($this->dataFilter($html)); $sql->db_Mark_Time('tp->dataFilter'); echo "

\$tp->toHtml()

"; - // echo $this->dataFilter($html); // Remove Comment for a real mess! + // echo $tp->dataFilter($html); // Remove Comment for a real mess! print_a($this->tohtml($html)); $sql->db_Mark_Time('tp->toHtml'); echo "

\$tp->toDB()

"; - // echo $this->dataFilter($html); // Remove Comment for a real mess! + // echo $tp->dataFilter($html); // Remove Comment for a real mess! print_a($this->toDB($html)); $sql->db_Mark_Time('tp->toDB'); @@ -2424,7 +2439,7 @@ class e_parser extends e_parse $cleaned = $this->cleanHtml($html); print_a($cleaned); $sql->db_Mark_Time('new Parser'); - + // $sql->db_Mark_Time('------ End Parser Test -------'); echo "

Processed & Rendered

"; echo $cleaned; @@ -2434,6 +2449,7 @@ class e_parser extends e_parse echo "

Removed Tags and Attributes

"; print_a($this->removedList); + // print_a($p); } @@ -2446,12 +2462,15 @@ class e_parser extends e_parse public function cleanHtml($html='') { if(!vartrue($html)){ return; } - - $html = 'html 5 test'.$html.''; // Set it up for processing. + + // $html = mb_convert_encoding($html, 'UTF-8'); + + $html = ''.$html.''; // Set it up for processing. $doc = $this->domObj; - - $doc->loadHTML($html); - $doc->resolveExternals = true; + + $doc->loadHTML($html); + $doc->encoding = 'UTF-8'; //FIXME + // $doc->resolveExternals = true; $tmp = $doc->getElementsByTagName('*'); @@ -2462,41 +2481,45 @@ class e_parser extends e_parse // $tag = strval(basename($path)); $tag = preg_replace('/([a-z0-9\[\]\/]*)?\/([\w]*)(\[(\d)*\])?$/i', "$2", $path); - $allowed = in_array($tag, $this->allowedTags); - if(!$allowed) + if(!in_array($tag, $this->allowedTags)) { - if(strpos($path,'/code/') !== false || strpos($path,'/pre/') !== false) // treat as html. + + if(strpos($path,'/code/') !== false || strpos($path,'/pre/') !== false) // treat as html. { $this->pathList[] = $path; - $this->nodesToConvert[] = $node->parentNode; // $node; + $this->nodesToConvert[] = $node->parentNode; // $node; continue; } $this->removedList['tags'][] = $tag; $this->nodesToDelete[] = $node; - continue; + continue; } - + foreach ($node->attributes as $attr) { $name = $attr->nodeName; - $value = $attr->nodeValue; // Check value against whitelist. + $value = $attr->nodeValue; + + $allow = varset($this->allowedAttributes[$tag], $this->allowedAttributes['default']); - if(!in_array($name, $this->allowedAttributes) ) + if(!in_array($name, $allow)) + { + $node->removeAttribute($name); + $this->removedList['attributes'][] = $name. " from <".$tag.">"; + continue; + } + + if(invalidAttributeVal( $value)) // Check value against blacklist. { $node->removeAttribute($name); - $this->removedList['attributes'][] = $tag.'['.$name.']'; - } - else - { - if($this->inValidAttributeVal($value)) - { - $node->removeAttribute($name); - $node->setAttribute($name, '#---sanitized---#'); - $this->removedList['sanitized'][] = $tag.'['.$name.']'; - } - } + $node->setAttribute($name, '#---sanitized---#'); + $node->removeAttribute($name); + $node->setAttribute($name, '#---sanitized---#'); + $this->removedList['sanitized'][] = $tag.'['.$name.']'; + } } + } // Remove some stuff. @@ -2506,18 +2529,37 @@ class e_parser extends e_parse } // Convert and
 Tags to Htmlentities. 
-        foreach($this->nodesToConvert as $node) //TODO Work on code processing and highlighting. 
+        foreach($this->nodesToConvert as $node) //TODO Work on code processing and highlighting . 
         {
             $value = $node->C14N();
+
             $value = str_replace("
","",$value);
-            $node->nodeValue = htmlentities($value);
+            
+            if($node->nodeName == 'pre')
+            {
+                $value = substr($value,5);
+                $end = strrpos($value,"
"); + $value = substr($value,0,$end); + } + + if($node->nodeName == 'code') + { + $value = substr($value,6); + $end = strrpos($value,"
"); + $value = substr($value,0,$end); + } + + $value = htmlentities(htmlentities($value)); // Needed + $node->nodeValue = $value; } $cleaned = $doc->saveHTML(); - $cleaned = str_replace(array('','','','',''),'',$cleaned); // filter out tags. - + $cleaned = str_replace(array('','','','','','',''),'',$cleaned); // filter out tags. + + $cleaned = html_entity_decode($cleaned, ENT_QUOTES, 'UTF-8'); + return $cleaned; } @@ -2529,12 +2571,9 @@ class e_parser extends e_parse */ function invalidAttributeVal($val) { - // FIXME default (strict) match and filters for certain attributes (e.g. src, href, etc) - $invalid = array("javascript:","alert(","vbscript:","data:text/html", "mhtml:", "data:image"); - - foreach($invalid as $v) + foreach($this->badAttrValues as $v) // global list because a bad value is bad regardless of the attribute it's in. ;-) { - if(stripos($val,$v)!==false) //TODO More reliable check. + if(preg_match('/'.$v.'/i',$v)!==false) { return true; } @@ -2553,6 +2592,13 @@ class e_parser extends e_parse { $html = << +日本語
+简体中文
+Test +Test regex +
Hi there
@@ -2622,8 +2668,10 @@ Some example text






...



-Test +
Some text goes here
+ + EOF; return $html;