From 8c4432b4b2238ade25ef744d24230bd14cc8df0b Mon Sep 17 00:00:00 2001
From: SteveD <steved@e107.org>
Date: Sat, 5 Jan 2013 09:42:34 +0000
Subject: [PATCH] Keep HTML abuse filter in line with 1.x. Remove isutf8 flag -
 no longer relevant.

---
 e107_handlers/e_parse_class.php | 53 +++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 19 deletions(-)
diff --git a/e107_handlers/e_parse_class.php b/e107_handlers/e_parse_class.php
index 75d4ceeef..c8a3b457d 100644
--- a/e107_handlers/e_parse_class.php
+++ b/e107_handlers/e_parse_class.php
@@ -31,13 +31,6 @@ define("E_NL", chr(2));
 
 class e_parse
 {
-	/**
-	 * Flag for global use indicates whether utf-8 character set
-	 *
-	 * @var boolean
-	 */
-	protected $isutf8 = FALSE;
-
 	/**
 	 * Determine how to handle utf-8.
 	 *    0 = 'do nothing'
@@ -275,7 +268,6 @@ class e_parse
 // CHARSET is utf-8
 //		if(strtolower(CHARSET) == 'utf-8')
 //		{
-			$this->isutf8 = TRUE;
 			if(version_compare(PHP_VERSION, '6.0.0') < 1)
 			{
 				// Need to do something here
@@ -530,7 +522,8 @@ class e_parse
 
 
 	/**
-	 *	Check for HTML closing tag for input elements, without corresponding opening tag
+	 *	Check for umatched 'dangerous' HTML tags
+	 *		(these can destroy page layout where users are able to post HTML)
 	 *
 	 *	@param string $data
 	 *	@param string $tagList - if empty, uses default list of input tags. Otherwise a CSV list of tags to check (any type)
@@ -548,17 +541,41 @@ class e_parse
 		{
 			$checkTags = explode(',', $tagList);
 		}
-		$data = strtolower(preg_replace('#\[code.*?\[\/code\]#i', '', $data));		// Ignore code blocks. All lower case simplifies subsequent processing
-		foreach ($checkTags as $tag)
+		$tagArray = array_flip($checkTags);
+		foreach ($tagArray as &$v) { $v = 0; };		// Data fields become zero; keys are tag names.
+		$data = strtolower(preg_replace('#\[code\].*?\[\/code\]#i', '', $data));            // Ignore code blocks. All lower case simplifies the rest
+		$matches = array();
+		if (!preg_match_all('#<(\/|)([^<>]*?[^\/])>#', $data, $matches, PREG_SET_ORDER))
 		{
-			$aCount = substr_count($data,  '<'.$tag);			// Count opening tags
-			$bCount = substr_count($data,  '</'.$tag);			// Count closing tags
-			if ($aCount != $bCount)
-			{
-				return TRUE;		// Potentially abusive HTML found - tags don't balance
+			//echo "No tags found<br />";
+			return TRUE;				// No tags found; so all OK
+		}
+		//print_a($matches);
+		foreach ($matches as $m)
+		{
+			// $m[0] is the complete tag; $m[1] is '/' or empty; $m[2] is the tag and any attributes
+			list ($tag) = explode(' ', $m[2], 2);
+			if (!isset($tagArray[$tag])) continue;			// Not a tag of interest
+			if ($m[1] == '/')
+			{	// Closing tag
+				if ($tagArray[$tag] == 0) 
+				{
+					//echo "Close before open: {$tag}<br />";
+					return TRUE;		// Closing tag before we've had an opening tag
+				}
+				$tagArray[$tag]--;		// Obviously had at least one opening tag
+			}
+			else
+			{	// Opening tag
+				$tagArray[$tag]++;
 			}
 		}
-		return FALSE;		// Nothing detected
+		//print_a($tagArray);
+		foreach ($tagArray as $t)
+		{
+			if ($t > 0) return TRUE;		// More opening tags than closing tags
+		}
+		return FALSE;						// OK now
 	}
 
 
@@ -1512,8 +1529,6 @@ class e_parse
 							}
 							else
 							{
-								// CHARSET is utf-8 - e_parse_class.php too
-								//$email_text = ($this->isutf8) ? "\\1\\2©\\3" : "\\1\\2&copy;\\3";
 								$email_text = '$1$2©$3';
 
 //								$sub_blk = preg_replace("#(^|[\s])([\w]+?://(?:[\w-%]+?)(?:\.[\w-%]+?)+.*?)(?=$|[\s()[\]<]|\.\s|\.$|,\s|,$)#is", "\\1<a href=\"\\2\" rel=\"external\">\\2</a>", $sub_blk);