Malformed UTF-8 and non-SGML character detection and cleaning implemented

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@303 48356398-32a2-884e-a903-53898d9a118a
2025-08-09 15:47:25 +02:00 · 2006-08-19 17:53:59 +00:00
parent 53808ee34a
commit 973cc43b64
11 changed files with 131 additions and 58 deletions
--- a/smoketests/common.php
+++ b/smoketests/common.php
@@ -0,0 +1,14 @@
+<?php
+
+header('Content-type: text/html; charset=UTF-8');
+
+set_include_path('../library' . PATH_SEPARATOR . get_include_path());
+require_once 'HTMLPurifier.php';
+
+function escapeHTML($string) {
+    $string = HTMLPurifier_Lexer::cleanUTF8($string);
+    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
+    return $string;
+}
+
+?>
--- a/smoketests/utf8.php
+++ b/smoketests/utf8.php
@@ -1,7 +1,6 @@
 <?php

-// there must not be a byte order mark
-header('Content-type: text/html; charset=UTF-8');
+require_once 'common.php';

 ?><!DOCTYPE html 
     PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -15,9 +14,6 @@ header('Content-type: text/html; charset=UTF-8');
 <h1>HTMLPurifier UTF-8 Smoketest</h1>
 <?php

-set_include_path('../library' . PATH_SEPARATOR . get_include_path());
-require_once 'HTMLPurifier.php';
-
 $purifier = new HTMLPurifier();
 $string = '
 <ul>
--- a/smoketests/variableWidthAttack.php
+++ b/smoketests/variableWidthAttack.php
@@ -1,6 +1,6 @@
 <?php

-header('Content-type: text/html; charset=UTF-8');
+require_once 'common.php';

 ?><!DOCTYPE html 
     PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -19,16 +19,8 @@ in Internet Explorer, if it works at all.</p>
 <h2>Test</h2>
 <?php

-set_include_path('../library' . PATH_SEPARATOR . get_include_path());
-require_once 'HTMLPurifier.php';
 $purifier = new HTMLPurifier();

-function escape($string) {
-    $string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
-    $string = iconv('UTF-8', 'UTF-8//IGNORE', $string);
-    return $string;
-}
-
 ?>
 <table>
 <thead><tr><th>ASCII</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
@@ -44,8 +36,8 @@ for ($i = 0; $i < 256; $i++) {
 ?>
 <tr>
    <td><?php echo $i; ?></td>
-    <td style="font-size:8pt;"><?php echo escape($html); ?></td>
-    <td style="font-size:8pt;"><?php echo escape($pure_html); ?></td>
+    <td style="font-size:8pt;"><?php echo escapeHTML($html); ?></td>
+    <td style="font-size:8pt;"><?php echo escapeHTML($pure_html); ?></td>
    <td><?php echo $pure_html; ?></td>
 </tr>
 <?php } ?>
@@ -54,9 +46,8 @@ for ($i = 0; $i < 256; $i++) {

 <h2>Analysis</h2>

-<p>This test currently passes the XSS aspect but fails the validation aspect
-due to generalized encoding issues.  An augmented UTF-8 smoketest is 
-pending, until then, consider this a pass.</p>
+<p>By making sure that UTF-8 is well formed and non-SGML codepoints are
+removed, as well as escaping quotes outside of tags, this is a non-threat.</p>

 </body>
 </html>
--- a/smoketests/xssAttacks.php
+++ b/smoketests/xssAttacks.php
@@ -1,6 +1,6 @@
 <?php

-header('Content-type: text/html; charset=UTF-8');
+require_once('common.php');

 ?><!DOCTYPE html 
     PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -23,9 +23,6 @@ relevant.</p>

 if (version_compare(PHP_VERSION, '5', '<')) exit('<p>Requires PHP 5.</p>');

-set_include_path('../library' . PATH_SEPARATOR . get_include_path());
-require_once 'HTMLPurifier.php';
-
 $xml = simplexml_load_file('xssAttacks.xml');
 $purifier = new HTMLPurifier();

@@ -43,10 +40,10 @@ foreach ($xml->attack as $attack) {
    if ($attack->name == 'US-ASCII encoding') $code = urldecode($code);
 ?>
    <tr>
-        <td><?php echo htmlspecialchars($attack->name); ?></td>
-        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($code); ?></textarea></td>
+        <td><?php echo escapeHTML($attack->name); ?></td>
+        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($code); ?></textarea></td>
        <?php $pure_html = $purifier->purify($code); ?>
-        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($pure_html); ?></textarea></td>
+        <td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($pure_html); ?></textarea></td>
        <td><?php echo $pure_html ?></td>
    </tr>
 <?php