1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-09 15:47:25 +02:00

Malformed UTF-8 and non-SGML character detection and cleaning implemented

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@303 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2006-08-19 17:53:59 +00:00
parent 53808ee34a
commit 973cc43b64
11 changed files with 131 additions and 58 deletions

14
smoketests/common.php Normal file
View File

@@ -0,0 +1,14 @@
<?php
header('Content-type: text/html; charset=UTF-8');
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
function escapeHTML($string) {
$string = HTMLPurifier_Lexer::cleanUTF8($string);
$string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
return $string;
}
?>

View File

@@ -1,7 +1,6 @@
<?php
// there must not be a byte order mark
header('Content-type: text/html; charset=UTF-8');
require_once 'common.php';
?><!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -15,9 +14,6 @@ header('Content-type: text/html; charset=UTF-8');
<h1>HTMLPurifier UTF-8 Smoketest</h1>
<?php
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
$purifier = new HTMLPurifier();
$string = '
<ul>

View File

@@ -1,6 +1,6 @@
<?php
header('Content-type: text/html; charset=UTF-8');
require_once 'common.php';
?><!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -19,16 +19,8 @@ in Internet Explorer, if it works at all.</p>
<h2>Test</h2>
<?php
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
$purifier = new HTMLPurifier();
function escape($string) {
$string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
$string = iconv('UTF-8', 'UTF-8//IGNORE', $string);
return $string;
}
?>
<table>
<thead><tr><th>ASCII</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
@@ -44,8 +36,8 @@ for ($i = 0; $i < 256; $i++) {
?>
<tr>
<td><?php echo $i; ?></td>
<td style="font-size:8pt;"><?php echo escape($html); ?></td>
<td style="font-size:8pt;"><?php echo escape($pure_html); ?></td>
<td style="font-size:8pt;"><?php echo escapeHTML($html); ?></td>
<td style="font-size:8pt;"><?php echo escapeHTML($pure_html); ?></td>
<td><?php echo $pure_html; ?></td>
</tr>
<?php } ?>
@@ -54,9 +46,8 @@ for ($i = 0; $i < 256; $i++) {
<h2>Analysis</h2>
<p>This test currently passes the XSS aspect but fails the validation aspect
due to generalized encoding issues. An augmented UTF-8 smoketest is
pending, until then, consider this a pass.</p>
<p>By making sure that UTF-8 is well formed and non-SGML codepoints are
removed, as well as escaping quotes outside of tags, this is a non-threat.</p>
</body>
</html>

View File

@@ -1,6 +1,6 @@
<?php
header('Content-type: text/html; charset=UTF-8');
require_once('common.php');
?><!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
@@ -23,9 +23,6 @@ relevant.</p>
if (version_compare(PHP_VERSION, '5', '<')) exit('<p>Requires PHP 5.</p>');
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
$xml = simplexml_load_file('xssAttacks.xml');
$purifier = new HTMLPurifier();
@@ -43,10 +40,10 @@ foreach ($xml->attack as $attack) {
if ($attack->name == 'US-ASCII encoding') $code = urldecode($code);
?>
<tr>
<td><?php echo htmlspecialchars($attack->name); ?></td>
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($code); ?></textarea></td>
<td><?php echo escapeHTML($attack->name); ?></td>
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($code); ?></textarea></td>
<?php $pure_html = $purifier->purify($code); ?>
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($pure_html); ?></textarea></td>
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($pure_html); ?></textarea></td>
<td><?php echo $pure_html ?></td>
</tr>
<?php