1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-12 10:16:18 +02:00

[2.1.5] [MFH] Fix Shift_JIS encoding wonkiness with yen symbols and whatnot, as well as other patches

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1791 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2008-06-11 18:49:56 +00:00
parent 369a69d533
commit 450fc6649d
9 changed files with 274 additions and 132 deletions

View File

@ -54,18 +54,15 @@ class HTMLPurifier_AttrDef
* *
* @warning This processing is inconsistent with XML's whitespace handling * @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section * as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized * 4.7. However, note that we are NOT necessarily
* to "\n", so the fix is not as simple as fixing it in this * parsing XML, thus, this behavior may still be correct. We
* function. Trim and whitespace collapsing are supposed to only * assume that newlines have been normalized.
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
* *
* @public * @public
*/ */
function parseCDATA($string) { function parseCDATA($string) {
$string = trim($string); $string = trim($string);
$string = str_replace("\n", '', $string); $string = str_replace(array("\n", "\t", "\r"), ' ', $string);
$string = str_replace(array("\r", "\t"), ' ', $string);
return $string; return $string;
} }

View File

@ -19,10 +19,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
'cursive' => true 'cursive' => true
); );
$string = $this->parseCDATA($string);
// assume that no font names contain commas in them // assume that no font names contain commas in them
$fonts = explode(',', $string); $fonts = explode(',', $string);
$final = ''; $final = '';
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
foreach($fonts as $font) { foreach($fonts as $font) {
$font = trim($font); $font = trim($font);
if ($font === '') continue; if ($font === '') continue;
@ -38,11 +38,33 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
$quote = $font[0]; $quote = $font[0];
if ($font[$length - 1] !== $quote) continue; if ($font[$length - 1] !== $quote) continue;
$font = substr($font, 1, $length - 2); $font = substr($font, 1, $length - 2);
// double-backslash processing is buggy. Namely, it doesn't allow
// fonts that contain an adjacent quote, backslash, or comma $new_font = '';
$font = str_replace("\\$quote", $quote, $font); // de-escape quote for ($i = 0, $c = strlen($font); $i < $c; $i++) {
$font = str_replace("\\\n", '', $font); // de-escape newlines if ($font[$i] === '\\') {
$font = str_replace("\\\\", "\\", $font); // de-escape double backslashes $i++;
if ($i >= $c) {
$new_font .= '\\';
break;
}
if (ctype_xdigit($font[$i])) {
$code = $font[$i];
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
if (!ctype_xdigit($font[$i])) break;
$code .= $font[$i];
}
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
if (isset($non_sgml[$char])) continue;
$new_font .= $char;
if ($i < $c && trim($font[$i]) !== '') $i--;
continue;
}
if ($font[$i] === "\n") continue;
}
$new_font .= $font[$i];
}
$font = $new_font;
} }
// $font is a pure representation of the font name // $font is a pure representation of the font name

View File

@ -67,6 +67,25 @@ class HTMLPurifier_Encoder
*/ */
function muteErrorHandler() {} function muteErrorHandler() {}
/**
* Returns a lookup of UTF-8 character byte sequences that are non-SGML.
*/
function getNonSgmlCharacters() {
static $nonSgmlCharacters;
if (empty($nonSgmlCharacters)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
$nonSgmlCharacters[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
return $nonSgmlCharacters;
}
/** /**
* Cleans a UTF-8 string for well-formedness and SGML validity * Cleans a UTF-8 string for well-formedness and SGML validity
* *
@ -95,18 +114,7 @@ class HTMLPurifier_Encoder
*/ */
function cleanUTF8($str, $force_php = false) { function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array(); $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
$non_sgml_chars[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
static $iconv = null; static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv'); if ($iconv === null) $iconv = function_exists('iconv');
@ -115,7 +123,7 @@ class HTMLPurifier_Encoder
// This is an optimization: if the string is already valid UTF-8, no // This is an optimization: if the string is already valid UTF-8, no
// need to do iconv/php stuff. 99% of the time, this will be the case. // need to do iconv/php stuff. 99% of the time, this will be the case.
if (preg_match('/^.{1}/us', $str)) { if (preg_match('/^.{1}/us', $str)) {
return strtr($str, $non_sgml_chars); return strtr($str, $non_sgml);
} }
if ($iconv && !$force_php) { if ($iconv && !$force_php) {
@ -123,7 +131,7 @@ class HTMLPurifier_Encoder
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str); $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
restore_error_handler(); restore_error_handler();
return strtr($str, $non_sgml_chars); return strtr($str, $non_sgml);
} }
$mState = 0; // cached expected number of octets after the current octet $mState = 0; // cached expected number of octets after the current octet
@ -327,14 +335,23 @@ class HTMLPurifier_Encoder
* @static * @static
*/ */
function convertToUTF8($str, $config, &$context) { function convertToUTF8($str, $config, &$context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding'); $encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str; if ($encoding === 'utf-8') return $str;
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) { if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
return @iconv($encoding, 'utf-8//IGNORE', $str); $str = iconv($encoding, 'utf-8//IGNORE', $str);
// If the string is bjorked by Shift_JIS or a similar encoding
// that doesn't support all of ASCII, convert the naughty
// characters to their true byte-wise ASCII/UTF-8 equivalents.
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
restore_error_handler();
return $str;
} elseif ($encoding === 'iso-8859-1') { } elseif ($encoding === 'iso-8859-1') {
return @utf8_encode($str); $str = utf8_encode($str);
restore_error_handler();
return $str;
} }
trigger_error('Encoding not supported', E_USER_ERROR); trigger_error('Encoding not supported', E_USER_ERROR);
} }
@ -346,17 +363,31 @@ class HTMLPurifier_Encoder
* characters being omitted. * characters being omitted.
*/ */
function convertFromUTF8($str, $config, &$context) { function convertFromUTF8($str, $config, &$context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding'); $encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str; if ($encoding === 'utf-8') return $str;
if ($config->get('Core', 'EscapeNonASCIICharacters')) { static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
} }
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) { if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
return @iconv('utf-8', $encoding . '//IGNORE', $str); // Undo our previous fix in convertToUTF8, otherwise iconv will barf
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
if (!$escape && !empty($ascii_fix)) {
$clear_fix = array();
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
$str = strtr($str, $clear_fix);
}
$str = strtr($str, array_flip($ascii_fix));
// Normal stuff
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
restore_error_handler();
return $str;
} elseif ($encoding === 'iso-8859-1') { } elseif ($encoding === 'iso-8859-1') {
return @utf8_decode($str); $str = utf8_decode($str);
restore_error_handler();
return $str;
} }
trigger_error('Encoding not supported', E_USER_ERROR); trigger_error('Encoding not supported', E_USER_ERROR);
} }
@ -409,6 +440,47 @@ class HTMLPurifier_Encoder
return $result; return $result;
} }
/**
* This expensive function tests whether or not a given character
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
* fail this test, and require special processing. Variable width
* encodings shouldn't ever fail.
*
* @param string $encoding Encoding name to test, as per iconv format
* @param bool $bypass Whether or not to bypass the precompiled arrays.
* @return Array of UTF-8 characters to their corresponding ASCII,
* which can be used to "undo" any overzealous iconv action.
*/
function testEncodingSupportsASCII($encoding, $bypass = false) {
static $encodings = array();
if (!$bypass) {
if (isset($encodings[$encoding])) return $encodings[$encoding];
$lenc = strtolower($encoding);
switch ($lenc) {
case 'shift_jis':
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
case 'johab':
return array("\xE2\x82\xA9" => '\\');
}
if (strpos($lenc, 'iso-8859-') === 0) return array();
}
$ret = array();
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if (iconv('UTF-8', $encoding, 'a') === false) return false;
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
$c = chr($i);
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
// Reverse engineer: what's the UTF-8 equiv of this byte
// sequence? This assumes that there's no variable width
// encoding that doesn't support ASCII.
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
}
}
restore_error_handler();
$encodings[$encoding] = $ret;
return $ret;
}
} }

View File

@ -21,7 +21,20 @@ class HTMLPurifier_AttrDef_CSS_FontFamilyTest extends HTMLPurifier_AttrDefHarnes
$this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'"); $this->assertDef($d = "'\xE5\xAE\x8B\xE4\xBD\x93'");
$this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d); $this->assertDef("\xE5\xAE\x8B\xE4\xBD\x93", $d);
$this->assertDef("'\\','f'", "'\\\\', f"); $this->assertDef("'\\','f'", "'\\\\', f");
$this->assertDef("'\\01'", "''");
$this->assertDef("'\\20'", "' '");
$this->assertDef("\\0020", "'\\\\0020'");
$this->assertDef("'\\000045'", "E");
$this->assertDef("','", false);
$this->assertDef("',' foobar','", "' foobar'");
$this->assertDef("'\\27'", "'\''");
$this->assertDef('"\\22"', "'\"'");
$this->assertDef('"\\""', "'\"'");
$this->assertDef('"\'"', "'\\''");
$this->assertDef("'\\000045a'", "Ea");
$this->assertDef("'\\00045 a'", "Ea");
$this->assertDef("'\\00045 a'", "'E a'");
$this->assertDef("'\\\nf'", "f");
} }
} }

View File

@ -11,7 +11,7 @@ class HTMLPurifier_AttrDef_TextTest extends HTMLPurifier_AttrDefHarness
$this->def = new HTMLPurifier_AttrDef_Text(); $this->def = new HTMLPurifier_AttrDef_Text();
$this->assertDef('This is spiffy text!'); $this->assertDef('This is spiffy text!');
$this->assertDef(" Casual\tCDATA parse\ncheck. ", 'Casual CDATA parsecheck.'); $this->assertDef(" Casual\tCDATA parse\ncheck. ", 'Casual CDATA parse check.');
} }

View File

@ -12,8 +12,7 @@ class HTMLPurifier_AttrDefTest extends HTMLPurifier_Harness
$this->assertIdentical('', $def->parseCDATA('')); $this->assertIdentical('', $def->parseCDATA(''));
$this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t")); $this->assertIdentical('', $def->parseCDATA("\t\n\r \t\t"));
$this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t")); $this->assertIdentical('foo', $def->parseCDATA("\t\n\r foo\t\t"));
$this->assertIdentical('ignorelinefeeds', $def->parseCDATA("ignore\nline\nfeeds")); $this->assertIdentical('translate to space', $def->parseCDATA("translate\nto\tspace"));
$this->assertIdentical('translate to space', $def->parseCDATA("translate\rto\tspace"));
} }

View File

@ -9,6 +9,7 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
function setUp() { function setUp() {
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
parent::setUp();
} }
function assertCleanUTF8($string, $expect = null) { function assertCleanUTF8($string, $expect = null) {
@ -28,91 +29,86 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
} }
function test_convertToUTF8() { function test_convertToUTF8_noConvert() {
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
// UTF-8 means that we don't touch it // UTF-8 means that we don't touch it
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context), HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
"\xF6" // this is invalid "\xF6" // this is invalid
); );
$this->assertNoErrors(); }
$config = HTMLPurifier_Config::create(array( function test_convertToUTF8_iso8859_1() {
'Core.Encoding' => 'ISO-8859-1' $this->config->set('Core', 'Encoding', 'ISO-8859-1');
));
// Now it gets converted
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context), HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
"\xC3\xB6" "\xC3\xB6"
); );
}
$config = HTMLPurifier_Config::create(array(
'Core.Encoding' => 'ISO-8859-1', function test_convertToUTF8_withoutIconv() {
'Test.ForceNoIconv' => true $this->config->set('Core', 'Encoding', 'ISO-8859-1');
)); $this->config->set('Test', 'ForceNoIconv', true);
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8("\xF6", $config, $context), HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
"\xC3\xB6" "\xC3\xB6"
); );
} }
function test_convertFromUTF8() { function getZhongWen() {
$config = HTMLPurifier_Config::createDefault(); return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
$context = new HTMLPurifier_Context(); }
// zhong-wen function test_convertFromUTF8_utf8() {
$chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
// UTF-8 means that we don't touch it // UTF-8 means that we don't touch it
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
"\xC3\xB6" "\xC3\xB6"
); );
}
$config = HTMLPurifier_Config::create(array(
'Core.Encoding' => 'ISO-8859-1' function test_convertFromUTF8_iso8859_1() {
)); $this->config->set('Core', 'Encoding', 'ISO-8859-1');
// Now it gets converted
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
"\xF6" "\xF6"
); );
}
if (function_exists('iconv')) {
// iconv has it's own way function test_convertFromUTF8_iconvNoChars() {
$this->assertIdentical( if (!function_exists('iconv')) return;
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), $this->config->set('Core', 'Encoding', 'ISO-8859-1');
" (Chinese)" $this->assertIdentical(
); HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
} " (Chinese)"
);
}
function test_convertFromUTF8_phpNormal() {
// Plain PHP implementation has slightly different behavior // Plain PHP implementation has slightly different behavior
$config = HTMLPurifier_Config::create(array( $this->config->set('Core', 'Encoding', 'ISO-8859-1');
'Core.Encoding' => 'ISO-8859-1', $this->config->set('Test', 'ForceNoIconv', true);
'Test.ForceNoIconv' => true
));
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
"\xF6" "\xF6"
); );
}
function test_convertFromUTF8_phpNoChars() {
$this->config->set('Core', 'Encoding', 'ISO-8859-1');
$this->config->set('Test', 'ForceNoIconv', true);
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
"?? (Chinese)" "?? (Chinese)"
); );
}
function test_convertFromUTF8_withProtection() {
// Preserve the characters! // Preserve the characters!
$config = HTMLPurifier_Config::create(array( $this->config->set('Core', 'Encoding', 'ISO-8859-1');
'Core.Encoding' => 'ISO-8859-1', $this->config->set('Core', 'EscapeNonASCIICharacters', true);
'Core.EscapeNonASCIICharacters' => true
));
$this->assertIdentical( $this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
"&#20013;&#25991; (Chinese)" "&#20013;&#25991; (Chinese)"
); );
@ -139,5 +135,39 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
} }
function assertASCIISupportCheck($enc, $ret) {
$test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
if ($test === false) return;
$this->assertIdentical(
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
$ret
);
$this->assertIdentical(
HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
$ret
);
}
function test_testEncodingSupportsASCII() {
$this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
$this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
$this->assertASCIISupportCheck('ISO-8859-1', array());
$this->assertASCIISupportCheck('dontexist', array()); // canary
}
function testShiftJIS() {
if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'Shift_JIS');
// This actually looks like a Yen, but we're going to treat it differently
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
'\\~'
);
$this->assertIdentical(
HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
'\\~'
);
}
} }

View File

@ -12,13 +12,24 @@ class HTMLPurifier_Harness extends UnitTestCase
parent::UnitTestCase(); parent::UnitTestCase();
} }
var $config, $context; var $config, $context, $purifier;
/** /**
* Generates easily accessible default config/context * Generates easily accessible default config/context, as well as
* a convenience purifier for integration testing.
*/ */
function setUp() { function setUp() {
list($this->config, $this->context) = $this->createCommon(); list($this->config, $this->context) = $this->createCommon();
$this->purifier = new HTMLPurifier();
}
/**
* Asserts a purification. Good for integration testing.
*/
function assertPurification($input, $expect = null) {
if ($expect === null) $expect = $input;
$result = $this->purifier->purify($input, $this->config);
$this->assertIdentical($expect, $result);
} }
/** /**

View File

@ -2,30 +2,15 @@
require_once 'HTMLPurifier.php'; require_once 'HTMLPurifier.php';
// integration test
class HTMLPurifierTest extends HTMLPurifier_Harness class HTMLPurifierTest extends HTMLPurifier_Harness
{ {
var $purifier;
function setUp() {
$this->purifier = new HTMLPurifier();
}
function assertPurification($input, $expect = null, $config = array()) {
if ($expect === null) $expect = $input;
$result = $this->purifier->purify($input, $config);
$this->assertIdentical($expect, $result);
}
function testNull() { function testNull() {
$this->assertPurification("Null byte\0", "Null byte"); $this->assertPurification("Null byte\0", "Null byte");
} }
function testStrict() { function testStrict() {
$config = HTMLPurifier_Config::createDefault(); $this->config->set('HTML', 'Strict', true);
$config->set('HTML', 'Strict', true);
$this->purifier = new HTMLPurifier( $config ); // verbose syntax
$this->assertPurification( $this->assertPurification(
'<u>Illegal underline</u>', '<u>Illegal underline</u>',
@ -41,10 +26,8 @@ class HTMLPurifierTest extends HTMLPurifier_Harness
function testDifferentAllowedElements() { function testDifferentAllowedElements() {
$this->purifier = new HTMLPurifier(array( $this->config->set('HTML', 'AllowedElements', array('b', 'i', 'p', 'a'));
'HTML.AllowedElements' => array('b', 'i', 'p', 'a'), $this->config->set('HTML', 'AllowedAttributes', array('a.href', '*.id'));
'HTML.AllowedAttributes' => array('a.href', '*.id')
));
$this->assertPurification( $this->assertPurification(
'<p>Par.</p><p>Para<a href="http://google.com/">gr</a>aph</p>Text<b>Bol<i>d</i></b>' '<p>Par.</p><p>Para<a href="http://google.com/">gr</a>aph</p>Text<b>Bol<i>d</i></b>'
@ -59,7 +42,7 @@ class HTMLPurifierTest extends HTMLPurifier_Harness
function testDisableURI() { function testDisableURI() {
$this->purifier = new HTMLPurifier( array('Attr.DisableURI' => true) ); $this->config->set('URI', 'Disable', true);
$this->assertPurification( $this->assertPurification(
'<img src="foobar"/>', '<img src="foobar"/>',
@ -70,8 +53,6 @@ class HTMLPurifierTest extends HTMLPurifier_Harness
function test_purifyArray() { function test_purifyArray() {
$this->purifier = new HTMLPurifier();
$this->assertIdentical( $this->assertIdentical(
$this->purifier->purifyArray( $this->purifier->purifyArray(
array('Good', '<b>Sketchy', 'foo' => '<script>bad</script>') array('Good', '<b>Sketchy', 'foo' => '<script>bad</script>')
@ -83,23 +64,24 @@ class HTMLPurifierTest extends HTMLPurifier_Harness
} }
function testEnableAttrID() { function testAttrIDDisabledByDefault() {
$this->purifier = new HTMLPurifier();
$this->assertPurification( $this->assertPurification(
'<span id="moon">foobar</span>', '<span id="moon">foobar</span>',
'<span>foobar</span>' '<span>foobar</span>'
); );
$this->purifier = new HTMLPurifier(array('HTML.EnableAttrID' => true)); }
function testEnableAttrID() {
$this->config->set('Attr', 'EnableID', true);
$this->assertPurification('<span id="moon">foobar</span>'); $this->assertPurification('<span id="moon">foobar</span>');
$this->assertPurification('<img id="folly" src="folly.png" alt="Omigosh!" />'); $this->assertPurification('<img id="folly" src="folly.png" alt="Omigosh!" />');
} }
function testScript() { function testScript() {
$this->purifier = new HTMLPurifier(array('HTML.Trusted' => true)); $this->config->set('HTML', 'Trusted', true);
$ideal = '<script type="text/javascript"><!--//--><![CDATA[//><!-- $ideal = '<script type="text/javascript"><!--//--><![CDATA[//><!--
alert("<This is compatible with XHTML>"); alert("<This is compatible with XHTML>");
//--><!]]></script>'; //--><!]]></script>';
@ -140,13 +122,29 @@ alert("<This is compatible with XHTML>");
} }
function testMakeAbsolute() { function testMakeAbsolute() {
$this->config->set('URI', 'Base', 'http://example.com/bar/baz.php');
$this->config->set('URI', 'MakeAbsolute', true);
$this->assertPurification( $this->assertPurification(
'<a href="foo.txt">Foobar</a>', '<a href="foo.txt">Foobar</a>',
'<a href="http://example.com/bar/foo.txt">Foobar</a>', '<a href="http://example.com/bar/foo.txt">Foobar</a>'
array( );
'URI.Base' => 'http://example.com/bar/baz.php', }
'URI.MakeAbsolute' => true
) function test_shiftJis() {
if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'Shift_JIS');
$this->config->set('Core', 'EscapeNonASCIICharacters', true);
$this->assertPurification(
"<b style=\"font-family:'&#165;';\">111</b>"
);
}
function test_shiftJisWorstCase() {
if (!function_exists('iconv')) return;
$this->config->set('Core', 'Encoding', 'Shift_JIS');
$this->assertPurification( // Notice how Yen disappears
"<b style=\"font-family:'&#165;';\">111</b>",
"<b style=\"font-family:'';\">111</b>"
); );
} }