From 0dd866cc15ca8123ead19e2bf565a8f532463e22 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang"
Date: Fri, 19 Jan 2007 03:54:55 +0000
Subject: [PATCH] [1.4.0] - Added %Core.EscapeNonASCIICharacters to workaround
%Core.Encoding misbehavior - Add "All Tests" to test runner title and reorder
subfile names - Specific file is now called with ?f= - Link to UTF-8 docs,
even though they're not done - 1000th unit test passed! W00t! (that's a third
as many as SimpleTest has for itself.)
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a
---
NEWS | 2 +
docs/index.html | 3 ++
library/HTMLPurifier/Encoder.php | 71 +++++++++++++++++++++++++++++-
tests/HTMLPurifier/EncoderTest.php | 49 ++++++++++++++++++++-
tests/index.php | 11 +++--
5 files changed, 128 insertions(+), 8 deletions(-)
diff --git a/NEWS b/NEWS
index 4a494326..cac5c31c 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
to allow these, and background-position IS NOT implemented yet.
! Configuration documentation looks nicer
! Added smoketest 'all.php', which loads all other smoketests via frames
+! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode
+ characters while %Core.Encoding is set to a non-UTF-8 encoding.
. Implemented AttrDef_CSSURI for url(http://google.com) style declarations
1.3.3, unknown release date, likely to be dropped
diff --git a/docs/index.html b/docs/index.html
index e5d9d662..ea498147 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -31,6 +31,9 @@ information for casual developers using HTML Purifier.
Speeding up HTML Purifier
Explains how to speed up HTML Purifier through caching or inbound filtering.
+UTF-8
+Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.
+
Development
diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php
index b818e199..1a22b452 100644
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@@ -6,15 +6,29 @@ HTMLPurifier_ConfigSchema::define(
'Core', 'Encoding', 'utf-8', 'istring',
'If for some reason you are unable to convert all webpages to UTF-8, '.
'you can use this directive as a stop-gap compatibility change to '.
- 'let HTMLPurifier deal with non UTF-8 input. This technique has '.
+ 'let HTML Purifier deal with non UTF-8 input. This technique has '.
'notable deficiencies: absolutely no characters outside of the selected '.
'character encoding will be preserved, not even the ones that have '.
'been ampersand escaped (this is due to a UTF-8 specific feature '.
'that automatically resolves all entities), making it pretty useless '.
- 'for anything except the most I18N-blind applications. This directive '.
+ 'for anything except the most I18N-blind applications, although '.
+ '%Core.EscapeNonASCIICharacters offers fixes this trouble with '.
+ 'another tradeoff. This directive '.
'only accepts ISO-8859-1 if iconv is not enabled.'
);
+HTMLPurifier_ConfigSchema::define(
+ 'Core', 'EscapeNonASCIICharacters', false, 'bool',
+ 'This directive overcomes a deficiency in %Core.Encoding by blindly '.
+ 'converting all non-ASCII characters into decimal numeric entities before '.
+ 'converting it to its native encoding. This means that even '.
+ 'characters that can be expressed in the non-UTF-8 encoding will '.
+ 'be entity-ized, which can be a real downer for encodings like Big5. '.
+ 'It also assumes that the ASCII repetoire is available, although '.
+ 'this is the case for almost all encodings. Anyway, use UTF-8! This '.
+ 'directive has been available since 1.4.0.'
+);
+
if ( !function_exists('iconv') ) {
// only encodings with native PHP support
HTMLPurifier_ConfigSchema::defineAllowedValues(
@@ -310,6 +324,7 @@ class HTMLPurifier_Encoder
} elseif ($encoding === 'iso-8859-1') {
return @utf8_encode($str);
}
+ trigger_error('Encoding not supported', E_USER_ERROR);
}
/**
@@ -323,11 +338,63 @@ class HTMLPurifier_Encoder
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
+ if ($config->get('Core', 'EscapeNonASCIICharacters')) {
+ $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
+ }
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
return @iconv('utf-8', $encoding . '//IGNORE', $str);
} elseif ($encoding === 'iso-8859-1') {
return @utf8_decode($str);
}
+ trigger_error('Encoding not supported', E_USER_ERROR);
+ }
+
+ /**
+ * Lossless (character-wise) conversion of HTML to ASCII
+ * @static
+ * @param $str UTF-8 string to be converted to ASCII
+ * @returns ASCII encoded string with non-ASCII character entity-ized
+ * @warning Adapted from MediaWiki, claiming fair use: this is a common
+ * algorithm. If you disagree with this license fudgery,
+ * implement it yourself.
+ * @note Uses decimal numeric entities since they are best supported.
+ * @note This is a DUMB function: it has no concept of keeping
+ * character entities that the projected character encoding
+ * can allow. We could possibly implement a smart version
+ * but that would require it to also know which Unicode
+ * codepoints the charset supported (not an easy task).
+ * @note Sort of with cleanUTF8() but it assumes that $str is
+ * well-formed UTF-8
+ */
+ function convertToASCIIDumbLossless($str) {
+ $bytesleft = 0;
+ $result = '';
+ $working = 0;
+ $len = strlen($str);
+ for( $i = 0; $i < $len; $i++ ) {
+ $bytevalue = ord( $str[$i] );
+ if( $bytevalue <= 0x7F ) { //0xxx xxxx
+ $result .= chr( $bytevalue );
+ $bytesleft = 0;
+ } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
+ $working = $working << 6;
+ $working += ($bytevalue & 0x3F);
+ $bytesleft--;
+ if( $bytesleft <= 0 ) {
+ $result .= "" . $working . ";";
+ }
+ } elseif( $bytevalue <= 0xDF ) { //110x xxxx
+ $working = $bytevalue & 0x1F;
+ $bytesleft = 1;
+ } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
+ $working = $bytevalue & 0x0F;
+ $bytesleft = 2;
+ } else { //1111 0xxx
+ $working = $bytevalue & 0x07;
+ $bytesleft = 3;
+ }
+ }
+ return $result;
}
diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php
index b8437fb2..ef14b139 100644
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Encoder.php';
class HTMLPurifier_EncoderTest extends UnitTestCase
{
- var $Encoder;
+ var $_entity_lookup;
function setUp() {
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
@@ -60,6 +60,9 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
+ // zhong-wen
+ $chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
+
// UTF-8 means that we don't touch it
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
@@ -74,13 +77,55 @@ class HTMLPurifier_EncoderTest extends UnitTestCase
"\xF6"
);
- $config->set('Test', 'ForceNoIconv', true);
+ if (function_exists('iconv')) {
+ // iconv has it's own way
+ $this->assertIdentical(
+ HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+ " (Chinese)"
+ );
+ }
+ // Plain PHP implementation has slightly different behavior
+ $config->set('Test', 'ForceNoIconv', true);
$this->assertIdentical(
HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context),
"\xF6"
);
+ $this->assertIdentical(
+ HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+ "?? (Chinese)"
+ );
+
+ // Preserve the characters!
+
+ $config->set('Core', 'EscapeNonASCIICharacters', true);
+ $this->assertIdentical(
+ HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context),
+ "中文 (Chinese)"
+ );
+
+ }
+
+ function test_convertToASCIIDumbLossless() {
+
+ // Uppercase thorn letter
+ $this->assertIdentical(
+ HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
+ "Þorn"
+ );
+
+ $this->assertIdentical(
+ HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
+ "an"
+ );
+
+ // test up to four bytes
+ $this->assertIdentical(
+ HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
+ ""
+ );
+
}
}
diff --git a/tests/index.php b/tests/index.php
index 3b8b87b1..4511ac82 100644
--- a/tests/index.php
+++ b/tests/index.php
@@ -1,5 +1,8 @@
addTestClass(htmlpurifier_path2class($path));
} else {
- $test = new GroupTest('HTML Purifier');
+ $test = new GroupTest('All Tests - HTML Purifier');
foreach ($test_files as $test_file) {
$path = 'HTMLPurifier/' . $test_file;