diff --git a/NEWS b/NEWS index 4a494326..cac5c31c 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier to allow these, and background-position IS NOT implemented yet. ! Configuration documentation looks nicer ! Added smoketest 'all.php', which loads all other smoketests via frames +! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode + characters while %Core.Encoding is set to a non-UTF-8 encoding. . Implemented AttrDef_CSSURI for url(http://google.com) style declarations 1.3.3, unknown release date, likely to be dropped diff --git a/docs/index.html b/docs/index.html index e5d9d662..ea498147 100644 --- a/docs/index.html +++ b/docs/index.html @@ -31,6 +31,9 @@ information for casual developers using HTML Purifier.

Speeding up HTML Purifier
Explains how to speed up HTML Purifier through caching or inbound filtering.
+
UTF-8
+
Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.
+

Development

diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php index b818e199..1a22b452 100644 --- a/library/HTMLPurifier/Encoder.php +++ b/library/HTMLPurifier/Encoder.php @@ -6,15 +6,29 @@ HTMLPurifier_ConfigSchema::define( 'Core', 'Encoding', 'utf-8', 'istring', 'If for some reason you are unable to convert all webpages to UTF-8, '. 'you can use this directive as a stop-gap compatibility change to '. - 'let HTMLPurifier deal with non UTF-8 input. This technique has '. + 'let HTML Purifier deal with non UTF-8 input. This technique has '. 'notable deficiencies: absolutely no characters outside of the selected '. 'character encoding will be preserved, not even the ones that have '. 'been ampersand escaped (this is due to a UTF-8 specific feature '. 'that automatically resolves all entities), making it pretty useless '. - 'for anything except the most I18N-blind applications. This directive '. + 'for anything except the most I18N-blind applications, although '. + '%Core.EscapeNonASCIICharacters offers fixes this trouble with '. + 'another tradeoff. This directive '. 'only accepts ISO-8859-1 if iconv is not enabled.' ); +HTMLPurifier_ConfigSchema::define( + 'Core', 'EscapeNonASCIICharacters', false, 'bool', + 'This directive overcomes a deficiency in %Core.Encoding by blindly '. + 'converting all non-ASCII characters into decimal numeric entities before '. + 'converting it to its native encoding. This means that even '. + 'characters that can be expressed in the non-UTF-8 encoding will '. + 'be entity-ized, which can be a real downer for encodings like Big5. '. + 'It also assumes that the ASCII repetoire is available, although '. + 'this is the case for almost all encodings. Anyway, use UTF-8! This '. + 'directive has been available since 1.4.0.' +); + if ( !function_exists('iconv') ) { // only encodings with native PHP support HTMLPurifier_ConfigSchema::defineAllowedValues( @@ -310,6 +324,7 @@ class HTMLPurifier_Encoder } elseif ($encoding === 'iso-8859-1') { return @utf8_encode($str); } + trigger_error('Encoding not supported', E_USER_ERROR); } /** @@ -323,11 +338,63 @@ class HTMLPurifier_Encoder if ($iconv === null) $iconv = function_exists('iconv'); $encoding = $config->get('Core', 'Encoding'); if ($encoding === 'utf-8') return $str; + if ($config->get('Core', 'EscapeNonASCIICharacters')) { + $str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str); + } if ($iconv && !$config->get('Test', 'ForceNoIconv')) { return @iconv('utf-8', $encoding . '//IGNORE', $str); } elseif ($encoding === 'iso-8859-1') { return @utf8_decode($str); } + trigger_error('Encoding not supported', E_USER_ERROR); + } + + /** + * Lossless (character-wise) conversion of HTML to ASCII + * @static + * @param $str UTF-8 string to be converted to ASCII + * @returns ASCII encoded string with non-ASCII character entity-ized + * @warning Adapted from MediaWiki, claiming fair use: this is a common + * algorithm. If you disagree with this license fudgery, + * implement it yourself. + * @note Uses decimal numeric entities since they are best supported. + * @note This is a DUMB function: it has no concept of keeping + * character entities that the projected character encoding + * can allow. We could possibly implement a smart version + * but that would require it to also know which Unicode + * codepoints the charset supported (not an easy task). + * @note Sort of with cleanUTF8() but it assumes that $str is + * well-formed UTF-8 + */ + function convertToASCIIDumbLossless($str) { + $bytesleft = 0; + $result = ''; + $working = 0; + $len = strlen($str); + for( $i = 0; $i < $len; $i++ ) { + $bytevalue = ord( $str[$i] ); + if( $bytevalue <= 0x7F ) { //0xxx xxxx + $result .= chr( $bytevalue ); + $bytesleft = 0; + } elseif( $bytevalue <= 0xBF ) { //10xx xxxx + $working = $working << 6; + $working += ($bytevalue & 0x3F); + $bytesleft--; + if( $bytesleft <= 0 ) { + $result .= "&#" . $working . ";"; + } + } elseif( $bytevalue <= 0xDF ) { //110x xxxx + $working = $bytevalue & 0x1F; + $bytesleft = 1; + } elseif( $bytevalue <= 0xEF ) { //1110 xxxx + $working = $bytevalue & 0x0F; + $bytesleft = 2; + } else { //1111 0xxx + $working = $bytevalue & 0x07; + $bytesleft = 3; + } + } + return $result; } diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php index b8437fb2..ef14b139 100644 --- a/tests/HTMLPurifier/EncoderTest.php +++ b/tests/HTMLPurifier/EncoderTest.php @@ -5,7 +5,7 @@ require_once 'HTMLPurifier/Encoder.php'; class HTMLPurifier_EncoderTest extends UnitTestCase { - var $Encoder; + var $_entity_lookup; function setUp() { $this->_entity_lookup = HTMLPurifier_EntityLookup::instance(); @@ -60,6 +60,9 @@ class HTMLPurifier_EncoderTest extends UnitTestCase $config = HTMLPurifier_Config::createDefault(); $context = new HTMLPurifier_Context(); + // zhong-wen + $chinese = "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)"; + // UTF-8 means that we don't touch it $this->assertIdentical( HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), @@ -74,13 +77,55 @@ class HTMLPurifier_EncoderTest extends UnitTestCase "\xF6" ); - $config->set('Test', 'ForceNoIconv', true); + if (function_exists('iconv')) { + // iconv has it's own way + $this->assertIdentical( + HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), + " (Chinese)" + ); + } + // Plain PHP implementation has slightly different behavior + $config->set('Test', 'ForceNoIconv', true); $this->assertIdentical( HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $config, $context), "\xF6" ); + $this->assertIdentical( + HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), + "?? (Chinese)" + ); + + // Preserve the characters! + + $config->set('Core', 'EscapeNonASCIICharacters', true); + $this->assertIdentical( + HTMLPurifier_Encoder::convertFromUTF8($chinese, $config, $context), + "中文 (Chinese)" + ); + + } + + function test_convertToASCIIDumbLossless() { + + // Uppercase thorn letter + $this->assertIdentical( + HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"), + "Þorn" + ); + + $this->assertIdentical( + HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"), + "an" + ); + + // test up to four bytes + $this->assertIdentical( + HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"), + "󠀠" + ); + } } diff --git a/tests/index.php b/tests/index.php index 3b8b87b1..4511ac82 100644 --- a/tests/index.php +++ b/tests/index.php @@ -1,5 +1,8 @@ addTestClass(htmlpurifier_path2class($path)); } else { - $test = new GroupTest('HTML Purifier'); + $test = new GroupTest('All Tests - HTML Purifier'); foreach ($test_files as $test_file) { $path = 'HTMLPurifier/' . $test_file;