mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-02 12:21:09 +02:00
[2.1.5] [MFH] Fix Shift_JIS bug.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1793 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
7
NEWS
7
NEWS
@@ -24,6 +24,13 @@ ERRATA
|
|||||||
- Disable percent height/width attributes for img
|
- Disable percent height/width attributes for img
|
||||||
- Fix stray backslashes in font-family; CSS Unicode character escapes are
|
- Fix stray backslashes in font-family; CSS Unicode character escapes are
|
||||||
now properly resolved (although *only* in font-family).
|
now properly resolved (although *only* in font-family).
|
||||||
|
- Improve parseCDATA algorithm to take into account newline normalization
|
||||||
|
- Account for browser confusion between Yen character and backslash in
|
||||||
|
Shift_JIS encoding. This fix generalizes to any other encoding which is not
|
||||||
|
a strict superset of printable ASCII.
|
||||||
|
- Improved adherence to Unicode by checking for non-character codepoints.
|
||||||
|
Thanks Geoffrey Sneddon for reporting. This may result in degraded
|
||||||
|
performance for extremely large inputs.
|
||||||
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
|
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
|
||||||
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
|
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
|
||||||
this class.
|
this class.
|
||||||
|
@@ -22,7 +22,6 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
// assume that no font names contain commas in them
|
// assume that no font names contain commas in them
|
||||||
$fonts = explode(',', $string);
|
$fonts = explode(',', $string);
|
||||||
$final = '';
|
$final = '';
|
||||||
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
|
|
||||||
foreach($fonts as $font) {
|
foreach($fonts as $font) {
|
||||||
$font = trim($font);
|
$font = trim($font);
|
||||||
if ($font === '') continue;
|
if ($font === '') continue;
|
||||||
@@ -53,8 +52,11 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
if (!ctype_xdigit($font[$i])) break;
|
if (!ctype_xdigit($font[$i])) break;
|
||||||
$code .= $font[$i];
|
$code .= $font[$i];
|
||||||
}
|
}
|
||||||
|
// We have to be extremely careful when adding
|
||||||
|
// new characters, to make sure we're not breaking
|
||||||
|
// the encoding.
|
||||||
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
|
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
|
||||||
if (isset($non_sgml[$char])) continue;
|
if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
|
||||||
$new_font .= $char;
|
$new_font .= $char;
|
||||||
if ($i < $c && trim($font[$i]) !== '') $i--;
|
if ($i < $c && trim($font[$i]) !== '') $i--;
|
||||||
continue;
|
continue;
|
||||||
|
@@ -68,24 +68,6 @@ class HTMLPurifier_Encoder
|
|||||||
function muteErrorHandler() {}
|
function muteErrorHandler() {}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a lookup of UTF-8 character byte sequences that are non-SGML.
|
|
||||||
*/
|
|
||||||
function getNonSgmlCharacters() {
|
|
||||||
static $nonSgmlCharacters;
|
|
||||||
if (empty($nonSgmlCharacters)) {
|
|
||||||
for ($i = 0; $i <= 31; $i++) {
|
|
||||||
// non-SGML ASCII chars
|
|
||||||
// save \r, \t and \n
|
|
||||||
if ($i == 9 || $i == 13 || $i == 10) continue;
|
|
||||||
$nonSgmlCharacters[chr($i)] = '';
|
|
||||||
}
|
|
||||||
for ($i = 127; $i <= 159; $i++) {
|
|
||||||
$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $nonSgmlCharacters;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Cleans a UTF-8 string for well-formedness and SGML validity
|
* Cleans a UTF-8 string for well-formedness and SGML validity
|
||||||
*
|
*
|
||||||
@@ -114,24 +96,13 @@ class HTMLPurifier_Encoder
|
|||||||
*/
|
*/
|
||||||
function cleanUTF8($str, $force_php = false) {
|
function cleanUTF8($str, $force_php = false) {
|
||||||
|
|
||||||
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
|
|
||||||
|
|
||||||
static $iconv = null;
|
|
||||||
if ($iconv === null) $iconv = function_exists('iconv');
|
|
||||||
|
|
||||||
// UTF-8 validity is checked since PHP 4.3.5
|
// UTF-8 validity is checked since PHP 4.3.5
|
||||||
// This is an optimization: if the string is already valid UTF-8, no
|
// This is an optimization: if the string is already valid UTF-8, no
|
||||||
// need to do iconv/php stuff. 99% of the time, this will be the case.
|
// need to do PHP stuff. 99% of the time, this will be the case.
|
||||||
if (preg_match('/^.{1}/us', $str)) {
|
// The regexp matches the XML char production, as well as well as excluding
|
||||||
return strtr($str, $non_sgml);
|
// non-SGML codepoints U+007F to U+009F
|
||||||
}
|
if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
|
||||||
|
return $str;
|
||||||
if ($iconv && !$force_php) {
|
|
||||||
// do the shortcut way
|
|
||||||
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
|
|
||||||
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
|
|
||||||
restore_error_handler();
|
|
||||||
return strtr($str, $non_sgml);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
$mState = 0; // cached expected number of octets after the current octet
|
$mState = 0; // cached expected number of octets after the current octet
|
||||||
@@ -242,7 +213,17 @@ class HTMLPurifier_Encoder
|
|||||||
) {
|
) {
|
||||||
|
|
||||||
} elseif (0xFEFF != $mUcs4 && // omit BOM
|
} elseif (0xFEFF != $mUcs4 && // omit BOM
|
||||||
!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
|
// check for valid Char unicode codepoints
|
||||||
|
(
|
||||||
|
0x9 == $mUcs4 ||
|
||||||
|
0xA == $mUcs4 ||
|
||||||
|
0xD == $mUcs4 ||
|
||||||
|
(0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
|
||||||
|
// 7F-9F is not strictly prohibited by XML,
|
||||||
|
// but it is non-SGML, and thus we don't allow it
|
||||||
|
(0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
|
||||||
|
(0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
|
||||||
|
)
|
||||||
) {
|
) {
|
||||||
$out .= $char;
|
$out .= $char;
|
||||||
}
|
}
|
||||||
|
@@ -27,6 +27,8 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
|
|||||||
$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
|
$this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
|
||||||
$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
|
$this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
|
||||||
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
$this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
|
||||||
|
// invalid codepoints
|
||||||
|
$this->assertCleanUTF8("\xED\xB0\x80", '');
|
||||||
}
|
}
|
||||||
|
|
||||||
function test_convertToUTF8_noConvert() {
|
function test_convertToUTF8_noConvert() {
|
||||||
|
Reference in New Issue
Block a user