Merge branch 'w13_MDL-32109_m23_fastpurify' of git://github.com/skodak/moodle

This commit is contained in:
Aparup Banerjee 2012-03-27 10:15:18 +08:00
commit 5eee9f7ade
3 changed files with 167 additions and 5 deletions

View File

@ -193,6 +193,63 @@ class htmlpurifier_test extends UnitTestCase {
$text = 'x<form></form>x';
$this->assertIdentical('xx', purify_html($text));
}
/**
* Test internal function used for clean_text() speedup.
*/
function test_is_purify_html_necessary() {
// first our shortcuts
$text = "";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertidentical($text, purify_html($text));
$text = "666";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertidentical($text, purify_html($text));
$text = "abc\ndef \" ' ";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertidentical($text, purify_html($text));
$text = "abc\n<p>def</p>efg<p>hij</p>";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertidentical($text, purify_html($text));
$text = "<br />abc\n<p>def<em>efg</em><strong>hi<br />j</strong></p>";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertidentical($text, purify_html($text));
// now failures
$text = "&nbsp;";
$this->assertTrue(is_purify_html_necessary($text));
$text = "Gin & Tonic";
$this->assertTrue(is_purify_html_necessary($text));
$text = "Gin > Tonic";
$this->assertTrue(is_purify_html_necessary($text));
$text = "Gin < Tonic";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<div>abc</div>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<span>abc</span>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<br>abc";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<p class='xxx'>abc</p>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<p>abc<em></p></em>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<p>abc";
$this->assertTrue(is_purify_html_necessary($text));
}
}

View File

@ -189,5 +189,63 @@ class core_htmlpurifier_testcase extends basic_testcase {
$text = 'x<form></form>x';
$this->assertSame('xx', purify_html($text));
}
/**
* Test internal function used for clean_text() speedup.
* @return void
*/
function test_is_purify_html_necessary() {
// first our shortcuts
$text = "";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertSame($text, purify_html($text));
$text = "666";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertSame($text, purify_html($text));
$text = "abc\ndef \" ' ";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertSame($text, purify_html($text));
$text = "abc\n<p>def</p>efg<p>hij</p>";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertSame($text, purify_html($text));
$text = "<br />abc\n<p>def<em>efg</em><strong>hi<br />j</strong></p>";
$this->assertFalse(is_purify_html_necessary($text));
$this->assertSame($text, purify_html($text));
// now failures
$text = "&nbsp;";
$this->assertTrue(is_purify_html_necessary($text));
$text = "Gin & Tonic";
$this->assertTrue(is_purify_html_necessary($text));
$text = "Gin > Tonic";
$this->assertTrue(is_purify_html_necessary($text));
$text = "Gin < Tonic";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<div>abc</div>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<span>abc</span>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<br>abc";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<p class='xxx'>abc</p>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<p>abc<em></p></em>";
$this->assertTrue(is_purify_html_necessary($text));
$text = "<p>abc";
$this->assertTrue(is_purify_html_necessary($text));
}
}

View File

@ -1450,9 +1450,7 @@ function trusttext_active() {
* @return string The cleaned up text
*/
function clean_text($text, $format = FORMAT_HTML, $options = array()) {
if (empty($text) or is_numeric($text)) {
return (string)$text;
}
$text = (string)$text;
if ($format != FORMAT_HTML and $format != FORMAT_HTML) {
// TODO: we need to standardise cleanup of text when loading it into editor first
@ -1463,7 +1461,9 @@ function clean_text($text, $format = FORMAT_HTML, $options = array()) {
return $text;
}
$text = purify_html($text, $options);
if (is_purify_html_necessary($text)) {
$text = purify_html($text, $options);
}
// Originally we tried to neutralise some script events here, it was a wrong approach because
// it was trivial to work around that (for example using style based XSS exploits).
@ -1473,6 +1473,53 @@ function clean_text($text, $format = FORMAT_HTML, $options = array()) {
return $text;
}
/**
* Is it necessary to use HTMLPurifier?
* @private
* @param string $text
* @return bool false means html is safe and valid, true means use HTMLPurifier
*/
function is_purify_html_necessary($text) {
if ($text === '') {
return false;
}
if ($text === (string)((int)$text)) {
return false;
}
if (strpos($text, '&') !== false or preg_match('|<[^pesb/]|', $text)) {
// we need to normalise entities or other tags except p, em, strong and br present
return true;
}
$altered = htmlspecialchars($text, ENT_NOQUOTES, 'UTF-8', true);
if ($altered === $text) {
// no < > or other special chars means this must be safe
return false;
}
// let's try to convert back some safe html tags
$altered = preg_replace('|&lt;p&gt;(.*?)&lt;/p&gt;|m', '<p>$1</p>', $altered);
if ($altered === $text) {
return false;
}
$altered = preg_replace('|&lt;em&gt;([^<>]+?)&lt;/em&gt;|m', '<em>$1</em>', $altered);
if ($altered === $text) {
return false;
}
$altered = preg_replace('|&lt;strong&gt;([^<>]+?)&lt;/strong&gt;|m', '<strong>$1</strong>', $altered);
if ($altered === $text) {
return false;
}
$altered = str_replace('&lt;br /&gt;', '<br />', $altered);
if ($altered === $text) {
return false;
}
return true;
}
/**
* KSES replacement cleaning function - uses HTML Purifier.
*