diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php
index e8fbf1a2..ae012ed1 100644
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -239,9 +239,15 @@ class HTMLPurifier_Lexer
/**
* Takes a string of HTML (fragment or document) and returns the content
*/
- function extractBody($html) {
- if (strpos($html, ']*>(.+?)!is', $html, $matches);
+ if ($return_bool) return $result;
+ if ($result) {
+ return $matches[1];
+ } else {
+ return $html;
+ }
}
}
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index 3018423b..0df13ae5 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -28,25 +28,31 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
public function tokenizeHTML($string, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
+ if ($config->get('Core', 'AcceptFullDocuments')) {
+ $is_full = $this->extractBody($string, true);
+ }
+
$doc = new DOMDocument();
- $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive
+ $doc->encoding = 'UTF-8'; // technically does nothing, but whatever
// replace and escape the CDATA sections, since parsing under HTML
// mode won't get 'em.
$string = $this->escapeCDATA($string);
+ if (!$is_full) {
// preprocess string, essential for UTF-8
$string =
- '
'.
- ''.
- ''.$string.'
';
+ ''.
+ ''.
+ ''.$string.'';
+ }
@$doc->loadHTML($string); // mute all errors, handle it transparently
+
return $this->tokenizeDOM(
$doc->childNodes->item(1)-> // html
- childNodes->item(1)-> // body
- childNodes->item(0) // div
+ getElementsByTagName('body')->item(0) // body
);
}
diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php
index 29634b69..535b3866 100644
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -114,6 +114,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$string = @ (string) $string;
if ($string == '') return array();
+ if ($config->get('Core', 'AcceptFullDocuments')) {
+ $string = $this->extractBody($string);
+ }
+
$cursor = 0; // our location in the text
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php
index da3843b0..02b3a484 100644
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -32,6 +32,9 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
function tokenizeHTML($html, $config = null) {
if (!$config) $config = HTMLPurifier_Config::createDefault();
$html = $this->escapeCDATA($html);
+ if ($config->get('Core', 'AcceptFullDocuments')) {
+ $html = $this->extractBody($html);
+ }
$html = $this->substituteNonSpecialEntities($html);
$parser=& new XML_HTMLSax3();
$parser->set_object($this);
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 9f71a838..b42b5edb 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -40,7 +40,44 @@ class HTMLPurifier_LexerTest extends UnitTestCase
$this->Lexer->substituteNonSpecialEntities('"') );
}
+ function assertExtractBody($text, $extract = true) {
+ $result = $this->Lexer->extractBody($text);
+ if ($extract === true) $extract = $text;
+ $this->assertIdentical($extract, $result);
+ }
+
function test_extractBody() {
+ $this->assertExtractBody('Bold');
+ $this->assertExtractBody('Bold', 'Bold');
+ $this->assertExtractBody(
+'
+
+
+
+ xyz
+
+
+
+
+',
+ '
+
+ ');
+ $this->assertExtractBody('Bold', 'Bold');
+ $this->assertExtractBody('asdf'); // not closed, don't accept
}