From b1b3377b9cd8f5857ecf0d365f00849d13d8cb0f Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Thu, 23 Nov 2006 03:23:35 +0000 Subject: [PATCH] [1.3.0] Huge upgrade, (X)HTML Strict now supported + Transparently handles inline elements in block context (blockquote) ! Added GET method to demo for easier validation, added 50kb max input size ! New directive %HTML.BlockWrapper, for block-ifying inline elements ! New directive %HTML.Parent, allows you to only allow inline content - Added missing type to ChildDef_Chameleon . ChildDef_Required guards against empty tags . Lookup table HTMLDefinition->info_flow_elements added . Added peace-of-mind variable initialization to Strategy_FixNesting git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@560 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 9 ++ docs/examples/demo.php | 88 +++++++++++++++---- docs/ref-loose-vs-strict.txt | 9 +- library/HTMLPurifier/ChildDef/Chameleon.php | 2 + library/HTMLPurifier/ChildDef/Required.php | 5 +- .../ChildDef/StrictBlockquote.php | 70 +++++++++++++++ library/HTMLPurifier/HTMLDefinition.php | 70 +++++++++++++-- library/HTMLPurifier/Strategy/FixNesting.php | 1 + .../ChildDef/StrictBlockquoteTest.php | 50 +++++++++++ .../HTMLPurifier/Strategy/FixNestingTest.php | 14 +++ tests/HTMLPurifier/Test.php | 2 +- tests/index.php | 1 + 12 files changed, 289 insertions(+), 32 deletions(-) create mode 100644 library/HTMLPurifier/ChildDef/StrictBlockquote.php create mode 100644 tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php diff --git a/NEWS b/NEWS index 8e05466a..9a180fc2 100644 --- a/NEWS +++ b/NEWS @@ -11,6 +11,15 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 1.3.0, unknown release date (major feature release) +! (X)HTML Strict now supported + + Transparently handles inline elements in block context (blockquote) +! Added GET method to demo for easier validation, added 50kb max input size +! New directive %HTML.BlockWrapper, for block-ifying inline elements +! New directive %HTML.Parent, allows you to only allow inline content +- Added missing type to ChildDef_Chameleon +. ChildDef_Required guards against empty tags +. Lookup table HTMLDefinition->info_flow_elements added +. Added peace-of-mind variable initialization to Strategy_FixNesting 1.2.1, unknown release date (bugfix/minor feature release, may be dropped if 1.2.0 is stable) diff --git a/docs/examples/demo.php b/docs/examples/demo.php index 35a47986..d5b3a5b1 100644 --- a/docs/examples/demo.php +++ b/docs/examples/demo.php @@ -1,11 +1,30 @@ +'; + +function getFormMethod() { + return (isset($_REQUEST['post'])) ? 'post' : 'get'; +} + +if (empty($_REQUEST['strict'])) { +?> - + + + + HTMLPurifier Live Demo @@ -14,15 +33,21 @@ header('Content-type:text/html;charset=UTF-8');

HTMLPurifier Live Demo

50000) { + ?> +

Request exceeds maximum allowed text size of 50kb.

+ set('Core', 'TidyFormat', !empty($_POST['tidy'])); + $config->set('Core', 'TidyFormat', !empty($_REQUEST['tidy'])); + $config->set('HTML', 'Strict', !empty($_REQUEST['strict'])); $purifier = new HTMLPurifier($config); $pure_html = $purifier->purify($html); @@ -43,7 +68,17 @@ echo htmlspecialchars($pure_html, ENT_COMPAT, 'UTF-8'); ?> +

If you would like to validate the code with +W3C's +validator, copy and paste the entire demo page's source.

+ @@ -54,12 +89,13 @@ will filter it.

} ?> -
+
- HTML + HTML Purifier Input () + +

Warning: GET request method can only hold + approximately 2000 characters. If you need to test anything + larger than that, try the POST form.

+
Nicely format output with Tidy? />
+ name="tidy" /> +
XHTML 1.0 Strict output? />
+
Serve as application/xhtml+xml? (not for IE) />
-

Return to HTMLPurifier's home page.

+

Return to HTMLPurifier's home page. +Try the form in GET and POST request +flavors (GET is easy to validate, but POST allows larger inputs).

+ +

+ Valid XHTML 1.0 Transitional +

+ \ No newline at end of file diff --git a/docs/ref-loose-vs-strict.txt b/docs/ref-loose-vs-strict.txt index 110bda37..39d51aa2 100644 --- a/docs/ref-loose-vs-strict.txt +++ b/docs/ref-loose-vs-strict.txt @@ -7,11 +7,11 @@ to HTML Purifier, though, so let's take a look: == Major incompatibilities == -BLOCKQUOTE changes from 'flow' to 'block' +[done] BLOCKQUOTE changes from 'flow' to 'block' behavior: inline inner contents should not be nuked, paragraph as necessary -U, S, STRIKE cut +[partially-done] U, S, STRIKE cut behavior: replace with appropriate inline span + CSS -ADDRESS from potpourri to Inline (removes p tags) +[partially-done] ADDRESS from potpourri to Inline (removes p tags) (lower importance) behavior: p tags silently dropped or replaced with something (
) == Things we can loosen up == @@ -38,5 +38,6 @@ A tag's attribute 'target' (for selecting frames) cut OL/LI tag's attribute 'start' (for renumbering lists) cut behavior: no substitute, just delete Attribute 'name' deprecated in favor of 'id' - behavior: create proper AttrTransform + behavior: not allowed in first place, but create proper AttrTransform PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows) + behavior: disallow as usual diff --git a/library/HTMLPurifier/ChildDef/Chameleon.php b/library/HTMLPurifier/ChildDef/Chameleon.php index 22724646..deaf84d9 100644 --- a/library/HTMLPurifier/ChildDef/Chameleon.php +++ b/library/HTMLPurifier/ChildDef/Chameleon.php @@ -23,6 +23,8 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef */ var $block; + var $type = 'chameleon'; + /** * @param $inline List of elements to allow when inline. * @param $block List of elements to allow when block. diff --git a/library/HTMLPurifier/ChildDef/Required.php b/library/HTMLPurifier/ChildDef/Required.php index 0d80fbf8..0bf0ee45 100644 --- a/library/HTMLPurifier/ChildDef/Required.php +++ b/library/HTMLPurifier/ChildDef/Required.php @@ -20,7 +20,10 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef $elements = explode('|', $elements); } $elements = array_flip($elements); - foreach ($elements as $i => $x) $elements[$i] = true; + foreach ($elements as $i => $x) { + $elements[$i] = true; + if (empty($i)) unset($elements[$i]); + } $this->elements = $elements; $this->gen = new HTMLPurifier_Generator(); } diff --git a/library/HTMLPurifier/ChildDef/StrictBlockquote.php b/library/HTMLPurifier/ChildDef/StrictBlockquote.php new file mode 100644 index 00000000..980acac3 --- /dev/null +++ b/library/HTMLPurifier/ChildDef/StrictBlockquote.php @@ -0,0 +1,70 @@ +getHTMLDefinition(); + if (!$this->init) { + // allow all inline elements + $this->elements = $def->info_flow_elements; + $this->elements['#PCDATA'] = true; + $this->init = true; + } + + $result = parent::validateChildren($tokens_of_children, $config, $context); + if ($result === false) return array(); + if ($result === true) $result = $tokens_of_children; + + $block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper); + $block_wrap_end = new HTMLPurifier_Token_End( $def->info_block_wrapper); + $is_inline = false; + $depth = 0; + $ret = array(); + + // assuming that there are no comment tokens + foreach ($result as $i => $token) { + $token = $result[$i]; + // ifs are nested for readability + if (!$is_inline) { + if (!$depth) { + if (($token->type == 'text') || + ($def->info[$token->name]->type == 'inline')) { + $is_inline = true; + $ret[] = $block_wrap_start; + } + } + } else { + if (!$depth) { + // starting tokens have been inline text / empty + if ($token->type == 'start' || $token->type == 'empty') { + if ($def->info[$token->name]->type == 'block') { + // ended + $ret[] = $block_wrap_end; + $is_inline = false; + } + } + } + } + $ret[] = $token; + if ($token->type == 'start') $depth++; + if ($token->type == 'end') $depth--; + } + if ($is_inline) $ret[] = $block_wrap_end; + return $ret; + } +} + +?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLDefinition.php b/library/HTMLPurifier/HTMLDefinition.php index 28776349..ae585a86 100644 --- a/library/HTMLPurifier/HTMLDefinition.php +++ b/library/HTMLPurifier/HTMLDefinition.php @@ -23,6 +23,7 @@ require_once 'HTMLPurifier/ChildDef.php'; require_once 'HTMLPurifier/ChildDef/Required.php'; require_once 'HTMLPurifier/ChildDef/Optional.php'; require_once 'HTMLPurifier/ChildDef/Table.php'; + require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php'; require_once 'HTMLPurifier/Generator.php'; require_once 'HTMLPurifier/Token.php'; require_once 'HTMLPurifier/TagTransform.php'; @@ -45,6 +46,23 @@ HTMLPurifier_ConfigSchema::define( 'Determines whether or not to use Transitional (loose) or Strict rulesets.' ); +HTMLPurifier_ConfigSchema::define( + 'HTML', 'BlockWrapper', 'p', 'string', + 'String name of element to wrap inline elements that are inside a block '. + 'context. This only occurs in the children of blockquote in strict mode. '. + 'Example: by default value, <blockquote>Foo</blockquote> '. + 'would become <blockquote><p>Foo</p></blockquote>. The '. + '<p> tags can be replaced '. + 'with whatever you desire, as long as it is a block level element.' +); + +HTMLPurifier_ConfigSchema::define( + 'HTML', 'Parent', 'div', 'string', + 'String name of element that HTML fragment passed to library will be '. + 'inserted in. An interesting variation would be using span as the '. + 'parent element, meaning that only inline tags would be allowed.' +); + /** * Defines the purified HTML type with large amounts of objects. * @@ -79,11 +97,17 @@ class HTMLPurifier_HTMLDefinition /** * String name of parent element HTML will be going into. - * @todo Allow this to be overloaded by user config * @public */ var $info_parent = 'div'; + /** + * String name of element used to wrap inline elements in block context + * @note This is rarely used except for BLOCKQUOTEs in strict mode + * @public + */ + var $info_block_wrapper = 'p'; + /** * Associative array of deprecated tag name to HTMLPurifier_TagTransform * @public @@ -102,6 +126,11 @@ class HTMLPurifier_HTMLDefinition */ var $info_attr_transform_post = array(); + /** + * Lookup table of flow elements + */ + var $info_flow_elements = array(); + /** * Initializes the definition, the meat of the class. */ @@ -164,11 +193,9 @@ class HTMLPurifier_HTMLDefinition $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'. ' | cite | abbr | acronym'; $e_phrase = "$e_phrase_basic | $e_phrase_extra"; - $e_inline_forms = ''; // humor the dtd $e_misc_inline = 'ins | del'; $e_misc = "$e_misc_inline"; - $e_inline = "a | $e_special | $e_fontstyle | $e_phrase". - " | $e_inline_forms"; + $e_inline = "a | $e_special | $e_fontstyle | $e_phrase"; // pseudo-property we created for convenience, see later on $e__inline = "#PCDATA | $e_inline | $e_misc_inline"; // note the casing @@ -181,11 +208,10 @@ class HTMLPurifier_HTMLDefinition $e__flow = "#PCDATA | $e_block | $e_inline | $e_misc"; $e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow); $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA". - " | $e_special | $e_fontstyle | $e_phrase | $e_inline_forms". - " | $e_misc_inline"); + " | $e_special | $e_fontstyle | $e_phrase | $e_misc_inline"); $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a". " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic". - " | $e_inline_forms | $e_misc_inline"); + " | $e_misc_inline"); $e_form_content = new HTMLPurifier_ChildDef_Optional('');//unused $e_form_button_content = new HTMLPurifier_ChildDef_Optional('');//unused @@ -198,7 +224,7 @@ class HTMLPurifier_HTMLDefinition $this->info['div']->child = $e_Flow; if ($config->get('HTML', 'Strict')) { - $this->info['blockquote']->child = $e_Block; + $this->info['blockquote']->child = new HTMLPurifier_ChildDef_StrictBlockquote(); } else { $this->info['blockquote']->child = $e_Flow; } @@ -276,7 +302,7 @@ class HTMLPurifier_HTMLDefinition // reuses $e_Inline and $e_Block foreach ($e_Inline->elements as $name => $bool) { - if ($name == '#PCDATA' || $name == '') continue; + if ($name == '#PCDATA') continue; $this->info[$name]->type = 'inline'; } @@ -284,6 +310,10 @@ class HTMLPurifier_HTMLDefinition $this->info[$name]->type = 'block'; } + foreach ($e_Flow->elements as $name => $bool) { + $this->info_flow_elements[$name] = true; + } + ////////////////////////////////////////////////////////////////////// // info[]->excludes : defines elements that aren't allowed in here @@ -447,6 +477,28 @@ class HTMLPurifier_HTMLDefinition } } + ////////////////////////////////////////////////////////////////////// + // info_block_wrapper : wraps inline elements in block context + + $block_wrapper = $config->get('HTML', 'BlockWrapper'); + if (isset($e_Block->elements[$block_wrapper])) { + $this->info_block_wrapper = $block_wrapper; + } else { + trigger_error('Cannot use non-block element as block wrapper.', + E_USER_ERROR); + } + + ////////////////////////////////////////////////////////////////////// + // info_parent : parent element of the HTML fragment + + $parent = $config->get('HTML', 'Parent'); + if (isset($this->info[$parent])) { + $this->info_parent = $parent; + } else { + trigger_error('Cannot use unrecognized element as parent.', + E_USER_ERROR); + } + } function setAttrForTableElements($attr, $def) { diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index ca6f1a33..3357937e 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -141,6 +141,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy if ($excluded) { // there is an exclusion, remove the entire node $result = false; + $excludes = array(); // not used, but good to initialize anyway } else { // DEFINITION CALL $def = $definition->info[$tokens[$i]->name]; diff --git a/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php b/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php new file mode 100644 index 00000000..a3e2cb14 --- /dev/null +++ b/tests/HTMLPurifier/ChildDef/StrictBlockquoteTest.php @@ -0,0 +1,50 @@ +obj = new HTMLPurifier_ChildDef_StrictBlockquote(); + + $this->assertResult(''); + $this->assertResult('

Valid

'); + $this->assertResult('
Still valid
'); + $this->assertResult('Needs wrap', '

Needs wrap

'); + $this->assertResult( + 'Wrap'. '

Do not wrap

', + '

Wrap

Do not wrap

' + ); + $this->assertResult( + '

Do not

'.'Wrap', + '

Do not

Wrap

' + ); + $this->assertResult( + '
  • Not allowed
  • Paragraph.

    Hmm.

    ', + '

    Not allowedParagraph.

    Hmm.

    ' + ); + $this->assertResult( + $var = 'He said
    perhaps
    we should nuke them.', + "

    $var

    " + ); + $this->assertResult( + 'BarPeopleConniving.'. '

    Fools!

    ', + '

    Bar'. 'PeopleConniving.

    Fools!

    ' + ); + $this->assertResult('Needs wrap', '
    Needs wrap
    ', + array('HTML.BlockWrapper' => 'div')); + + $this->assertResult('Needs wrap', '

    Needs wrap

    ', + array('HTML.BlockWrapper' => 'dav')); + $this->assertError('Cannot use non-block element as block wrapper.'); + $this->assertNoErrors(); + + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index ff88cc09..a395cf07 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -83,6 +83,20 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness '' ); + // test inline parent + $this->assertResult( + 'Bold', true, array('HTML.Parent' => 'span') + ); + $this->assertResult( + '
    Reject
    ', 'Reject', array('HTML.Parent' => 'span') + ); + + $this->assertResult( + '
    Accept
    ', true, array('HTML.Parent' => 'script') + ); + $this->assertError('Cannot use unrecognized element as parent.'); + $this->assertNoErrors(); + } } diff --git a/tests/HTMLPurifier/Test.php b/tests/HTMLPurifier/Test.php index bbb8fada..aaa8341c 100644 --- a/tests/HTMLPurifier/Test.php +++ b/tests/HTMLPurifier/Test.php @@ -31,7 +31,7 @@ class HTMLPurifier_Test extends UnitTestCase $this->assertPurification( '
    Illegal contents
    ', - '
    ' + '

    Illegal contents

    ' ); } diff --git a/tests/index.php b/tests/index.php index 36b58433..92c845fe 100644 --- a/tests/index.php +++ b/tests/index.php @@ -49,6 +49,7 @@ $test_files[] = 'ChildDef/OptionalTest.php'; $test_files[] = 'ChildDef/ChameleonTest.php'; $test_files[] = 'ChildDef/CustomTest.php'; $test_files[] = 'ChildDef/TableTest.php'; +$test_files[] = 'ChildDef/StrictBlockquoteTest.php'; $test_files[] = 'GeneratorTest.php'; $test_files[] = 'EntityLookupTest.php'; $test_files[] = 'Strategy/RemoveForeignElementsTest.php';