From 4b3436968d60fc4dc301f3fe1ac888fbe6d59bce Mon Sep 17 00:00:00 2001 From: bernhard-reiter Date: Wed, 27 Nov 2024 14:33:46 +0000 Subject: [PATCH] HTML API: Allow more contexts in `create_fragment`. This changeset modifies `WP_HTML_Processor::create_fragment( $html, $context )` to use a full processor and `create_fragment_at_node` instead of the other way around. This makes more sense and makes the main factory methods more clear, where the state required for fragments is set up in `create_fragment_at_node` instead of in both `create_fragment` and `create_fragment_at_current_node`. This allows for more HTML contexts to be provided to the basic `create_fragment` where the provided context HTML is appended to ``, a full processor is created, the last tag opener is found, and a fragment parser is created at that node via `create_fragment_at_current_node`. The HTML5lib tests are updated accordingly to use this new method to create fragments. Props jonsurrell, dmsnell, bernhard-reiter. Fixes #62584. git-svn-id: https://develop.svn.wordpress.org/trunk@59467 602fd350-edb4-49c9-b593-d223f7449a82 --- .../html-api/class-wp-html-processor.php | 126 +++++++++---- .../tests/html-api/wpHtmlProcessor.php | 77 -------- .../wpHtmlProcessorFragmentParsing.php | 178 ++++++++++++++++++ .../html-api/wpHtmlProcessorHtml5lib.php | 98 +++++----- 4 files changed, 307 insertions(+), 172 deletions(-) create mode 100644 tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 48f7d7fe8c..1be795c5c7 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -279,51 +279,62 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * form is provided because a context element may have attributes that * impact the parse, such as with a SCRIPT tag and its `type` attribute. * - * ## Current HTML Support + * Example: * - * - The only supported context is ``, which is the default value. - * - The only supported document encoding is `UTF-8`, which is the default value. + * // Usually, snippets of HTML ought to be processed in the default `` context: + * $processor = WP_HTML_Processor::create_fragment( '

Hi

' ); + * + * // Some fragments should be processed in the correct context like this SVG: + * $processor = WP_HTML_Processor::create_fragment( '', '' ); + * + * // This fragment with TD tags should be processed in a TR context: + * $processor = WP_HTML_Processor::create_fragment( + * '123', + * '' + * ); + * + * In order to create a fragment processor at the correct location, the + * provided fragment will be processed as part of a full HTML document. + * The processor will search for the last opener tag in the document and + * create a fragment processor at that location. The document will be + * forced into "no-quirks" mode by including the HTML5 doctype. + * + * For advanced usage and precise control over the context element, use + * `WP_HTML_Processor::create_full_processor()` and + * `WP_HTML_Processor::create_fragment_at_current_node()`. + * + * UTF-8 is the only allowed encoding. If working with a document that + * isn't UTF-8, first convert the document to UTF-8, then pass in the + * converted HTML. * * @since 6.4.0 * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances. + * @since 6.8.0 Can create fragments with any context element. * * @param string $html Input HTML fragment to process. - * @param string $context Context element for the fragment, must be default of ``. + * @param string $context Context element for the fragment. Defaults to ``. * @param string $encoding Text encoding of the document; must be default of 'UTF-8'. * @return static|null The created processor if successful, otherwise null. */ public static function create_fragment( $html, $context = '', $encoding = 'UTF-8' ) { - if ( '' !== $context || 'UTF-8' !== $encoding ) { + $context_processor = static::create_full_parser( "{$context}", $encoding ); + if ( null === $context_processor ) { return null; } - $processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); - $processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY; - $processor->state->encoding = $encoding; - $processor->state->encoding_confidence = 'certain'; + while ( $context_processor->next_tag() ) { + $context_processor->set_bookmark( 'final_node' ); + } - // @todo Create "fake" bookmarks for non-existent but implied nodes. - $processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); - $processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); + if ( + ! $context_processor->has_bookmark( 'final_node' ) || + ! $context_processor->seek( 'final_node' ) + ) { + _doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' ); + return null; + } - $root_node = new WP_HTML_Token( - 'root-node', - 'HTML', - false - ); - - $processor->state->stack_of_open_elements->push( $root_node ); - - $context_node = new WP_HTML_Token( - 'context-node', - 'BODY', - false - ); - - $processor->context_node = $context_node; - $processor->breadcrumbs = array( 'HTML', $context_node->node_name ); - - return $processor; + return $context_processor->create_fragment_at_current_node( $html ); } /** @@ -333,9 +344,9 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * entire HTML document from start to finish. Consider a fragment parser with * a context node of ``. * - * Since UTF-8 is the only currently-accepted charset, if working with a - * document that isn't UTF-8, it's important to convert the document before - * creating the processor: pass in the converted HTML. + * UTF-8 is the only allowed encoding. If working with a document that + * isn't UTF-8, first convert the document to UTF-8, then pass in the + * converted HTML. * * @param string $html Input HTML document to process. * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used @@ -459,35 +470,72 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor { * * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm * + * @since 6.8.0 + * * @param string $html Input HTML fragment to process. * @return static|null The created processor if successful, otherwise null. */ public function create_fragment_at_current_node( string $html ) { if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) { + _doing_it_wrong( + __METHOD__, + __( 'The context element must be a start tag.' ), + '6.8.0' + ); return null; } + $tag_name = $this->current_element->token->node_name; $namespace = $this->current_element->token->namespace; + if ( 'html' === $namespace && self::is_void( $tag_name ) ) { + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %s: A tag name like INPUT or BR. + __( 'The context element cannot be a void element, found "%s".' ), + $tag_name + ), + '6.8.0' + ); + return null; + } + /* * Prevent creating fragments at nodes that require a special tokenizer state. * This is unsupported by the HTML Processor. */ if ( 'html' === $namespace && - in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) + in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true ) ) { + _doing_it_wrong( + __METHOD__, + sprintf( + // translators: %s: A tag name like IFRAME or TEXTAREA. + __( 'The context element "%s" is not supported.' ), + $tag_name + ), + '6.8.0' + ); return null; } - $fragment_processor = static::create_fragment( $html ); - if ( null === $fragment_processor ) { - return null; - } + $fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE ); $fragment_processor->compat_mode = $this->compat_mode; - $fragment_processor->context_node = clone $this->state->current_token; + // @todo Create "fake" bookmarks for non-existent but implied nodes. + $fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 ); + $root_node = new WP_HTML_Token( + 'root-node', + 'HTML', + false + ); + $fragment_processor->state->stack_of_open_elements->push( $root_node ); + + $fragment_processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 ); + $fragment_processor->context_node = clone $this->current_element->token; $fragment_processor->context_node->bookmark_name = 'context-node'; $fragment_processor->context_node->on_destroy = null; diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessor.php b/tests/phpunit/tests/html-api/wpHtmlProcessor.php index f80260cbc1..1ca60e691f 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessor.php @@ -1043,83 +1043,6 @@ class Tests_HtmlApi_WpHtmlProcessor extends WP_UnitTestCase { $this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) ); } - /** - * @ticket 62357 - */ - public function test_create_fragment_at_current_node_in_foreign_content() { - $processor = WP_HTML_Processor::create_full_parser( '' ); - $this->assertTrue( $processor->next_tag( 'SVG' ) ); - - $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" ); - - $this->assertSame( 'svg', $fragment->get_namespace() ); - $this->assertTrue( $fragment->next_token() ); - - /* - * In HTML parsing, a nul byte would be ignored. - * In SVG it should be replaced with a replacement character. - */ - $this->assertSame( '#text', $fragment->get_token_type() ); - $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() ); - - $this->assertTrue( $fragment->next_tag( 'RECT' ) ); - $this->assertSame( 'svg', $fragment->get_namespace() ); - - $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) ); - $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() ); - $this->assertTrue( $fragment->next_tag( 'foreignObject' ) ); - $this->assertSame( 'svg', $fragment->get_namespace() ); - } - - /** - * @ticket 62357 - */ - public function test_create_fragment_at_current_node_in_foreign_content_integration_point() { - $processor = WP_HTML_Processor::create_full_parser( '' ); - $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); - - $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte" ); - - // Nothing has been processed, the html namespace should be used for parsing as an integration point. - $this->assertSame( 'html', $fragment->get_namespace() ); - - // HTML parsing transforms IMAGE into IMG. - $this->assertTrue( $fragment->next_tag( 'IMG' ) ); - - $this->assertTrue( $fragment->next_token() ); - - // In HTML parsing, the nul byte is ignored and the text is reached. - $this->assertSame( '#text', $fragment->get_token_type() ); - $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() ); - - /* - * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace. - * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close. - */ - $this->assertTrue( $fragment->next_tag( 'RECT' ) ); - $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() ); - $this->assertSame( 'html', $fragment->get_namespace() ); - $this->assertTrue( $fragment->has_self_closing_flag() ); - $this->assertTrue( $fragment->expects_closer() ); - } - - /** - * @ticket 62357 - */ - public function test_prevent_fragment_creation_on_closers() { - $processor = WP_HTML_Processor::create_full_parser( '

' ); - $processor->next_tag( 'P' ); - $processor->next_tag( - array( - 'tag_name' => 'P', - 'tag_closers' => 'visit', - ) - ); - $this->assertSame( 'P', $processor->get_tag() ); - $this->assertTrue( $processor->is_tag_closer() ); - $this->assertNull( $processor->create_fragment_at_current_node( 'fragment HTML' ) ); - } - /** * Ensure that lowercased tag_name query matches tags case-insensitively. * diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php b/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php new file mode 100644 index 0000000000..4913fa07eb --- /dev/null +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorFragmentParsing.php @@ -0,0 +1,178 @@ +' ); + $this->assertTrue( $processor->next_tag( 'SVG' ) ); + + $fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte
" ); + + $this->assertSame( 'svg', $fragment->get_namespace() ); + $this->assertTrue( $fragment->next_token() ); + + /* + * In HTML parsing, a nul byte would be ignored. + * In SVG it should be replaced with a replacement character. + */ + $this->assertSame( '#text', $fragment->get_token_type() ); + $this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() ); + + $this->assertTrue( $fragment->next_tag( 'RECT' ) ); + $this->assertSame( 'svg', $fragment->get_namespace() ); + + $this->assertTrue( $fragment->next_tag( 'CIRCLE' ) ); + $this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() ); + $this->assertTrue( $fragment->next_tag( 'foreignObject' ) ); + $this->assertSame( 'svg', $fragment->get_namespace() ); + } + + /** + * @ticket 62357 + */ + public function test_create_fragment_at_current_node_in_foreign_content_integration_point() { + $processor = WP_HTML_Processor::create_full_parser( '' ); + $this->assertTrue( $processor->next_tag( 'foreignObject' ) ); + + $fragment = $processor->create_fragment_at_current_node( "\0not-preceded-by-nul-byte" ); + + // Nothing has been processed, the html namespace should be used for parsing as an integration point. + $this->assertSame( 'html', $fragment->get_namespace() ); + + // HTML parsing transforms IMAGE into IMG. + $this->assertTrue( $fragment->next_tag( 'IMG' ) ); + + $this->assertTrue( $fragment->next_token() ); + + // In HTML parsing, the nul byte is ignored and the text is reached. + $this->assertSame( '#text', $fragment->get_token_type() ); + $this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() ); + + /* + * svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace. + * RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close. + */ + $this->assertTrue( $fragment->next_tag( 'RECT' ) ); + $this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() ); + $this->assertSame( 'html', $fragment->get_namespace() ); + $this->assertTrue( $fragment->has_self_closing_flag() ); + $this->assertTrue( $fragment->expects_closer() ); + } + + /** + * @expectedIncorrectUsage WP_HTML_Processor::create_fragment_at_current_node + * @ticket 62357 + */ + public function test_prevent_fragment_creation_on_closers() { + $processor = WP_HTML_Processor::create_full_parser( '

' ); + $processor->next_tag( 'P' ); + $processor->next_tag( + array( + 'tag_name' => 'P', + 'tag_closers' => 'visit', + ) + ); + $this->assertSame( 'P', $processor->get_tag() ); + $this->assertTrue( $processor->is_tag_closer() ); + $this->assertNull( $processor->create_fragment_at_current_node( 'fragment HTML' ) ); + } + + /** + * Verifies that the fragment parser doesn't allow invalid context nodes. + * + * This includes void elements and self-contained elements because they can + * contain no inner HTML. Operations on self-contained elements should occur + * through methods such as {@see WP_HTML_Tag_Processor::set_modifiable_text}. + * + * @ticket 62584 + * + * @dataProvider data_invalid_fragment_contexts + * + * @param string $context Invalid context node for fragment parser. + */ + public function test_rejects_invalid_fragment_contexts( string $context, string $doing_it_wrong_method_name ) { + $this->setExpectedIncorrectUsage( "WP_HTML_Processor::{$doing_it_wrong_method_name}" ); + $this->assertNull( + WP_HTML_Processor::create_fragment( 'just a test', $context ), + "Should not have been able to create a fragment parser with context node {$context}" + ); + } + + /** + * Data provider. + * + * @return array[] + */ + public static function data_invalid_fragment_contexts() { + return array( + /* + * Invalid contexts. + */ + /* + * The text node is confused with a virtual body open tag. + * This should fail to set a bookmark in `create_fragment` + * but currently does not, it slips through and fails in + * `create_fragment_at_current_node`. + */ + 'Invalid text' => array( 'just some text', 'create_fragment_at_current_node' ), + 'Invalid comment' => array( '', 'create_fragment' ), + 'Invalid closing' => array( '', 'create_fragment' ), + 'Invalid DOCTYPE' => array( '', 'create_fragment' ), + /* + * PLAINTEXT should appear in the unsupported elements, but at the + * moment it's completely unsupported by the processor so + * the context element cannot be found. + */ + 'Unsupported PLAINTEXT' => array( '', 'create_fragment' ), + + /* + * Invalid contexts. + */ + 'AREA' => array( '<area>', 'create_fragment_at_current_node' ), + 'BASE' => array( '<base>', 'create_fragment_at_current_node' ), + 'BASEFONT' => array( '<basefont>', 'create_fragment_at_current_node' ), + 'BGSOUND' => array( '<bgsound>', 'create_fragment_at_current_node' ), + 'BR' => array( '<br>', 'create_fragment_at_current_node' ), + 'COL' => array( '<table><colgroup><col>', 'create_fragment_at_current_node' ), + 'EMBED' => array( '<embed>', 'create_fragment_at_current_node' ), + 'FRAME' => array( '<frameset><frame>', 'create_fragment_at_current_node' ), + 'HR' => array( '<hr>', 'create_fragment_at_current_node' ), + 'IMG' => array( '<img>', 'create_fragment_at_current_node' ), + 'INPUT' => array( '<input>', 'create_fragment_at_current_node' ), + 'KEYGEN' => array( '<keygen>', 'create_fragment_at_current_node' ), + 'LINK' => array( '<link>', 'create_fragment_at_current_node' ), + 'META' => array( '<meta>', 'create_fragment_at_current_node' ), + 'PARAM' => array( '<param>', 'create_fragment_at_current_node' ), + 'SOURCE' => array( '<source>', 'create_fragment_at_current_node' ), + 'TRACK' => array( '<track>', 'create_fragment_at_current_node' ), + 'WBR' => array( '<wbr>', 'create_fragment_at_current_node' ), + + /* + * Unsupported elements. Include a tag closer to ensure the element can be found + * and does not pause the parser at an incomplete token. + */ + 'IFRAME' => array( '<iframe></iframe>', 'create_fragment_at_current_node' ), + 'NOEMBED' => array( '<noembed></noembed>', 'create_fragment_at_current_node' ), + 'NOFRAMES' => array( '<noframes></noframes>', 'create_fragment_at_current_node' ), + 'SCRIPT' => array( '<script></script>', 'create_fragment_at_current_node' ), + 'SCRIPT with type' => array( '<script type="javascript"></script>', 'create_fragment_at_current_node' ), + 'STYLE' => array( '<style></style>', 'create_fragment_at_current_node' ), + 'TEXTAREA' => array( '<textarea></textarea>', 'create_fragment_at_current_node' ), + 'TITLE' => array( '<title></title>', 'create_fragment_at_current_node' ), + 'XMP' => array( '<xmp></xmp>', 'create_fragment_at_current_node' ), + ); + } +} diff --git a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php index 7abe63a859..5e0c3b77f8 100644 --- a/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php +++ b/tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php @@ -153,69 +153,55 @@ class Tests_HtmlApi_Html5lib extends WP_UnitTestCase { * @return string|null Tree structure of parsed HTML, if supported, else null. */ private static function build_tree_representation( ?string $fragment_context, string $html ) { - $processor = null; if ( $fragment_context ) { - if ( 'body' === $fragment_context ) { - $processor = WP_HTML_Processor::create_fragment( $html ); - } else { - - /* - * If the string of characters starts with "svg ", the context - * element is in the SVG namespace and the substring after - * "svg " is the local name. If the string of characters starts - * with "math ", the context element is in the MathML namespace - * and the substring after "math " is the local name. - * Otherwise, the context element is in the HTML namespace and - * the string is the local name. - */ - if ( str_starts_with( $fragment_context, 'svg ' ) ) { - $tag_name = substr( $fragment_context, 4 ); - if ( 'svg' === $tag_name ) { - $parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><svg>' ); - } else { - $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><svg><{$tag_name}>" ); - } - $parent_processor->next_tag( $tag_name ); - } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { - $tag_name = substr( $fragment_context, 5 ); - if ( 'math' === $tag_name ) { - $parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><math>' ); - } else { - $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><math><{$tag_name}>" ); - } - $parent_processor->next_tag( $tag_name ); + /* + * If the string of characters starts with "svg ", the context + * element is in the SVG namespace and the substring after + * "svg " is the local name. If the string of characters starts + * with "math ", the context element is in the MathML namespace + * and the substring after "math " is the local name. + * Otherwise, the context element is in the HTML namespace and + * the string is the local name. + */ + if ( str_starts_with( $fragment_context, 'svg ' ) ) { + $tag_name = substr( $fragment_context, 4 ); + if ( 'svg' === $tag_name ) { + $fragment_context_html = '<svg>'; } else { - if ( in_array( - $fragment_context, - array( - 'caption', - 'col', - 'colgroup', - 'tbody', - 'td', - 'tfoot', - 'th', - 'thead', - 'tr', - ), - true - ) ) { - $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><table><{$fragment_context}>" ); - $parent_processor->next_tag(); - } else { - $parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><{$fragment_context}>" ); - } - $parent_processor->next_tag( $fragment_context ); + $fragment_context_html = "<svg><{$tag_name}>"; } - if ( null !== $parent_processor->get_unsupported_exception() ) { - throw $parent_processor->get_unsupported_exception(); + } elseif ( str_starts_with( $fragment_context, 'math ' ) ) { + $tag_name = substr( $fragment_context, 5 ); + if ( 'math' === $tag_name ) { + $fragment_context_html = '<math>'; + } else { + $fragment_context_html = "<math><{$tag_name}>"; } - if ( null !== $parent_processor->get_last_error() ) { - throw new Exception( $parent_processor->get_last_error() ); + } else { + // Tags that only appear in tables need a special case. + if ( in_array( + $fragment_context, + array( + 'caption', + 'col', + 'colgroup', + 'tbody', + 'td', + 'tfoot', + 'th', + 'thead', + 'tr', + ), + true + ) ) { + $fragment_context_html = "<table><{$fragment_context}>"; + } else { + $fragment_context_html = "<{$fragment_context}>"; } - $processor = $parent_processor->create_fragment_at_current_node( $html ); } + $processor = WP_HTML_Processor::create_fragment( $html, $fragment_context_html ); + if ( null === $processor ) { throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() ); }