HTML API: Respect document compat mode when handling CSS class names.

The HTML API has been behaving as if CSS class name selectors matched class names in an ASCII case-insensitive manner. This is only true if the document in question is set to quirks mode. Unfortunately most documents processed will be set to no-quirks mode, meaning that some CSS behaviors have been matching incorrectly when provided with case variants of class names.

In this patch, the CSS methods have been audited and updated to adhere to the rules governing ASCII case sensitivity when matching classes. This includes `add_class()`, `remove_class()`, `has_class()`, and `class_list()`. Now, it is assumed that a document is in no-quirks mode unless a full HTML parser infers quirks mode, and these methods will treat class names in a byte-for-byte manner. Otherwise, when a document is in quirks mode, the methods will compare the provided class names against existing class names for the tag in an ASCII case insensitive way, while `class_list()` will return a lower-cased version of the existing class names.

The lower-casing in `class_list()` is performed for consistency, since it's possible that multiple case variants of the same comparable class name exists on a tag in the input HTML.

Developed in https://github.com/WordPress/wordpress-develop/pull/7169
Discussed in https://core.trac.wordpress.org/ticket/61531

Props dmsnell, jonsurrell.
See #61531.


git-svn-id: https://develop.svn.wordpress.org/trunk@58985 602fd350-edb4-49c9-b593-d223f7449a82
This commit is contained in:
Dennis Snell 2024-09-04 04:32:37 +00:00
parent e13b7cb6db
commit fb40fe915e
4 changed files with 301 additions and 78 deletions

View File

@ -299,31 +299,6 @@ class WP_HTML_Processor_State {
*/
const INSERTION_MODE_AFTER_AFTER_FRAMESET = 'insertion-mode-after-after-frameset';
/**
* No-quirks mode document compatability mode.
*
* > In no-quirks mode, the behavior is (hopefully) the desired behavior
* > described by the modern HTML and CSS specifications.
*
* @since 6.7.0
*
* @var string
*/
const NO_QUIRKS_MODE = 'no-quirks-mode';
/**
* Quirks mode document compatability mode.
*
* > In quirks mode, layout emulates behavior in Navigator 4 and Internet
* > Explorer 5. This is essential in order to support websites that were
* > built before the widespread adoption of web standards.
*
* @since 6.7.0
*
* @var string
*/
const QUIRKS_MODE = 'quirks-mode';
/**
* The stack of template insertion modes.
*
@ -381,30 +356,6 @@ class WP_HTML_Processor_State {
*/
public $insertion_mode = self::INSERTION_MODE_INITIAL;
/**
* Indicates if the document is in quirks mode or no-quirks mode.
*
* Impact on HTML parsing:
*
* - In `NO_QUIRKS_MODE` CSS class and ID selectors match in a byte-for-byte
* manner, otherwise for backwards compatability, class selectors are to
* match in an ASCII case-insensitive manner.
*
* - When not in `QUIRKS_MODE`, a TABLE start tag implicitly closes an open P tag
* if one is in scope and open, otherwise the TABLE becomes a child of the P.
*
* `QUIRKS_MODE` impacts many styling-related aspects of an HTML document, but
* none of the other changes modifies how the HTML is parsed or selected.
*
* @see self::QUIRKS_MODE
* @see self::NO_QUIRKS_MODE
*
* @since 6.7.0
*
* @var string
*/
public $document_mode = self::NO_QUIRKS_MODE;
/**
* Context node initializing fragment parser, if created as a fragment parser.
*

View File

@ -1080,7 +1080,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
case 'html':
$doctype = $this->get_doctype_info();
if ( null !== $doctype && 'quirks' === $doctype->indicated_compatability_mode ) {
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
$this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
}
/*
@ -1095,7 +1095,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > Anything else
*/
initial_anything_else:
$this->state->document_mode = WP_HTML_Processor_State::QUIRKS_MODE;
$this->compat_mode = WP_HTML_Tag_Processor::QUIRKS_MODE;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML;
return $this->step( self::REPROCESS_CURRENT_NODE );
}
@ -2448,7 +2448,7 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
* > has a p element in button scope, then close a p element.
*/
if (
WP_HTML_Processor_State::QUIRKS_MODE !== $this->state->document_mode &&
WP_HTML_Tag_Processor::QUIRKS_MODE !== $this->compat_mode &&
$this->state->stack_of_open_elements->has_p_in_button_scope()
) {
$this->close_a_p_element();
@ -4938,6 +4938,10 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @todo When reconstructing active formatting elements with attributes, find a way
* to indicate if the virtually-reconstructed formatting elements contain the
* wanted class name.
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/

View File

@ -511,6 +511,32 @@ class WP_HTML_Tag_Processor {
*/
protected $parser_state = self::STATE_READY;
/**
* Indicates if the document is in quirks mode or no-quirks mode.
*
* Impact on HTML parsing:
*
* - In `NO_QUIRKS_MODE` (also known as "standard mode"):
* - CSS class and ID selectors match byte-for-byte (case-sensitively).
* - A TABLE start tag `<table>` implicitly closes any open `P` element.
*
* - In `QUIRKS_MODE`:
* - CSS class and ID selectors match match in an ASCII case-insensitive manner.
* - A TABLE start tag `<table>` opens a `TABLE` element as a child of a `P`
* element if one is open.
*
* Quirks and no-quirks mode are thus mostly about styling, but have an impact when
* tables are found inside paragraph elements.
*
* @see self::QUIRKS_MODE
* @see self::NO_QUIRKS_MODE
*
* @since 6.7.0
*
* @var string
*/
protected $compat_mode = self::NO_QUIRKS_MODE;
/**
* Indicates whether the parser is inside foreign content,
* e.g. inside an SVG or MathML element.
@ -1155,6 +1181,8 @@ class WP_HTML_Tag_Processor {
$seen = array();
$is_quirks = self::QUIRKS_MODE === $this->compat_mode;
$at = 0;
while ( $at < strlen( $class ) ) {
// Skip past any initial boundary characters.
@ -1169,13 +1197,11 @@ class WP_HTML_Tag_Processor {
return;
}
/*
* CSS class names are case-insensitive in the ASCII range.
*
* @see https://www.w3.org/TR/CSS2/syndata.html#x1
*/
$name = str_replace( "\x00", "\u{FFFD}", strtolower( substr( $class, $at, $length ) ) );
$at += $length;
$name = str_replace( "\x00", "\u{FFFD}", substr( $class, $at, $length ) );
if ( $is_quirks ) {
$name = strtolower( $name );
}
$at += $length;
/*
* It's expected that the number of class names for a given tag is relatively small.
@ -1205,10 +1231,14 @@ class WP_HTML_Tag_Processor {
return null;
}
$wanted_class = strtolower( $wanted_class );
$case_insensitive = self::QUIRKS_MODE === $this->compat_mode;
$wanted_length = strlen( $wanted_class );
foreach ( $this->class_list() as $class_name ) {
if ( $class_name === $wanted_class ) {
if (
strlen( $class_name ) === $wanted_length &&
0 === substr_compare( $class_name, $wanted_class, 0, strlen( $wanted_class ), $case_insensitive )
) {
return true;
}
}
@ -2296,6 +2326,23 @@ class WP_HTML_Tag_Processor {
*/
$modified = false;
$seen = array();
$to_remove = array();
$is_quirks = self::QUIRKS_MODE === $this->compat_mode;
if ( $is_quirks ) {
foreach ( $this->classname_updates as $updated_name => $action ) {
if ( self::REMOVE_CLASS === $action ) {
$to_remove[] = strtolower( $updated_name );
}
}
} else {
foreach ( $this->classname_updates as $updated_name => $action ) {
if ( self::REMOVE_CLASS === $action ) {
$to_remove[] = $updated_name;
}
}
}
// Remove unwanted classes by only copying the new ones.
$existing_class_length = strlen( $existing_class );
while ( $at < $existing_class_length ) {
@ -2311,25 +2358,23 @@ class WP_HTML_Tag_Processor {
break;
}
$name = substr( $existing_class, $at, $name_length );
$at += $name_length;
$name = substr( $existing_class, $at, $name_length );
$comparable_class_name = $is_quirks ? strtolower( $name ) : $name;
$at += $name_length;
// If this class is marked for removal, start processing the next one.
$remove_class = (
isset( $this->classname_updates[ $name ] ) &&
self::REMOVE_CLASS === $this->classname_updates[ $name ]
);
// If a class has already been seen then skip it; it should not be added twice.
if ( ! $remove_class ) {
$this->classname_updates[ $name ] = self::SKIP_CLASS;
}
if ( $remove_class ) {
// If this class is marked for removal, remove it and move on to the next one.
if ( in_array( $comparable_class_name, $to_remove, true ) ) {
$modified = true;
continue;
}
// If a class has already been seen then skip it; it should not be added twice.
if ( in_array( $comparable_class_name, $seen, true ) ) {
continue;
}
$seen[] = $comparable_class_name;
/*
* Otherwise, append it to the new "class" attribute value.
*
@ -2350,7 +2395,8 @@ class WP_HTML_Tag_Processor {
// Add new classes by appending those which haven't already been seen.
foreach ( $this->classname_updates as $name => $operation ) {
if ( self::ADD_CLASS === $operation ) {
$comparable_name = $is_quirks ? strtolower( $name ) : $name;
if ( self::ADD_CLASS === $operation && ! in_array( $comparable_name, $seen, true ) ) {
$modified = true;
$class .= strlen( $class ) > 0 ? ' ' : '';
@ -3932,8 +3978,29 @@ class WP_HTML_Tag_Processor {
return false;
}
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
if ( self::QUIRKS_MODE !== $this->compat_mode ) {
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
return true;
}
/*
* Because class names are matched ASCII-case-insensitively in quirks mode,
* this needs to see if a case variant of the given class name is already
* enqueued and update that existing entry, if so. This picks the casing of
* the first-provided class name for all lexical variations.
*/
$class_name_length = strlen( $class_name );
foreach ( $this->classname_updates as $updated_name => $action ) {
if (
strlen( $updated_name ) === $class_name_length &&
0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true )
) {
$this->classname_updates[ $updated_name ] = self::ADD_CLASS;
return true;
}
}
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
return true;
}
@ -3953,10 +4020,29 @@ class WP_HTML_Tag_Processor {
return false;
}
if ( null !== $this->tag_name_starts_at ) {
if ( self::QUIRKS_MODE !== $this->compat_mode ) {
$this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
return true;
}
/*
* Because class names are matched ASCII-case-insensitively in quirks mode,
* this needs to see if a case variant of the given class name is already
* enqueued and update that existing entry, if so. This picks the casing of
* the first-provided class name for all lexical variations.
*/
$class_name_length = strlen( $class_name );
foreach ( $this->classname_updates as $updated_name => $action ) {
if (
strlen( $updated_name ) === $class_name_length &&
0 === substr_compare( $updated_name, $class_name, 0, $class_name_length, true )
) {
$this->classname_updates[ $updated_name ] = self::REMOVE_CLASS;
return true;
}
}
$this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
return true;
}
@ -4350,6 +4436,37 @@ class WP_HTML_Tag_Processor {
*/
const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
/**
* No-quirks mode document compatability mode.
*
* > In no-quirks mode, the behavior is (hopefully) the desired behavior
* > described by the modern HTML and CSS specifications.
*
* @see self::$compat_mode
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
*
* @since 6.7.0
*
* @var string
*/
const NO_QUIRKS_MODE = 'no-quirks-mode';
/**
* Quirks mode document compatability mode.
*
* > In quirks mode, layout emulates behavior in Navigator 4 and Internet
* > Explorer 5. This is essential in order to support websites that were
* > built before the widespread adoption of web standards.
*
* @see self::$compat_mode
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
*
* @since 6.7.0
*
* @var string
*/
const QUIRKS_MODE = 'quirks-mode';
/**
* Indicates that a span of text may contain any combination of significant
* kinds of characters: NULL bytes, whitespace, and others.

View File

@ -519,4 +519,155 @@ class Tests_HtmlApi_WpHtmlProcessor extends WP_UnitTestCase {
$processor = WP_HTML_Processor::create_fragment( '<svg><script />' );
$this->assertTrue( $processor->next_tag( 'script' ) );
}
/**
* Ensures that the tag processor is case sensitive when removing CSS classes in no-quirks mode.
*
* @ticket 61531
*
* @covers ::remove_class
*/
public function test_remove_class_no_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><span class="UPPER">' );
$processor->next_tag( 'SPAN' );
$processor->remove_class( 'upper' );
$this->assertSame( '<!DOCTYPE html><span class="UPPER">', $processor->get_updated_html() );
$processor->remove_class( 'UPPER' );
$this->assertSame( '<!DOCTYPE html><span >', $processor->get_updated_html() );
}
/**
* Ensures that the tag processor is case sensitive when adding CSS classes in no-quirks mode.
*
* @ticket 61531
*
* @covers ::add_class
*/
public function test_add_class_no_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><span class="UPPER">' );
$processor->next_tag( 'SPAN' );
$processor->add_class( 'UPPER' );
$this->assertSame( '<!DOCTYPE html><span class="UPPER">', $processor->get_updated_html() );
$processor->add_class( 'upper' );
$this->assertSame( '<!DOCTYPE html><span class="UPPER upper">', $processor->get_updated_html() );
}
/**
* Ensures that the tag processor is case sensitive when checking has CSS classes in no-quirks mode.
*
* @ticket 61531
*
* @covers ::has_class
*/
public function test_has_class_no_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><span class="UPPER">' );
$processor->next_tag( 'SPAN' );
$this->assertFalse( $processor->has_class( 'upper' ) );
$this->assertTrue( $processor->has_class( 'UPPER' ) );
}
/**
* Ensures that the tag processor lists unique CSS class names in no-quirks mode.
*
* @ticket 61531
*
* @covers ::class_list
*/
public function test_class_list_no_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser(
/*
* U+00C9 is LATIN CAPITAL LETTER E WITH ACUTE
* U+0045 is LATIN CAPITAL LETTER E
* U+0301 is COMBINING ACUTE ACCENT
*
* This tests not only that the class matching deduplicates the É, but also
* that it treats the same character in different normalization forms as
* distinct, since matching occurs on a byte-for-byte basis.
*/
"<!DOCTYPE html><span class='A A a B b \u{C9} \u{45}\u{0301} \u{C9} é'>"
);
$processor->next_tag( 'SPAN' );
$class_list = iterator_to_array( $processor->class_list() );
$this->assertSame(
array( 'A', 'a', 'B', 'b', 'É', "E\u{0301}", 'é' ),
$class_list
);
}
/**
* Ensures that the tag processor is case insensitive when removing CSS classes in quirks mode.
*
* @ticket 61531
*
* @covers ::remove_class
*/
public function test_remove_class_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser( '<span class="uPPER">' );
$processor->next_tag( 'SPAN' );
$processor->remove_class( 'upPer' );
$this->assertSame( '<span >', $processor->get_updated_html() );
}
/**
* Ensures that the tag processor is case insensitive when adding CSS classes in quirks mode.
*
* @ticket 61531
*
* @covers ::add_class
*/
public function test_add_class_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser( '<span class="UPPER">' );
$processor->next_tag( 'SPAN' );
$processor->add_class( 'upper' );
$this->assertSame( '<span class="UPPER">', $processor->get_updated_html() );
$processor->add_class( 'ANOTHER-UPPER' );
$this->assertSame( '<span class="UPPER ANOTHER-UPPER">', $processor->get_updated_html() );
}
/**
* Ensures that the tag processor is case sensitive when checking has CSS classes in quirks mode.
*
* @ticket 61531
*
* @covers ::has_class
*/
public function test_has_class_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser( '<span class="UPPER">' );
$processor->next_tag( 'SPAN' );
$this->assertTrue( $processor->has_class( 'upper' ) );
$this->assertTrue( $processor->has_class( 'UPPER' ) );
}
/**
* Ensures that the tag processor lists unique CSS class names in quirks mode.
*
* @ticket 61531
*
* @covers ::class_list
*/
public function test_class_list_quirks_mode() {
$processor = WP_HTML_Processor::create_full_parser(
/*
* U+00C9 is LATIN CAPITAL LETTER E WITH ACUTE
* U+0045 is LATIN CAPITAL LETTER E
* U+0065 is LATIN SMALL LETTER E
* U+0301 is COMBINING ACUTE ACCENT
*
* This tests not only that the class matching deduplicates the É, but also
* that it treats the same character in different normalization forms as
* distinct, since matching occurs on a byte-for-byte basis.
*/
"<span class='A A a B b \u{C9} \u{45}\u{301} \u{C9} é \u{65}\u{301}'>"
);
$processor->next_tag( 'SPAN' );
$class_list = iterator_to_array( $processor->class_list() );
$this->assertSame(
array( 'a', 'b', 'É', "e\u{301}", 'é' ),
$class_list
);
}
}