diff --git a/lib/experimental/html/class-wp-html-attribute-token.php b/lib/experimental/html/class-wp-html-attribute-token.php new file mode 100644 index 00000000000000..32adfe02e0fb8c --- /dev/null +++ b/lib/experimental/html/class-wp-html-attribute-token.php @@ -0,0 +1,89 @@ +name = $name; + $this->value_starts_at = $value_start; + $this->value_length = $value_length; + $this->start = $start; + $this->end = $end; + $this->is_true = $is_true; + } +} diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php new file mode 100644 index 00000000000000..be6179c963571f --- /dev/null +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -0,0 +1,1265 @@ + "c" not " c" + * @TODO: Skip over `/` in attributes area, split attribute names by `/` + * @TODO: Decode HTML references/entities in class names when matching. + * E.g. match having class `1<"2` needs to recognize `class="1<"2"`. + * @TODO: Decode character references in `get_attribute()` + * @TODO: Properly escape attribute value in `set_attribute()` + * + * @package WordPress + * @subpackage HTML + * @since 6.1.0 + */ + +/** + * Processes an input HTML document by applying a specified set + * of patches to that input. Tokenizes HTML but does not fully + * parse the input document. + * + * @since 6.1.0 + */ +class WP_HTML_Tag_Processor { + + /** + * The HTML document to parse. + * + * @since 6.1.0 + * @var string + */ + private $html; + + /** + * The last query passed to next_tag(). + * + * @since 6.1.0 + * @var array|null + */ + private $last_query; + + /** + * The tag name this processor currently scans for. + * + * @since 6.1.0 + * @var string|null + */ + private $sought_tag_name; + + /** + * The CSS class name this processor currently scans for. + * + * @since 6.1.0 + * @var string|null + */ + private $sought_class_name; + + /** + * The match offset this processor currently scans for. + * + * @since 6.1.0 + * @var int|null + */ + private $sought_match_offset; + + /** + * The updated HTML document. + * + * @since 6.1.0 + * @var string + */ + private $updated_html = ''; + + /** + * How many bytes from the original HTML document were already read. + * + * @since 6.1.0 + * @var int + */ + private $parsed_bytes = 0; + + /** + * How many bytes from the original HTML document were already treated + * with the requested replacements. + * + * @since 6.1.0 + * @var int + */ + private $updated_bytes = 0; + + /** + * The name of the currently matched tag. + * + * @since 6.1.0 + * @var integer|null + */ + private $tag_name_starts_at; + + /** + * Byte offset after the name of current tag. + * Example: + *
+ * // supposing the parser is working through this content + * // and stops after recognizing the `id` attribute + * //
+ * // ^ parsing will continue from this point + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) + * ); + * + * // when picking up parsing again, or when asking to find the + * // `class` attribute we will continue and add to this array + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), + * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) + * ); + * + * // Note that only the `class` attribute value is stored in the index. + * // That's because it is the only value used by this class at the moment. + * + * + * @since 6.1.0 + * @var WP_HTML_Attribute_Token[] + */ + private $attributes = array(); + + /** + * Which class names to add or remove from a tag. + * + * These are tracked separately from attribute updates because they are + * semantically distinct, whereas this interface exists for the common + * case of adding and removing class names while other attributes are + * generally modified as with DOM `setAttribute` calls. + * + * When modifying an HTML document these will eventually be collapsed + * into a single lexical update to replace the `class` attribute. + * + * Example: + * + * // Add the `WP-block-group` class, remove the `WP-group` class. + * $class_changes = array( + * // Indexed by a comparable class name + * 'wp-block-group' => new WP_Class_Name_Operation( 'WP-block-group', WP_Class_Name_Operation::ADD ), + * 'wp-group' => new WP_Class_Name_Operation( 'WP-group', WP_Class_Name_Operation::REMOVE ) + * ); + * + * + * @since 6.1.0 + * @var bool[] + */ + private $classname_updates = array(); + + const ADD_CLASS = true; + const REMOVE_CLASS = false; + const SKIP_CLASS = null; + + /** + * Lexical replacements to apply to input HTML document. + * + * HTML modifications collapse into lexical replacements in order to + * provide an efficient mechanism to update documents lazily and in + * order to support a variety of semantic modifications without + * building a complicated parsing machinery. That is, it's up to + * the calling class to generate the lexical modification from the + * semantic change requested. + * + * Example: + * + * // Replace an attribute stored with a new value, indices + * // sourced from the lazily-parsed HTML recognizer. + * $start = $attributes['src']->start; + * $end = $attributes['src']->end; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, get_the_post_thumbnail_url() ); + * + * // Correspondingly, something like this + * // will appear in the replacements array. + * $replacements = array( + * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) + * ); + * + * + * @since 6.1.0 + * @var WP_HTML_Text_Replacement[] + */ + private $attribute_updates = array(); + + /** + * Constructor. + * + * @since 6.1.0 + * + * @param string $html HTML to process. + */ + public function __construct( $html ) { + $this->html = $html; + } + + /** + * Finds the next tag matching the $query. + * + * @since 6.1.0 + * + * @param array|string $query { + * Which tag name to find, having which class, etc. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 0 for "first" tag, 2 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this whole class name to match. + * } + * @return boolean Whether a tag was matched. + */ + public function next_tag( $query = null ) { + $this->parse_query( $query ); + $already_found = 0; + + do { + /* + * Unfortunately we can't try to search for only the tag name we want because that might + * lead us to skip over other tags and lose track of our place. So we need to search for + * _every_ tag and then check after we find one if it's the one we are looking for. + */ + if ( false === $this->parse_next_tag() ) { + $this->parsed_bytes = strlen( $this->html ); + + return false; + } + + $this->parse_tag_opener_attributes(); + + if ( $this->matches() ) { + $already_found++; + } + + // Avoid copying the tag name string when possible. + $t = $this->html[ $this->tag_name_starts_at ]; + if ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) { + $tag_name = $this->get_tag(); + + if ( 'script' === $tag_name ) { + $this->skip_script_data(); + } elseif ( 'textarea' === $tag_name || 'title' === $tag_name ) { + $this->skip_rcdata( $tag_name ); + } + } + } while ( $already_found < $this->sought_match_offset ); + + return true; + } + + /** + * Skips the contents of the title and textarea tags until an appropriate + * tag closer is found. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + * @param string $tag_name – the lowercase tag name which will close the RCDATA region. + * @since 6.1.0 + */ + private function skip_rcdata( $tag_name ) { + $html = $this->html; + $doc_length = strlen( $html ); + $tag_length = strlen( $tag_name ); + + $at = $this->parsed_bytes; + + while ( true ) { + $at = strpos( $this->html, ' $doc_length ) { + $this->parsed_bytes = $doc_length; + return; + } + + $at += 2; + + /* + * We have to find a case-insensitive match to the tag name. + * Note also that since tag names are limited to US-ASCII + * characters we can ignore any kind of Unicode normalizing + * forms when comparing. If we get a non-ASCII character it + * will never be a match. + */ + for ( $i = 0; $i < $tag_length; $i++ ) { + $tag_char = $tag_name[ $i ]; + $html_char = $html[ $at + $i ]; + + if ( $html_char !== $tag_char && strtolower( $html_char ) !== $tag_char ) { + $at += $i; + continue 2; + } + } + + $at += $tag_length; + $this->parsed_bytes = $at; + + /* + * Ensure we terminate the tag name, otherwise we might, + * for example, accidentally match the sequence + * "" for "". + */ + $c = $html[ $at ]; + if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { + continue; + } + + $this->skip_tag_closer_attributes(); + $at = $this->parsed_bytes; + + if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { + $this->parsed_bytes++; + return; + } + } + } + + /** + * Skips the contents of
', + ); + + $examples['Simple uppercase script tag'] = array( + '
', + ); + + $examples['Script with a comment opener inside should end at the next script tag closer (dash dash escaped state)'] = array( + '
-->', + ); + + $examples['Script with a comment opener and a script tag opener inside should end two script tag closer later (double escaped state)'] = array( + '
-->', + ); + + $examples['Double escaped script with a tricky opener'] = array( + '">
', + ); + + $examples['Double escaped script with a tricky closer'] = array( + '">
', + ); + + $examples['Double escaped, then escaped, then double escaped'] = array( + '
', + ); + + $examples['Script with a commented a script tag opener inside should at the next tag closer (dash dash escaped state)'] = array( + '
-->', + ); + + $examples['Script closer with another script tag in closer attributes'] = array( + '
', + ); + + $examples['Script closer with attributes'] = array( + '
', + ); + + $examples['Script opener with title closer inside'] = array( + '
', + ); + + $examples['Complex script with many parsing states'] = array( + '-->
-->', + ); + return $examples; + } + + /** + * @ticket 56299 + * + * @covers next_tag + * + * @dataProvider data_rcdata_state + */ + public function test_next_tag_ignores_the_contents_of_a_rcdata_tag( $rcdata_then_div, $rcdata_tag ) { + $p = new WP_HTML_Tag_Processor( $rcdata_then_div ); + $p->next_tag(); + $this->assertSame( $rcdata_tag, $p->get_tag(), "The first found tag was not '$rcdata_tag'" ); + $p->next_tag(); + $this->assertSame( 'div', $p->get_tag(), "The second found tag was not 'div'" ); + } + + /** + * Data provider for test_ignores_contents_of_a_rcdata_tag(). + * + * @return array { + * @type array { + * @type string $rcdata_then_div The HTML snippet containing RCDATA and div tags. + * @type string $rcdata_tag The RCDATA tag. + * } + * } + */ + public function data_rcdata_state() { + $examples = array(); + $examples['Simple textarea'] = array( + '
', + 'textarea', + ); + + $examples['Simple title'] = array( + '<span class="d-none d-md-inline">Back to notifications</title</span>
', + 'title', + ); + + $examples['Comment opener inside a textarea tag should be ignored'] = array( + '
-->', + 'textarea', + ); + + $examples['Textarea closer with another textarea tag in closer attributes'] = array( + '
', + 'textarea', + ); + + $examples['Textarea closer with attributes'] = array( + '
', + 'textarea', + ); + + $examples['Textarea opener with title closer inside'] = array( + '
', + 'textarea', + ); + return $examples; + } + + /** + * @ticket 56299 + * + * @covers next_tag + * @covers set_attribute + * @covers __toString + */ + public function test_can_query_and_update_wrongly_nested_tags() { + $p = new WP_HTML_Tag_Processor( + '123

456789

' + ); + $p->next_tag( 'span' ); + $p->set_attribute( 'class', 'span-class' ); + $p->next_tag( 'p' ); + $p->set_attribute( 'class', 'p-class' ); + $this->assertSame( + '123

456789

', + (string) $p + ); + } + + /** + * @ticket 56299 + * + * @covers next_tag + * @covers remove_attribute + * @covers __toString + */ + public function test_removing_attributes_works_even_in_malformed_html() { + $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED ); + $p->next_tag( 'span' ); + $p->remove_attribute( 'Notifications<' ); + $this->assertSame( + '
Back to notifications
', + (string) $p + ); + } + + /** + * @ticket 56299 + * + * @covers next_Tag + * @covers set_attribute + * @covers __toString + */ + public function test_updating_attributes_works_even_in_malformed_html_1() { + $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED ); + $p->next_tag( 'span' ); + $p->set_attribute( 'id', 'first' ); + $p->next_tag( 'span' ); + $p->set_attribute( 'id', 'second' ); + $this->assertSame( + '
Back to notifications
', + (string) $p + ); + } + + /** + * @ticket 56299 + * + * @covers next_tag + * @covers set_attribute + * @covers add_class + * @covers __toString + * + * @dataProvider data_malformed_tag + */ + public function test_updating_attributes_works_even_in_malformed_html_2( $html_input, $html_expected ) { + $p = new WP_HTML_Tag_Processor( $html_input ); + $p->next_tag(); + $p->set_attribute( 'foo', 'bar' ); + $p->add_class( 'firstTag' ); + $p->next_tag(); + $p->add_class( 'secondTag' ); + $this->assertSame( + $html_expected, + (string) $p + ); + } + + /** + * Data provider for test_updates_when_malformed_tag(). + * + * @return array { + * @type array { + * @type string $html_input The input HTML snippet. + * @type string $html_expected The expected HTML snippet after processing. + * } + * } + */ + public function data_malformed_tag() { + $null_byte = chr( 0 ); + $examples = array(); + $examples['Invalid entity inside attribute value'] = array( + 'test', + 'test', + ); + + $examples['HTML tag opening inside attribute value'] = array( + '
This <is> a <strong is="true">thing.
test', + '
This <is> a <strong is="true">thing.
test', + ); + + $examples['HTML tag brackets in attribute values and data markup'] = array( + '
This <is> a <strong is="true">thing.
test', + '
This <is> a <strong is="true">thing.
test', + ); + + $examples['Single and double quotes in attribute value'] = array( + '

test', + '

test', + ); + + $examples['Unquoted attribute values'] = array( + '


test', + '
test', + ); + + $examples['Double-quotes escaped in double-quote attribute value'] = array( + '
test', + '
test', + ); + + $examples['Unquoted attribute value'] = array( + '
test', + '
test', + ); + + $examples['Unquoted attribute value with tag-like value'] = array( + '
>test', + '
>test', + ); + + $examples['Unquoted attribute value with tag-like value followed by tag-like data'] = array( + '
>test', + '
>test', + ); + + $examples['1'] = array( + '
test', + '
test', + ); + + $examples['2'] = array( + '
test', + '
test', + ); + + $examples['4'] = array( + '
test', + '
test', + ); + + $examples['5'] = array( + '
code>test', + '
code>test', + ); + + $examples['6'] = array( + '
test', + '
test', + ); + + $examples['7'] = array( + '
test', + '
test', + ); + + $examples['8'] = array( + '
id="test">test', + '
id="test">test', + ); + + $examples['9'] = array( + '
test', + '
test', + ); + + $examples['10'] = array( + 'test', + 'test', + ); + + $examples['11'] = array( + 'The applicative operator <* works well in Haskell; is what?test', + 'The applicative operator <* works well in Haskell; is what?test', + ); + + $examples['12'] = array( + '<3 is a heart but is a tag.test', + '<3 is a heart but is a tag.test', + ); + + $examples['13'] = array( + 'test', + 'test', + ); + + $examples['14'] = array( + 'test', + 'test', + ); + + $examples['15'] = array( + ' a HTML Tag]]>test', + ' a HTML Tag]]>test', + ); + + $examples['16'] = array( + '
test', + '
test', + ); + + $examples['17'] = array( + '
test', + '
test', + ); + + $examples['18'] = array( + '
test', + '
test', + ); + + $examples['19'] = array( + '
test', + '
test', + ); + + $examples['20'] = array( + '
test', + '
test', + ); + + $examples['21'] = array( + '
test', + '
test', + ); + + $examples['22'] = array( + '
test', + '
test', + ); + + $examples['23'] = array( + '
test', + '
test', + ); + + $examples['24'] = array( + '
test', + '
test', + ); + + $examples['25'] = array( + '
test', + '
test', + ); + + $examples['Multiple unclosed tags treated as a single tag'] = array( + '
+test', + '
+test', + ); + + $examples['27'] = array( + '
test', + '
test', + ); + + $examples['28'] = array( + '
test', + '
test', + ); + + return $examples; + } +}