From c6c5dbba30cdf90310e4f6767e6c31f760c68b70 Mon Sep 17 00:00:00 2001 From: Adam Zielinski Date: Fri, 23 Sep 2022 16:36:52 +1000 Subject: [PATCH] WP_HTML_Tag_Processor: Inject dynamic data to block HTML markup in PHP (#42485) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce WP_HTML_Tag_Processor for reliably modifying HTML attributes. Dynamic blocks often need to inject a CSS class name or set in the rendered block HTML markup but lack the means to do so. WP_HTML_Tag_Processor solves this problem. It scans through an HTML document to find specific tags, then transforms those tags by adding, removing, or updating the values of the HTML attributes within that tag (opener). Importantly, it does not fully parse HTML or _recurse_ into the HTML structure. Instead WP_HTML_Tag_Processor scans linearly through a document and only parses the HTML tag openers. Example: ``` $p = new WP_HTML_Tag_Processor('
'); $p->next_tag('img')->set_attribute('src', '/wp-content/logo.png'); echo $p; //
``` For more details and context, see the original GitHub Pull Request at https://github.com/WordPress/gutenberg/pull/42485 and the overview issue at https://github.com/WordPress/gutenberg/issues/44410. Co-authored-by: Adam Zieliński Co-authored-by: Dennis Snell Co-authored-by: Grzegorz Ziółkowski Co-authored-by: Sören Wrede Co-authored-by: Colin Stewart <79332690+costdev@users.noreply.github.com> --- .../html/class-wp-html-attribute-token.php | 89 ++ .../html/class-wp-html-tag-processor.php | 1265 +++++++++++++++++ .../html/class-wp-html-text-replacement.php | 59 + lib/experimental/html/index.php | 11 + lib/load.php | 3 + phpunit/html/wp-html-tag-processor-test.php | 1187 ++++++++++++++++ 6 files changed, 2614 insertions(+) create mode 100644 lib/experimental/html/class-wp-html-attribute-token.php create mode 100644 lib/experimental/html/class-wp-html-tag-processor.php create mode 100644 lib/experimental/html/class-wp-html-text-replacement.php create mode 100644 lib/experimental/html/index.php create mode 100644 phpunit/html/wp-html-tag-processor-test.php diff --git a/lib/experimental/html/class-wp-html-attribute-token.php b/lib/experimental/html/class-wp-html-attribute-token.php new file mode 100644 index 00000000000000..32adfe02e0fb8c --- /dev/null +++ b/lib/experimental/html/class-wp-html-attribute-token.php @@ -0,0 +1,89 @@ +name = $name; + $this->value_starts_at = $value_start; + $this->value_length = $value_length; + $this->start = $start; + $this->end = $end; + $this->is_true = $is_true; + } +} diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php new file mode 100644 index 00000000000000..be6179c963571f --- /dev/null +++ b/lib/experimental/html/class-wp-html-tag-processor.php @@ -0,0 +1,1265 @@ + "c" not " c" + * @TODO: Skip over `/` in attributes area, split attribute names by `/` + * @TODO: Decode HTML references/entities in class names when matching. + * E.g. match having class `1<"2` needs to recognize `class="1<"2"`. + * @TODO: Decode character references in `get_attribute()` + * @TODO: Properly escape attribute value in `set_attribute()` + * + * @package WordPress + * @subpackage HTML + * @since 6.1.0 + */ + +/** + * Processes an input HTML document by applying a specified set + * of patches to that input. Tokenizes HTML but does not fully + * parse the input document. + * + * @since 6.1.0 + */ +class WP_HTML_Tag_Processor { + + /** + * The HTML document to parse. + * + * @since 6.1.0 + * @var string + */ + private $html; + + /** + * The last query passed to next_tag(). + * + * @since 6.1.0 + * @var array|null + */ + private $last_query; + + /** + * The tag name this processor currently scans for. + * + * @since 6.1.0 + * @var string|null + */ + private $sought_tag_name; + + /** + * The CSS class name this processor currently scans for. + * + * @since 6.1.0 + * @var string|null + */ + private $sought_class_name; + + /** + * The match offset this processor currently scans for. + * + * @since 6.1.0 + * @var int|null + */ + private $sought_match_offset; + + /** + * The updated HTML document. + * + * @since 6.1.0 + * @var string + */ + private $updated_html = ''; + + /** + * How many bytes from the original HTML document were already read. + * + * @since 6.1.0 + * @var int + */ + private $parsed_bytes = 0; + + /** + * How many bytes from the original HTML document were already treated + * with the requested replacements. + * + * @since 6.1.0 + * @var int + */ + private $updated_bytes = 0; + + /** + * The name of the currently matched tag. + * + * @since 6.1.0 + * @var integer|null + */ + private $tag_name_starts_at; + + /** + * Byte offset after the name of current tag. + * Example: + *
+ * // supposing the parser is working through this content + * // and stops after recognizing the `id` attribute + * //
+ * // ^ parsing will continue from this point + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ) + * ); + * + * // when picking up parsing again, or when asking to find the + * // `class` attribute we will continue and add to this array + * $this->attributes = array( + * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ), + * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 ) + * ); + * + * // Note that only the `class` attribute value is stored in the index. + * // That's because it is the only value used by this class at the moment. + * + * + * @since 6.1.0 + * @var WP_HTML_Attribute_Token[] + */ + private $attributes = array(); + + /** + * Which class names to add or remove from a tag. + * + * These are tracked separately from attribute updates because they are + * semantically distinct, whereas this interface exists for the common + * case of adding and removing class names while other attributes are + * generally modified as with DOM `setAttribute` calls. + * + * When modifying an HTML document these will eventually be collapsed + * into a single lexical update to replace the `class` attribute. + * + * Example: + * + * // Add the `WP-block-group` class, remove the `WP-group` class. + * $class_changes = array( + * // Indexed by a comparable class name + * 'wp-block-group' => new WP_Class_Name_Operation( 'WP-block-group', WP_Class_Name_Operation::ADD ), + * 'wp-group' => new WP_Class_Name_Operation( 'WP-group', WP_Class_Name_Operation::REMOVE ) + * ); + * + * + * @since 6.1.0 + * @var bool[] + */ + private $classname_updates = array(); + + const ADD_CLASS = true; + const REMOVE_CLASS = false; + const SKIP_CLASS = null; + + /** + * Lexical replacements to apply to input HTML document. + * + * HTML modifications collapse into lexical replacements in order to + * provide an efficient mechanism to update documents lazily and in + * order to support a variety of semantic modifications without + * building a complicated parsing machinery. That is, it's up to + * the calling class to generate the lexical modification from the + * semantic change requested. + * + * Example: + * + * // Replace an attribute stored with a new value, indices + * // sourced from the lazily-parsed HTML recognizer. + * $start = $attributes['src']->start; + * $end = $attributes['src']->end; + * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, get_the_post_thumbnail_url() ); + * + * // Correspondingly, something like this + * // will appear in the replacements array. + * $replacements = array( + * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' ) + * ); + * + * + * @since 6.1.0 + * @var WP_HTML_Text_Replacement[] + */ + private $attribute_updates = array(); + + /** + * Constructor. + * + * @since 6.1.0 + * + * @param string $html HTML to process. + */ + public function __construct( $html ) { + $this->html = $html; + } + + /** + * Finds the next tag matching the $query. + * + * @since 6.1.0 + * + * @param array|string $query { + * Which tag name to find, having which class, etc. + * + * @type string|null $tag_name Which tag to find, or `null` for "any tag." + * @type int|null $match_offset Find the Nth tag matching all search criteria. + * 0 for "first" tag, 2 for "third," etc. + * Defaults to first tag. + * @type string|null $class_name Tag must contain this whole class name to match. + * } + * @return boolean Whether a tag was matched. + */ + public function next_tag( $query = null ) { + $this->parse_query( $query ); + $already_found = 0; + + do { + /* + * Unfortunately we can't try to search for only the tag name we want because that might + * lead us to skip over other tags and lose track of our place. So we need to search for + * _every_ tag and then check after we find one if it's the one we are looking for. + */ + if ( false === $this->parse_next_tag() ) { + $this->parsed_bytes = strlen( $this->html ); + + return false; + } + + $this->parse_tag_opener_attributes(); + + if ( $this->matches() ) { + $already_found++; + } + + // Avoid copying the tag name string when possible. + $t = $this->html[ $this->tag_name_starts_at ]; + if ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) { + $tag_name = $this->get_tag(); + + if ( 'script' === $tag_name ) { + $this->skip_script_data(); + } elseif ( 'textarea' === $tag_name || 'title' === $tag_name ) { + $this->skip_rcdata( $tag_name ); + } + } + } while ( $already_found < $this->sought_match_offset ); + + return true; + } + + /** + * Skips the contents of the title and textarea tags until an appropriate + * tag closer is found. + * + * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + * @param string $tag_name – the lowercase tag name which will close the RCDATA region. + * @since 6.1.0 + */ + private function skip_rcdata( $tag_name ) { + $html = $this->html; + $doc_length = strlen( $html ); + $tag_length = strlen( $tag_name ); + + $at = $this->parsed_bytes; + + while ( true ) { + $at = strpos( $this->html, ' $doc_length ) { + $this->parsed_bytes = $doc_length; + return; + } + + $at += 2; + + /* + * We have to find a case-insensitive match to the tag name. + * Note also that since tag names are limited to US-ASCII + * characters we can ignore any kind of Unicode normalizing + * forms when comparing. If we get a non-ASCII character it + * will never be a match. + */ + for ( $i = 0; $i < $tag_length; $i++ ) { + $tag_char = $tag_name[ $i ]; + $html_char = $html[ $at + $i ]; + + if ( $html_char !== $tag_char && strtolower( $html_char ) !== $tag_char ) { + $at += $i; + continue 2; + } + } + + $at += $tag_length; + $this->parsed_bytes = $at; + + /* + * Ensure we terminate the tag name, otherwise we might, + * for example, accidentally match the sequence + * "" for "". + */ + $c = $html[ $at ]; + if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { + continue; + } + + $this->skip_tag_closer_attributes(); + $at = $this->parsed_bytes; + + if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) { + $this->parsed_bytes++; + return; + } + } + } + + /** + * Skips the contents of
', + ); + + $examples['Simple uppercase script tag'] = array( + '
', + ); + + $examples['Script with a comment opener inside should end at the next script tag closer (dash dash escaped state)'] = array( + '
-->', + ); + + $examples['Script with a comment opener and a script tag opener inside should end two script tag closer later (double escaped state)'] = array( + '
-->', + ); + + $examples['Double escaped script with a tricky opener'] = array( + '">
', + ); + + $examples['Double escaped script with a tricky closer'] = array( + '">
', + ); + + $examples['Double escaped, then escaped, then double escaped'] = array( + '
', + ); + + $examples['Script with a commented a script tag opener inside should at the next tag closer (dash dash escaped state)'] = array( + '
-->', + ); + + $examples['Script closer with another script tag in closer attributes'] = array( + '
', + ); + + $examples['Script closer with attributes'] = array( + '
', + ); + + $examples['Script opener with title closer inside'] = array( + '
', + ); + + $examples['Complex script with many parsing states'] = array( + '-->
-->', + ); + return $examples; + } + + /** + * @ticket 56299 + * + * @covers next_tag + * + * @dataProvider data_rcdata_state + */ + public function test_next_tag_ignores_the_contents_of_a_rcdata_tag( $rcdata_then_div, $rcdata_tag ) { + $p = new WP_HTML_Tag_Processor( $rcdata_then_div ); + $p->next_tag(); + $this->assertSame( $rcdata_tag, $p->get_tag(), "The first found tag was not '$rcdata_tag'" ); + $p->next_tag(); + $this->assertSame( 'div', $p->get_tag(), "The second found tag was not 'div'" ); + } + + /** + * Data provider for test_ignores_contents_of_a_rcdata_tag(). + * + * @return array { + * @type array { + * @type string $rcdata_then_div The HTML snippet containing RCDATA and div tags. + * @type string $rcdata_tag The RCDATA tag. + * } + * } + */ + public function data_rcdata_state() { + $examples = array(); + $examples['Simple textarea'] = array( + '
', + 'textarea', + ); + + $examples['Simple title'] = array( + '<span class="d-none d-md-inline">Back to notifications</title</span>
', + 'title', + ); + + $examples['Comment opener inside a textarea tag should be ignored'] = array( + '
-->', + 'textarea', + ); + + $examples['Textarea closer with another textarea tag in closer attributes'] = array( + '
', + 'textarea', + ); + + $examples['Textarea closer with attributes'] = array( + '
', + 'textarea', + ); + + $examples['Textarea opener with title closer inside'] = array( + '
', + 'textarea', + ); + return $examples; + } + + /** + * @ticket 56299 + * + * @covers next_tag + * @covers set_attribute + * @covers __toString + */ + public function test_can_query_and_update_wrongly_nested_tags() { + $p = new WP_HTML_Tag_Processor( + '123

456789

' + ); + $p->next_tag( 'span' ); + $p->set_attribute( 'class', 'span-class' ); + $p->next_tag( 'p' ); + $p->set_attribute( 'class', 'p-class' ); + $this->assertSame( + '123

456789

', + (string) $p + ); + } + + /** + * @ticket 56299 + * + * @covers next_tag + * @covers remove_attribute + * @covers __toString + */ + public function test_removing_attributes_works_even_in_malformed_html() { + $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED ); + $p->next_tag( 'span' ); + $p->remove_attribute( 'Notifications<' ); + $this->assertSame( + '
Back to notifications
', + (string) $p + ); + } + + /** + * @ticket 56299 + * + * @covers next_Tag + * @covers set_attribute + * @covers __toString + */ + public function test_updating_attributes_works_even_in_malformed_html_1() { + $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED ); + $p->next_tag( 'span' ); + $p->set_attribute( 'id', 'first' ); + $p->next_tag( 'span' ); + $p->set_attribute( 'id', 'second' ); + $this->assertSame( + '
Back to notifications
', + (string) $p + ); + } + + /** + * @ticket 56299 + * + * @covers next_tag + * @covers set_attribute + * @covers add_class + * @covers __toString + * + * @dataProvider data_malformed_tag + */ + public function test_updating_attributes_works_even_in_malformed_html_2( $html_input, $html_expected ) { + $p = new WP_HTML_Tag_Processor( $html_input ); + $p->next_tag(); + $p->set_attribute( 'foo', 'bar' ); + $p->add_class( 'firstTag' ); + $p->next_tag(); + $p->add_class( 'secondTag' ); + $this->assertSame( + $html_expected, + (string) $p + ); + } + + /** + * Data provider for test_updates_when_malformed_tag(). + * + * @return array { + * @type array { + * @type string $html_input The input HTML snippet. + * @type string $html_expected The expected HTML snippet after processing. + * } + * } + */ + public function data_malformed_tag() { + $null_byte = chr( 0 ); + $examples = array(); + $examples['Invalid entity inside attribute value'] = array( + 'test', + 'test', + ); + + $examples['HTML tag opening inside attribute value'] = array( + '
This <is> a <strong is="true">thing.
test', + '
This <is> a <strong is="true">thing.
test', + ); + + $examples['HTML tag brackets in attribute values and data markup'] = array( + '
This <is> a <strong is="true">thing.
test', + '
This <is> a <strong is="true">thing.
test', + ); + + $examples['Single and double quotes in attribute value'] = array( + '

test', + '

test', + ); + + $examples['Unquoted attribute values'] = array( + '


test', + '
test', + ); + + $examples['Double-quotes escaped in double-quote attribute value'] = array( + '
test', + '
test', + ); + + $examples['Unquoted attribute value'] = array( + '
test', + '
test', + ); + + $examples['Unquoted attribute value with tag-like value'] = array( + '
>test', + '
>test', + ); + + $examples['Unquoted attribute value with tag-like value followed by tag-like data'] = array( + '
>test', + '
>test', + ); + + $examples['1'] = array( + '
test', + '
test', + ); + + $examples['2'] = array( + '
test', + '
test', + ); + + $examples['4'] = array( + '
test', + '
test', + ); + + $examples['5'] = array( + '
code>test', + '
code>test', + ); + + $examples['6'] = array( + '
test', + '
test', + ); + + $examples['7'] = array( + '
test', + '
test', + ); + + $examples['8'] = array( + '
id="test">test', + '
id="test">test', + ); + + $examples['9'] = array( + '
test', + '
test', + ); + + $examples['10'] = array( + 'test', + 'test', + ); + + $examples['11'] = array( + 'The applicative operator <* works well in Haskell; is what?test', + 'The applicative operator <* works well in Haskell; is what?test', + ); + + $examples['12'] = array( + '<3 is a heart but is a tag.test', + '<3 is a heart but is a tag.test', + ); + + $examples['13'] = array( + 'test', + 'test', + ); + + $examples['14'] = array( + 'test', + 'test', + ); + + $examples['15'] = array( + ' a HTML Tag]]>test', + ' a HTML Tag]]>test', + ); + + $examples['16'] = array( + '
test', + '
test', + ); + + $examples['17'] = array( + '
test', + '
test', + ); + + $examples['18'] = array( + '
test', + '
test', + ); + + $examples['19'] = array( + '
test', + '
test', + ); + + $examples['20'] = array( + '
test', + '
test', + ); + + $examples['21'] = array( + '
test', + '
test', + ); + + $examples['22'] = array( + '
test', + '
test', + ); + + $examples['23'] = array( + '
test', + '
test', + ); + + $examples['24'] = array( + '
test', + '
test', + ); + + $examples['25'] = array( + '
test', + '
test', + ); + + $examples['Multiple unclosed tags treated as a single tag'] = array( + '
+test', + '
+test', + ); + + $examples['27'] = array( + '
test', + '
test', + ); + + $examples['28'] = array( + '
test', + '
test', + ); + + return $examples; + } +}