From c6c5dbba30cdf90310e4f6767e6c31f760c68b70 Mon Sep 17 00:00:00 2001
From: Adam Zielinski
Date: Fri, 23 Sep 2022 16:36:52 +1000
Subject: [PATCH] WP_HTML_Tag_Processor: Inject dynamic data to block HTML
markup in PHP (#42485)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Introduce WP_HTML_Tag_Processor for reliably modifying HTML attributes.
Dynamic blocks often need to inject a CSS class name or set in the rendered block HTML markup but lack the means to do so. WP_HTML_Tag_Processor solves this problem.
It scans through an HTML document to find specific tags, then transforms those tags by adding, removing, or updating the values of the HTML attributes within that tag (opener).
Importantly, it does not fully parse HTML or _recurse_ into the HTML structure. Instead WP_HTML_Tag_Processor scans linearly through a document and only parses the HTML tag openers.
Example:
```
$p = new WP_HTML_Tag_Processor('');
$p->next_tag('img')->set_attribute('src', '/wp-content/logo.png');
echo $p;
//
```
For more details and context, see the original GitHub Pull Request at https://github.com/WordPress/gutenberg/pull/42485 and the overview issue at https://github.com/WordPress/gutenberg/issues/44410.
Co-authored-by: Adam Zieliński
Co-authored-by: Dennis Snell
Co-authored-by: Grzegorz Ziółkowski
Co-authored-by: Sören Wrede
Co-authored-by: Colin Stewart <79332690+costdev@users.noreply.github.com>
---
.../html/class-wp-html-attribute-token.php | 89 ++
.../html/class-wp-html-tag-processor.php | 1265 +++++++++++++++++
.../html/class-wp-html-text-replacement.php | 59 +
lib/experimental/html/index.php | 11 +
lib/load.php | 3 +
phpunit/html/wp-html-tag-processor-test.php | 1187 ++++++++++++++++
6 files changed, 2614 insertions(+)
create mode 100644 lib/experimental/html/class-wp-html-attribute-token.php
create mode 100644 lib/experimental/html/class-wp-html-tag-processor.php
create mode 100644 lib/experimental/html/class-wp-html-text-replacement.php
create mode 100644 lib/experimental/html/index.php
create mode 100644 phpunit/html/wp-html-tag-processor-test.php
diff --git a/lib/experimental/html/class-wp-html-attribute-token.php b/lib/experimental/html/class-wp-html-attribute-token.php
new file mode 100644
index 00000000000000..32adfe02e0fb8c
--- /dev/null
+++ b/lib/experimental/html/class-wp-html-attribute-token.php
@@ -0,0 +1,89 @@
+name = $name;
+ $this->value_starts_at = $value_start;
+ $this->value_length = $value_length;
+ $this->start = $start;
+ $this->end = $end;
+ $this->is_true = $is_true;
+ }
+}
diff --git a/lib/experimental/html/class-wp-html-tag-processor.php b/lib/experimental/html/class-wp-html-tag-processor.php
new file mode 100644
index 00000000000000..be6179c963571f
--- /dev/null
+++ b/lib/experimental/html/class-wp-html-tag-processor.php
@@ -0,0 +1,1265 @@
+ "c" not " c"
+ * @TODO: Skip over `/` in attributes area, split attribute names by `/`
+ * @TODO: Decode HTML references/entities in class names when matching.
+ * E.g. match having class `1<"2` needs to recognize `class="1<"2"`.
+ * @TODO: Decode character references in `get_attribute()`
+ * @TODO: Properly escape attribute value in `set_attribute()`
+ *
+ * @package WordPress
+ * @subpackage HTML
+ * @since 6.1.0
+ */
+
+/**
+ * Processes an input HTML document by applying a specified set
+ * of patches to that input. Tokenizes HTML but does not fully
+ * parse the input document.
+ *
+ * @since 6.1.0
+ */
+class WP_HTML_Tag_Processor {
+
+ /**
+ * The HTML document to parse.
+ *
+ * @since 6.1.0
+ * @var string
+ */
+ private $html;
+
+ /**
+ * The last query passed to next_tag().
+ *
+ * @since 6.1.0
+ * @var array|null
+ */
+ private $last_query;
+
+ /**
+ * The tag name this processor currently scans for.
+ *
+ * @since 6.1.0
+ * @var string|null
+ */
+ private $sought_tag_name;
+
+ /**
+ * The CSS class name this processor currently scans for.
+ *
+ * @since 6.1.0
+ * @var string|null
+ */
+ private $sought_class_name;
+
+ /**
+ * The match offset this processor currently scans for.
+ *
+ * @since 6.1.0
+ * @var int|null
+ */
+ private $sought_match_offset;
+
+ /**
+ * The updated HTML document.
+ *
+ * @since 6.1.0
+ * @var string
+ */
+ private $updated_html = '';
+
+ /**
+ * How many bytes from the original HTML document were already read.
+ *
+ * @since 6.1.0
+ * @var int
+ */
+ private $parsed_bytes = 0;
+
+ /**
+ * How many bytes from the original HTML document were already treated
+ * with the requested replacements.
+ *
+ * @since 6.1.0
+ * @var int
+ */
+ private $updated_bytes = 0;
+
+ /**
+ * The name of the currently matched tag.
+ *
+ * @since 6.1.0
+ * @var integer|null
+ */
+ private $tag_name_starts_at;
+
+ /**
+ * Byte offset after the name of current tag.
+ * Example:
+ *
+ * // supposing the parser is working through this content
+ * // and stops after recognizing the `id` attribute
+ * //
+ * // ^ parsing will continue from this point
+ * $this->attributes = array(
+ * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 )
+ * );
+ *
+ * // when picking up parsing again, or when asking to find the
+ * // `class` attribute we will continue and add to this array
+ * $this->attributes = array(
+ * 'id' => new WP_HTML_Attribute_Match( 'id', null, 6, 17 ),
+ * 'class' => new WP_HTML_Attribute_Match( 'class', 'outline', 18, 32 )
+ * );
+ *
+ * // Note that only the `class` attribute value is stored in the index.
+ * // That's because it is the only value used by this class at the moment.
+ *
+ *
+ * @since 6.1.0
+ * @var WP_HTML_Attribute_Token[]
+ */
+ private $attributes = array();
+
+ /**
+ * Which class names to add or remove from a tag.
+ *
+ * These are tracked separately from attribute updates because they are
+ * semantically distinct, whereas this interface exists for the common
+ * case of adding and removing class names while other attributes are
+ * generally modified as with DOM `setAttribute` calls.
+ *
+ * When modifying an HTML document these will eventually be collapsed
+ * into a single lexical update to replace the `class` attribute.
+ *
+ * Example:
+ *
+ * // Add the `WP-block-group` class, remove the `WP-group` class.
+ * $class_changes = array(
+ * // Indexed by a comparable class name
+ * 'wp-block-group' => new WP_Class_Name_Operation( 'WP-block-group', WP_Class_Name_Operation::ADD ),
+ * 'wp-group' => new WP_Class_Name_Operation( 'WP-group', WP_Class_Name_Operation::REMOVE )
+ * );
+ *
+ *
+ * @since 6.1.0
+ * @var bool[]
+ */
+ private $classname_updates = array();
+
+ const ADD_CLASS = true;
+ const REMOVE_CLASS = false;
+ const SKIP_CLASS = null;
+
+ /**
+ * Lexical replacements to apply to input HTML document.
+ *
+ * HTML modifications collapse into lexical replacements in order to
+ * provide an efficient mechanism to update documents lazily and in
+ * order to support a variety of semantic modifications without
+ * building a complicated parsing machinery. That is, it's up to
+ * the calling class to generate the lexical modification from the
+ * semantic change requested.
+ *
+ * Example:
+ *
+ * // Replace an attribute stored with a new value, indices
+ * // sourced from the lazily-parsed HTML recognizer.
+ * $start = $attributes['src']->start;
+ * $end = $attributes['src']->end;
+ * $modifications[] = new WP_HTML_Text_Replacement( $start, $end, get_the_post_thumbnail_url() );
+ *
+ * // Correspondingly, something like this
+ * // will appear in the replacements array.
+ * $replacements = array(
+ * WP_HTML_Text_Replacement( 14, 28, 'https://my-site.my-domain/wp-content/uploads/2014/08/kittens.jpg' )
+ * );
+ *
+ *
+ * @since 6.1.0
+ * @var WP_HTML_Text_Replacement[]
+ */
+ private $attribute_updates = array();
+
+ /**
+ * Constructor.
+ *
+ * @since 6.1.0
+ *
+ * @param string $html HTML to process.
+ */
+ public function __construct( $html ) {
+ $this->html = $html;
+ }
+
+ /**
+ * Finds the next tag matching the $query.
+ *
+ * @since 6.1.0
+ *
+ * @param array|string $query {
+ * Which tag name to find, having which class, etc.
+ *
+ * @type string|null $tag_name Which tag to find, or `null` for "any tag."
+ * @type int|null $match_offset Find the Nth tag matching all search criteria.
+ * 0 for "first" tag, 2 for "third," etc.
+ * Defaults to first tag.
+ * @type string|null $class_name Tag must contain this whole class name to match.
+ * }
+ * @return boolean Whether a tag was matched.
+ */
+ public function next_tag( $query = null ) {
+ $this->parse_query( $query );
+ $already_found = 0;
+
+ do {
+ /*
+ * Unfortunately we can't try to search for only the tag name we want because that might
+ * lead us to skip over other tags and lose track of our place. So we need to search for
+ * _every_ tag and then check after we find one if it's the one we are looking for.
+ */
+ if ( false === $this->parse_next_tag() ) {
+ $this->parsed_bytes = strlen( $this->html );
+
+ return false;
+ }
+
+ $this->parse_tag_opener_attributes();
+
+ if ( $this->matches() ) {
+ $already_found++;
+ }
+
+ // Avoid copying the tag name string when possible.
+ $t = $this->html[ $this->tag_name_starts_at ];
+ if ( 's' === $t || 'S' === $t || 't' === $t || 'T' === $t ) {
+ $tag_name = $this->get_tag();
+
+ if ( 'script' === $tag_name ) {
+ $this->skip_script_data();
+ } elseif ( 'textarea' === $tag_name || 'title' === $tag_name ) {
+ $this->skip_rcdata( $tag_name );
+ }
+ }
+ } while ( $already_found < $this->sought_match_offset );
+
+ return true;
+ }
+
+ /**
+ * Skips the contents of the title and textarea tags until an appropriate
+ * tag closer is found.
+ *
+ * @see https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
+ * @param string $tag_name – the lowercase tag name which will close the RCDATA region.
+ * @since 6.1.0
+ */
+ private function skip_rcdata( $tag_name ) {
+ $html = $this->html;
+ $doc_length = strlen( $html );
+ $tag_length = strlen( $tag_name );
+
+ $at = $this->parsed_bytes;
+
+ while ( true ) {
+ $at = strpos( $this->html, '', $at );
+
+ // If we have no possible tag closer then fail.
+ if ( false === $at || ( $at + $tag_length ) > $doc_length ) {
+ $this->parsed_bytes = $doc_length;
+ return;
+ }
+
+ $at += 2;
+
+ /*
+ * We have to find a case-insensitive match to the tag name.
+ * Note also that since tag names are limited to US-ASCII
+ * characters we can ignore any kind of Unicode normalizing
+ * forms when comparing. If we get a non-ASCII character it
+ * will never be a match.
+ */
+ for ( $i = 0; $i < $tag_length; $i++ ) {
+ $tag_char = $tag_name[ $i ];
+ $html_char = $html[ $at + $i ];
+
+ if ( $html_char !== $tag_char && strtolower( $html_char ) !== $tag_char ) {
+ $at += $i;
+ continue 2;
+ }
+ }
+
+ $at += $tag_length;
+ $this->parsed_bytes = $at;
+
+ /*
+ * Ensure we terminate the tag name, otherwise we might,
+ * for example, accidentally match the sequence
+ * "" for "".
+ */
+ $c = $html[ $at ];
+ if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
+ continue;
+ }
+
+ $this->skip_tag_closer_attributes();
+ $at = $this->parsed_bytes;
+
+ if ( '>' === $html[ $at ] || '/' === $html[ $at ] ) {
+ $this->parsed_bytes++;
+ return;
+ }
+ }
+ }
+
+ /**
+ * Skips the contents of tags, so if we're not seeing the
+ * start of one of these tokens we can proceed to the next
+ * potential match in the text.
+ */
+ if ( ! (
+ $at + 6 < $doc_length &&
+ ( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
+ ( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
+ ( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
+ ( 'i' === $html[ $at + 3 ] || 'I' === $html[ $at + 3 ] ) &&
+ ( 'p' === $html[ $at + 4 ] || 'P' === $html[ $at + 4 ] ) &&
+ ( 't' === $html[ $at + 5 ] || 'T' === $html[ $at + 5 ] )
+ ) ) {
+ $at++;
+ continue;
+ }
+
+ /*
+ * We also have to make sure we terminate the script tag opener/closer
+ * to avoid making partial matches on strings like `
' !== $c ) {
+ $at++;
+ continue;
+ }
+
+ if ( 'escaped' === $state && ! $is_closing ) {
+ $state = 'double-escaped';
+ continue;
+ }
+
+ if ( 'double-escaped' === $state && $is_closing ) {
+ $state = 'escaped';
+ continue;
+ }
+
+ if ( $is_closing ) {
+ $this->parsed_bytes = $at;
+ $this->skip_tag_closer_attributes();
+
+ if ( '>' === $html[ $this->parsed_bytes ] ) {
+ $this->parsed_bytes++;
+ return;
+ }
+ }
+
+ $at++;
+ }
+ }
+
+ /**
+ * Parses the next tag.
+ *
+ * @since 6.1.0
+ */
+ private function parse_next_tag() {
+ $this->after_tag();
+
+ $html = $this->html;
+ $at = $this->parsed_bytes;
+
+ while ( true ) {
+ $at = strpos( $html, '<', $at );
+ if ( false === $at ) {
+ return false;
+ }
+
+ /*
+ * HTML tag names must start with [a-zA-Z] otherwise they are not tags.
+ * For example, "<3" is rendered as text, not a tag opener. This means
+ * if we have at least one letter following the "<" then we _do_ have
+ * a tag opener and can process it as such. This is more common than
+ * HTML comments, DOCTYPE tags, and other structure starting with "<"
+ * so it's good to check first for the presence of the tag.
+ *
+ * Reference:
+ * * https://html.spec.whatwg.org/multipage/parsing.html#data-state
+ * * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ $tag_name_prefix_length = strspn( $html, 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', $at + 1 );
+ if ( $tag_name_prefix_length > 0 ) {
+ $at++;
+ $tag_name_length = $tag_name_prefix_length + strcspn( $html, " \t\f\r\n/>", $at + $tag_name_prefix_length );
+ $this->tag_name_starts_at = $at;
+ $this->tag_name_ends_at = $at + $tag_name_length;
+ $this->parsed_bytes = $at + $tag_name_length;
+ return true;
+ }
+
+ //
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ if (
+ strlen( $html ) > $at + 3 &&
+ '-' === $html[ $at + 2 ] &&
+ '-' === $html[ $at + 3 ]
+ ) {
+ $at = strpos( $html, '-->', $at + 4 ) + 3;
+ continue;
+ }
+
+ //
+ // The CDATA is case-sensitive.
+ // https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ if (
+ strlen( $html ) > $at + 8 &&
+ '[' === $html[ $at + 2 ] &&
+ 'C' === $html[ $at + 3 ] &&
+ 'D' === $html[ $at + 4 ] &&
+ 'A' === $html[ $at + 5 ] &&
+ 'T' === $html[ $at + 6 ] &&
+ 'A' === $html[ $at + 7 ] &&
+ '[' === $html[ $at + 8 ]
+ ) {
+ $at = strpos( $html, ']]>', $at + 9 ) + 3;
+ continue;
+ }
+
+ /*
+ *
+ * These are ASCII-case-insensitive.
+ * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ if (
+ strlen( $html ) > $at + 8 &&
+ 'D' === strtoupper( $html[ $at + 2 ] ) &&
+ 'O' === strtoupper( $html[ $at + 3 ] ) &&
+ 'C' === strtoupper( $html[ $at + 4 ] ) &&
+ 'T' === strtoupper( $html[ $at + 5 ] ) &&
+ 'Y' === strtoupper( $html[ $at + 6 ] ) &&
+ 'P' === strtoupper( $html[ $at + 7 ] ) &&
+ 'E' === strtoupper( $html[ $at + 8 ] )
+ ) {
+ $at = strpos( $html, '>', $at + 9 ) + 1;
+ continue;
+ }
+
+ /*
+ * Anything else here is an incorrectly-opened comment and transitions
+ * to the bogus comment state - we can skip to the nearest >.
+ */
+ $at = strpos( $html, '>', $at + 1 );
+ continue;
+ }
+
+ /*
+ * transitions to a bogus comment state – we can skip to the nearest >
+ * https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+ */
+ if ( '?' === $html[ $at + 1 ] ) {
+ $at = strpos( $html, '>', $at + 2 ) + 1;
+ continue;
+ }
+
+ $at++;
+ }
+ }
+
+ /**
+ * Parses all attributes of the current tag.
+ *
+ * @since 6.1.0
+ */
+ private function parse_tag_opener_attributes() {
+ while ( $this->parse_next_attribute() ) {
+ // Twiddle our thumbs...
+ }
+ }
+
+ /**
+ * Skips all attributes of the current tag.
+ *
+ * @since 6.1.0
+ */
+ private function skip_tag_closer_attributes() {
+ while ( $this->parse_next_attribute( 'tag-closer' ) ) {
+ // Twiddle our thumbs...
+ }
+ }
+
+ /**
+ * Parses the next attribute.
+ *
+ * @param string $context tag-opener or tag-closer.
+ * @since 6.1.0
+ */
+ private function parse_next_attribute( $context = 'tag-opener' ) {
+ // Skip whitespace and slashes.
+ $this->parsed_bytes += strspn( $this->html, " \t\f\r\n/", $this->parsed_bytes );
+
+ /*
+ * Treat the equal sign ("=") as a part of the attribute name if it is the
+ * first encountered byte:
+ * https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
+ */
+ $name_length = '=' === $this->html[ $this->parsed_bytes ]
+ ? 1 + strcspn( $this->html, "=/> \t\f\r\n", $this->parsed_bytes + 1 )
+ : strcspn( $this->html, "=/> \t\f\r\n", $this->parsed_bytes );
+
+ // No attribute, just tag closer.
+ if ( 0 === $name_length ) {
+ return false;
+ }
+
+ $attribute_start = $this->parsed_bytes;
+ $attribute_name = substr( $this->html, $attribute_start, $name_length );
+ $this->parsed_bytes += $name_length;
+
+ $this->skip_whitespace();
+
+ $has_value = '=' === $this->html[ $this->parsed_bytes ];
+ if ( $has_value ) {
+ $this->parsed_bytes++;
+ $this->skip_whitespace();
+
+ switch ( $this->html[ $this->parsed_bytes ] ) {
+ case "'":
+ case '"':
+ $quote = $this->html[ $this->parsed_bytes ];
+ $value_start = $this->parsed_bytes + 1;
+ $value_length = strcspn( $this->html, $quote, $value_start );
+ $attribute_end = $value_start + $value_length + 1;
+ $this->parsed_bytes = $attribute_end;
+ break;
+
+ default:
+ $value_start = $this->parsed_bytes;
+ $value_length = strcspn( $this->html, "> \t\f\r\n", $value_start );
+ $attribute_end = $value_start + $value_length;
+ $this->parsed_bytes = $attribute_end;
+ }
+ } else {
+ $value_start = $this->parsed_bytes;
+ $value_length = 0;
+ $attribute_end = $attribute_start + $name_length;
+ }
+
+ if ( 'tag-opener' !== $context ) {
+ return true;
+ }
+
+ // If an attribute is listed many times, only use the first declaration and ignore the rest.
+ if ( ! array_key_exists( $attribute_name, $this->attributes ) ) {
+ $this->attributes[ $attribute_name ] = new WP_HTML_Attribute_Token(
+ $attribute_name,
+ $value_start,
+ $value_length,
+ $attribute_start,
+ $attribute_end,
+ ! $has_value
+ );
+ }
+
+ return $this->attributes[ $attribute_name ];
+ }
+
+ /**
+ * Move the pointer past any immediate successive whitespace.
+ *
+ * @since 6.1.0
+ *
+ * @return void
+ */
+ private function skip_whitespace() {
+ $this->parsed_bytes += strspn( $this->html, " \t\f\r\n", $this->parsed_bytes );
+ }
+
+ /**
+ * Applies attribute updates and cleans up once a tag is fully parsed.
+ *
+ * @since 6.1.0
+ *
+ * @return void
+ */
+ private function after_tag() {
+ $this->class_name_updates_to_attributes_updates();
+ $this->apply_attributes_updates();
+ $this->tag_name_starts_at = null;
+ $this->tag_name_ends_at = null;
+ $this->attributes = array();
+ }
+
+ /**
+ * Converts class name updates into tag attributes updates
+ * (they are accumulated in different data formats for performance).
+ *
+ * This method is only meant to run right before the attribute updates are applied.
+ * The behavior in all other cases is undefined.
+ *
+ * @return void
+ * @since 6.1.0
+ *
+ * @see $classname_updates
+ * @see $attribute_updates
+ */
+ private function class_name_updates_to_attributes_updates() {
+ if ( count( $this->classname_updates ) === 0 || isset( $this->attribute_updates['class'] ) ) {
+ $this->classname_updates = array();
+ return;
+ }
+
+ $existing_class = isset( $this->attributes['class'] )
+ ? substr( $this->html, $this->attributes['class']->value_starts_at, $this->attributes['class']->value_length )
+ : '';
+
+ /**
+ * Updated "class" attribute value.
+ *
+ * This is incrementally built as we scan through the existing class
+ * attribute, omitting removed classes as we do so, and then appending
+ * added classes at the end. Only when we're done processing will the
+ * value contain the final new value.
+
+ * @var string
+ */
+ $class = '';
+
+ /**
+ * Tracks the cursor position in the existing class
+ * attribute value where we're currently parsing.
+ *
+ * @var integer
+ */
+ $at = 0;
+
+ /**
+ * Indicates if we have made any actual modifications to the existing
+ * class attribute value, used to short-circuit string copying.
+ *
+ * It's possible that we are intending to remove certain classes and
+ * add others in such a way that we don't modify the existing value
+ * because calls to `add_class()` and `remove_class()` occur
+ * independent of the input values sent to the WP_HTML_Tag_Processor. That is, we
+ * might call `remove_class()` for a class that isn't already present
+ * and we might call `add_class()` for one that is, in which case we
+ * wouldn't need to break apart the string and rebuild it.
+ *
+ * This flag is set upon the first change that requires a string update.
+ *
+ * @var boolean
+ */
+ $modified = false;
+
+ // Remove unwanted classes by only copying the new ones.
+ while ( $at < strlen( $existing_class ) ) {
+ // Skip to the first non-whitespace character.
+ $ws_at = $at;
+ $ws_length = strspn( $existing_class, " \t\f\r\n", $ws_at );
+ $at += $ws_length;
+
+ // Capture the class name – it's everything until the next whitespace.
+ $name_length = strcspn( $existing_class, " \t\f\r\n", $at );
+ if ( 0 === $name_length ) {
+ // We're done, no more class names.
+ break;
+ }
+
+ $name = substr( $existing_class, $at, $name_length );
+ $at += $name_length;
+
+ // If this class is marked for removal, start processing the next one.
+ $remove_class = (
+ isset( $this->classname_updates[ $name ] ) &&
+ self::REMOVE_CLASS === $this->classname_updates[ $name ]
+ );
+
+ // Once we've seen a class, we should never add it again.
+ if ( ! $remove_class ) {
+ $this->classname_updates[ $name ] = self::SKIP_CLASS;
+ }
+
+ if ( $remove_class ) {
+ $modified = true;
+ continue;
+ }
+
+ /*
+ * Otherwise, append it to the new "class" attribute value.
+ *
+ * By preserving the existing whitespace instead of only adding a single
+ * space (which is a valid transformation we can make) we'll introduce
+ * fewer changes to the HTML content and hopefully make comparing
+ * before/after easier for people trying to debug the modified output.
+ */
+ $class .= substr( $existing_class, $ws_at, $ws_length );
+ $class .= $name;
+ }
+
+ // Add new classes by appending the ones we haven't already seen.
+ foreach ( $this->classname_updates as $name => $operation ) {
+ if ( self::ADD_CLASS === $operation ) {
+ $modified = true;
+
+ $class .= strlen( $class ) > 0 ? ' ' : '';
+ $class .= $name;
+ }
+ }
+
+ $this->classname_updates = array();
+ if ( ! $modified ) {
+ return;
+ }
+
+ if ( strlen( $class ) > 0 ) {
+ $this->set_attribute( 'class', $class );
+ } else {
+ $this->remove_attribute( 'class' );
+ }
+ }
+
+ /**
+ * Applies updates to attributes.
+ *
+ * @since 6.1.0
+ */
+ private function apply_attributes_updates() {
+ if ( ! count( $this->attribute_updates ) ) {
+ return;
+ }
+
+ /**
+ * Attribute updates can be enqueued in any order but as we
+ * progress through the document to replace them we have to
+ * make our replacements in the order in which they are found
+ * in that document.
+ *
+ * Sorting the updates ensures we don't make our replacements
+ * out of order, which could otherwise lead to mangled output,
+ * partially-duplicate attributes, and overwritten attributes.
+ */
+ usort( $this->attribute_updates, array( 'self', 'sort_start_ascending' ) );
+
+ foreach ( $this->attribute_updates as $diff ) {
+ $this->updated_html .= substr( $this->html, $this->updated_bytes, $diff->start - $this->updated_bytes );
+ $this->updated_html .= $diff->text;
+ $this->updated_bytes = $diff->end;
+ }
+
+ $this->attribute_updates = array();
+ }
+
+ /**
+ * Sort function to arrange objects with a start property in ascending order.
+ *
+ * @since 6.1.0
+ *
+ * @param object $a First attribute update.
+ * @param object $b Second attribute update.
+ * @return integer
+ */
+ private static function sort_start_ascending( $a, $b ) {
+ return $a->start - $b->start;
+ }
+
+ /**
+ * Returns the value of the parsed attribute in the currently-opened tag.
+ *
+ * Example:
+ *
+ * $p = new WP_HTML_Tag_Processor( 'Test
' );
+ * $p->next_tag( [ 'class_name' => 'test' ] ) === true;
+ * $p->get_attribute( 'data-test-id' ) === '14';
+ * $p->get_attribute( 'enabled' ) === true;
+ * $p->get_attribute( 'aria-label' ) === null;
+ *
+ * $p->next_tag( [] ) === false;
+ * $p->get_attribute( 'class' ) === null;
+ *
+ *
+ * @since 6.1.0
+ *
+ * @param string $name Name of attribute whose value is requested.
+ * @return string|true|null Value of attribute or `null` if not available.
+ * Boolean attributes return `true`.
+ */
+ public function get_attribute( $name ) {
+ if ( null === $this->tag_name_starts_at ) {
+ return null;
+ }
+
+ $comparable = strtolower( $name );
+ if ( ! isset( $this->attributes[ $comparable ] ) ) {
+ return null;
+ }
+
+ $attribute = $this->attributes[ $comparable ];
+
+ if ( true === $attribute->is_true ) {
+ return true;
+ }
+
+ return substr( $this->html, $attribute->value_starts_at, $attribute->value_length );
+ }
+
+ /**
+ * Returns the lowercase name of the currently-opened tag.
+ *
+ * Example:
+ *
+ * $p = new WP_HTML_Tag_Processor( 'Test
' );
+ * $p->next_tag( [] ) === true;
+ * $p->get_tag() === 'DIV';
+ *
+ * $p->next_tag( [] ) === false;
+ * $p->get_tag() === null;
+ *
+ *
+ * @since 6.1.0
+ *
+ * @return string|null Name of current tag in input HTML, or `null` if none currently open.
+ */
+ public function get_tag() {
+ if ( null === $this->tag_name_starts_at ) {
+ return null;
+ }
+
+ $tag_name_length = $this->tag_name_ends_at - $this->tag_name_starts_at;
+ $tag_name = substr( $this->html, $this->tag_name_starts_at, $tag_name_length );
+
+ return strtolower( $tag_name );
+ }
+
+ /**
+ * Updates or creates a new attribute on the currently matched tag with the value passed.
+ *
+ * For boolean attributes special handling is provided:
+ * - When `true` is passed as the value, then only the attribute name is added to the tag.
+ * - When `false` is passed, the attribute gets removed if it existed before.
+ *
+ * @since 6.1.0
+ *
+ * @param string $name The attribute name to target.
+ * @param string|boolean $value The new attribute value.
+ */
+ public function set_attribute( $name, $value ) {
+ if ( null === $this->tag_name_starts_at ) {
+ return;
+ }
+
+ /*
+ * > The values "true" and "false" are not allowed on boolean attributes.
+ * > To represent a false value, the attribute has to be omitted altogether.
+ * - HTML5 spec, https://html.spec.whatwg.org/#boolean-attributes
+ */
+ if ( false === $value ) {
+ $this->remove_attribute( $name );
+ return;
+ }
+
+ if ( true === $value ) {
+ $updated_attribute = $name;
+ } else {
+ // @TODO: What escaping and sanitization do we need here?
+ $escaped_new_value = str_replace( '"', '"', $value );
+ $updated_attribute = "{$name}=\"{$escaped_new_value}\"";
+ }
+
+ if ( isset( $this->attributes[ $name ] ) ) {
+ /*
+ * Update an existing attribute.
+ *
+ * Example – set attribute id to "new" in :
+ *
+ * ^-------------^
+ * start end
+ * replacement: `id="new"`
+ *
+ * Result:
+ */
+ $existing_attribute = $this->attributes[ $name ];
+ $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement(
+ $existing_attribute->start,
+ $existing_attribute->end,
+ $updated_attribute
+ );
+ } else {
+ /*
+ * Create a new attribute at the tag's name end.
+ *
+ * Example – add attribute id="new" to :
+ *
+ * ^
+ * start and end
+ * replacement: ` id="new"`
+ *
+ * Result:
+ */
+ $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement(
+ $this->tag_name_ends_at,
+ $this->tag_name_ends_at,
+ ' ' . $updated_attribute
+ );
+ }
+ }
+
+ /**
+ * Removes an attribute of the currently matched tag.
+ *
+ * @since 6.1.0
+ *
+ * @param string $name The attribute name to remove.
+ */
+ public function remove_attribute( $name ) {
+ if ( ! isset( $this->attributes[ $name ] ) ) {
+ return;
+ }
+
+ /*
+ * Removes an existing tag attribute.
+ *
+ * Example – remove the attribute id from :
+ *
+ * ^-------------^
+ * start end
+ * replacement: ``
+ *
+ * Result:
+ */
+ $this->attribute_updates[ $name ] = new WP_HTML_Text_Replacement(
+ $this->attributes[ $name ]->start,
+ $this->attributes[ $name ]->end,
+ ''
+ );
+ }
+
+ /**
+ * Adds a new class name to the currently matched tag.
+ *
+ * @since 6.1.0
+ *
+ * @param string $class_name The class name to add.
+ */
+ public function add_class( $class_name ) {
+ if ( null !== $this->tag_name_starts_at ) {
+ $this->classname_updates[ $class_name ] = self::ADD_CLASS;
+ }
+ }
+
+ /**
+ * Removes a class name from the currently matched tag.
+ *
+ * @since 6.1.0
+ *
+ * @param string $class_name The class name to remove.
+ */
+ public function remove_class( $class_name ) {
+ if ( null !== $this->tag_name_starts_at ) {
+ $this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
+ }
+ }
+
+ /**
+ * Returns the string representation of the HTML Tag Processor.
+ * It closes the HTML Tag Processor and prevents further lookups and modifications.
+ *
+ * @since 6.1.0
+ *
+ * @return string The processed HTML.
+ */
+ public function __toString() {
+ // Parsing either already finished or not started yet.
+ if ( null === $this->tag_name_ends_at ) {
+ return $this->updated_html . substr( $this->html, $this->updated_bytes );
+ }
+
+ /*
+ * Parsing is in progress – let's apply the attribute updates without moving on to the next tag.
+ *
+ * In practice, it means:
+ * 1. Applying the attributes updates to the original HTML
+ * 2. Replacing the original HTML with the updated HTML
+ * 3. Pointing this tag processor to the current tag name's end in that updated HTML
+ */
+
+ // Find tag name's end in the updated markup.
+ $markup_updated_up_to_a_tag_name_end = $this->updated_html . substr( $this->html, $this->updated_bytes, $this->tag_name_ends_at - $this->updated_bytes );
+ $updated_tag_name_ends_at = strlen( $markup_updated_up_to_a_tag_name_end );
+ $tag_name_length = $this->tag_name_ends_at - $this->tag_name_starts_at;
+ $updated_tag_name_starts_at = $updated_tag_name_ends_at - $tag_name_length;
+
+ // Apply attributes updates.
+ $this->updated_html = $markup_updated_up_to_a_tag_name_end;
+ $this->updated_bytes = $this->tag_name_ends_at;
+ $this->class_name_updates_to_attributes_updates();
+ $this->apply_attributes_updates();
+
+ // Replace $this->html with the updated markup.
+ $this->html = $this->updated_html . substr( $this->html, $this->updated_bytes );
+
+ // Rewind this processor to the tag name's end.
+ $this->tag_name_starts_at = $updated_tag_name_starts_at;
+ $this->tag_name_ends_at = $updated_tag_name_ends_at;
+ $this->parsed_bytes = $this->tag_name_ends_at;
+
+ // Restore the previous version of the updated_html as we are not finished with the current_tag yet.
+ $this->updated_html = $markup_updated_up_to_a_tag_name_end;
+ $this->updated_bytes = $updated_tag_name_ends_at;
+
+ // Parse the attributes in the updated markup.
+ $this->attributes = array();
+ $this->parse_tag_opener_attributes();
+
+ return $this->html;
+ }
+
+ /**
+ * Prepares tag search criteria from input interface.
+ *
+ * @since 6.1.0
+ *
+ * @param array|string $query {
+ * Which tag name to find, having which class.
+ *
+ * @type string|null $tag_name Which tag to find, or `null` for "any tag."
+ * @type string|null $class_name Tag must contain this class name to match.
+ * }
+ */
+ private function parse_query( $query ) {
+ if ( null !== $query && $query === $this->last_query ) {
+ return;
+ }
+
+ $this->last_query = $query;
+ $this->sought_tag_name = null;
+ $this->sought_class_name = null;
+ $this->sought_match_offset = 1;
+
+ // A single string value means "find the tag of this name".
+ if ( is_string( $query ) ) {
+ $this->sought_tag_name = $query;
+ return;
+ }
+
+ // If not using the string interface we have to pass an associative array.
+ if ( ! is_array( $query ) ) {
+ return;
+ }
+
+ if ( isset( $query['tag_name'] ) && is_string( $query['tag_name'] ) ) {
+ $this->sought_tag_name = $query['tag_name'];
+ }
+
+ if ( isset( $query['class_name'] ) && is_string( $query['class_name'] ) ) {
+ $this->sought_class_name = $query['class_name'];
+ }
+
+ if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) {
+ $this->sought_match_offset = $query['match_offset'];
+ }
+ }
+
+
+ /**
+ * Checks whether a given tag and its attributes match the search criteria.
+ *
+ * @since 6.1.0
+ *
+ * @return boolean
+ */
+ private function matches() {
+ // Do we match a case-insensitive HTML tag name?
+ if ( null !== $this->sought_tag_name ) {
+ /*
+ * String (byte) length lookup is fast. If they aren't the
+ * same length then they can't be the same string values.
+ */
+ $length = $this->tag_name_ends_at - $this->tag_name_starts_at;
+ if ( strlen( $this->sought_tag_name ) !== $length ) {
+ return false;
+ }
+
+ /*
+ * Otherwise we have to check for each character if they
+ * are the same, and only `strtolower()` if we have to.
+ * Presuming that most people will supply lowercase tag
+ * names and most HTML will contain lowercase tag names,
+ * most of the time this runs we shouldn't expect to
+ * actually run the case-folding comparison.
+ */
+ for ( $i = 0; $i < $length; $i++ ) {
+ $html_char = $this->html[ $this->tag_name_starts_at + $i ];
+ $tag_char = $this->sought_tag_name[ $i ];
+
+ if ( $html_char !== $tag_char && strtolower( $html_char ) !== $tag_char ) {
+ return false;
+ }
+ }
+ }
+
+ $needs_class_name = null !== $this->sought_class_name;
+
+ if ( $needs_class_name && ! isset( $this->attributes['class'] ) ) {
+ return false;
+ }
+
+ // Do we match a byte-for-byte (case-sensitive and encoding-form-sensitive) class name?
+ if ( $needs_class_name ) {
+ $class_start = $this->attributes['class']->value_starts_at;
+ $class_end = $class_start + $this->attributes['class']->value_length;
+ $class_at = $class_start;
+
+ /*
+ * We're going to have to jump through potential matches here because
+ * it's possible that we have classes containing the class name we're
+ * looking for. For instance, if we are looking for "even" we don't
+ * want to be confused when we come to the class "not-even." This is
+ * secured by ensuring that we find our sought-after class and that
+ * it's surrounded on both sides by proper boundaries.
+ *
+ * See https://html.spec.whatwg.org/#attributes-3
+ * See https://html.spec.whatwg.org/#space-separated-tokens
+ */
+ while (
+ // phpcs:ignore WordPress.CodeAnalysis.AssignmentInCondition.FoundInWhileCondition
+ false !== ( $class_at = strpos( $this->html, $this->sought_class_name, $class_at ) ) &&
+ $class_at < $class_end
+ ) {
+ /*
+ * Verify this class starts at a boundary. If it were at 0 we'd be at
+ * the start of the string and that would be fine, otherwise we have
+ * to start at a place where the preceding character is whitespace.
+ */
+ if ( $class_at > $class_start ) {
+ $character = $this->html[ $class_at - 1 ];
+
+ if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) {
+ $class_at += strlen( $this->sought_class_name );
+ continue;
+ }
+ }
+
+ /*
+ * Similarly, verify this class ends at a boundary as well. Here we
+ * can end at the very end of the string value, otherwise we have
+ * to end at a place where the next character is whitespace.
+ */
+ if ( $class_at + strlen( $this->sought_class_name ) < $class_end ) {
+ $character = $this->html[ $class_at + strlen( $this->sought_class_name ) ];
+
+ if ( ' ' !== $character && "\t" !== $character && "\f" !== $character && "\r" !== $character && "\n" !== $character ) {
+ $class_at += strlen( $this->sought_class_name );
+ continue;
+ }
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ return true;
+ }
+}
diff --git a/lib/experimental/html/class-wp-html-text-replacement.php b/lib/experimental/html/class-wp-html-text-replacement.php
new file mode 100644
index 00000000000000..cbddd483538004
--- /dev/null
+++ b/lib/experimental/html/class-wp-html-text-replacement.php
@@ -0,0 +1,59 @@
+start = $start;
+ $this->end = $end;
+ $this->text = $text;
+ }
+}
diff --git a/lib/experimental/html/index.php b/lib/experimental/html/index.php
new file mode 100644
index 00000000000000..e7d41f8cdf4863
--- /dev/null
+++ b/lib/experimental/html/index.php
@@ -0,0 +1,11 @@
+Text';
+ const HTML_WITH_CLASSES = '
Text
';
+ const HTML_MALFORMED = '
Back to notifications
';
+
+ /**
+ * @ticket 56299
+ *
+ * @covers get_tag
+ */
+ public function test_get_tag_returns_null_before_finding_tags() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertNull( $p->get_tag() );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_tag
+ */
+ public function test_get_tag_returns_null_when_not_in_open_tag() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ $this->assertNull( $p->get_tag(), 'Accessing a non-existing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_tag
+ */
+ public function test_get_tag_returns_open_tag_name() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertSame( 'div', $p->get_tag(), 'Accessing an existing tag name did not return "div"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers get_attribute
+ */
+ public function test_get_attribute_returns_null_before_finding_tags() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertNull( $p->get_attribute( 'class' ) );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_attribute
+ */
+ public function test_get_attribute_returns_null_when_not_in_open_tag() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ $this->assertNull( $p->get_attribute( 'class' ), 'Accessing an attribute of a non-existing tag did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_attribute
+ */
+ public function test_get_attribute_returns_null_when_attribute_missing() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertNull( $p->get_attribute( 'test-id' ), 'Accessing a non-existing attribute did not return null' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_attribute
+ */
+ public function test_get_attribute_returns_attribute_value() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $this->assertSame( 'test', $p->get_attribute( 'class' ), 'Accessing a class="test" attribute value did not return "test"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_attribute
+ */
+ public function test_get_attribute_returns_true_for_boolean_attribute() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertTrue( $p->next_tag( array( 'class_name' => 'test' ) ), 'Querying an existing tag did not return true' );
+ $this->assertTrue( $p->get_attribute( 'enabled' ), 'Accessing a boolean "enabled" attribute value did not return true' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_attribute
+ */
+ public function test_get_attribute_returns_string_for_truthy_attributes() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertTrue( $p->next_tag( array() ), 'Querying an existing tag did not return true' );
+ $this->assertSame( 'enabled', $p->get_attribute( 'enabled' ), 'Accessing a boolean "enabled" attribute value did not return true' );
+ $this->assertSame( '1', $p->get_attribute( 'checked' ), 'Accessing a checked=1 attribute value did not return "1"' );
+ $this->assertSame( 'true', $p->get_attribute( 'hidden' ), 'Accessing a hidden="true" attribute value did not return "true"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers get_attribute
+ */
+ public function test_attributes_parser_treats_slash_as_attribute_separator() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $this->assertTrue( $p->next_tag( array() ), 'Querying an existing tag did not return true' );
+ $this->assertTrue( $p->get_attribute( 'a' ), 'Accessing an existing attribute did not return true' );
+ $this->assertTrue( $p->get_attribute( 'b' ), 'Accessing an existing attribute did not return true' );
+ $this->assertTrue( $p->get_attribute( 'c' ), 'Accessing an existing attribute did not return true' );
+ $this->assertTrue( $p->get_attribute( 'd' ), 'Accessing an existing attribute did not return true' );
+ $this->assertSame( 'test', $p->get_attribute( 'e' ), 'Accessing an existing e="test" did not return "test"' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers __toString
+ */
+ public function test_tostring_applies_the_updates_so_far_and_keeps_the_processor_on_the_current_tag() {
+ $p = new WP_HTML_Tag_Processor( '
Test
' );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+
+ $p->next_tag();
+ $p->set_attribute( 'id', 'div-id-1' );
+ $p->add_class( 'new_class_1' );
+ $this->assertSame(
+ '
Test
',
+ (string) $p,
+ 'Calling __toString after updating the attributes of the second tag returned different HTML than expected'
+ );
+
+ $p->set_attribute( 'id', 'div-id-2' );
+ $p->add_class( 'new_class_2' );
+ $this->assertSame(
+ '
Test
',
+ (string) $p,
+ 'Calling __toString after updating the attributes of the second tag for the second time returned different HTML than expected'
+ );
+
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+ $this->assertSame(
+ '
Test
',
+ (string) $p,
+ 'Calling __toString after removing the id attribute of the third tag returned different HTML than expected'
+ );
+
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers __toString
+ */
+ public function test_tostring_without_updating_any_attributes_returns_the_original_html() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $this->assertSame( self::HTML_SIMPLE, (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ */
+ public function test_next_tag_with_no_arguments_should_find_the_next_existing_tag() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $this->assertTrue( $p->next_tag(), 'Querying an existing tag did not return true' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ */
+ public function test_next_tag_should_return_false_for_a_non_existing_tag() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers __toString
+ */
+ public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
+ $this->assertFalse( $p->next_tag( 'div' ), 'Querying a non-existing tag did not return false' );
+ $p->set_attribute( 'id', 'primary' );
+ $this->assertSame(
+ self::HTML_SIMPLE,
+ (string) $p,
+ 'Calling __toString after updating a non-existing tag returned an HTML that was different from the original HTML'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_set_attribute_with_a_non_existing_attribute_adds_a_new_attribute_to_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'test-attribute', 'test-value' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * According to HTML spec, only the first instance of an attribute counts.
+ * The other ones are ignored.
+ *
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_update_first_when_duplicated_attribute() {
+ $p = new WP_HTML_Tag_Processor( '
Text
' );
+ $p->next_tag();
+ $p->set_attribute( 'id', 'updated-id' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_set_attribute_with_an_existing_attribute_name_updates_its_value_in_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->set_attribute( 'id', 'new-id' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_next_tag_and_set_attribute_in_a_loop_update_all_tags_in_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ while ( $p->next_tag() ) {
+ $p->set_attribute( 'data-foo', 'bar' );
+ }
+
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * Removing an attribute that's listed many times, e.g. `
` should remove
+ * all its instances and output just `
`.
+ *
+ * Today, however, WP_HTML_Tag_Processor only removes the first such attribute. It seems like a corner case
+ * and introducing additional complexity to correctly handle this scenario doesn't seem to be worth it.
+ * Let's revisit if and when this becomes a problem.
+ *
+ * This test is in place to confirm this behavior, while incorrect, is well-defined.
+ *
+ * @ticket 56299
+ *
+ * @covers remove_attribute
+ * @covers __toString
+ */
+ public function test_remove_first_when_duplicated_attribute() {
+ $p = new WP_HTML_Tag_Processor( '
Text
' );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_attribute
+ * @covers __toString
+ */
+ public function test_remove_attribute_with_an_existing_attribute_name_removes_it_from_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_attribute( 'id' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_attribute
+ * @covers __toString
+ */
+ public function test_remove_attribute_with_a_non_existing_attribute_name_does_not_change_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_attribute( 'no-such-attribute' );
+ $this->assertSame( self::HTML_SIMPLE, (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers __toString
+ */
+ public function test_add_class_creates_a_class_attribute_when_there_is_none() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers __toString
+ */
+ public function test_calling_add_class_twice_creates_a_class_attribute_with_both_class_names_when_there_is_no_class_attribute() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $p->add_class( 'bar-class' );
+ $this->assertSame( '
Text
', (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_class
+ * @covers __toString
+ */
+ public function test_remove_class_does_not_change_the_markup_when_there_is_no_class_attribute() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_SIMPLE );
+ $p->next_tag();
+ $p->remove_class( 'foo-class' );
+ $this->assertSame( self::HTML_SIMPLE, (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers __toString
+ */
+ public function test_add_class_appends_class_names_to_the_existing_class_attribute_when_one_already_exists() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $p->add_class( 'bar-class' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_class
+ * @covers __toString
+ */
+ public function test_remove_class_removes_a_single_class_from_the_class_attribute_when_one_exists() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->remove_class( 'main' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_class
+ * @covers __toString
+ */
+ public function test_calling_remove_class_with_all_listed_class_names_removes_the_existing_class_attribute_from_the_markup() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->remove_class( 'main' );
+ $p->remove_class( 'with-border' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers __toString
+ */
+ public function test_add_class_does_not_add_duplicate_class_names() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'with-border' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers __toString
+ */
+ public function test_add_class_preserves_class_name_order_when_a_duplicate_class_name_is_added() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'main' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers __toString
+ */
+ public function test_add_class_when_there_is_a_class_attribute_with_excessive_whitespaces() {
+ $p = new WP_HTML_Tag_Processor(
+ '
Text
'
+ );
+ $p->next_tag();
+ $p->add_class( 'foo-class' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_class
+ * @covers __toString
+ */
+ public function test_remove_class_preserves_whitespaces_when_there_is_a_class_attribute_with_excessive_whitespaces() {
+ $p = new WP_HTML_Tag_Processor(
+ '
Text
'
+ );
+ $p->next_tag();
+ $p->remove_class( 'with-border' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_class
+ * @covers __toString
+ */
+ public function test_removing_all_classes_removes_the_existing_class_attribute_from_the_markup_even_when_excessive_whitespaces_are_present() {
+ $p = new WP_HTML_Tag_Processor(
+ '
Text
'
+ );
+ $p->next_tag();
+ $p->remove_class( 'main' );
+ $p->remove_class( 'with-border' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * When both set_attribute('class', $value) and add_class( $different_value ) are called,
+ * the final class name should be $value. In other words, the `add_class` call should be ignored,
+ * and the `set_attribute` call should win. This holds regardless of the order in which these methods
+ * are called.
+ *
+ * @ticket 56299
+ *
+ * @covers add_class
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_set_attribute_takes_priority_over_add_class() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->add_class( 'add_class' );
+ $p->set_attribute( 'class', 'set_attribute' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p,
+ 'Calling __toString after updating first tag\'s attributes did not return the expected HTML'
+ );
+
+ $p = new WP_HTML_Tag_Processor( self::HTML_WITH_CLASSES );
+ $p->next_tag();
+ $p->set_attribute( 'class', 'set_attribute' );
+ $p->add_class( 'add_class' );
+ $this->assertSame(
+ '
Text
',
+ (string) $p,
+ 'Calling __toString after updating second tag\'s attributes did not return the expected HTML'
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers remove_attribute
+ * @covers add_class
+ * @covers remove_class
+ * @covers __toString
+ */
+ public function test_advanced_use_case() {
+ $input = <<
+
+
+HTML;
+
+ $expected_output = <<
+
+
+HTML;
+
+ $p = new WP_HTML_Tag_Processor( $input );
+ $this->assertTrue( $p->next_tag( 'div' ), 'Querying an existing tag did not return true' );
+ $p->set_attribute( 'data-details', '{ "key": "value" }' );
+ $p->add_class( 'is-processed' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'class_name' => 'BtnGroup',
+ )
+ ),
+ 'Querying an existing tag did not return true'
+ );
+ $p->remove_class( 'BtnGroup' );
+ $p->add_class( 'button-group' );
+ $p->add_class( 'Another-Mixed-Case' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'class_name' => 'BtnGroup',
+ )
+ ),
+ 'Querying an existing tag did not return true'
+ );
+ $p->remove_class( 'BtnGroup' );
+ $p->add_class( 'button-group' );
+ $p->add_class( 'Another-Mixed-Case' );
+ $this->assertTrue(
+ $p->next_tag(
+ array(
+ 'tag_name' => 'button',
+ 'class_name' => 'btn',
+ 'match_offset' => 3,
+ )
+ ),
+ 'Querying an existing tag did not return true'
+ );
+ $p->remove_attribute( 'class' );
+ $this->assertFalse( $p->next_tag( 'non-existent' ), 'Querying a non-existing tag did not return false' );
+ $p->set_attribute( 'class', 'test' );
+ $this->assertSame( $expected_output, (string) $p, 'Calling __toString after updating the attributes did not return the expected HTML' );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers remove_attribute
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_correctly_parses_html_attributes_wrapped_in_single_quotation_marks() {
+ $p = new WP_HTML_Tag_Processor(
+ 'Text
'
+ );
+ $p->next_tag(
+ array(
+ 'tag_name' => 'div',
+ 'id' => 'first',
+ )
+ );
+ $p->remove_attribute( 'id' );
+ $p->next_tag(
+ array(
+ 'tag_name' => 'span',
+ 'id' => 'second',
+ )
+ );
+ $p->set_attribute( 'id', 'single-quote' );
+ $this->assertSame(
+ 'Text
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_set_attribute_with_value_equals_to_true_adds_a_boolean_html_attribute_with_implicit_value() {
+ $p = new WP_HTML_Tag_Processor(
+ ''
+ );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', true );
+ $this->assertSame(
+ '',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_setting_a_boolean_attribute_to_false_removes_it_from_the_markup() {
+ $p = new WP_HTML_Tag_Processor(
+ ''
+ );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', false );
+ $this->assertSame(
+ '',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_setting_a_missing_attribute_to_false_does_not_change_the_markup() {
+ $html_input = '';
+ $p = new WP_HTML_Tag_Processor( $html_input );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', false );
+ $this->assertSame( $html_input, (string) $p );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_setting_a_boolean_attribute_to_a_string_value_adds_explicit_value_to_the_markup() {
+ $p = new WP_HTML_Tag_Processor(
+ ''
+ );
+ $p->next_tag( 'input' );
+ $p->set_attribute( 'checked', 'checked' );
+ $this->assertSame(
+ '',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers get_tag
+ * @covers next_tag
+ */
+ public function test_unclosed_script_tag_should_not_cause_an_infinite_loop() {
+ $p = new WP_HTML_Tag_Processor( '',
+ );
+
+ $examples['Simple uppercase script tag'] = array(
+ '',
+ );
+
+ $examples['Script with a comment opener inside should end at the next script tag closer (dash dash escaped state)'] = array(
+ '-->',
+ );
+
+ $examples['Script with a comment opener and a script tag opener inside should end two script tag closer later (double escaped state)'] = array(
+ '-->',
+ );
+
+ $examples['Double escaped script with a tricky opener'] = array(
+ '">">',
+ );
+
+ $examples['Double escaped script with a tricky closer'] = array(
+ '">',
+ );
+
+ $examples['Double escaped, then escaped, then double escaped'] = array(
+ '',
+ );
+
+ $examples['Script with a commented a script tag opener inside should at the next tag closer (dash dash escaped state)'] = array(
+ '-->',
+ );
+
+ $examples['Script closer with another script tag in closer attributes'] = array(
+ '',
+ );
+
+ $examples['Script closer with attributes'] = array(
+ '',
+ );
+
+ $examples['Script opener with title closer inside'] = array(
+ '',
+ );
+
+ $examples['Complex script with many parsing states'] = array(
+ '-->-->',
+ );
+ return $examples;
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ *
+ * @dataProvider data_rcdata_state
+ */
+ public function test_next_tag_ignores_the_contents_of_a_rcdata_tag( $rcdata_then_div, $rcdata_tag ) {
+ $p = new WP_HTML_Tag_Processor( $rcdata_then_div );
+ $p->next_tag();
+ $this->assertSame( $rcdata_tag, $p->get_tag(), "The first found tag was not '$rcdata_tag'" );
+ $p->next_tag();
+ $this->assertSame( 'div', $p->get_tag(), "The second found tag was not 'div'" );
+ }
+
+ /**
+ * Data provider for test_ignores_contents_of_a_rcdata_tag().
+ *
+ * @return array {
+ * @type array {
+ * @type string $rcdata_then_div The HTML snippet containing RCDATA and div tags.
+ * @type string $rcdata_tag The RCDATA tag.
+ * }
+ * }
+ */
+ public function data_rcdata_state() {
+ $examples = array();
+ $examples['Simple textarea'] = array(
+ '',
+ 'textarea',
+ );
+
+ $examples['Simple title'] = array(
+ 'Back to notifications',
+ 'title',
+ );
+
+ $examples['Comment opener inside a textarea tag should be ignored'] = array(
+ '
'
+ );
+ $p->next_tag( 'span' );
+ $p->set_attribute( 'class', 'span-class' );
+ $p->next_tag( 'p' );
+ $p->set_attribute( 'class', 'p-class' );
+ $this->assertSame(
+ '123456
789',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers remove_attribute
+ * @covers __toString
+ */
+ public function test_removing_attributes_works_even_in_malformed_html() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED );
+ $p->next_tag( 'span' );
+ $p->remove_attribute( 'Notifications<' );
+ $this->assertSame(
+ 'Back to notifications
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_Tag
+ * @covers set_attribute
+ * @covers __toString
+ */
+ public function test_updating_attributes_works_even_in_malformed_html_1() {
+ $p = new WP_HTML_Tag_Processor( self::HTML_MALFORMED );
+ $p->next_tag( 'span' );
+ $p->set_attribute( 'id', 'first' );
+ $p->next_tag( 'span' );
+ $p->set_attribute( 'id', 'second' );
+ $this->assertSame(
+ 'Back to notifications
',
+ (string) $p
+ );
+ }
+
+ /**
+ * @ticket 56299
+ *
+ * @covers next_tag
+ * @covers set_attribute
+ * @covers add_class
+ * @covers __toString
+ *
+ * @dataProvider data_malformed_tag
+ */
+ public function test_updating_attributes_works_even_in_malformed_html_2( $html_input, $html_expected ) {
+ $p = new WP_HTML_Tag_Processor( $html_input );
+ $p->next_tag();
+ $p->set_attribute( 'foo', 'bar' );
+ $p->add_class( 'firstTag' );
+ $p->next_tag();
+ $p->add_class( 'secondTag' );
+ $this->assertSame(
+ $html_expected,
+ (string) $p
+ );
+ }
+
+ /**
+ * Data provider for test_updates_when_malformed_tag().
+ *
+ * @return array {
+ * @type array {
+ * @type string $html_input The input HTML snippet.
+ * @type string $html_expected The expected HTML snippet after processing.
+ * }
+ * }
+ */
+ public function data_malformed_tag() {
+ $null_byte = chr( 0 );
+ $examples = array();
+ $examples['Invalid entity inside attribute value'] = array(
+ 'test',
+ 'test',
+ );
+
+ $examples['HTML tag opening inside attribute value'] = array(
+ 'This <is> a <strong is="true">thing.
test',
+ 'This <is> a <strong is="true">thing.
test',
+ );
+
+ $examples['HTML tag brackets in attribute values and data markup'] = array(
+ 'This <is> a <strong is="true">thing.
test',
+ 'This <is> a <strong is="true">thing.
test',
+ );
+
+ $examples['Single and double quotes in attribute value'] = array(
+ 'test',
+ '
test',
+ );
+
+ $examples['Unquoted attribute values'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['Double-quotes escaped in double-quote attribute value'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['Unquoted attribute value'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['Unquoted attribute value with tag-like value'] = array(
+ '
>test',
+ '
>test',
+ );
+
+ $examples['Unquoted attribute value with tag-like value followed by tag-like data'] = array(
+ '
>test',
+ '
>test',
+ );
+
+ $examples['1'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['2'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['4'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['5'] = array(
+ '
code>test',
+ '
code>test',
+ );
+
+ $examples['6'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['7'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['8'] = array(
+ '
id="test">test',
+ '
id="test">test',
+ );
+
+ $examples['9'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['10'] = array(
+ '>test',
+ '>test',
+ );
+
+ $examples['11'] = array(
+ 'The applicative operator <* works well in Haskell; is what?test',
+ 'The applicative operator <* works well in Haskell; is what?test',
+ );
+
+ $examples['12'] = array(
+ '<3 is a heart but is a tag.test',
+ '<3 is a heart but is a tag.test',
+ );
+
+ $examples['13'] = array(
+ 'test',
+ 'test',
+ );
+
+ $examples['14'] = array(
+ 'test',
+ 'test',
+ );
+
+ $examples['15'] = array(
+ ' a HTML Tag]]>test',
+ ' a HTML Tag]]>test',
+ );
+
+ $examples['16'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['17'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['18'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['19'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['20'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['21'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['22'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['23'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['24'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['25'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['Multiple unclosed tags treated as a single tag'] = array(
+ '
+test',
+ '
+test',
+ );
+
+ $examples['27'] = array(
+ '
test',
+ '
test',
+ );
+
+ $examples['28'] = array(
+ '
test',
+ '
test',
+ );
+
+ return $examples;
+ }
+}