Skip to content

Commit

Permalink
Tag Processor: Add ability to stop at tag closers, if requested
Browse files Browse the repository at this point in the history
`WP_HTML_Tag_Processor` introduces the ability to walk through an HTML
document and modify attributes on tag openers.

At times it could be useful to stop also at tag closers in order to
perform augmented inspection and querying of a document.

In this patch we're opening up a mode that allows just that; if
querying with the `[ "tag_closers" => "visit" ]` query parrameter
then in addition to stopping at tag openers, this will allow
stopping at all tag tokens regardless of open or close status.
  • Loading branch information
dmsnell committed Nov 17, 2022
1 parent dbb9487 commit 1652965
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 33 deletions.
136 changes: 103 additions & 33 deletions lib/experimental/html/class-wp-html-tag-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,14 @@ class WP_HTML_Tag_Processor {
*/
private $sought_match_offset;

/**
* Whether to visit tag closers, e.g. </div>, when walking an input document.
*
* @since 6.2.0
* @var boolean
*/
private $stop_on_tag_closers;

/**
* The updated HTML document.
*
Expand Down Expand Up @@ -276,6 +284,29 @@ class WP_HTML_Tag_Processor {
*/
private $tag_name_length;

/**
* Byte offset in input document where current tag token ends.
*
* Example:
* ```
* <div id="test">...
* 0 1 |
* 01234567890123456
* --- tag name ends at 14
* ```
*
* @since 6.2.0
* @var ?int
*/
private $tag_ends_at;

/**
* Whether the current tag is an opening tag, e.g. <div>, or a closing tag, e.g. </div>.
*
* @var boolean
*/
private $is_closing_tag;

/**
* Lazily-built index of attributes found within an HTML tag, keyed by the attribute name.
*
Expand Down Expand Up @@ -412,7 +443,16 @@ public function next_tag( $query = null ) {
return false;
}

$this->parse_tag_opener_attributes();
while ( $this->parse_next_attribute() ) {
continue;
}

$tag_ends_at = strpos( $this->html, '>', $this->parsed_bytes );
if ( false === $tag_ends_at ) {
return false;
}
$this->tag_ends_at = $tag_ends_at;
$this->parsed_bytes = $tag_ends_at;

if ( $this->matches() ) {
++$already_found;
Expand Down Expand Up @@ -495,7 +535,9 @@ private function skip_rcdata( $tag_name ) {
continue;
}

$this->skip_tag_closer_attributes();
while ( $this->parse_next_attribute() ) {
continue;
}
$at = $this->parsed_bytes;
if ( $at >= strlen( $this->html ) ) {
return false;
Expand Down Expand Up @@ -620,12 +662,14 @@ private function skip_script_data() {

if ( $is_closing ) {
$this->parsed_bytes = $at;
$this->skip_tag_closer_attributes();

if ( $this->parsed_bytes >= $doc_length ) {
return false;
}

while ( $this->parse_next_attribute() ) {
continue;
}

if ( '>' === $html[ $this->parsed_bytes ] ) {
++$this->parsed_bytes;
return true;
Expand Down Expand Up @@ -656,6 +700,13 @@ private function parse_next_tag() {
return false;
}

if ( '/' === $this->html[ $at + 1 ] ) {
$this->is_closing_tag = true;
$at++;
} else {
$this->is_closing_tag = false;
}

/*
* HTML tag names must start with [a-zA-Z] otherwise they are not tags.
* For example, "<3" is rendered as text, not a tag opener. This means
Expand Down Expand Up @@ -777,35 +828,13 @@ private function parse_next_tag() {
return false;
}

/**
* Parses all attributes of the current tag.
*
* @since 6.2.0
*/
private function parse_tag_opener_attributes() {
while ( $this->parse_next_attribute() ) {
continue;
}
}

/**
* Skips all attributes of the current tag.
*
* @since 6.2.0
*/
private function skip_tag_closer_attributes() {
while ( $this->parse_next_attribute( 'tag-closer' ) ) {
continue;
}
}

/**
* Parses the next attribute.
*
* @param string $context tag-opener or tag-closer.
* @since 6.2.0
*/
private function parse_next_attribute( $context = 'tag-opener' ) {
private function parse_next_attribute() {
// Skip whitespace and slashes.
$this->parsed_bytes += strspn( $this->html, " \t\f\r\n/", $this->parsed_bytes );
if ( $this->parsed_bytes >= strlen( $this->html ) ) {
Expand Down Expand Up @@ -872,7 +901,7 @@ private function parse_next_attribute( $context = 'tag-opener' ) {
return false;
}

if ( 'tag-opener' !== $context ) {
if ( $this->is_closing_tag ) {
return true;
}

Expand Down Expand Up @@ -914,6 +943,8 @@ private function after_tag() {
$this->apply_attributes_updates();
$this->tag_name_starts_at = null;
$this->tag_name_length = null;
$this->tag_ends_at = null;
$this->is_closing_tag = null;
$this->attributes = array();
}

Expand Down Expand Up @@ -1159,6 +1190,25 @@ public function get_tag() {
return strtoupper( $tag_name );
}

/**
* Indicates if the current tag token is a tag closer.
*
* Example:
* <code>
* $p = new WP_HTML_Tag_Processor( '<div></div>' );
* $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
* $p->is_tag_closer() === false;
*
* $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
* $p->is_tag_closer() === true;
* </code>
*
* @return bool
*/
public function is_tag_closer() {
return $this->is_closing_tag;
}

/**
* Updates or creates a new attribute on the currently matched tag with the value passed.
*
Expand All @@ -1175,8 +1225,8 @@ public function get_tag() {
* @throws Exception When WP_DEBUG is true and the attribute name is invalid.
*/
public function set_attribute( $name, $value ) {
if ( null === $this->tag_name_starts_at ) {
return;
if ( $this->is_closing_tag || null === $this->tag_name_starts_at ) {
return false;
}

/*
Expand Down Expand Up @@ -1286,8 +1336,8 @@ public function set_attribute( $name, $value ) {
* @param string $name The attribute name to remove.
*/
public function remove_attribute( $name ) {
if ( ! isset( $this->attributes[ $name ] ) ) {
return;
if ( $this->is_closing_tag || ! isset( $this->attributes[ $name ] ) ) {
return false;
}

/*
Expand Down Expand Up @@ -1316,6 +1366,10 @@ public function remove_attribute( $name ) {
* @param string $class_name The class name to add.
*/
public function add_class( $class_name ) {
if ( $this->is_closing_tag ) {
return false;
}

if ( null !== $this->tag_name_starts_at ) {
$this->classname_updates[ $class_name ] = self::ADD_CLASS;
}
Expand All @@ -1329,6 +1383,10 @@ public function add_class( $class_name ) {
* @param string $class_name The class name to remove.
*/
public function remove_class( $class_name ) {
if ( $this->is_closing_tag ) {
return false;
}

if ( null !== $this->tag_name_starts_at ) {
$this->classname_updates[ $class_name ] = self::REMOVE_CLASS;
}
Expand Down Expand Up @@ -1392,7 +1450,9 @@ public function get_updated_html() {

// Parse the attributes in the updated markup.
$this->attributes = array();
$this->parse_tag_opener_attributes();
while ( $this->parse_next_attribute() ) {
continue;
}

return $this->html;
}
Expand All @@ -1407,6 +1467,7 @@ public function get_updated_html() {
*
* @type string|null $tag_name Which tag to find, or `null` for "any tag."
* @type string|null $class_name Tag must contain this class name to match.
* @type string $tag_closers "visit" or "skip": whether to stop on tag closers, e.g. </div>.
* }
*/
private function parse_query( $query ) {
Expand All @@ -1418,6 +1479,7 @@ private function parse_query( $query ) {
$this->sought_tag_name = null;
$this->sought_class_name = null;
$this->sought_match_offset = 1;
$this->stop_on_tag_closers = false;

// A single string value means "find the tag of this name".
if ( is_string( $query ) ) {
Expand All @@ -1441,6 +1503,10 @@ private function parse_query( $query ) {
if ( isset( $query['match_offset'] ) && is_int( $query['match_offset'] ) && 0 < $query['match_offset'] ) {
$this->sought_match_offset = $query['match_offset'];
}

if ( isset( $query['tag_closers' ] ) ) {
$this->stop_on_tag_closers = $query['tag_closers'] === 'visit';
}
}


Expand All @@ -1452,6 +1518,10 @@ private function parse_query( $query ) {
* @return boolean
*/
private function matches() {
if ( $this->is_closing_tag && ! $this->stop_on_tag_closers ) {
return false;
}

// Do we match a case-insensitive HTML tag name?
if ( null !== $this->sought_tag_name ) {
/*
Expand Down
33 changes: 33 additions & 0 deletions phpunit/html/wp-html-tag-processor-test.php
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,22 @@ public function test_next_tag_should_return_false_for_a_non_existing_tag() {
$this->assertFalse( $p->next_tag( 'p' ), 'Querying a non-existing tag did not return false' );
}

/**
* @covers next_tag
* @covers is_tag_closer
*/
public function test_next_tag_should_stop_on_closers_only_when_requested() {
$p = new WP_HTML_Tag_Processor( '<div><img /></div>' );
$this->assertTrue( $p->next_tag( [ 'tag_name' => 'div' ] ), 'Did not find desired tag opener' );
$this->assertFalse( $p->next_tag( [ 'tag_name' => 'div' ] ), 'Visited an unwanted tag, a tag closer' );

$p = new WP_HTML_Tag_Processor( '<div><img /></div>' );
$p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
$this->assertFalse( $p->is_tag_closer(), 'Indicated a tag opener is a tag closer' );
$this->assertTrue( $p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] ), 'Did not stop at desired tag closer' );
$this->assertTrue( $p->is_tag_closer(), 'Indicated a tag closer is a tag opener' );
}

/**
* @ticket 56299
*
Expand All @@ -255,6 +271,23 @@ public function test_set_attribute_on_a_non_existing_tag_does_not_change_the_mar
);
}

public function test_attribute_ops_on_tag_closer_do_not_change_the_markup() {
$p = new WP_HTML_Tag_Processor( '<div id=3></div invalid-id=4>');
$p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
$this->assertFalse( $p->is_tag_closer(), 'Skipped tag opener' );
$p->next_tag( [ 'tag_name' => 'div', 'tag_closers' => 'visit' ] );
$this->assertTrue( $p->is_tag_closer(), 'Skipped tag closer' );
$this->assertFalse( $p->set_attribute( 'id', 'test' ), "Allowed setting an attribute on a tag closer when it shouldn't have" );
$this->assertFalse( $p->remove_attribute( 'invalid-id' ), "Allowed removing an attribute on a tag closer when it shouldn't have" );
$this->assertFalse( $p->add_class( 'sneaky' ), "Allowed adding a class on a tag closer when it shouldn't have" );
$this->assertFalse( $p->remove_class( 'not-appearing-in-this-test' ), "Allowed removing a class on a tag closer when it shouldn't have" );
$this->assertSame(
'<div id=3></div invalid-id=4>',
$p->get_updated_html(),
'Calling get_updated_html after updating a non-existing tag returned an HTML that was different from the original HTML'
);
}

/**
* Passing a double quote inside of an attribute values could lead to an XSS attack as follows:
*
Expand Down

0 comments on commit 1652965

Please sign in to comment.