two`.
* - HTML with unexpected tag closers, e.g. `
Closed by `.
* - Elements containing text that looks like other tags but isn't, e.g. `The
is plaintext`.
* - SCRIPT and STYLE tags containing text that looks like HTML but isn't, e.g. ``.
* - SCRIPT content which has been escaped, e.g. `') -->`.
*
* ### Unsupported Features
*
* This parser does not report parse errors.
*
* Normally, when additional HTML or BODY tags are encountered in a document, if there
* are any additional attributes on them that aren't found on the previous elements,
* the existing HTML and BODY elements adopt those missing attribute values. This
* parser does not add those additional attributes.
*
* In certain situations, elements are moved to a different part of the document in
* a process called "adoption" and "fostering." Because the nodes move to a location
* in the document that the parser had already processed, this parser does not support
* these situations and will bail.
*
* @since 6.4.0
*
* @see WP_HTML_Tag_Processor
* @see https://html.spec.whatwg.org/
*/
class WP_HTML_Processor extends WP_HTML_Tag_Processor {
/**
* The maximum number of bookmarks allowed to exist at any given time.
*
* HTML processing requires more bookmarks than basic tag processing,
* so this class constant from the Tag Processor is overwritten.
*
* @since 6.4.0
*
* @var int
*/
const MAX_BOOKMARKS = 100;
/**
* Holds the working state of the parser, including the stack of
* open elements and the stack of active formatting elements.
*
* Initialized in the constructor.
*
* @since 6.4.0
*
* @var WP_HTML_Processor_State
*/
private $state;
/**
* Used to create unique bookmark names.
*
* This class sets a bookmark for every tag in the HTML document that it encounters.
* The bookmark name is auto-generated and increments, starting with `1`. These are
* internal bookmarks and are automatically released when the referring WP_HTML_Token
* goes out of scope and is garbage-collected.
*
* @since 6.4.0
*
* @see WP_HTML_Processor::$release_internal_bookmark_on_destruct
*
* @var int
*/
private $bookmark_counter = 0;
/**
* Stores an explanation for why something failed, if it did.
*
* @see self::get_last_error
*
* @since 6.4.0
*
* @var string|null
*/
private $last_error = null;
/**
* Stores context for why the parser bailed on unsupported HTML, if it did.
*
* @see self::get_unsupported_exception
*
* @since 6.7.0
*
* @var WP_HTML_Unsupported_Exception|null
*/
private $unsupported_exception = null;
/**
* Releases a bookmark when PHP garbage-collects its wrapping WP_HTML_Token instance.
*
* This function is created inside the class constructor so that it can be passed to
* the stack of open elements and the stack of active formatting elements without
* exposing it as a public method on the class.
*
* @since 6.4.0
*
* @var Closure|null
*/
private $release_internal_bookmark_on_destruct = null;
/**
* Stores stack events which arise during parsing of the
* HTML document, which will then supply the "match" events.
*
* @since 6.6.0
*
* @var WP_HTML_Stack_Event[]
*/
private $element_queue = array();
/**
* Stores the current breadcrumbs.
*
* @since 6.7.0
*
* @var string[]
*/
private $breadcrumbs = array();
/**
* Current stack event, if set, representing a matched token.
*
* Because the parser may internally point to a place further along in a document
* than the nodes which have already been processed (some "virtual" nodes may have
* appeared while scanning the HTML document), this will point at the "current" node
* being processed. It comes from the front of the element queue.
*
* @since 6.6.0
*
* @var WP_HTML_Stack_Event|null
*/
private $current_element = null;
/**
* Context node if created as a fragment parser.
*
* @var WP_HTML_Token|null
*/
private $context_node = null;
/*
* Public Interface Functions
*/
/**
* Creates an HTML processor in the fragment parsing mode.
*
* Use this for cases where you are processing chunks of HTML that
* will be found within a bigger HTML document, such as rendered
* block output that exists within a post, `the_content` inside a
* rendered site layout.
*
* Fragment parsing occurs within a context, which is an HTML element
* that the document will eventually be placed in. It becomes important
* when special elements have different rules than others, such as inside
* a TEXTAREA or a TITLE tag where things that look like tags are text,
* or inside a SCRIPT tag where things that look like HTML syntax are JS.
*
* The context value should be a representation of the tag into which the
* HTML is found. For most cases this will be the body element. The HTML
* form is provided because a context element may have attributes that
* impact the parse, such as with a SCRIPT tag and its `type` attribute.
*
* ## Current HTML Support
*
* - The only supported context is ``, which is the default value.
* - The only supported document encoding is `UTF-8`, which is the default value.
*
* @since 6.4.0
* @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances.
*
* @param string $html Input HTML fragment to process.
* @param string $context Context element for the fragment, must be default of ``.
* @param string $encoding Text encoding of the document; must be default of 'UTF-8'.
* @return static|null The created processor if successful, otherwise null.
*/
public static function create_fragment( $html, $context = '', $encoding = 'UTF-8' ) {
if ( '' !== $context || 'UTF-8' !== $encoding ) {
return null;
}
if ( ! is_string( $html ) ) {
_doing_it_wrong(
__METHOD__,
__( 'The HTML parameter must be a string.' ),
'6.9.0'
);
return null;
}
$context_processor = static::create_full_parser( "{$context}", $encoding );
if ( null === $context_processor ) {
return null;
}
while ( $context_processor->next_tag() ) {
if ( ! $context_processor->is_virtual() ) {
$context_processor->set_bookmark( 'final_node' );
}
}
if (
! $context_processor->has_bookmark( 'final_node' ) ||
! $context_processor->seek( 'final_node' )
) {
_doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' );
return null;
}
return $context_processor->create_fragment_at_current_node( $html );
}
/**
* Creates an HTML processor in the full parsing mode.
*
* It's likely that a fragment parser is more appropriate, unless sending an
* entire HTML document from start to finish. Consider a fragment parser with
* a context node of ``.
*
* UTF-8 is the only allowed encoding. If working with a document that
* isn't UTF-8, first convert the document to UTF-8, then pass in the
* converted HTML.
*
* @param string $html Input HTML document to process.
* @param string|null $known_definite_encoding Optional. If provided, specifies the charset used
* in the input byte stream. Currently must be UTF-8.
* @return static|null The created processor if successful, otherwise null.
*/
public static function create_full_parser( $html, $known_definite_encoding = 'UTF-8' ) {
if ( 'UTF-8' !== $known_definite_encoding ) {
return null;
}
if ( ! is_string( $html ) ) {
_doing_it_wrong(
__METHOD__,
__( 'The HTML parameter must be a string.' ),
'6.9.0'
);
return null;
}
$processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
$processor->state->encoding = $known_definite_encoding;
$processor->state->encoding_confidence = 'certain';
return $processor;
}
/**
* Constructor.
*
* Do not use this method. Use the static creator methods instead.
*
* @access private
*
* @since 6.4.0
*
* @see WP_HTML_Processor::create_fragment()
*
* @param string $html HTML to process.
* @param string|null $use_the_static_create_methods_instead This constructor should not be called manually.
*/
public function __construct( $html, $use_the_static_create_methods_instead = null ) {
parent::__construct( $html );
if ( self::CONSTRUCTOR_UNLOCK_CODE !== $use_the_static_create_methods_instead ) {
_doing_it_wrong(
__METHOD__,
sprintf(
/* translators: %s: WP_HTML_Processor::create_fragment(). */
__( 'Call %s to create an HTML Processor instead of calling the constructor directly.' ),
'WP_HTML_Processor::create_fragment()'
),
'6.4.0'
);
}
$this->state = new WP_HTML_Processor_State();
$this->state->stack_of_open_elements->set_push_handler(
function ( WP_HTML_Token $token ): void {
$is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer();
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance );
$this->change_parsing_namespace( $token->integration_node_type ? 'html' : $token->namespace );
}
);
$this->state->stack_of_open_elements->set_pop_handler(
function ( WP_HTML_Token $token ): void {
$is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer();
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance );
$adjusted_current_node = $this->get_adjusted_current_node();
if ( $adjusted_current_node ) {
$this->change_parsing_namespace( $adjusted_current_node->integration_node_type ? 'html' : $adjusted_current_node->namespace );
} else {
$this->change_parsing_namespace( 'html' );
}
}
);
/*
* Create this wrapper so that it's possible to pass
* a private method into WP_HTML_Token classes without
* exposing it to any public API.
*/
$this->release_internal_bookmark_on_destruct = function ( string $name ): void {
parent::release_bookmark( $name );
};
}
/**
* Creates a fragment processor at the current node.
*
* HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be
* instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`.
*
* The context node may impact how a fragment of HTML is parsed. For example, consider the HTML
* fragment ` | Inside TD?`.
*
* A BODY context node will produce the following tree:
*
* └─#text Inside TD?
*
* Notice that the `` tags are completely ignored.
*
* Compare that with an SVG context node that produces the following tree:
*
* ├─svg:td
* └─#text Inside TD?
*
* Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected.
* This is a peculiarity of parsing HTML in foreign content like SVG.
*
* Finally, consider the tree produced with a TABLE context node:
*
* └─TBODY
* └─TR
* └─TD
* └─#text Inside TD?
*
* These examples demonstrate how important the context node may be when processing an HTML
* fragment. Special care must be taken when processing fragments that are expected to appear
* in specific contexts. SVG and TABLE are good examples, but there are others.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
*
* @since 6.8.0
*
* @param string $html Input HTML fragment to process.
* @return static|null The created processor if successful, otherwise null.
*/
private function create_fragment_at_current_node( string $html ) {
if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) {
_doing_it_wrong(
__METHOD__,
__( 'The context element must be a start tag.' ),
'6.8.0'
);
return null;
}
$tag_name = $this->current_element->token->node_name;
$namespace = $this->current_element->token->namespace;
if ( 'html' === $namespace && self::is_void( $tag_name ) ) {
_doing_it_wrong(
__METHOD__,
sprintf(
// translators: %s: A tag name like INPUT or BR.
__( 'The context element cannot be a void element, found "%s".' ),
$tag_name
),
'6.8.0'
);
return null;
}
/*
* Prevent creating fragments at nodes that require a special tokenizer state.
* This is unsupported by the HTML Processor.
*/
if (
'html' === $namespace &&
in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
) {
_doing_it_wrong(
__METHOD__,
sprintf(
// translators: %s: A tag name like IFRAME or TEXTAREA.
__( 'The context element "%s" is not supported.' ),
$tag_name
),
'6.8.0'
);
return null;
}
$fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
$fragment_processor->compat_mode = $this->compat_mode;
// @todo Create "fake" bookmarks for non-existent but implied nodes.
$fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
$root_node = new WP_HTML_Token(
'root-node',
'HTML',
false
);
$fragment_processor->state->stack_of_open_elements->push( $root_node );
$fragment_processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
$fragment_processor->context_node = clone $this->current_element->token;
$fragment_processor->context_node->bookmark_name = 'context-node';
$fragment_processor->context_node->on_destroy = null;
$fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name );
if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) {
$fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE;
}
$fragment_processor->reset_insertion_mode_appropriately();
/*
* > Set the parser's form element pointer to the nearest node to the context element that
* > is a form element (going straight up the ancestor chain, and including the element
* > itself, if it is a form element), if any. (If there is no such form element, the
* > form element pointer keeps its initial value, null.)
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) {
if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) {
$fragment_processor->state->form_element = clone $element;
$fragment_processor->state->form_element->bookmark_name = null;
$fragment_processor->state->form_element->on_destroy = null;
break;
}
}
$fragment_processor->state->encoding_confidence = 'irrelevant';
/*
* Update the parsing namespace near the end of the process.
* This is important so that any push/pop from the stack of open
* elements does not change the parsing namespace.
*/
$fragment_processor->change_parsing_namespace(
$this->current_element->token->integration_node_type ? 'html' : $namespace
);
return $fragment_processor;
}
/**
* Stops the parser and terminates its execution when encountering unsupported markup.
*
* @throws WP_HTML_Unsupported_Exception Halts execution of the parser.
*
* @since 6.7.0
*
* @param string $message Explains support is missing in order to parse the current node.
*/
private function bail( string $message ) {
$here = $this->bookmarks[ $this->state->current_token->bookmark_name ];
$token = substr( $this->html, $here->start, $here->length );
$open_elements = array();
foreach ( $this->state->stack_of_open_elements->stack as $item ) {
$open_elements[] = $item->node_name;
}
$active_formats = array();
foreach ( $this->state->active_formatting_elements->walk_down() as $item ) {
$active_formats[] = $item->node_name;
}
$this->last_error = self::ERROR_UNSUPPORTED;
$this->unsupported_exception = new WP_HTML_Unsupported_Exception(
$message,
$this->state->current_token->node_name,
$here->start,
$token,
$open_elements,
$active_formats
);
throw $this->unsupported_exception;
}
/**
* Returns the last error, if any.
*
* Various situations lead to parsing failure but this class will
* return `false` in all those cases. To determine why something
* failed it's possible to request the last error. This can be
* helpful to know to distinguish whether a given tag couldn't
* be found or if content in the document caused the processor
* to give up and abort processing.
*
* Example
*
* $processor = WP_HTML_Processor::create_fragment( ' |