rtion_mode_appropriately();
return $this->step( self::REPROCESS_CURRENT_NODE );
}
/*
* > Anything else
*/
return $this->step_in_select();
}
/**
* Parses next element in the 'in template' insertion mode.
*
* This internal function performs the 'in template' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-intemplate
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_in_template(): bool {
$token_name = $this->get_token_name();
$token_type = $this->get_token_type();
$is_closer = $this->is_tag_closer();
$op_sigil = '#tag' === $token_type ? ( $is_closer ? '-' : '+' ) : '';
$op = "{$op_sigil}{$token_name}";
switch ( $op ) {
/*
* > A character token
* > A comment token
* > A DOCTYPE token
*/
case '#text':
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
case 'html':
return $this->step_in_body();
/*
* > A start tag whose tag name is one of: "base", "basefont", "bgsound", "link",
* > "meta", "noframes", "script", "style", "template", "title"
* > An end tag whose tag name is "template"
*/
case '+BASE':
case '+BASEFONT':
case '+BGSOUND':
case '+LINK':
case '+META':
case '+NOFRAMES':
case '+SCRIPT':
case '+STYLE':
case '+TEMPLATE':
case '+TITLE':
case '-TEMPLATE':
return $this->step_in_head();
/*
* > A start tag whose tag name is one of: "caption", "colgroup", "tbody", "tfoot", "thead"
*/
case '+CAPTION':
case '+COLGROUP':
case '+TBODY':
case '+TFOOT':
case '+THEAD':
array_pop( $this->state->stack_of_template_insertion_modes );
$this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE;
return $this->step( self::REPROCESS_CURRENT_NODE );
/*
* > A start tag whose tag name is "col"
*/
case '+COL':
array_pop( $this->state->stack_of_template_insertion_modes );
$this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP;
return $this->step( self::REPROCESS_CURRENT_NODE );
/*
* > A start tag whose tag name is "tr"
*/
case '+TR':
array_pop( $this->state->stack_of_template_insertion_modes );
$this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY;
return $this->step( self::REPROCESS_CURRENT_NODE );
/*
* > A start tag whose tag name is one of: "td", "th"
*/
case '+TD':
case '+TH':
array_pop( $this->state->stack_of_template_insertion_modes );
$this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW;
return $this->step( self::REPROCESS_CURRENT_NODE );
}
/*
* > Any other start tag
*/
if ( ! $is_closer ) {
array_pop( $this->state->stack_of_template_insertion_modes );
$this->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
return $this->step( self::REPROCESS_CURRENT_NODE );
}
/*
* > Any other end tag
*/
if ( $is_closer ) {
// Parse error: ignore the token.
return $this->step();
}
/*
* > An end-of-file token
*/
if ( ! $this->state->stack_of_open_elements->contains( 'TEMPLATE' ) ) {
// Stop parsing.
return false;
}
// @todo Indicate a parse error once it's possible.
$this->state->stack_of_open_elements->pop_until( 'TEMPLATE' );
$this->state->active_formatting_elements->clear_up_to_last_marker();
array_pop( $this->state->stack_of_template_insertion_modes );
$this->reset_insertion_mode_appropriately();
return $this->step( self::REPROCESS_CURRENT_NODE );
}
/**
* Parses next element in the 'after body' insertion mode.
*
* This internal function performs the 'after body' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-afterbody
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_after_body(): bool {
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";
switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
*
* > Process the token using the rules for the "in body" insertion mode.
*/
case '#text':
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
goto after_body_anything_else;
break;
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->bail( 'Content outside of BODY is unsupported.' );
break;
/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();
/*
* > A start tag whose tag name is "html"
*/
case '+HTML':
return $this->step_in_body();
/*
* > An end tag whose tag name is "html"
*
* > If the parser was created as part of the HTML fragment parsing algorithm,
* > this is a parse error; ignore the token. (fragment case)
* >
* > Otherwise, switch the insertion mode to "after after body".
*/
case '-HTML':
if ( isset( $this->context_node ) ) {
return $this->step();
}
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY;
return true;
}
/*
* > Parse error. Switch the insertion mode to "in body" and reprocess the token.
*/
after_body_anything_else:
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
return $this->step( self::REPROCESS_CURRENT_NODE );
}
/**
* Parses next element in the 'in frameset' insertion mode.
*
* This internal function performs the 'in frameset' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-inframeset
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_in_frameset(): bool {
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";
switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Insert the character.
*
* This algorithm effectively strips non-whitespace characters from text and inserts
* them under HTML. This is not supported at this time.
*/
case '#text':
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
break;
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_html_element( $this->state->current_token );
return true;
/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();
/*
* > A start tag whose tag name is "html"
*/
case '+HTML':
return $this->step_in_body();
/*
* > A start tag whose tag name is "frameset"
*/
case '+FRAMESET':
$this->insert_html_element( $this->state->current_token );
return true;
/*
* > An end tag whose tag name is "frameset"
*/
case '-FRAMESET':
/*
* > If the current node is the root html element, then this is a parse error;
* > ignore the token. (fragment case)
*/
if ( $this->state->stack_of_open_elements->current_node_is( 'HTML' ) ) {
return $this->step();
}
/*
* > Otherwise, pop the current node from the stack of open elements.
*/
$this->state->stack_of_open_elements->pop();
/*
* > If the parser was not created as part of the HTML fragment parsing algorithm
* > (fragment case), and the current node is no longer a frameset element, then
* > switch the insertion mode to "after frameset".
*/
if ( ! isset( $this->context_node ) && ! $this->state->stack_of_open_elements->current_node_is( 'FRAMESET' ) ) {
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET;
}
return true;
/*
* > A start tag whose tag name is "frame"
*
* > Insert an HTML element for the token. Immediately pop the
* > current node off the stack of open elements.
* >
* > Acknowledge the token's self-closing flag, if it is set.
*/
case '+FRAME':
$this->insert_html_element( $this->state->current_token );
$this->state->stack_of_open_elements->pop();
return true;
/*
* > A start tag whose tag name is "noframes"
*/
case '+NOFRAMES':
return $this->step_in_head();
}
// Parse error: ignore the token.
return $this->step();
}
/**
* Parses next element in the 'after frameset' insertion mode.
*
* This internal function performs the 'after frameset' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-afterframeset
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_after_frameset(): bool {
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";
switch ( $op ) {
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Insert the character.
*
* This algorithm effectively strips non-whitespace characters from text and inserts
* them under HTML. This is not supported at this time.
*/
case '#text':
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
break;
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_html_element( $this->state->current_token );
return true;
/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();
/*
* > A start tag whose tag name is "html"
*/
case '+HTML':
return $this->step_in_body();
/*
* > An end tag whose tag name is "html"
*/
case '-HTML':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET;
return true;
/*
* > A start tag whose tag name is "noframes"
*/
case '+NOFRAMES':
return $this->step_in_head();
}
// Parse error: ignore the token.
return $this->step();
}
/**
* Parses next element in the 'after after body' insertion mode.
*
* This internal function performs the 'after after body' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#the-after-after-body-insertion-mode
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_after_after_body(): bool {
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";
switch ( $op ) {
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->bail( 'Content outside of HTML is unsupported.' );
break;
/*
* > A DOCTYPE token
* > A start tag whose tag name is "html"
*
* > Process the token using the rules for the "in body" insertion mode.
*/
case 'html':
case '+HTML':
return $this->step_in_body();
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Process the token using the rules for the "in body" insertion mode.
*/
case '#text':
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
goto after_after_body_anything_else;
break;
}
/*
* > Parse error. Switch the insertion mode to "in body" and reprocess the token.
*/
after_after_body_anything_else:
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
return $this->step( self::REPROCESS_CURRENT_NODE );
}
/**
* Parses next element in the 'after after frameset' insertion mode.
*
* This internal function performs the 'after after frameset' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#the-after-after-frameset-insertion-mode
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_after_after_frameset(): bool {
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";
switch ( $op ) {
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->bail( 'Content outside of HTML is unsupported.' );
break;
/*
* > A DOCTYPE token
* > A start tag whose tag name is "html"
*
* > Process the token using the rules for the "in body" insertion mode.
*/
case 'html':
case '+HTML':
return $this->step_in_body();
/*
* > A character token that is one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
* >
* > Process the token using the rules for the "in body" insertion mode.
*
* This algorithm effectively strips non-whitespace characters from text and inserts
* them under HTML. This is not supported at this time.
*/
case '#text':
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
return $this->step_in_body();
}
$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
break;
/*
* > A start tag whose tag name is "noframes"
*/
case '+NOFRAMES':
return $this->step_in_head();
}
// Parse error: ignore the token.
return $this->step();
}
/**
* Parses next element in the 'in foreign content' insertion mode.
*
* This internal function performs the 'in foreign content' insertion mode
* logic for the generalized WP_HTML_Processor::step() function.
*
* @since 6.7.0 Stub implementation.
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#parsing-main-inforeign
* @see WP_HTML_Processor::step
*
* @return bool Whether an element was found.
*/
private function step_in_foreign_content(): bool {
$tag_name = $this->get_token_name();
$token_type = $this->get_token_type();
$op_sigil = '#tag' === $token_type ? ( $this->is_tag_closer() ? '-' : '+' ) : '';
$op = "{$op_sigil}{$tag_name}";
/*
* > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size"
*
* This section drawn out above the switch to more easily incorporate
* the additional rules based on the presence of the attributes.
*/
if (
'+FONT' === $op &&
(
null !== $this->get_attribute( 'color' ) ||
null !== $this->get_attribute( 'face' ) ||
null !== $this->get_attribute( 'size' )
)
) {
$op = '+FONT with attributes';
}
switch ( $op ) {
case '#text':
/*
* > A character token that is U+0000 NULL
*
* This is handled by `get_modifiable_text()`.
*/
/*
* Whitespace-only text does not affect the frameset-ok flag.
* It is probably inter-element whitespace, but it may also
* contain character references which decode only to whitespace.
*/
if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
$this->state->frameset_ok = false;
}
$this->insert_foreign_element( $this->state->current_token, false );
return true;
/*
* CDATA sections are alternate wrappers for text content and therefore
* ought to follow the same rules as text nodes.
*/
case '#cdata-section':
/*
* NULL bytes and whitespace do not change the frameset-ok flag.
*/
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
$cdata_content_start = $current_token->start + 9;
$cdata_content_length = $current_token->length - 12;
if ( strspn( $this->html, "\0 \t\n\f\r", $cdata_content_start, $cdata_content_length ) !== $cdata_content_length ) {
$this->state->frameset_ok = false;
}
$this->insert_foreign_element( $this->state->current_token, false );
return true;
/*
* > A comment token
*/
case '#comment':
case '#funky-comment':
case '#presumptuous-tag':
$this->insert_foreign_element( $this->state->current_token, false );
return true;
/*
* > A DOCTYPE token
*/
case 'html':
// Parse error: ignore the token.
return $this->step();
/*
* > A start tag whose tag name is "b", "big", "blockquote", "body", "br", "center",
* > "code", "dd", "div", "dl", "dt", "em", "embed", "h1", "h2", "h3", "h4", "h5",
* > "h6", "head", "hr", "i", "img", "li", "listing", "menu", "meta", "nobr", "ol",
* > "p", "pre", "ruby", "s", "small", "span", "strong", "strike", "sub", "sup",
* > "table", "tt", "u", "ul", "var"
*
* > A start tag whose name is "font", if the token has any attributes named "color", "face", or "size"
*
* > An end tag whose tag name is "br", "p"
*
* Closing BR tags are always reported by the Tag Processor as opening tags.
*/
case '+B':
case '+BIG':
case '+BLOCKQUOTE':
case '+BODY':
case '+BR':
case '+CENTER':
case '+CODE':
case '+DD':
case '+DIV':
case '+DL':
case '+DT':
case '+EM':
case '+EMBED':
case '+H1':
case '+H2':
case '+H3':
case '+H4':
case '+H5':
case '+H6':
case '+HEAD':
case '+HR':
case '+I':
case '+IMG':
case '+LI':
case '+LISTING':
case '+MENU':
case '+META':
case '+NOBR':
case '+OL':
case '+P':
case '+PRE':
case '+RUBY':
case '+S':
case '+SMALL':
case '+SPAN':
case '+STRONG':
case '+STRIKE':
case '+SUB':
case '+SUP':
case '+TABLE':
case '+TT':
case '+U':
case '+UL':
case '+VAR':
case '+FONT with attributes':
case '-BR':
case '-P':
// @todo Indicate a parse error once it's possible.
foreach ( $this->state->stack_of_open_elements->walk_up() as $current_node ) {
if (
'math' === $current_node->integration_node_type ||
'html' === $current_node->integration_node_type ||
'html' === $current_node->namespace
) {
break;
}
$this->state->stack_of_open_elements->pop();
}
goto in_foreign_content_process_in_current_insertion_mode;
}
/*
* > Any other start tag
*/
if ( ! $this->is_tag_closer() ) {
$this->insert_foreign_element( $this->state->current_token, false );
/*
* > If the token has its self-closing flag set, then run
* > the appropriate steps from the following list:
* >
* > ↪ the token's tag name is "script", and the new current node is in the SVG namespace
* > Acknowledge the token's self-closing flag, and then act as
* > described in the steps for a "script" end tag below.
* >
* > ↪ Otherwise
* > Pop the current node off the stack of open elements and
* > acknowledge the token's self-closing flag.
*
* Since the rules for SCRIPT below indicate to pop the element off of the stack of
* open elements, which is the same for the Otherwise condition, there's no need to
* separate these checks. The difference comes when a parser operates with the scripting
* flag enabled, and executes the script, which this parser does not support.
*/
if ( $this->state->current_token->has_self_closing_flag ) {
$this->state->stack_of_open_elements->pop();
}
return true;
}
/*
* > An end tag whose name is "script", if the current node is an SVG script element.
*/
if ( $this->is_tag_closer() && 'SCRIPT' === $this->state->current_token->node_name && 'svg' === $this->state->current_token->namespace ) {
$this->state->stack_of_open_elements->pop();
return true;
}
/*
* > Any other end tag
*/
if ( $this->is_tag_closer() ) {
$node = $this->state->stack_of_open_elements->current_node();
if ( $tag_name !== $node->node_name ) {
// @todo Indicate a parse error once it's possible.
}
in_foreign_content_end_tag_loop:
if ( $node === $this->state->stack_of_open_elements->at( 1 ) ) {
return true;
}
/*
* > If node's tag name, converted to ASCII lowercase, is the same as the tag name
* > of the token, pop elements from the stack of open elements until node has
* > been popped from the stack, and then return.
*/
if ( 0 === strcasecmp( $node->node_name, $tag_name ) ) {
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
$this->state->stack_of_open_elements->pop();
if ( $node === $item ) {
return true;
}
}
}
foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $item ) {
$node = $item;
break;
}
if ( 'html' !== $node->namespace ) {
goto in_foreign_content_end_tag_loop;
}
in_foreign_content_process_in_current_insertion_mode:
switch ( $this->state->insertion_mode ) {
case WP_HTML_Processor_State::INSERTION_MODE_INITIAL:
return $this->step_initial();
case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HTML:
return $this->step_before_html();
case WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD:
return $this->step_before_head();
case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD:
return $this->step_in_head();
case WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD_NOSCRIPT:
return $this->step_in_head_noscript();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD:
return $this->step_after_head();
case WP_HTML_Processor_State::INSERTION_MODE_IN_BODY:
return $this->step_in_body();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE:
return $this->step_in_table();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_TEXT:
return $this->step_in_table_text();
case WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION:
return $this->step_in_caption();
case WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP:
return $this->step_in_column_group();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY:
return $this->step_in_table_body();
case WP_HTML_Processor_State::INSERTION_MODE_IN_ROW:
return $this->step_in_row();
case WP_HTML_Processor_State::INSERTION_MODE_IN_CELL:
return $this->step_in_cell();
case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT:
return $this->step_in_select();
case WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE:
return $this->step_in_select_in_table();
case WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE:
return $this->step_in_template();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_BODY:
return $this->step_after_body();
case WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET:
return $this->step_in_frameset();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_FRAMESET:
return $this->step_after_frameset();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_BODY:
return $this->step_after_after_body();
case WP_HTML_Processor_State::INSERTION_MODE_AFTER_AFTER_FRAMESET:
return $this->step_after_after_frameset();
// This should be unreachable but PHP doesn't have total type checking on switch.
default:
$this->bail( "Unaware of the requested parsing mode: '{$this->state->insertion_mode}'." );
}
}
$this->bail( 'Should not have been able to reach end of IN FOREIGN CONTENT processing. Check HTML API code.' );
// This unnecessary return prevents tools from inaccurately reporting type errors.
return false;
}
/*
* Internal helpers
*/
/**
* Creates a new bookmark for the currently-matched token and returns the generated name.
*
* @since 6.4.0
* @since 6.5.0 Renamed from bookmark_tag() to bookmark_token().
*
* @throws Exception When unable to allocate requested bookmark.
*
* @return string|false Name of created bookmark, or false if unable to create.
*/
private function bookmark_token() {
if ( ! parent::set_bookmark( ++$this->bookmark_counter ) ) {
$this->last_error = self::ERROR_EXCEEDED_MAX_BOOKMARKS;
throw new Exception( 'could not allocate bookmark' );
}
return "{$this->bookmark_counter}";
}
/*
* HTML semantic overrides for Tag Processor
*/
/**
* Indicates the namespace of the current token, or "html" if there is none.
*
* @return string One of "html", "math", or "svg".
*/
public function get_namespace(): string {
if ( ! isset( $this->current_element ) ) {
return parent::get_namespace();
}
return $this->current_element->token->namespace;
}
/**
* Returns the uppercase name of the matched tag.
*
* The semantic rules for HTML specify that certain tags be reprocessed
* with a different tag name. Because of this, the tag name presented
* by the HTML Processor may differ from the one reported by the HTML
* Tag Processor, which doesn't apply these semantic rules.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( '
Test
' );
* $processor->next_tag() === true;
* $processor->get_tag() === 'DIV';
*
* $processor->next_tag() === false;
* $processor->get_tag() === null;
*
* @since 6.4.0
*
* @return string|null Name of currently matched tag in input HTML, or `null` if none found.
*/
public function get_tag(): ?string {
if ( null !== $this->last_error ) {
return null;
}
if ( $this->is_virtual() ) {
return $this->current_element->token->node_name;
}
$tag_name = parent::get_tag();
/*
* > A start tag whose tag name is "image"
* > Change the token's tag name to "img" and reprocess it. (Don't ask.)
*/
return ( 'IMAGE' === $tag_name && 'html' === $this->get_namespace() )
? 'IMG'
: $tag_name;
}
/**
* Indicates if the currently matched tag contains the self-closing flag.
*
* No HTML elements ought to have the self-closing flag and for those, the self-closing
* flag will be ignored. For void elements this is benign because they "self close"
* automatically. For non-void HTML elements though problems will appear if someone
* intends to use a self-closing element in place of that element with an empty body.
* For HTML foreign elements and custom elements the self-closing flag determines if
* they self-close or not.
*
* This function does not determine if a tag is self-closing,
* but only if the self-closing flag is present in the syntax.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag(): bool {
return $this->is_virtual() ? false : parent::has_self_closing_flag();
}
/**
* Returns the node name represented by the token.
*
* This matches the DOM API value `nodeName`. Some values
* are static, such as `#text` for a text node, while others
* are dynamically generated from the token itself.
*
* Dynamic names:
* - Uppercase tag name for tag matches.
* - `html` for DOCTYPE declarations.
*
* Note that if the Tag Processor is not matched on a token
* then this function will return `null`, either because it
* hasn't yet found a token or because it reached the end
* of the document without matching a token.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null Name of the matched token.
*/
public function get_token_name(): ?string {
return $this->is_virtual()
? $this->current_element->token->node_name
: parent::get_token_name();
}
/**
* Indicates the kind of matched token, if any.
*
* This differs from `get_token_name()` in that it always
* returns a static string indicating the type, whereas
* `get_token_name()` may return values derived from the
* token itself, such as a tag name or processing
* instruction tag.
*
* Possible values:
* - `#tag` when matched on a tag.
* - `#text` when matched on a text node.
* - `#cdata-section` when matched on a CDATA node.
* - `#comment` when matched on a comment.
* - `#doctype` when matched on a DOCTYPE declaration.
* - `#presumptuous-tag` when matched on an empty tag closer.
* - `#funky-comment` when matched on a funky comment.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null What kind of token is matched, or null.
*/
public function get_token_type(): ?string {
if ( $this->is_virtual() ) {
/*
* This logic comes from the Tag Processor.
*
* @todo It would be ideal not to repeat this here, but it's not clearly
* better to allow passing a token name to `get_token_type()`.
*/
$node_name = $this->current_element->token->node_name;
$starting_char = $node_name[0];
if ( 'A' <= $starting_char && 'Z' >= $starting_char ) {
return '#tag';
}
if ( 'html' === $node_name ) {
return '#doctype';
}
return $node_name;
}
return parent::get_token_type();
}
/**
* Returns the value of a requested attribute from a matched tag opener if that attribute exists.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( 'Test
' );
* $p->next_token() === true;
* $p->get_attribute( 'data-test-id' ) === '14';
* $p->get_attribute( 'enabled' ) === true;
* $p->get_attribute( 'aria-label' ) === null;
*
* $p->next_tag() === false;
* $p->get_attribute( 'class' ) === null;
*
* @since 6.6.0 Subclassed for HTML Processor.
*
* @param string $name Name of attribute whose value is requested.
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
*/
public function get_attribute( $name ) {
return $this->is_virtual() ? null : parent::get_attribute( $name );
}
/**
* Updates or creates a new attribute on the currently matched tag with the passed value.
*
* For boolean attributes special handling is provided:
* - When `true` is passed as the value, then only the attribute name is added to the tag.
* - When `false` is passed, the attribute gets removed if it existed before.
*
* For string attributes, the value is escaped using the `esc_attr` function.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $name The attribute name to target.
* @param string|bool $value The new attribute value.
* @return bool Whether an attribute value was set.
*/
public function set_attribute( $name, $value ): bool {
return $this->is_virtual() ? false : parent::set_attribute( $name, $value );
}
/**
* Remove an attribute from the currently-matched tag.
*
* @since 6.6.0 Subclassed for HTML Processor.
*
* @param string $name The attribute name to remove.
* @return bool Whether an attribute was removed.
*/
public function remove_attribute( $name ): bool {
return $this->is_virtual() ? false : parent::remove_attribute( $name );
}
/**
* Gets lowercase names of all attributes matching a given prefix in the current tag.
*
* Note that matching is case-insensitive. This is in accordance with the spec:
*
* > There must never be two or more attributes on
* > the same start tag whose names are an ASCII
* > case-insensitive match for each other.
* - HTML 5 spec
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( 'Test
' );
* $p->next_tag( array( 'class_name' => 'test' ) ) === true;
* $p->get_attribute_names_with_prefix( 'data-' ) === array( 'data-enabled', 'data-test-id' );
*
* $p->next_tag() === false;
* $p->get_attribute_names_with_prefix( 'data-' ) === null;
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @see https://html.spec.whatwg.org/multipage/syntax.html#attributes-2:ascii-case-insensitive
*
* @param string $prefix Prefix of requested attribute names.
* @return array|null List of attribute names, or `null` when no tag opener is matched.
*/
public function get_attribute_names_with_prefix( $prefix ): ?array {
return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix );
}
/**
* Adds a new class name to the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to add.
* @return bool Whether the class was set to be added.
*/
public function add_class( $class_name ): bool {
return $this->is_virtual() ? false : parent::add_class( $class_name );
}
/**
* Removes a class name from the currently matched tag.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @param string $class_name The class name to remove.
* @return bool Whether the class was set to be removed.
*/
public function remove_class( $class_name ): bool {
return $this->is_virtual() ? false : parent::remove_class( $class_name );
}
/**
* Returns if a matched tag contains the given ASCII case-insensitive class name.
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @todo When reconstructing active formatting elements with attributes, find a way
* to indicate if the virtually-reconstructed formatting elements contain the
* wanted class name.
*
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
*/
public function has_class( $wanted_class ): ?bool {
return $this->is_virtual() ? null : parent::has_class( $wanted_class );
}
/**
* Generator for a foreach loop to step through each class name for the matched tag.
*
* This generator function is designed to be used inside a "foreach" loop.
*
* Example:
*
* $p = WP_HTML_Processor::create_fragment( "" );
* $p->next_tag();
* foreach ( $p->class_list() as $class_name ) {
* echo "{$class_name} ";
* }
* // Outputs: "free
lang-en "
*
* @since 6.6.0 Subclassed for the HTML Processor.
*/
public function class_list() {
return $this->is_virtual() ? null : parent::class_list();
}
/**
* Returns the modifiable text for a matched token, or an empty string.
*
* Modifiable text is text content that may be read and changed without
* changing the HTML structure of the document around it. This includes
* the contents of `#text` nodes in the HTML as well as the inner
* contents of HTML comments, Processing Instructions, and others, even
* though these nodes aren't part of a parsed DOM tree. They also contain
* the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
* other section in an HTML document which cannot contain HTML markup (DATA).
*
* If a token has no modifiable text then an empty string is returned to
* avoid needless crashing or type errors. An empty string does not mean
* that a token has modifiable text, and a token with modifiable text may
* have an empty string (e.g. a comment with no contents).
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string
*/
public function get_modifiable_text(): string {
return $this->is_virtual() ? '' : parent::get_modifiable_text();
}
/**
* Indicates what kind of comment produced the comment node.
*
* Because there are different kinds of HTML syntax which produce
* comments, the Tag Processor tracks and exposes this as a type
* for the comment. Nominally only regular HTML comments exist as
* they are commonly known, but a number of unrelated syntax errors
* also produce comments.
*
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
* @see self::COMMENT_AS_CDATA_LOOKALIKE
* @see self::COMMENT_AS_INVALID_HTML
* @see self::COMMENT_AS_HTML_COMMENT
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
*
* @since 6.6.0 Subclassed for the HTML Processor.
*
* @return string|null
*/
public function get_comment_type(): ?string {
return $this->is_virtual() ? null : parent::get_comment_type();
}
/**
* Removes a bookmark that is no longer needed.
*
* Releasing a bookmark frees up the small
* performance overhead it requires.
*
* @since 6.4.0
*
* @param string $bookmark_name Name of the bookmark to remove.
* @return bool Whether the bookmark already existed before removal.
*/
public function release_bookmark( $bookmark_name ): bool {
return parent::release_bookmark( "_{$bookmark_name}" );
}
/**
* Moves the internal cursor in the HTML Processor to a given bookmark's location.
*
* Be careful! Seeking backwards to a previous location resets the parser to the
* start of the document and reparses the entire contents up until it finds the
* sought-after bookmarked location.
*
* In order to prevent accidental infinite loops, there's a
* maximum limit on the number of times seek() can be called.
*
* @throws Exception When unable to allocate a bookmark for the next token in the input HTML document.
*
* @since 6.4.0
*
* @param string $bookmark_name Jump to the place in the document identified by this bookmark name.
* @return bool Whether the internal cursor was successfully moved to the bookmark's location.
*/
public function seek( $bookmark_name ): bool {
// Flush any pending updates to the document before beginning.
$this->get_updated_html();
$actual_bookmark_name = "_{$bookmark_name}";
$processor_started_at = $this->state->current_token
? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start
: 0;
$bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start;
$direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward';
/*
* If seeking backwards, it's possible that the sought-after bookmark exists within an element
* which has been closed before the current cursor; in other words, it has already been removed
* from the stack of open elements. This means that it's insufficient to simply pop off elements
* from the stack of open elements which appear after the bookmarked location and then jump to
* that location, as the elements which were open before won't be re-opened.
*
* In order to maintain consistency, the HTML Processor rewinds to the start of the document
* and reparses everything until it finds the sought-after bookmark.
*
* There are potentially better ways to do this: cache the parser state for each bookmark and
* restore it when seeking; store an immutable and idempotent register of where elements open
* and close.
*
* If caching the parser state it will be essential to properly maintain the cached stack of
* open elements and active formatting elements when modifying the document. This could be a
* tedious and time-consuming process as well, and so for now will not be performed.
*
* It may be possible to track bookmarks for where elements open and close, and in doing so
* be able to quickly recalculate breadcrumbs for any element in the document. It may even
* be possible to remove the stack of open elements and compute it on the fly this way.
* If doing this, the parser would need to track the opening and closing locations for all
* tokens in the breadcrumb path for any and all bookmarks. By utilizing bookmarks themselves
* this list could be automatically maintained while modifying the document. Finding the
* breadcrumbs would then amount to traversing that list from the start until the token
* being inspected. Once an element closes, if there are no bookmarks pointing to locations
* within that element, then all of these locations may be forgotten to save on memory use
* and computation time.
*/
if ( 'backward' === $direction ) {
/*
* Instead of clearing the parser state and starting fresh, calling the stack methods
* maintains the proper flags in the parser.
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
if ( 'context-node' === $item->bookmark_name ) {
break;
}
$this->state->stack_of_open_elements->remove_node( $item );
}
foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
if ( 'context-node' === $item->bookmark_name ) {
break;
}
$this->state->active_formatting_elements->remove_node( $item );
}
parent::seek( 'context-node' );
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
$this->state->frameset_ok = true;
$this->element_queue = array();
$this->current_element = null;
if ( isset( $this->context_node ) ) {
$this->breadcrumbs = array_slice( $this->breadcrumbs, 0, 2 );
} else {
$this->breadcrumbs = array();
}
}
// When moving forwards, reparse the document until reaching the same location as the original bookmark.
if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
return true;
}
while ( $this->next_token() ) {
if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) {
while ( isset( $this->current_element ) && WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
$this->current_element = array_shift( $this->element_queue );
}
return true;
}
}
return false;
}
/**
* Sets a bookmark in the HTML document.
*
* Bookmarks represent specific places or tokens in the HTML
* document, such as a tag opener or closer. When applying
* edits to a document, such as setting an attribute, the
* text offsets of that token may shift; the bookmark is
* kept updated with those shifts and remains stable unless
* the entire span of text in which the token sits is removed.
*
* Release bookmarks when they are no longer needed.
*
* Example:
*
* Surprising fact you may not know!
* ^ ^
* \-|-- this `H2` opener bookmark tracks the token
*
* Surprising fact you may no…
* ^ ^
* \-|-- it shifts with edits
*
* Bookmarks provide the ability to seek to a previously-scanned
* place in the HTML document. This avoids the need to re-scan
* the entire document.
*
* Example:
*
*
* ^^^^
* want to note this last item
*
* $p = new WP_HTML_Tag_Processor( $html );
* $in_list = false;
* while ( $p->next_tag( array( 'tag_closers' => $in_list ? 'visit' : 'skip' ) ) ) {
* if ( 'UL' === $p->get_tag() ) {
* if ( $p->is_tag_closer() ) {
* $in_list = false;
* $p->set_bookmark( 'resume' );
* if ( $p->seek( 'last-li' ) ) {
* $p->add_class( 'last-li' );
* }
* $p->seek( 'resume' );
* $p->release_bookmark( 'last-li' );
* $p->release_bookmark( 'resume' );
* } else {
* $in_list = true;
* }
* }
*
* if ( 'LI' === $p->get_tag() ) {
* $p->set_bookmark( 'last-li' );
* }
* }
*
* Bookmarks intentionally hide the internal string offsets
* to which they refer. They are maintained internally as
* updates are applied to the HTML document and therefore
* retain their "position" - the location to which they
* originally pointed. The inability to use bookmarks with
* functions like `substr` is therefore intentional to guard
* against accidentally breaking the HTML.
*
* Because bookmarks allocate memory and require processing
* for every applied update, they are limited and require
* a name. They should not be created with programmatically-made
* names, such as "li_{$index}" with some loop. As a general
* rule they should only be created with string-literal names
* like "start-of-section" or "last-paragraph".
*
* Bookmarks are a powerful tool to enable complicated behavior.
* Consider double-checking that you need this tool if you are
* reaching for it, as inappropriate use could lead to broken
* HTML structure or unwanted processing overhead.
*
* @since 6.4.0
*
* @param string $bookmark_name Identifies this particular bookmark.
* @return bool Whether the bookmark was successfully created.
*/
public function set_bookmark( $bookmark_name ): bool {
return parent::set_bookmark( "_{$bookmark_name}" );
}
/**
* Checks whether a bookmark with the given name exists.
*
* @since 6.5.0
*
* @param string $bookmark_name Name to identify a bookmark that potentially exists.
* @return bool Whether that bookmark exists.
*/
public function has_bookmark( $bookmark_name ): bool {
return parent::has_bookmark( "_{$bookmark_name}" );
}
/*
* HTML Parsing Algorithms
*/
/**
* Closes a P element.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#close-a-p-element
*/
private function close_a_p_element(): void {
$this->generate_implied_end_tags( 'P' );
$this->state->stack_of_open_elements->pop_until( 'P' );
}
/**
* Closes elements that have implied end tags.
*
* @since 6.4.0
* @since 6.7.0 Full spec support.
*
* @see https://html.spec.whatwg.org/#generate-implied-end-tags
*
* @param string|null $except_for_this_element Perform as if this element doesn't exist in the stack of open elements.
*/
private function generate_implied_end_tags( ?string $except_for_this_element = null ): void {
$elements_with_implied_end_tags = array(
'DD',
'DT',
'LI',
'OPTGROUP',
'OPTION',
'P',
'RB',
'RP',
'RT',
'RTC',
);
$no_exclusions = ! isset( $except_for_this_element );
while (
( $no_exclusions || ! $this->state->stack_of_open_elements->current_node_is( $except_for_this_element ) ) &&
in_array( $this->state->stack_of_open_elements->current_node()->node_name, $elements_with_implied_end_tags, true )
) {
$this->state->stack_of_open_elements->pop();
}
}
/**
* Closes elements that have implied end tags, thoroughly.
*
* See the HTML specification for an explanation why this is
* different from generating end tags in the normal sense.
*
* @since 6.4.0
* @since 6.7.0 Full spec support.
*
* @see WP_HTML_Processor::generate_implied_end_tags
* @see https://html.spec.whatwg.org/#generate-implied-end-tags
*/
private function generate_implied_end_tags_thoroughly(): void {
$elements_with_implied_end_tags = array(
'CAPTION',
'COLGROUP',
'DD',
'DT',
'LI',
'OPTGROUP',
'OPTION',
'P',
'RB',
'RP',
'RT',
'RTC',
'TBODY',
'TD',
'TFOOT',
'TH',
'THEAD',
'TR',
);
while ( in_array( $this->state->stack_of_open_elements->current_node()->node_name, $elements_with_implied_end_tags, true ) ) {
$this->state->stack_of_open_elements->pop();
}
}
/**
* Returns the adjusted current node.
*
* > The adjusted current node is the context element if the parser was created as
* > part of the HTML fragment parsing algorithm and the stack of open elements
* > has only one element in it (fragment case); otherwise, the adjusted current
* > node is the current node.
*
* @see https://html.spec.whatwg.org/#adjusted-current-node
*
* @since 6.7.0
*
* @return WP_HTML_Token|null The adjusted current node.
*/
private function get_adjusted_current_node(): ?WP_HTML_Token {
if ( isset( $this->context_node ) && 1 === $this->state->stack_of_open_elements->count() ) {
return $this->context_node;
}
return $this->state->stack_of_open_elements->current_node();
}
/**
* Reconstructs the active formatting elements.
*
* > This has the effect of reopening all the formatting elements that were opened
* > in the current body, cell, or caption (whichever is youngest) that haven't
* > been explicitly closed.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#reconstruct-the-active-formatting-elements
*
* @return bool Whether any formatting elements needed to be reconstructed.
*/
private function reconstruct_active_formatting_elements(): bool {
/*
* > If there are no entries in the list of active formatting elements, then there is nothing
* > to reconstruct; stop this algorithm.
*/
if ( 0 === $this->state->active_formatting_elements->count() ) {
return false;
}
$last_entry = $this->state->active_formatting_elements->current_node();
if (
/*
* > If the last (most recently added) entry in the list of active formatting elements is a marker;
* > stop this algorithm.
*/
'marker' === $last_entry->node_name ||
/*
* > If the last (most recently added) entry in the list of active formatting elements is an
* > element that is in the stack of open elements, then there is nothing to reconstruct;
* > stop this algorithm.
*/
$this->state->stack_of_open_elements->contains_node( $last_entry )
) {
return false;
}
$this->bail( 'Cannot reconstruct active formatting elements when advancing and rewinding is required.' );
}
/**
* Runs the reset the insertion mode appropriately algorithm.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately
*/
private function reset_insertion_mode_appropriately(): void {
// Set the first node.
$first_node = null;
foreach ( $this->state->stack_of_open_elements->walk_down() as $first_node ) {
break;
}
/*
* > 1. Let _last_ be false.
*/
$last = false;
foreach ( $this->state->stack_of_open_elements->walk_up() as $node ) {
/*
* > 2. Let _node_ be the last node in the stack of open elements.
* > 3. _Loop_: If _node_ is the first node in the stack of open elements, then set _last_
* > to true, and, if the parser was created as part of the HTML fragment parsing
* > algorithm (fragment case), set node to the context element passed to
* > that algorithm.
* > …
*/
if ( $node === $first_node ) {
$last = true;
if ( isset( $this->context_node ) ) {
$node = $this->context_node;
}
}
// All of the following rules are for matching HTML elements.
if ( 'html' !== $node->namespace ) {
continue;
}
switch ( $node->node_name ) {
/*
* > 4. If node is a `select` element, run these substeps:
* > 1. If _last_ is true, jump to the step below labeled done.
* > 2. Let _ancestor_ be _node_.
* > 3. _Loop_: If _ancestor_ is the first node in the stack of open elements,
* > jump to the step below labeled done.
* > 4. Let ancestor be the node before ancestor in the stack of open elements.
* > …
* > 7. Jump back to the step labeled _loop_.
* > 8. _Done_: Switch the insertion mode to "in select" and return.
*/
case 'SELECT':
if ( ! $last ) {
foreach ( $this->state->stack_of_open_elements->walk_up( $node ) as $ancestor ) {
if ( 'html' !== $ancestor->namespace ) {
continue;
}
switch ( $ancestor->node_name ) {
/*
* > 5. If _ancestor_ is a `template` node, jump to the step below
* > labeled _done_.
*/
case 'TEMPLATE':
break 2;
/*
* > 6. If _ancestor_ is a `table` node, switch the insertion mode to
* > "in select in table" and return.
*/
case 'TABLE':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT_IN_TABLE;
return;
}
}
}
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_SELECT;
return;
/*
* > 5. If _node_ is a `td` or `th` element and _last_ is false, then switch the
* > insertion mode to "in cell" and return.
*/
case 'TD':
case 'TH':
if ( ! $last ) {
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CELL;
return;
}
break;
/*
* > 6. If _node_ is a `tr` element, then switch the insertion mode to "in row"
* > and return.
*/
case 'TR':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW;
return;
/*
* > 7. If _node_ is a `tbody`, `thead`, or `tfoot` element, then switch the
* > insertion mode to "in table body" and return.
*/
case 'TBODY':
case 'THEAD':
case 'TFOOT':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE_BODY;
return;
/*
* > 8. If _node_ is a `caption` element, then switch the insertion mode to
* > "in caption" and return.
*/
case 'CAPTION':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_CAPTION;
return;
/*
* > 9. If _node_ is a `colgroup` element, then switch the insertion mode to
* > "in column group" and return.
*/
case 'COLGROUP':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_COLUMN_GROUP;
return;
/*
* > 10. If _node_ is a `table` element, then switch the insertion mode to
* > "in table" and return.
*/
case 'TABLE':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_TABLE;
return;
/*
* > 11. If _node_ is a `template` element, then switch the insertion mode to the
* > current template insertion mode and return.
*/
case 'TEMPLATE':
$this->state->insertion_mode = end( $this->state->stack_of_template_insertion_modes );
return;
/*
* > 12. If _node_ is a `head` element and _last_ is false, then switch the
* > insertion mode to "in head" and return.
*/
case 'HEAD':
if ( ! $last ) {
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_HEAD;
return;
}
break;
/*
* > 13. If _node_ is a `body` element, then switch the insertion mode to "in body"
* > and return.
*/
case 'BODY':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
return;
/*
* > 14. If _node_ is a `frameset` element, then switch the insertion mode to
* > "in frameset" and return. (fragment case)
*/
case 'FRAMESET':
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_FRAMESET;
return;
/*
* > 15. If _node_ is an `html` element, run these substeps:
* > 1. If the head element pointer is null, switch the insertion mode to
* > "before head" and return. (fragment case)
* > 2. Otherwise, the head element pointer is not null, switch the insertion
* > mode to "after head" and return.
*/
case 'HTML':
$this->state->insertion_mode = isset( $this->state->head_element )
? WP_HTML_Processor_State::INSERTION_MODE_AFTER_HEAD
: WP_HTML_Processor_State::INSERTION_MODE_BEFORE_HEAD;
return;
}
}
/*
* > 16. If _last_ is true, then switch the insertion mode to "in body"
* > and return. (fragment case)
*
* This is only reachable if `$last` is true, as per the fragment parsing case.
*/
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
}
/**
* Runs the adoption agency algorithm.
*
* @since 6.4.0
*
* @throws WP_HTML_Unsupported_Exception When encountering unsupported HTML input.
*
* @see https://html.spec.whatwg.org/#adoption-agency-algorithm
*/
private function run_adoption_agency_algorithm(): void {
$budget = 1000;
$subject = $this->get_tag();
$current_node = $this->state->stack_of_open_elements->current_node();
if (
// > If the current node is an HTML element whose tag name is subject
$current_node && $subject === $current_node->node_name &&
// > the current node is not in the list of active formatting elements
! $this->state->active_formatting_elements->contains_node( $current_node )
) {
$this->state->stack_of_open_elements->pop();
return;
}
$outer_loop_counter = 0;
while ( $budget-- > 0 ) {
if ( $outer_loop_counter++ >= 8 ) {
return;
}
/*
* > Let formatting element be the last element in the list of active formatting elements that:
* > - is between the end of the list and the last marker in the list,
* > if any, or the start of the list otherwise,
* > - and has the tag name subject.
*/
$formatting_element = null;
foreach ( $this->state->active_formatting_elements->walk_up() as $item ) {
if ( 'marker' === $item->node_name ) {
break;
}
if ( $subject === $item->node_name ) {
$formatting_element = $item;
break;
}
}
// > If there is no such element, then return and instead act as described in the "any other end tag" entry above.
if ( null === $formatting_element ) {
$this->bail( 'Cannot run adoption agency when "any other end tag" is required.' );
}
// > If formatting element is not in the stack of open elements, then this is a parse error; remove the element from the list, and return.
if ( ! $this->state->stack_of_open_elements->contains_node( $formatting_element ) ) {
$this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}
// > If formatting element is in the stack of open elements, but the element is not in scope, then this is a parse error; return.
if ( ! $this->state->stack_of_open_elements->has_element_in_scope( $formatting_element->node_name ) ) {
return;
}
/*
* > Let furthest block be the topmost node in the stack of open elements that is lower in the stack
* > than formatting element, and is an element in the special category. There might not be one.
*/
$is_above_formatting_element = true;
$furthest_block = null;
foreach ( $this->state->stack_of_open_elements->walk_down() as $item ) {
if ( $is_above_formatting_element && $formatting_element->bookmark_name !== $item->bookmark_name ) {
continue;
}
if ( $is_above_formatting_element ) {
$is_above_formatting_element = false;
continue;
}
if ( self::is_special( $item ) ) {
$furthest_block = $item;
break;
}
}
/*
* > If there is no furthest block, then the UA must first pop all the nodes from the bottom of the
* > stack of open elements, from the current node up to and including formatting element, then
* > remove formatting element from the list of active formatting elements, and finally return.
*/
if ( null === $furthest_block ) {
foreach ( $this->state->stack_of_open_elements->walk_up() as $item ) {
$this->state->stack_of_open_elements->pop();
if ( $formatting_element->bookmark_name === $item->bookmark_name ) {
$this->state->active_formatting_elements->remove_node( $formatting_element );
return;
}
}
}
$this->bail( 'Cannot extract common ancestor in adoption agency algorithm.' );
}
$this->bail( 'Cannot run adoption agency when looping required.' );
}
/**
* Runs the "close the cell" algorithm.
*
* > Where the steps above say to close the cell, they mean to run the following algorithm:
* > 1. Generate implied end tags.
* > 2. If the current node is not now a td element or a th element, then this is a parse error.
* > 3. Pop elements from the stack of open elements stack until a td element or a th element has been popped from the stack.
* > 4. Clear the list of active formatting elements up to the last marker.
* > 5. Switch the insertion mode to "in row".
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#close-the-cell
*
* @since 6.7.0
*/
private function close_cell(): void {
$this->generate_implied_end_tags();
// @todo Parse error if the current node is a "td" or "th" element.
foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) {
$this->state->stack_of_open_elements->pop();
if ( 'TD' === $element->node_name || 'TH' === $element->node_name ) {
break;
}
}
$this->state->active_formatting_elements->clear_up_to_last_marker();
$this->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_ROW;
}
/**
* Inserts an HTML element on the stack of open elements.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#insert-a-foreign-element
*
* @param WP_HTML_Token $token Name of bookmark pointing to element in original input HTML.
*/
private function insert_html_element( WP_HTML_Token $token ): void {
$this->state->stack_of_open_elements->push( $token );
}
/**
* Inserts a foreign element on to the stack of open elements.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#insert-a-foreign-element
*
* @param WP_HTML_Token $token Insert this token. The token's namespace and
* insertion point will be updated correctly.
* @param bool $only_add_to_element_stack Whether to skip the "insert an element at the adjusted
* insertion location" algorithm when adding this element.
*/
private function insert_foreign_element( WP_HTML_Token $token, bool $only_add_to_element_stack ): void {
$adjusted_current_node = $this->get_adjusted_current_node();
$token->namespace = $adjusted_current_node ? $adjusted_current_node->namespace : 'html';
if ( $this->is_mathml_integration_point() ) {
$token->integration_node_type = 'math';
} elseif ( $this->is_html_integration_point() ) {
$token->integration_node_type = 'html';
}
if ( false === $only_add_to_element_stack ) {
/*
* @todo Implement the "appropriate place for inserting a node" and the
* "insert an element at the adjusted insertion location" algorithms.
*
* These algorithms mostly impacts DOM tree construction and not the HTML API.
* Here, there's no DOM node onto which the element will be appended, so the
* parser will skip this step.
*
* @see https://html.spec.whatwg.org/#insert-an-element-at-the-adjusted-insertion-location
*/
}
$this->insert_html_element( $token );
}
/**
* Inserts a virtual element on the stack of open elements.
*
* @since 6.7.0
*
* @param string $token_name Name of token to create and insert into the stack of open elements.
* @param string|null $bookmark_name Optional. Name to give bookmark for created virtual node.
* Defaults to auto-creating a bookmark name.
* @return WP_HTML_Token Newly-created virtual token.
*/
private function insert_virtual_node( $token_name, $bookmark_name = null ): WP_HTML_Token {
$here = $this->bookmarks[ $this->state->current_token->bookmark_name ];
$name = $bookmark_name ?? $this->bookmark_token();
$this->bookmarks[ $name ] = new WP_HTML_Span( $here->start, 0 );
$token = new WP_HTML_Token( $name, $token_name, false );
$this->insert_html_element( $token );
return $token;
}
/*
* HTML Specification Helpers
*/
/**
* Indicates if the current token is a MathML integration point.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#mathml-text-integration-point
*
* @return bool Whether the current token is a MathML integration point.
*/
private function is_mathml_integration_point(): bool {
$current_token = $this->state->current_token;
if ( ! isset( $current_token ) ) {
return false;
}
if ( 'math' !== $current_token->namespace || 'M' !== $current_token->node_name[0] ) {
return false;
}
$tag_name = $current_token->node_name;
return (
'MI' === $tag_name ||
'MO' === $tag_name ||
'MN' === $tag_name ||
'MS' === $tag_name ||
'MTEXT' === $tag_name
);
}
/**
* Indicates if the current token is an HTML integration point.
*
* Note that this method must be an instance method with access
* to the current token, since it needs to examine the attributes
* of the currently-matched tag, if it's in the MathML namespace.
* Otherwise it would be required to scan the HTML and ensure that
* no other accounting is overlooked.
*
* @since 6.7.0
*
* @see https://html.spec.whatwg.org/#html-integration-point
*
* @return bool Whether the current token is an HTML integration point.
*/
private function is_html_integration_point(): bool {
$current_token = $this->state->current_token;
if ( ! isset( $current_token ) ) {
return false;
}
if ( 'html' === $current_token->namespace ) {
return false;
}
$tag_name = $current_token->node_name;
if ( 'svg' === $current_token->namespace ) {
return (
'DESC' === $tag_name ||
'FOREIGNOBJECT' === $tag_name ||
'TITLE' === $tag_name
);
}
if ( 'math' === $current_token->namespace ) {
if ( 'ANNOTATION-XML' !== $tag_name ) {
return false;
}
$encoding = $this->get_attribute( 'encoding' );
return (
is_string( $encoding ) &&
(
0 === strcasecmp( $encoding, 'application/xhtml+xml' ) ||
0 === strcasecmp( $encoding, 'text/html' )
)
);
}
$this->bail( 'Should not have reached end of HTML Integration Point detection: check HTML API code.' );
// This unnecessary return prevents tools from inaccurately reporting type errors.
return false;
}
/**
* Returns whether an element of a given name is in the HTML special category.
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#special
*
* @param WP_HTML_Token|string $tag_name Node to check, or only its name if in the HTML namespace.
* @return bool Whether the element of the given name is in the special category.
*/
public static function is_special( $tag_name ): bool {
if ( is_string( $tag_name ) ) {
$tag_name = strtoupper( $tag_name );
} else {
$tag_name = 'html' === $tag_name->namespace
? strtoupper( $tag_name->node_name )
: "{$tag_name->namespace} {$tag_name->node_name}";
}
return (
'ADDRESS' === $tag_name ||
'APPLET' === $tag_name ||
'AREA' === $tag_name ||
'ARTICLE' === $tag_name ||
'ASIDE' === $tag_name ||
'BASE' === $tag_name ||
'BASEFONT' === $tag_name ||
'BGSOUND' === $tag_name ||
'BLOCKQUOTE' === $tag_name ||
'BODY' === $tag_name ||
'BR' === $tag_name ||
'BUTTON' === $tag_name ||
'CAPTION' === $tag_name ||
'CENTER' === $tag_name ||
'COL' === $tag_name ||
'COLGROUP' === $tag_name ||
'DD' === $tag_name ||
'DETAILS' === $tag_name ||
'DIR' === $tag_name ||
'DIV' === $tag_name ||
'DL' === $tag_name ||
'DT' === $tag_name ||
'EMBED' === $tag_name ||
'FIELDSET' === $tag_name ||
'FIGCAPTION' === $tag_name ||
'FIGURE' === $tag_name ||
'FOOTER' === $tag_name ||
'FORM' === $tag_name ||
'FRAME' === $tag_name ||
'FRAMESET' === $tag_name ||
'H1' === $tag_name ||
'H2' === $tag_name ||
'H3' === $tag_name ||
'H4' === $tag_name ||
'H5' === $tag_name ||
'H6' === $tag_name ||
'HEAD' === $tag_name ||
'HEADER' === $tag_name ||
'HGROUP' === $tag_name ||
'HR' === $tag_name ||
'HTML' === $tag_name ||
'IFRAME' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
'KEYGEN' === $tag_name ||
'LI' === $tag_name ||
'LINK' === $tag_name ||
'LISTING' === $tag_name ||
'MAIN' === $tag_name ||
'MARQUEE' === $tag_name ||
'MENU' === $tag_name ||
'META' === $tag_name ||
'NAV' === $tag_name ||
'NOEMBED' === $tag_name ||
'NOFRAMES' === $tag_name ||
'NOSCRIPT' === $tag_name ||
'OBJECT' === $tag_name ||
'OL' === $tag_name ||
'P' === $tag_name ||
'PARAM' === $tag_name ||
'PLAINTEXT' === $tag_name ||
'PRE' === $tag_name ||
'SCRIPT' === $tag_name ||
'SEARCH' === $tag_name ||
'SECTION' === $tag_name ||
'SELECT' === $tag_name ||
'SOURCE' === $tag_name ||
'STYLE' === $tag_name ||
'SUMMARY' === $tag_name ||
'TABLE' === $tag_name ||
'TBODY' === $tag_name ||
'TD' === $tag_name ||
'TEMPLATE' === $tag_name ||
'TEXTAREA' === $tag_name ||
'TFOOT' === $tag_name ||
'TH' === $tag_name ||
'THEAD' === $tag_name ||
'TITLE' === $tag_name ||
'TR' === $tag_name ||
'TRACK' === $tag_name ||
'UL' === $tag_name ||
'WBR' === $tag_name ||
'XMP' === $tag_name ||
// MathML.
'math MI' === $tag_name ||
'math MO' === $tag_name ||
'math MN' === $tag_name ||
'math MS' === $tag_name ||
'math MTEXT' === $tag_name ||
'math ANNOTATION-XML' === $tag_name ||
// SVG.
'svg DESC' === $tag_name ||
'svg FOREIGNOBJECT' === $tag_name ||
'svg TITLE' === $tag_name
);
}
/**
* Returns whether a given element is an HTML Void Element
*
* > area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr
*
* @since 6.4.0
*
* @see https://html.spec.whatwg.org/#void-elements
*
* @param string $tag_name Name of HTML tag to check.
* @return bool Whether the given tag is an HTML Void Element.
*/
public static function is_void( $tag_name ): bool {
$tag_name = strtoupper( $tag_name );
return (
'AREA' === $tag_name ||
'BASE' === $tag_name ||
'BASEFONT' === $tag_name || // Obsolete but still treated as void.
'BGSOUND' === $tag_name || // Obsolete but still treated as void.
'BR' === $tag_name ||
'COL' === $tag_name ||
'EMBED' === $tag_name ||
'FRAME' === $tag_name ||
'HR' === $tag_name ||
'IMG' === $tag_name ||
'INPUT' === $tag_name ||
'KEYGEN' === $tag_name || // Obsolete but still treated as void.
'LINK' === $tag_name ||
'META' === $tag_name ||
'PARAM' === $tag_name || // Obsolete but still treated as void.
'SOURCE' === $tag_name ||
'TRACK' === $tag_name ||
'WBR' === $tag_name
);
}
/**
* Gets an encoding from a given string.
*
* This is an algorithm defined in the WHAT-WG specification.
*
* Example:
*
* 'UTF-8' === self::get_encoding( 'utf8' );
* 'UTF-8' === self::get_encoding( " \tUTF-8 " );
* null === self::get_encoding( 'UTF-7' );
* null === self::get_encoding( 'utf8; charset=' );
*
* @see https://encoding.spec.whatwg.org/#concept-encoding-get
*
* @todo As this parser only supports UTF-8, only the UTF-8
* encodings are detected. Add more as desired, but the
* parser will bail on non-UTF-8 encodings.
*
* @since 6.7.0
*
* @param string $label A string which may specify a known encoding.
* @return string|null Known encoding if matched, otherwise null.
*/
protected static function get_encoding( string $label ): ?string {
/*
* > Remove any leading and trailing ASCII whitespace from label.
*/
$label = trim( $label, " \t\f\r\n" );
/*
* > If label is an ASCII case-insensitive match for any of the labels listed in the
* > table below, then return the corresponding encoding; otherwise return failure.
*/
switch ( strtolower( $label ) ) {
case 'unicode-1-1-utf-8':
case 'unicode11utf8':
case 'unicode20utf8':
case 'utf-8':
case 'utf8':
case 'x-unicode20utf8':
return 'UTF-8';
default:
return null;
}
}
/*
* Constants that would pollute the top of the class if they were found there.
*/
/**
* Indicates that the next HTML token should be parsed and processed.
*
* @since 6.4.0
*
* @var string
*/
const PROCESS_NEXT_NODE = 'process-next-node';
/**
* Indicates that the current HTML token should be reprocessed in the newly-selected insertion mode.
*
* @since 6.4.0
*
* @var string
*/
const REPROCESS_CURRENT_NODE = 'reprocess-current-node';
/**
* Indicates that the current HTML token should be processed without advancing the parser.
*
* @since 6.5.0
*
* @var string
*/
const PROCESS_CURRENT_NODE = 'process-current-node';
/**
* Indicates that the parser encountered unsupported markup and has bailed.
*
* @since 6.4.0
*
* @var string
*/
const ERROR_UNSUPPORTED = 'unsupported';
/**
* Indicates that the parser encountered more HTML tokens than it
* was able to process and has bailed.
*
* @since 6.4.0
*
* @var string
*/
const ERROR_EXCEEDED_MAX_BOOKMARKS = 'exceeded-max-bookmarks';
/**
* Unlock code that must be passed into the constructor to create this class.
*
* This class extends the WP_HTML_Tag_Processor, which has a public class
* constructor. Therefore, it's not possible to have a private constructor here.
*
* This unlock code is used to ensure that anyone calling the constructor is
* doing so with a full understanding that it's intended to be a private API.
*
* @access private
*/
const CONSTRUCTOR_UNLOCK_CODE = 'Use WP_HTML_Processor::create_fragment() instead of calling the class constructor directly.';
}