4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
8 * Recent changes to PHP's DOM extension have resulted in some fatal
9 * error conditions with the original version of PH5P. Pending changes,
10 * this lexer will punt to DirectLex if DOM throughs an exception.
13 class HTMLPurifier_Lexer_PH5P
extends HTMLPurifier_Lexer_DOMLex
{
15 public function tokenizeHTML($html, $config, $context) {
16 $new_html = $this->normalize($html, $config, $context);
17 $new_html = $this->wrapHTML($new_html, $config, $context);
19 $parser = new HTML5($new_html);
20 $doc = $parser->save();
21 } catch (DOMException
$e) {
22 // Uh oh, it failed. Punt to DirectLex.
23 $lexer = new HTMLPurifier_Lexer_DirectLex();
24 $context->register('PH5PError', $e); // save the error, so we can detect it
25 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
29 $doc->getElementsByTagName('html')->item(0)-> // <html>
30 getElementsByTagName('body')->item(0)-> // <body>
31 getElementsByTagName('div')->item(0) // <div>
40 Copyright 2007 Jeroen van der Meer <http://jero.net/>
42 Permission is hereby granted, free of charge, to any person obtaining a
43 copy of this software and associated documentation files (the
44 "Software"), to deal in the Software without restriction, including
45 without limitation the rights to use, copy, modify, merge, publish,
46 distribute, sublicense, and/or sell copies of the Software, and to
47 permit persons to whom the Software is furnished to do so, subject to
48 the following conditions:
50 The above copyright notice and this permission notice shall be included
51 in all copies or substantial portions of the Software.
53 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
54 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
55 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
56 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
57 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
58 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
59 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
70 private $content_model;
71 private $escape = false;
72 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
73 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
74 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
75 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
76 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
77 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
78 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
79 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
80 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
81 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
82 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
83 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
84 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
85 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
86 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
87 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
88 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
89 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
90 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
91 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
92 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
93 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
94 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
95 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
96 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
97 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
98 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
99 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
100 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
101 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
102 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
103 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
104 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
105 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
106 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
107 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
108 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
109 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
110 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
111 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
112 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
113 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
127 public function __construct($data) {
128 $data = str_replace("\r\n", "\n", $data);
129 $data = str_replace("\r", null, $data);
133 $this->EOF
= strlen($data);
134 $this->tree
= new HTML5TreeConstructer
;
135 $this->content_model
= self
::PCDATA
;
137 $this->state
= 'data';
139 while($this->state
!== null) {
140 $this->{$this->state
.'State'}();
144 public function save() {
145 return $this->tree
->save();
148 private function char() {
149 return ($this->char
< $this->EOF
)
150 ?
$this->data
[$this->char
]
154 private function character($s, $l = 0) {
155 if($s +
$l < $this->EOF
) {
157 return $this->data
[$s];
159 return substr($this->data
, $s, $l);
164 private function characters($char_class, $start) {
165 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data
, $start));
168 private function dataState() {
169 // Consume the next input character
171 $char = $this->char();
173 if($char === '&' && ($this->content_model
=== self
::PCDATA ||
$this->content_model
=== self
::RCDATA
)) {
174 /* U+0026 AMPERSAND (&)
175 When the content model flag is set to one of the PCDATA or RCDATA
176 states: switch to the entity data state. Otherwise: treat it as per
177 the "anything else" entry below. */
178 $this->state
= 'entityData';
180 } elseif($char === '-') {
181 /* If the content model flag is set to either the RCDATA state or
182 the CDATA state, and the escape flag is false, and there are at
183 least three characters before this one in the input stream, and the
184 last four characters in the input stream, including this one, are
185 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
186 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
187 if(($this->content_model
=== self
::RCDATA ||
$this->content_model
===
188 self
::CDATA
) && $this->escape
=== false &&
189 $this->char
>= 3 && $this->character($this->char
- 4, 4) === '<!--') {
190 $this->escape
= true;
193 /* In any case, emit the input character as a character token. Stay
194 in the data state. */
195 $this->emitToken(array(
196 'type' => self
::CHARACTR
,
200 /* U+003C LESS-THAN SIGN (<) */
201 } elseif($char === '<' && ($this->content_model
=== self
::PCDATA ||
202 (($this->content_model
=== self
::RCDATA ||
203 $this->content_model
=== self
::CDATA
) && $this->escape
=== false))) {
204 /* When the content model flag is set to the PCDATA state: switch
205 to the tag open state.
207 When the content model flag is set to either the RCDATA state or
208 the CDATA state and the escape flag is false: switch to the tag
211 Otherwise: treat it as per the "anything else" entry below. */
212 $this->state
= 'tagOpen';
214 /* U+003E GREATER-THAN SIGN (>) */
215 } elseif($char === '>') {
216 /* If the content model flag is set to either the RCDATA state or
217 the CDATA state, and the escape flag is true, and the last three
218 characters in the input stream including this one are U+002D
219 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
220 set the escape flag to false. */
221 if(($this->content_model
=== self
::RCDATA ||
222 $this->content_model
=== self
::CDATA
) && $this->escape
=== true &&
223 $this->character($this->char
, 3) === '-->') {
224 $this->escape
= false;
227 /* In any case, emit the input character as a character token.
228 Stay in the data state. */
229 $this->emitToken(array(
230 'type' => self
::CHARACTR
,
234 } elseif($this->char
=== $this->EOF
) {
236 Emit an end-of-file token. */
239 } elseif($this->content_model
=== self
::PLAINTEXT
) {
240 /* When the content model flag is set to the PLAINTEXT state
241 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
242 the text and emit it as a character token. */
243 $this->emitToken(array(
244 'type' => self
::CHARACTR
,
245 'data' => substr($this->data
, $this->char
)
252 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
253 otherwise would also be treated as a character token and emit it
254 as a single character token. Stay in the data state. */
255 $len = strcspn($this->data
, '<&', $this->char
);
256 $char = substr($this->data
, $this->char
, $len);
257 $this->char +
= $len - 1;
259 $this->emitToken(array(
260 'type' => self
::CHARACTR
,
264 $this->state
= 'data';
268 private function entityDataState() {
269 // Attempt to consume an entity.
270 $entity = $this->entity();
272 // If nothing is returned, emit a U+0026 AMPERSAND character token.
273 // Otherwise, emit the character token that was returned.
274 $char = (!$entity) ?
'&' : $entity;
275 $this->emitToken(array(
276 'type' => self
::CHARACTR
,
280 // Finally, switch to the data state.
281 $this->state
= 'data';
284 private function tagOpenState() {
285 switch($this->content_model
) {
288 /* If the next input character is a U+002F SOLIDUS (/) character,
289 consume it and switch to the close tag open state. If the next
290 input character is not a U+002F SOLIDUS (/) character, emit a
291 U+003C LESS-THAN SIGN character token and switch to the data
292 state to process the next input character. */
293 if($this->character($this->char +
1) === '/') {
295 $this->state
= 'closeTagOpen';
298 $this->emitToken(array(
299 'type' => self
::CHARACTR
,
303 $this->state
= 'data';
308 // If the content model flag is set to the PCDATA state
309 // Consume the next input character:
311 $char = $this->char();
314 /* U+0021 EXCLAMATION MARK (!)
315 Switch to the markup declaration open state. */
316 $this->state
= 'markupDeclarationOpen';
318 } elseif($char === '/') {
319 /* U+002F SOLIDUS (/)
320 Switch to the close tag open state. */
321 $this->state
= 'closeTagOpen';
323 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
324 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
325 Create a new start tag token, set its tag name to the lowercase
326 version of the input character (add 0x0020 to the character's code
327 point), then switch to the tag name state. (Don't emit the token
328 yet; further details will be filled in before it is emitted.) */
329 $this->token
= array(
330 'name' => strtolower($char),
331 'type' => self
::STARTTAG
,
335 $this->state
= 'tagName';
337 } elseif($char === '>') {
338 /* U+003E GREATER-THAN SIGN (>)
339 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
340 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
341 $this->emitToken(array(
342 'type' => self
::CHARACTR
,
346 $this->state
= 'data';
348 } elseif($char === '?') {
349 /* U+003F QUESTION MARK (?)
350 Parse error. Switch to the bogus comment state. */
351 $this->state
= 'bogusComment';
355 Parse error. Emit a U+003C LESS-THAN SIGN character token and
356 reconsume the current input character in the data state. */
357 $this->emitToken(array(
358 'type' => self
::CHARACTR
,
363 $this->state
= 'data';
369 private function closeTagOpenState() {
370 $next_node = strtolower($this->characters('A-Za-z', $this->char +
1));
371 $the_same = count($this->tree
->stack
) > 0 && $next_node === end($this->tree
->stack
)->nodeName
;
373 if(($this->content_model
=== self
::RCDATA ||
$this->content_model
=== self
::CDATA
) &&
374 (!$the_same ||
($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
375 $this->character($this->char +
1 +
strlen($next_node))) ||
$this->EOF
=== $this->char
)))) {
376 /* If the content model flag is set to the RCDATA or CDATA states then
377 examine the next few characters. If they do not match the tag name of
378 the last start tag token emitted (case insensitively), or if they do but
379 they are not immediately followed by one of the following characters:
380 * U+0009 CHARACTER TABULATION
381 * U+000A LINE FEED (LF)
382 * U+000B LINE TABULATION
383 * U+000C FORM FEED (FF)
385 * U+003E GREATER-THAN SIGN (>)
388 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
389 token, a U+002F SOLIDUS character token, and switch to the data state
390 to process the next input character. */
391 $this->emitToken(array(
392 'type' => self
::CHARACTR
,
396 $this->state
= 'data';
399 /* Otherwise, if the content model flag is set to the PCDATA state,
400 or if the next few characters do match that tag name, consume the
401 next input character: */
403 $char = $this->char();
405 if(preg_match('/^[A-Za-z]$/', $char)) {
406 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
407 Create a new end tag token, set its tag name to the lowercase version
408 of the input character (add 0x0020 to the character's code point), then
409 switch to the tag name state. (Don't emit the token yet; further details
410 will be filled in before it is emitted.) */
411 $this->token
= array(
412 'name' => strtolower($char),
413 'type' => self
::ENDTAG
416 $this->state
= 'tagName';
418 } elseif($char === '>') {
419 /* U+003E GREATER-THAN SIGN (>)
420 Parse error. Switch to the data state. */
421 $this->state
= 'data';
423 } elseif($this->char
=== $this->EOF
) {
425 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
426 SOLIDUS character token. Reconsume the EOF character in the data state. */
427 $this->emitToken(array(
428 'type' => self
::CHARACTR
,
433 $this->state
= 'data';
436 /* Parse error. Switch to the bogus comment state. */
437 $this->state
= 'bogusComment';
442 private function tagNameState() {
443 // Consume the next input character:
445 $char = $this->character($this->char
);
447 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
448 /* U+0009 CHARACTER TABULATION
449 U+000A LINE FEED (LF)
450 U+000B LINE TABULATION
451 U+000C FORM FEED (FF)
453 Switch to the before attribute name state. */
454 $this->state
= 'beforeAttributeName';
456 } elseif($char === '>') {
457 /* U+003E GREATER-THAN SIGN (>)
458 Emit the current tag token. Switch to the data state. */
459 $this->emitToken($this->token
);
460 $this->state
= 'data';
462 } elseif($this->char
=== $this->EOF
) {
464 Parse error. Emit the current tag token. Reconsume the EOF
465 character in the data state. */
466 $this->emitToken($this->token
);
469 $this->state
= 'data';
471 } elseif($char === '/') {
472 /* U+002F SOLIDUS (/)
473 Parse error unless this is a permitted slash. Switch to the before
474 attribute name state. */
475 $this->state
= 'beforeAttributeName';
479 Append the current input character to the current tag token's tag name.
480 Stay in the tag name state. */
481 $this->token
['name'] .= strtolower($char);
482 $this->state
= 'tagName';
486 private function beforeAttributeNameState() {
487 // Consume the next input character:
489 $char = $this->character($this->char
);
491 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
492 /* U+0009 CHARACTER TABULATION
493 U+000A LINE FEED (LF)
494 U+000B LINE TABULATION
495 U+000C FORM FEED (FF)
497 Stay in the before attribute name state. */
498 $this->state
= 'beforeAttributeName';
500 } elseif($char === '>') {
501 /* U+003E GREATER-THAN SIGN (>)
502 Emit the current tag token. Switch to the data state. */
503 $this->emitToken($this->token
);
504 $this->state
= 'data';
506 } elseif($char === '/') {
507 /* U+002F SOLIDUS (/)
508 Parse error unless this is a permitted slash. Stay in the before
509 attribute name state. */
510 $this->state
= 'beforeAttributeName';
512 } elseif($this->char
=== $this->EOF
) {
514 Parse error. Emit the current tag token. Reconsume the EOF
515 character in the data state. */
516 $this->emitToken($this->token
);
519 $this->state
= 'data';
523 Start a new attribute in the current tag token. Set that attribute's
524 name to the current input character, and its value to the empty string.
525 Switch to the attribute name state. */
526 $this->token
['attr'][] = array(
527 'name' => strtolower($char),
531 $this->state
= 'attributeName';
535 private function attributeNameState() {
536 // Consume the next input character:
538 $char = $this->character($this->char
);
540 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
541 /* U+0009 CHARACTER TABULATION
542 U+000A LINE FEED (LF)
543 U+000B LINE TABULATION
544 U+000C FORM FEED (FF)
546 Stay in the before attribute name state. */
547 $this->state
= 'afterAttributeName';
549 } elseif($char === '=') {
550 /* U+003D EQUALS SIGN (=)
551 Switch to the before attribute value state. */
552 $this->state
= 'beforeAttributeValue';
554 } elseif($char === '>') {
555 /* U+003E GREATER-THAN SIGN (>)
556 Emit the current tag token. Switch to the data state. */
557 $this->emitToken($this->token
);
558 $this->state
= 'data';
560 } elseif($char === '/' && $this->character($this->char +
1) !== '>') {
561 /* U+002F SOLIDUS (/)
562 Parse error unless this is a permitted slash. Switch to the before
563 attribute name state. */
564 $this->state
= 'beforeAttributeName';
566 } elseif($this->char
=== $this->EOF
) {
568 Parse error. Emit the current tag token. Reconsume the EOF
569 character in the data state. */
570 $this->emitToken($this->token
);
573 $this->state
= 'data';
577 Append the current input character to the current attribute's name.
578 Stay in the attribute name state. */
579 $last = count($this->token
['attr']) - 1;
580 $this->token
['attr'][$last]['name'] .= strtolower($char);
582 $this->state
= 'attributeName';
586 private function afterAttributeNameState() {
587 // Consume the next input character:
589 $char = $this->character($this->char
);
591 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
592 /* U+0009 CHARACTER TABULATION
593 U+000A LINE FEED (LF)
594 U+000B LINE TABULATION
595 U+000C FORM FEED (FF)
597 Stay in the after attribute name state. */
598 $this->state
= 'afterAttributeName';
600 } elseif($char === '=') {
601 /* U+003D EQUALS SIGN (=)
602 Switch to the before attribute value state. */
603 $this->state
= 'beforeAttributeValue';
605 } elseif($char === '>') {
606 /* U+003E GREATER-THAN SIGN (>)
607 Emit the current tag token. Switch to the data state. */
608 $this->emitToken($this->token
);
609 $this->state
= 'data';
611 } elseif($char === '/' && $this->character($this->char +
1) !== '>') {
612 /* U+002F SOLIDUS (/)
613 Parse error unless this is a permitted slash. Switch to the
614 before attribute name state. */
615 $this->state
= 'beforeAttributeName';
617 } elseif($this->char
=== $this->EOF
) {
619 Parse error. Emit the current tag token. Reconsume the EOF
620 character in the data state. */
621 $this->emitToken($this->token
);
624 $this->state
= 'data';
628 Start a new attribute in the current tag token. Set that attribute's
629 name to the current input character, and its value to the empty string.
630 Switch to the attribute name state. */
631 $this->token
['attr'][] = array(
632 'name' => strtolower($char),
636 $this->state
= 'attributeName';
640 private function beforeAttributeValueState() {
641 // Consume the next input character:
643 $char = $this->character($this->char
);
645 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
646 /* U+0009 CHARACTER TABULATION
647 U+000A LINE FEED (LF)
648 U+000B LINE TABULATION
649 U+000C FORM FEED (FF)
651 Stay in the before attribute value state. */
652 $this->state
= 'beforeAttributeValue';
654 } elseif($char === '"') {
655 /* U+0022 QUOTATION MARK (")
656 Switch to the attribute value (double-quoted) state. */
657 $this->state
= 'attributeValueDoubleQuoted';
659 } elseif($char === '&') {
660 /* U+0026 AMPERSAND (&)
661 Switch to the attribute value (unquoted) state and reconsume
662 this input character. */
664 $this->state
= 'attributeValueUnquoted';
666 } elseif($char === '\'') {
667 /* U+0027 APOSTROPHE (')
668 Switch to the attribute value (single-quoted) state. */
669 $this->state
= 'attributeValueSingleQuoted';
671 } elseif($char === '>') {
672 /* U+003E GREATER-THAN SIGN (>)
673 Emit the current tag token. Switch to the data state. */
674 $this->emitToken($this->token
);
675 $this->state
= 'data';
679 Append the current input character to the current attribute's value.
680 Switch to the attribute value (unquoted) state. */
681 $last = count($this->token
['attr']) - 1;
682 $this->token
['attr'][$last]['value'] .= $char;
684 $this->state
= 'attributeValueUnquoted';
688 private function attributeValueDoubleQuotedState() {
689 // Consume the next input character:
691 $char = $this->character($this->char
);
694 /* U+0022 QUOTATION MARK (")
695 Switch to the before attribute name state. */
696 $this->state
= 'beforeAttributeName';
698 } elseif($char === '&') {
699 /* U+0026 AMPERSAND (&)
700 Switch to the entity in attribute value state. */
701 $this->entityInAttributeValueState('double');
703 } elseif($this->char
=== $this->EOF
) {
705 Parse error. Emit the current tag token. Reconsume the character
706 in the data state. */
707 $this->emitToken($this->token
);
710 $this->state
= 'data';
714 Append the current input character to the current attribute's value.
715 Stay in the attribute value (double-quoted) state. */
716 $last = count($this->token
['attr']) - 1;
717 $this->token
['attr'][$last]['value'] .= $char;
719 $this->state
= 'attributeValueDoubleQuoted';
723 private function attributeValueSingleQuotedState() {
724 // Consume the next input character:
726 $char = $this->character($this->char
);
729 /* U+0022 QUOTATION MARK (')
730 Switch to the before attribute name state. */
731 $this->state
= 'beforeAttributeName';
733 } elseif($char === '&') {
734 /* U+0026 AMPERSAND (&)
735 Switch to the entity in attribute value state. */
736 $this->entityInAttributeValueState('single');
738 } elseif($this->char
=== $this->EOF
) {
740 Parse error. Emit the current tag token. Reconsume the character
741 in the data state. */
742 $this->emitToken($this->token
);
745 $this->state
= 'data';
749 Append the current input character to the current attribute's value.
750 Stay in the attribute value (single-quoted) state. */
751 $last = count($this->token
['attr']) - 1;
752 $this->token
['attr'][$last]['value'] .= $char;
754 $this->state
= 'attributeValueSingleQuoted';
758 private function attributeValueUnquotedState() {
759 // Consume the next input character:
761 $char = $this->character($this->char
);
763 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
764 /* U+0009 CHARACTER TABULATION
765 U+000A LINE FEED (LF)
766 U+000B LINE TABULATION
767 U+000C FORM FEED (FF)
769 Switch to the before attribute name state. */
770 $this->state
= 'beforeAttributeName';
772 } elseif($char === '&') {
773 /* U+0026 AMPERSAND (&)
774 Switch to the entity in attribute value state. */
775 $this->entityInAttributeValueState();
777 } elseif($char === '>') {
778 /* U+003E GREATER-THAN SIGN (>)
779 Emit the current tag token. Switch to the data state. */
780 $this->emitToken($this->token
);
781 $this->state
= 'data';
785 Append the current input character to the current attribute's value.
786 Stay in the attribute value (unquoted) state. */
787 $last = count($this->token
['attr']) - 1;
788 $this->token
['attr'][$last]['value'] .= $char;
790 $this->state
= 'attributeValueUnquoted';
794 private function entityInAttributeValueState() {
795 // Attempt to consume an entity.
796 $entity = $this->entity();
798 // If nothing is returned, append a U+0026 AMPERSAND character to the
799 // current attribute's value. Otherwise, emit the character token that
805 $last = count($this->token
['attr']) - 1;
806 $this->token
['attr'][$last]['value'] .= $char;
809 private function bogusCommentState() {
810 /* Consume every character up to the first U+003E GREATER-THAN SIGN
811 character (>) or the end of the file (EOF), whichever comes first. Emit
812 a comment token whose data is the concatenation of all the characters
813 starting from and including the character that caused the state machine
814 to switch into the bogus comment state, up to and including the last
815 consumed character before the U+003E character, if any, or up to the
816 end of the file otherwise. (If the comment was started by the end of
817 the file (EOF), the token is empty.) */
818 $data = $this->characters('^>', $this->char
);
819 $this->emitToken(array(
821 'type' => self
::COMMENT
824 $this->char +
= strlen($data);
826 /* Switch to the data state. */
827 $this->state
= 'data';
829 /* If the end of the file was reached, reconsume the EOF character. */
830 if($this->char
=== $this->EOF
) {
831 $this->char
= $this->EOF
- 1;
835 private function markupDeclarationOpenState() {
836 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
837 characters, consume those two characters, create a comment token whose
838 data is the empty string, and switch to the comment state. */
839 if($this->character($this->char +
1, 2) === '--') {
841 $this->state
= 'comment';
842 $this->token
= array(
844 'type' => self
::COMMENT
847 /* Otherwise if the next seven chacacters are a case-insensitive match
848 for the word "DOCTYPE", then consume those characters and switch to the
850 } elseif(strtolower($this->character($this->char +
1, 7)) === 'doctype') {
852 $this->state
= 'doctype';
854 /* Otherwise, is is a parse error. Switch to the bogus comment state.
855 The next character that is consumed, if any, is the first character
856 that will be in the comment. */
859 $this->state
= 'bogusComment';
863 private function commentState() {
864 /* Consume the next input character: */
866 $char = $this->char();
868 /* U+002D HYPHEN-MINUS (-) */
870 /* Switch to the comment dash state */
871 $this->state
= 'commentDash';
874 } elseif($this->char
=== $this->EOF
) {
875 /* Parse error. Emit the comment token. Reconsume the EOF character
876 in the data state. */
877 $this->emitToken($this->token
);
879 $this->state
= 'data';
883 /* Append the input character to the comment token's data. Stay in
884 the comment state. */
885 $this->token
['data'] .= $char;
889 private function commentDashState() {
890 /* Consume the next input character: */
892 $char = $this->char();
894 /* U+002D HYPHEN-MINUS (-) */
896 /* Switch to the comment end state */
897 $this->state
= 'commentEnd';
900 } elseif($this->char
=== $this->EOF
) {
901 /* Parse error. Emit the comment token. Reconsume the EOF character
902 in the data state. */
903 $this->emitToken($this->token
);
905 $this->state
= 'data';
909 /* Append a U+002D HYPHEN-MINUS (-) character and the input
910 character to the comment token's data. Switch to the comment state. */
911 $this->token
['data'] .= '-'.$char;
912 $this->state
= 'comment';
916 private function commentEndState() {
917 /* Consume the next input character: */
919 $char = $this->char();
922 $this->emitToken($this->token
);
923 $this->state
= 'data';
925 } elseif($char === '-') {
926 $this->token
['data'] .= '-';
928 } elseif($this->char
=== $this->EOF
) {
929 $this->emitToken($this->token
);
931 $this->state
= 'data';
934 $this->token
['data'] .= '--'.$char;
935 $this->state
= 'comment';
939 private function doctypeState() {
940 /* Consume the next input character: */
942 $char = $this->char();
944 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
945 $this->state
= 'beforeDoctypeName';
949 $this->state
= 'beforeDoctypeName';
953 private function beforeDoctypeNameState() {
954 /* Consume the next input character: */
956 $char = $this->char();
958 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
959 // Stay in the before DOCTYPE name state.
961 } elseif(preg_match('/^[a-z]$/', $char)) {
962 $this->token
= array(
963 'name' => strtoupper($char),
964 'type' => self
::DOCTYPE
,
968 $this->state
= 'doctypeName';
970 } elseif($char === '>') {
971 $this->emitToken(array(
973 'type' => self
::DOCTYPE
,
977 $this->state
= 'data';
979 } elseif($this->char
=== $this->EOF
) {
980 $this->emitToken(array(
982 'type' => self
::DOCTYPE
,
987 $this->state
= 'data';
990 $this->token
= array(
992 'type' => self
::DOCTYPE
,
996 $this->state
= 'doctypeName';
1000 private function doctypeNameState() {
1001 /* Consume the next input character: */
1003 $char = $this->char();
1005 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1006 $this->state
= 'AfterDoctypeName';
1008 } elseif($char === '>') {
1009 $this->emitToken($this->token
);
1010 $this->state
= 'data';
1012 } elseif(preg_match('/^[a-z]$/', $char)) {
1013 $this->token
['name'] .= strtoupper($char);
1015 } elseif($this->char
=== $this->EOF
) {
1016 $this->emitToken($this->token
);
1018 $this->state
= 'data';
1021 $this->token
['name'] .= $char;
1024 $this->token
['error'] = ($this->token
['name'] === 'HTML')
1029 private function afterDoctypeNameState() {
1030 /* Consume the next input character: */
1032 $char = $this->char();
1034 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1035 // Stay in the DOCTYPE name state.
1037 } elseif($char === '>') {
1038 $this->emitToken($this->token
);
1039 $this->state
= 'data';
1041 } elseif($this->char
=== $this->EOF
) {
1042 $this->emitToken($this->token
);
1044 $this->state
= 'data';
1047 $this->token
['error'] = true;
1048 $this->state
= 'bogusDoctype';
1052 private function bogusDoctypeState() {
1053 /* Consume the next input character: */
1055 $char = $this->char();
1058 $this->emitToken($this->token
);
1059 $this->state
= 'data';
1061 } elseif($this->char
=== $this->EOF
) {
1062 $this->emitToken($this->token
);
1064 $this->state
= 'data';
1067 // Stay in the bogus DOCTYPE state.
1071 private function entity() {
1072 $start = $this->char
;
1074 // This section defines how to consume an entity. This definition is
1075 // used when parsing entities in text and in attributes.
1077 // The behaviour depends on the identity of the next character (the
1078 // one immediately after the U+0026 AMPERSAND character):
1080 switch($this->character($this->char +
1)) {
1081 // U+0023 NUMBER SIGN (#)
1084 // The behaviour further depends on the character after the
1085 // U+0023 NUMBER SIGN:
1086 switch($this->character($this->char +
1)) {
1087 // U+0078 LATIN SMALL LETTER X
1088 // U+0058 LATIN CAPITAL LETTER X
1091 // Follow the steps below, but using the range of
1092 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1093 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1094 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1095 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1096 // words, 0-9, A-F, a-f).
1098 $char_class = '0-9A-Fa-f';
1103 // Follow the steps below, but using the range of
1104 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1105 // NINE (i.e. just 0-9).
1107 $char_class = '0-9';
1111 // Consume as many characters as match the range of characters
1114 $e_name = $this->characters($char_class, $this->char +
$char +
1);
1115 $entity = $this->character($start, $this->char
);
1116 $cond = strlen($e_name) > 0;
1118 // The rest of the parsing happens bellow.
1123 // Consume the maximum number of characters possible, with the
1124 // consumed characters case-sensitively matching one of the
1125 // identifiers in the first column of the entities table.
1126 $e_name = $this->characters('0-9A-Za-z;', $this->char +
1);
1127 $len = strlen($e_name);
1129 for($c = 1; $c <= $len; $c++
) {
1130 $id = substr($e_name, 0, $c);
1133 if(in_array($id, $this->entities
)) {
1134 if ($e_name[$c-1] !== ';') {
1135 if ($c < $len && $e_name[$c] == ';') {
1136 $this->char++
; // consume extra semicolon
1144 $cond = isset($entity);
1145 // The rest of the parsing happens bellow.
1150 // If no match can be made, then this is a parse error. No
1151 // characters are consumed, and nothing is returned.
1152 $this->char
= $start;
1156 // Return a character token for the character corresponding to the
1157 // entity name (as given by the second column of the entities table).
1158 return html_entity_decode('&'.$entity.';', ENT_QUOTES
, 'UTF-8');
1161 private function emitToken($token) {
1162 $emit = $this->tree
->emitToken($token);
1165 $this->content_model
= $emit;
1167 } elseif($token['type'] === self
::ENDTAG
) {
1168 $this->content_model
= self
::PCDATA
;
1172 private function EOF() {
1173 $this->state
= null;
1174 $this->tree
->emitToken(array(
1180 class HTML5TreeConstructer
{
1181 public $stack = array();
1186 private $foster_parent = null;
1187 private $a_formatting = array();
1189 private $head_pointer = null;
1190 private $form_pointer = null;
1192 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1193 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1194 private $special = array('address','area','base','basefont','bgsound',
1195 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1196 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1197 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1198 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1199 'option','p','param','plaintext','pre','script','select','spacer','style',
1200 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1202 // The different phases.
1203 const INIT_PHASE
= 0;
1204 const ROOT_PHASE
= 1;
1205 const MAIN_PHASE
= 2;
1206 const END_PHASE
= 3;
1208 // The different insertion modes for the main phase.
1209 const BEFOR_HEAD
= 0;
1211 const AFTER_HEAD
= 2;
1214 const IN_CAPTION
= 5;
1215 const IN_CGROUP
= 6;
1219 const IN_SELECT
= 10;
1220 const AFTER_BODY
= 11;
1221 const IN_FRAME
= 12;
1222 const AFTR_FRAME
= 13;
1224 // The different types of elements.
1227 const FORMATTING
= 2;
1232 public function __construct() {
1233 $this->phase
= self
::INIT_PHASE
;
1234 $this->mode
= self
::BEFOR_HEAD
;
1235 $this->dom
= new DOMDocument
;
1237 $this->dom
->encoding
= 'UTF-8';
1238 $this->dom
->preserveWhiteSpace
= true;
1239 $this->dom
->substituteEntities
= true;
1240 $this->dom
->strictErrorChecking
= false;
1243 // Process tag tokens
1244 public function emitToken($token) {
1245 switch($this->phase
) {
1246 case self
::INIT_PHASE
: return $this->initPhase($token); break;
1247 case self
::ROOT_PHASE
: return $this->rootElementPhase($token); break;
1248 case self
::MAIN_PHASE
: return $this->mainPhase($token); break;
1249 case self
::END_PHASE
: return $this->trailingEndPhase($token); break;
1253 private function initPhase($token) {
1254 /* Initially, the tree construction stage must handle each token
1255 emitted from the tokenisation stage as follows: */
1257 /* A DOCTYPE token that is marked as being in error
1261 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1262 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1264 An end-of-file token */
1265 if((isset($token['error']) && $token['error']) ||
1266 $token['type'] === HTML5
::COMMENT ||
1267 $token['type'] === HTML5
::STARTTAG ||
1268 $token['type'] === HTML5
::ENDTAG ||
1269 $token['type'] === HTML5
::EOF ||
1270 ($token['type'] === HTML5
::CHARACTR
&& isset($token['data']) &&
1271 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1272 /* This specification does not define how to handle this case. In
1273 particular, user agents may ignore the entirety of this specification
1274 altogether for such documents, and instead invoke special parse modes
1275 with a greater emphasis on backwards compatibility. */
1277 $this->phase
= self
::ROOT_PHASE
;
1278 return $this->rootElementPhase($token);
1280 /* A DOCTYPE token marked as being correct */
1281 } elseif(isset($token['error']) && !$token['error']) {
1282 /* Append a DocumentType node to the Document node, with the name
1283 attribute set to the name given in the DOCTYPE token (which will be
1284 "HTML"), and the other attributes specific to DocumentType objects
1285 set to null, empty lists, or the empty string as appropriate. */
1286 $doctype = new DOMDocumentType(null, null, 'HTML');
1288 /* Then, switch to the root element phase of the tree construction
1290 $this->phase
= self
::ROOT_PHASE
;
1292 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1293 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1295 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1297 /* Append that character to the Document node. */
1298 $text = $this->dom
->createTextNode($token['data']);
1299 $this->dom
->appendChild($text);
1303 private function rootElementPhase($token) {
1304 /* After the initial phase, as each token is emitted from the tokenisation
1305 stage, it must be processed as described in this section. */
1307 /* A DOCTYPE token */
1308 if($token['type'] === HTML5
::DOCTYPE
) {
1309 // Parse error. Ignore the token.
1311 /* A comment token */
1312 } elseif($token['type'] === HTML5
::COMMENT
) {
1313 /* Append a Comment node to the Document object with the data
1314 attribute set to the data given in the comment token. */
1315 $comment = $this->dom
->createComment($token['data']);
1316 $this->dom
->appendChild($comment);
1318 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1319 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1321 } elseif($token['type'] === HTML5
::CHARACTR
&&
1322 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1323 /* Append that character to the Document node. */
1324 $text = $this->dom
->createTextNode($token['data']);
1325 $this->dom
->appendChild($text);
1327 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1328 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1329 (FF), or U+0020 SPACE
1332 An end-of-file token */
1333 } elseif(($token['type'] === HTML5
::CHARACTR
&&
1334 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1335 $token['type'] === HTML5
::STARTTAG ||
1336 $token['type'] === HTML5
::ENDTAG ||
1337 $token['type'] === HTML5
::EOF
) {
1338 /* Create an HTMLElement node with the tag name html, in the HTML
1339 namespace. Append it to the Document object. Switch to the main
1340 phase and reprocess the current token. */
1341 $html = $this->dom
->createElement('html');
1342 $this->dom
->appendChild($html);
1343 $this->stack
[] = $html;
1345 $this->phase
= self
::MAIN_PHASE
;
1346 return $this->mainPhase($token);
1350 private function mainPhase($token) {
1351 /* Tokens in the main phase must be handled as follows: */
1353 /* A DOCTYPE token */
1354 if($token['type'] === HTML5
::DOCTYPE
) {
1355 // Parse error. Ignore the token.
1357 /* A start tag token with the tag name "html" */
1358 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'html') {
1359 /* If this start tag token was not the first start tag token, then
1360 it is a parse error. */
1362 /* For each attribute on the token, check to see if the attribute
1363 is already present on the top element of the stack of open elements.
1364 If it is not, add the attribute and its corresponding value to that
1366 foreach($token['attr'] as $attr) {
1367 if(!$this->stack
[0]->hasAttribute($attr['name'])) {
1368 $this->stack
[0]->setAttribute($attr['name'], $attr['value']);
1372 /* An end-of-file token */
1373 } elseif($token['type'] === HTML5
::EOF
) {
1374 /* Generate implied end tags. */
1375 $this->generateImpliedEndTags();
1377 /* Anything else. */
1379 /* Depends on the insertion mode: */
1380 switch($this->mode
) {
1381 case self
::BEFOR_HEAD
: return $this->beforeHead($token); break;
1382 case self
::IN_HEAD
: return $this->inHead($token); break;
1383 case self
::AFTER_HEAD
: return $this->afterHead($token); break;
1384 case self
::IN_BODY
: return $this->inBody($token); break;
1385 case self
::IN_TABLE
: return $this->inTable($token); break;
1386 case self
::IN_CAPTION
: return $this->inCaption($token); break;
1387 case self
::IN_CGROUP
: return $this->inColumnGroup($token); break;
1388 case self
::IN_TBODY
: return $this->inTableBody($token); break;
1389 case self
::IN_ROW
: return $this->inRow($token); break;
1390 case self
::IN_CELL
: return $this->inCell($token); break;
1391 case self
::IN_SELECT
: return $this->inSelect($token); break;
1392 case self
::AFTER_BODY
: return $this->afterBody($token); break;
1393 case self
::IN_FRAME
: return $this->inFrameset($token); break;
1394 case self
::AFTR_FRAME
: return $this->afterFrameset($token); break;
1395 case self
::END_PHASE
: return $this->trailingEndPhase($token); break;
1400 private function beforeHead($token) {
1401 /* Handle the token as follows: */
1403 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1404 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1406 if($token['type'] === HTML5
::CHARACTR
&&
1407 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1408 /* Append the character to the current node. */
1409 $this->insertText($token['data']);
1411 /* A comment token */
1412 } elseif($token['type'] === HTML5
::COMMENT
) {
1413 /* Append a Comment node to the current node with the data attribute
1414 set to the data given in the comment token. */
1415 $this->insertComment($token['data']);
1417 /* A start tag token with the tag name "head" */
1418 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'head') {
1419 /* Create an element for the token, append the new element to the
1420 current node and push it onto the stack of open elements. */
1421 $element = $this->insertElement($token);
1423 /* Set the head element pointer to this new element node. */
1424 $this->head_pointer
= $element;
1426 /* Change the insertion mode to "in head". */
1427 $this->mode
= self
::IN_HEAD
;
1429 /* A start tag token whose tag name is one of: "base", "link", "meta",
1430 "script", "style", "title". Or an end tag with the tag name "html".
1431 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1432 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1433 or U+0020 SPACE. Or any other start tag token */
1434 } elseif($token['type'] === HTML5
::STARTTAG ||
1435 ($token['type'] === HTML5
::ENDTAG
&& $token['name'] === 'html') ||
1436 ($token['type'] === HTML5
::CHARACTR
&& !preg_match('/^[\t\n\x0b\x0c ]$/',
1438 /* Act as if a start tag token with the tag name "head" and no
1439 attributes had been seen, then reprocess the current token. */
1440 $this->beforeHead(array(
1442 'type' => HTML5
::STARTTAG
,
1446 return $this->inHead($token);
1448 /* Any other end tag */
1449 } elseif($token['type'] === HTML5
::ENDTAG
) {
1450 /* Parse error. Ignore the token. */
1454 private function inHead($token) {
1455 /* Handle the token as follows: */
1457 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1458 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1461 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1462 or script element, append the character to the current node regardless
1464 if(($token['type'] === HTML5
::CHARACTR
&&
1465 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
(
1466 $token['type'] === HTML5
::CHARACTR
&& in_array(end($this->stack
)->nodeName
,
1467 array('title', 'style', 'script')))) {
1468 /* Append the character to the current node. */
1469 $this->insertText($token['data']);
1471 /* A comment token */
1472 } elseif($token['type'] === HTML5
::COMMENT
) {
1473 /* Append a Comment node to the current node with the data attribute
1474 set to the data given in the comment token. */
1475 $this->insertComment($token['data']);
1477 } elseif($token['type'] === HTML5
::ENDTAG
&&
1478 in_array($token['name'], array('title', 'style', 'script'))) {
1479 array_pop($this->stack
);
1480 return HTML5
::PCDATA
;
1482 /* A start tag with the tag name "title" */
1483 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'title') {
1484 /* Create an element for the token and append the new element to the
1485 node pointed to by the head element pointer, or, if that is null
1486 (innerHTML case), to the current node. */
1487 if($this->head_pointer
!== null) {
1488 $element = $this->insertElement($token, false);
1489 $this->head_pointer
->appendChild($element);
1492 $element = $this->insertElement($token);
1495 /* Switch the tokeniser's content model flag to the RCDATA state. */
1496 return HTML5
::RCDATA
;
1498 /* A start tag with the tag name "style" */
1499 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'style') {
1500 /* Create an element for the token and append the new element to the
1501 node pointed to by the head element pointer, or, if that is null
1502 (innerHTML case), to the current node. */
1503 if($this->head_pointer
!== null) {
1504 $element = $this->insertElement($token, false);
1505 $this->head_pointer
->appendChild($element);
1508 $this->insertElement($token);
1511 /* Switch the tokeniser's content model flag to the CDATA state. */
1512 return HTML5
::CDATA
;
1514 /* A start tag with the tag name "script" */
1515 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'script') {
1516 /* Create an element for the token. */
1517 $element = $this->insertElement($token, false);
1518 $this->head_pointer
->appendChild($element);
1520 /* Switch the tokeniser's content model flag to the CDATA state. */
1521 return HTML5
::CDATA
;
1523 /* A start tag with the tag name "base", "link", or "meta" */
1524 } elseif($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
1525 array('base', 'link', 'meta'))) {
1526 /* Create an element for the token and append the new element to the
1527 node pointed to by the head element pointer, or, if that is null
1528 (innerHTML case), to the current node. */
1529 if($this->head_pointer
!== null) {
1530 $element = $this->insertElement($token, false);
1531 $this->head_pointer
->appendChild($element);
1532 array_pop($this->stack
);
1535 $this->insertElement($token);
1538 /* An end tag with the tag name "head" */
1539 } elseif($token['type'] === HTML5
::ENDTAG
&& $token['name'] === 'head') {
1540 /* If the current node is a head element, pop the current node off
1541 the stack of open elements. */
1542 if($this->head_pointer
->isSameNode(end($this->stack
))) {
1543 array_pop($this->stack
);
1545 /* Otherwise, this is a parse error. */
1550 /* Change the insertion mode to "after head". */
1551 $this->mode
= self
::AFTER_HEAD
;
1553 /* A start tag with the tag name "head" or an end tag except "html". */
1554 } elseif(($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'head') ||
1555 ($token['type'] === HTML5
::ENDTAG
&& $token['name'] !== 'html')) {
1556 // Parse error. Ignore the token.
1560 /* If the current node is a head element, act as if an end tag
1561 token with the tag name "head" had been seen. */
1562 if($this->head_pointer
->isSameNode(end($this->stack
))) {
1563 $this->inHead(array(
1565 'type' => HTML5
::ENDTAG
1568 /* Otherwise, change the insertion mode to "after head". */
1570 $this->mode
= self
::AFTER_HEAD
;
1573 /* Then, reprocess the current token. */
1574 return $this->afterHead($token);
1578 private function afterHead($token) {
1579 /* Handle the token as follows: */
1581 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1582 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1584 if($token['type'] === HTML5
::CHARACTR
&&
1585 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1586 /* Append the character to the current node. */
1587 $this->insertText($token['data']);
1589 /* A comment token */
1590 } elseif($token['type'] === HTML5
::COMMENT
) {
1591 /* Append a Comment node to the current node with the data attribute
1592 set to the data given in the comment token. */
1593 $this->insertComment($token['data']);
1595 /* A start tag token with the tag name "body" */
1596 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'body') {
1597 /* Insert a body element for the token. */
1598 $this->insertElement($token);
1600 /* Change the insertion mode to "in body". */
1601 $this->mode
= self
::IN_BODY
;
1603 /* A start tag token with the tag name "frameset" */
1604 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'frameset') {
1605 /* Insert a frameset element for the token. */
1606 $this->insertElement($token);
1608 /* Change the insertion mode to "in frameset". */
1609 $this->mode
= self
::IN_FRAME
;
1611 /* A start tag token whose tag name is one of: "base", "link", "meta",
1612 "script", "style", "title" */
1613 } elseif($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
1614 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1615 /* Parse error. Switch the insertion mode back to "in head" and
1616 reprocess the token. */
1617 $this->mode
= self
::IN_HEAD
;
1618 return $this->inHead($token);
1622 /* Act as if a start tag token with the tag name "body" and no
1623 attributes had been seen, and then reprocess the current token. */
1624 $this->afterHead(array(
1626 'type' => HTML5
::STARTTAG
,
1630 return $this->inBody($token);
1634 private function inBody($token) {
1635 /* Handle the token as follows: */
1637 switch($token['type']) {
1638 /* A character token */
1639 case HTML5
::CHARACTR
:
1640 /* Reconstruct the active formatting elements, if any. */
1641 $this->reconstructActiveFormattingElements();
1643 /* Append the token's character to the current node. */
1644 $this->insertText($token['data']);
1647 /* A comment token */
1648 case HTML5
::COMMENT
:
1649 /* Append a Comment node to the current node with the data
1650 attribute set to the data given in the comment token. */
1651 $this->insertComment($token['data']);
1654 case HTML5
::STARTTAG
:
1655 switch($token['name']) {
1656 /* A start tag token whose tag name is one of: "script",
1658 case 'script': case 'style':
1659 /* Process the token as if the insertion mode had been "in
1661 return $this->inHead($token);
1664 /* A start tag token whose tag name is one of: "base", "link",
1666 case 'base': case 'link': case 'meta': case 'title':
1667 /* Parse error. Process the token as if the insertion mode
1668 had been "in head". */
1669 return $this->inHead($token);
1672 /* A start tag token with the tag name "body" */
1674 /* Parse error. If the second element on the stack of open
1675 elements is not a body element, or, if the stack of open
1676 elements has only one node on it, then ignore the token.
1678 if(count($this->stack
) === 1 ||
$this->stack
[1]->nodeName
!== 'body') {
1681 /* Otherwise, for each attribute on the token, check to see
1682 if the attribute is already present on the body element (the
1683 second element) on the stack of open elements. If it is not,
1684 add the attribute and its corresponding value to that
1687 foreach($token['attr'] as $attr) {
1688 if(!$this->stack
[1]->hasAttribute($attr['name'])) {
1689 $this->stack
[1]->setAttribute($attr['name'], $attr['value']);
1695 /* A start tag whose tag name is one of: "address",
1696 "blockquote", "center", "dir", "div", "dl", "fieldset",
1697 "listing", "menu", "ol", "p", "ul" */
1698 case 'address': case 'blockquote': case 'center': case 'dir':
1699 case 'div': case 'dl': case 'fieldset': case 'listing':
1700 case 'menu': case 'ol': case 'p': case 'ul':
1701 /* If the stack of open elements has a p element in scope,
1702 then act as if an end tag with the tag name p had been
1704 if($this->elementInScope('p')) {
1705 $this->emitToken(array(
1707 'type' => HTML5
::ENDTAG
1711 /* Insert an HTML element for the token. */
1712 $this->insertElement($token);
1715 /* A start tag whose tag name is "form" */
1717 /* If the form element pointer is not null, ignore the
1718 token with a parse error. */
1719 if($this->form_pointer
!== null) {
1724 /* If the stack of open elements has a p element in
1725 scope, then act as if an end tag with the tag name p
1727 if($this->elementInScope('p')) {
1728 $this->emitToken(array(
1730 'type' => HTML5
::ENDTAG
1734 /* Insert an HTML element for the token, and set the
1735 form element pointer to point to the element created. */
1736 $element = $this->insertElement($token);
1737 $this->form_pointer
= $element;
1741 /* A start tag whose tag name is "li", "dd" or "dt" */
1742 case 'li': case 'dd': case 'dt':
1743 /* If the stack of open elements has a p element in scope,
1744 then act as if an end tag with the tag name p had been
1746 if($this->elementInScope('p')) {
1747 $this->emitToken(array(
1749 'type' => HTML5
::ENDTAG
1753 $stack_length = count($this->stack
) - 1;
1755 for($n = $stack_length; 0 <= $n; $n--) {
1756 /* 1. Initialise node to be the current node (the
1757 bottommost node of the stack). */
1759 $node = $this->stack
[$n];
1760 $cat = $this->getElementCategory($node->tagName
);
1762 /* 2. If node is an li, dd or dt element, then pop all
1763 the nodes from the current node up to node, including
1764 node, then stop this algorithm. */
1765 if($token['name'] === $node->tagName ||
($token['name'] !== 'li'
1766 && ($node->tagName
=== 'dd' ||
$node->tagName
=== 'dt'))) {
1767 for($x = $stack_length; $x >= $n ; $x--) {
1768 array_pop($this->stack
);
1774 /* 3. If node is not in the formatting category, and is
1775 not in the phrasing category, and is not an address or
1776 div element, then stop this algorithm. */
1777 if($cat !== self
::FORMATTING
&& $cat !== self
::PHRASING
&&
1778 $node->tagName
!== 'address' && $node->tagName
!== 'div') {
1783 /* Finally, insert an HTML element with the same tag
1784 name as the token's. */
1785 $this->insertElement($token);
1788 /* A start tag token whose tag name is "plaintext" */
1790 /* If the stack of open elements has a p element in scope,
1791 then act as if an end tag with the tag name p had been
1793 if($this->elementInScope('p')) {
1794 $this->emitToken(array(
1796 'type' => HTML5
::ENDTAG
1800 /* Insert an HTML element for the token. */
1801 $this->insertElement($token);
1803 return HTML5
::PLAINTEXT
;
1806 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1808 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1809 /* If the stack of open elements has a p element in scope,
1810 then act as if an end tag with the tag name p had been seen. */
1811 if($this->elementInScope('p')) {
1812 $this->emitToken(array(
1814 'type' => HTML5
::ENDTAG
1818 /* If the stack of open elements has in scope an element whose
1819 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1820 this is a parse error; pop elements from the stack until an
1821 element with one of those tag names has been popped from the
1823 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1824 array_pop($this->stack
);
1827 /* Insert an HTML element for the token. */
1828 $this->insertElement($token);
1831 /* A start tag whose tag name is "a" */
1833 /* If the list of active formatting elements contains
1834 an element whose tag name is "a" between the end of the
1835 list and the last marker on the list (or the start of
1836 the list if there is no marker on the list), then this
1837 is a parse error; act as if an end tag with the tag name
1838 "a" had been seen, then remove that element from the list
1839 of active formatting elements and the stack of open
1840 elements if the end tag didn't already remove it (it
1841 might not have if the element is not in table scope). */
1842 $leng = count($this->a_formatting
);
1844 for($n = $leng - 1; $n >= 0; $n--) {
1845 if($this->a_formatting
[$n] === self
::MARKER
) {
1848 } elseif($this->a_formatting
[$n]->nodeName
=== 'a') {
1849 $this->emitToken(array(
1851 'type' => HTML5
::ENDTAG
1857 /* Reconstruct the active formatting elements, if any. */
1858 $this->reconstructActiveFormattingElements();
1860 /* Insert an HTML element for the token. */
1861 $el = $this->insertElement($token);
1863 /* Add that element to the list of active formatting
1865 $this->a_formatting
[] = $el;
1868 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1869 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1870 case 'b': case 'big': case 'em': case 'font': case 'i':
1871 case 'nobr': case 's': case 'small': case 'strike':
1872 case 'strong': case 'tt': case 'u':
1873 /* Reconstruct the active formatting elements, if any. */
1874 $this->reconstructActiveFormattingElements();
1876 /* Insert an HTML element for the token. */
1877 $el = $this->insertElement($token);
1879 /* Add that element to the list of active formatting
1881 $this->a_formatting
[] = $el;
1884 /* A start tag token whose tag name is "button" */
1886 /* If the stack of open elements has a button element in scope,
1887 then this is a parse error; act as if an end tag with the tag
1888 name "button" had been seen, then reprocess the token. (We don't
1889 do that. Unnecessary.) */
1890 if($this->elementInScope('button')) {
1891 $this->inBody(array(
1893 'type' => HTML5
::ENDTAG
1897 /* Reconstruct the active formatting elements, if any. */
1898 $this->reconstructActiveFormattingElements();
1900 /* Insert an HTML element for the token. */
1901 $this->insertElement($token);
1903 /* Insert a marker at the end of the list of active
1904 formatting elements. */
1905 $this->a_formatting
[] = self
::MARKER
;
1908 /* A start tag token whose tag name is one of: "marquee", "object" */
1909 case 'marquee': case 'object':
1910 /* Reconstruct the active formatting elements, if any. */
1911 $this->reconstructActiveFormattingElements();
1913 /* Insert an HTML element for the token. */
1914 $this->insertElement($token);
1916 /* Insert a marker at the end of the list of active
1917 formatting elements. */
1918 $this->a_formatting
[] = self
::MARKER
;
1921 /* A start tag token whose tag name is "xmp" */
1923 /* Reconstruct the active formatting elements, if any. */
1924 $this->reconstructActiveFormattingElements();
1926 /* Insert an HTML element for the token. */
1927 $this->insertElement($token);
1929 /* Switch the content model flag to the CDATA state. */
1930 return HTML5
::CDATA
;
1933 /* A start tag whose tag name is "table" */
1935 /* If the stack of open elements has a p element in scope,
1936 then act as if an end tag with the tag name p had been seen. */
1937 if($this->elementInScope('p')) {
1938 $this->emitToken(array(
1940 'type' => HTML5
::ENDTAG
1944 /* Insert an HTML element for the token. */
1945 $this->insertElement($token);
1947 /* Change the insertion mode to "in table". */
1948 $this->mode
= self
::IN_TABLE
;
1951 /* A start tag whose tag name is one of: "area", "basefont",
1952 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1953 case 'area': case 'basefont': case 'bgsound': case 'br':
1954 case 'embed': case 'img': case 'param': case 'spacer':
1956 /* Reconstruct the active formatting elements, if any. */
1957 $this->reconstructActiveFormattingElements();
1959 /* Insert an HTML element for the token. */
1960 $this->insertElement($token);
1962 /* Immediately pop the current node off the stack of open elements. */
1963 array_pop($this->stack
);
1966 /* A start tag whose tag name is "hr" */
1968 /* If the stack of open elements has a p element in scope,
1969 then act as if an end tag with the tag name p had been seen. */
1970 if($this->elementInScope('p')) {
1971 $this->emitToken(array(
1973 'type' => HTML5
::ENDTAG
1977 /* Insert an HTML element for the token. */
1978 $this->insertElement($token);
1980 /* Immediately pop the current node off the stack of open elements. */
1981 array_pop($this->stack
);
1984 /* A start tag whose tag name is "image" */
1986 /* Parse error. Change the token's tag name to "img" and
1987 reprocess it. (Don't ask.) */
1988 $token['name'] = 'img';
1989 return $this->inBody($token);
1992 /* A start tag whose tag name is "input" */
1994 /* Reconstruct the active formatting elements, if any. */
1995 $this->reconstructActiveFormattingElements();
1997 /* Insert an input element for the token. */
1998 $element = $this->insertElement($token, false);
2000 /* If the form element pointer is not null, then associate the
2001 input element with the form element pointed to by the form
2003 $this->form_pointer
!== null
2004 ?
$this->form_pointer
->appendChild($element)
2005 : end($this->stack
)->appendChild($element);
2007 /* Pop that input element off the stack of open elements. */
2008 array_pop($this->stack
);
2011 /* A start tag whose tag name is "isindex" */
2016 /* If the form element pointer is not null,
2017 then ignore the token. */
2018 if($this->form_pointer
=== null) {
2019 /* Act as if a start tag token with the tag name "form" had
2021 $this->inBody(array(
2023 'type' => HTML5
::STARTTAG
,
2027 /* Act as if a start tag token with the tag name "hr" had
2029 $this->inBody(array(
2031 'type' => HTML5
::STARTTAG
,
2035 /* Act as if a start tag token with the tag name "p" had
2037 $this->inBody(array(
2039 'type' => HTML5
::STARTTAG
,
2043 /* Act as if a start tag token with the tag name "label"
2045 $this->inBody(array(
2047 'type' => HTML5
::STARTTAG
,
2051 /* Act as if a stream of character tokens had been seen. */
2052 $this->insertText('This is a searchable index. '.
2053 'Insert your search keywords here: ');
2055 /* Act as if a start tag token with the tag name "input"
2056 had been seen, with all the attributes from the "isindex"
2057 token, except with the "name" attribute set to the value
2058 "isindex" (ignoring any explicit "name" attribute). */
2059 $attr = $token['attr'];
2060 $attr[] = array('name' => 'name', 'value' => 'isindex');
2062 $this->inBody(array(
2064 'type' => HTML5
::STARTTAG
,
2068 /* Act as if a stream of character tokens had been seen
2069 (see below for what they should say). */
2070 $this->insertText('This is a searchable index. '.
2071 'Insert your search keywords here: ');
2073 /* Act as if an end tag token with the tag name "label"
2075 $this->inBody(array(
2077 'type' => HTML5
::ENDTAG
2080 /* Act as if an end tag token with the tag name "p" had
2082 $this->inBody(array(
2084 'type' => HTML5
::ENDTAG
2087 /* Act as if a start tag token with the tag name "hr" had
2089 $this->inBody(array(
2091 'type' => HTML5
::ENDTAG
2094 /* Act as if an end tag token with the tag name "form" had
2096 $this->inBody(array(
2098 'type' => HTML5
::ENDTAG
2103 /* A start tag whose tag name is "textarea" */
2105 $this->insertElement($token);
2107 /* Switch the tokeniser's content model flag to the
2109 return HTML5
::RCDATA
;
2112 /* A start tag whose tag name is one of: "iframe", "noembed",
2114 case 'iframe': case 'noembed': case 'noframes':
2115 $this->insertElement($token);
2117 /* Switch the tokeniser's content model flag to the CDATA state. */
2118 return HTML5
::CDATA
;
2121 /* A start tag whose tag name is "select" */
2123 /* Reconstruct the active formatting elements, if any. */
2124 $this->reconstructActiveFormattingElements();
2126 /* Insert an HTML element for the token. */
2127 $this->insertElement($token);
2129 /* Change the insertion mode to "in select". */
2130 $this->mode
= self
::IN_SELECT
;
2133 /* A start or end tag whose tag name is one of: "caption", "col",
2134 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2135 "tbody", "td", "tfoot", "th", "thead", "tr". */
2136 case 'caption': case 'col': case 'colgroup': case 'frame':
2137 case 'frameset': case 'head': case 'option': case 'optgroup':
2138 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2140 // Parse error. Ignore the token.
2143 /* A start or end tag whose tag name is one of: "event-source",
2144 "section", "nav", "article", "aside", "header", "footer",
2145 "datagrid", "command" */
2146 case 'event-source': case 'section': case 'nav': case 'article':
2147 case 'aside': case 'header': case 'footer': case 'datagrid':
2149 // Work in progress!
2152 /* A start tag token not covered by the previous entries */
2154 /* Reconstruct the active formatting elements, if any. */
2155 $this->reconstructActiveFormattingElements();
2157 $this->insertElement($token, true, true);
2163 switch($token['name']) {
2164 /* An end tag with the tag name "body" */
2166 /* If the second element in the stack of open elements is
2167 not a body element, this is a parse error. Ignore the token.
2169 if(count($this->stack
) < 2 ||
$this->stack
[1]->nodeName
!== 'body') {
2172 /* If the current node is not the body element, then this
2173 is a parse error. */
2174 } elseif(end($this->stack
)->nodeName
!== 'body') {
2178 /* Change the insertion mode to "after body". */
2179 $this->mode
= self
::AFTER_BODY
;
2182 /* An end tag with the tag name "html" */
2184 /* Act as if an end tag with tag name "body" had been seen,
2185 then, if that token wasn't ignored, reprocess the current
2187 $this->inBody(array(
2189 'type' => HTML5
::ENDTAG
2192 return $this->afterBody($token);
2195 /* An end tag whose tag name is one of: "address", "blockquote",
2196 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2197 "ol", "pre", "ul" */
2198 case 'address': case 'blockquote': case 'center': case 'dir':
2199 case 'div': case 'dl': case 'fieldset': case 'listing':
2200 case 'menu': case 'ol': case 'pre': case 'ul':
2201 /* If the stack of open elements has an element in scope
2202 with the same tag name as that of the token, then generate
2203 implied end tags. */
2204 if($this->elementInScope($token['name'])) {
2205 $this->generateImpliedEndTags();
2207 /* Now, if the current node is not an element with
2208 the same tag name as that of the token, then this
2209 is a parse error. */
2212 /* If the stack of open elements has an element in
2213 scope with the same tag name as that of the token,
2214 then pop elements from this stack until an element
2215 with that tag name has been popped from the stack. */
2216 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
2217 if($this->stack
[$n]->nodeName
=== $token['name']) {
2221 array_pop($this->stack
);
2226 /* An end tag whose tag name is "form" */
2228 /* If the stack of open elements has an element in scope
2229 with the same tag name as that of the token, then generate
2230 implied end tags. */
2231 if($this->elementInScope($token['name'])) {
2232 $this->generateImpliedEndTags();
2236 if(end($this->stack
)->nodeName
!== $token['name']) {
2237 /* Now, if the current node is not an element with the
2238 same tag name as that of the token, then this is a parse
2243 /* Otherwise, if the current node is an element with
2244 the same tag name as that of the token pop that element
2246 array_pop($this->stack
);
2249 /* In any case, set the form element pointer to null. */
2250 $this->form_pointer
= null;
2253 /* An end tag whose tag name is "p" */
2255 /* If the stack of open elements has a p element in scope,
2256 then generate implied end tags, except for p elements. */
2257 if($this->elementInScope('p')) {
2258 $this->generateImpliedEndTags(array('p'));
2260 /* If the current node is not a p element, then this is
2264 /* If the stack of open elements has a p element in
2265 scope, then pop elements from this stack until the stack
2266 no longer has a p element in scope. */
2267 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
2268 if($this->elementInScope('p')) {
2269 array_pop($this->stack
);
2278 /* An end tag whose tag name is "dd", "dt", or "li" */
2279 case 'dd': case 'dt': case 'li':
2280 /* If the stack of open elements has an element in scope
2281 whose tag name matches the tag name of the token, then
2282 generate implied end tags, except for elements with the
2283 same tag name as the token. */
2284 if($this->elementInScope($token['name'])) {
2285 $this->generateImpliedEndTags(array($token['name']));
2287 /* If the current node is not an element with the same
2288 tag name as the token, then this is a parse error. */
2291 /* If the stack of open elements has an element in scope
2292 whose tag name matches the tag name of the token, then
2293 pop elements from this stack until an element with that
2294 tag name has been popped from the stack. */
2295 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
2296 if($this->stack
[$n]->nodeName
=== $token['name']) {
2300 array_pop($this->stack
);
2305 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2307 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2308 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2310 /* If the stack of open elements has in scope an element whose
2311 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2312 generate implied end tags. */
2313 if($this->elementInScope($elements)) {
2314 $this->generateImpliedEndTags();
2316 /* Now, if the current node is not an element with the same
2317 tag name as that of the token, then this is a parse error. */
2320 /* If the stack of open elements has in scope an element
2321 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2322 "h6", then pop elements from the stack until an element
2323 with one of those tag names has been popped from the stack. */
2324 while($this->elementInScope($elements)) {
2325 array_pop($this->stack
);
2330 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2331 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2332 case 'a': case 'b': case 'big': case 'em': case 'font':
2333 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2334 case 'strong': case 'tt': case 'u':
2335 /* 1. Let the formatting element be the last element in
2336 the list of active formatting elements that:
2337 * is between the end of the list and the last scope
2338 marker in the list, if any, or the start of the list
2340 * has the same tag name as the token.
2343 for($a = count($this->a_formatting
) - 1; $a >= 0; $a--) {
2344 if($this->a_formatting
[$a] === self
::MARKER
) {
2347 } elseif($this->a_formatting
[$a]->tagName
=== $token['name']) {
2348 $formatting_element = $this->a_formatting
[$a];
2349 $in_stack = in_array($formatting_element, $this->stack
, true);
2355 /* If there is no such node, or, if that node is
2356 also in the stack of open elements but the element
2357 is not in scope, then this is a parse error. Abort
2358 these steps. The token is ignored. */
2359 if(!isset($formatting_element) ||
($in_stack &&
2360 !$this->elementInScope($token['name']))) {
2363 /* Otherwise, if there is such a node, but that node
2364 is not in the stack of open elements, then this is a
2365 parse error; remove the element from the list, and
2366 abort these steps. */
2367 } elseif(isset($formatting_element) && !$in_stack) {
2368 unset($this->a_formatting
[$fe_af_pos]);
2369 $this->a_formatting
= array_merge($this->a_formatting
);
2373 /* 2. Let the furthest block be the topmost node in the
2374 stack of open elements that is lower in the stack
2375 than the formatting element, and is not an element in
2376 the phrasing or formatting categories. There might
2378 $fe_s_pos = array_search($formatting_element, $this->stack
, true);
2379 $length = count($this->stack
);
2381 for($s = $fe_s_pos +
1; $s < $length; $s++
) {
2382 $category = $this->getElementCategory($this->stack
[$s]->nodeName
);
2384 if($category !== self
::PHRASING
&& $category !== self
::FORMATTING
) {
2385 $furthest_block = $this->stack
[$s];
2389 /* 3. If there is no furthest block, then the UA must
2390 skip the subsequent steps and instead just pop all
2391 the nodes from the bottom of the stack of open
2392 elements, from the current node up to the formatting
2393 element, and remove the formatting element from the
2394 list of active formatting elements. */
2395 if(!isset($furthest_block)) {
2396 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2397 array_pop($this->stack
);
2400 unset($this->a_formatting
[$fe_af_pos]);
2401 $this->a_formatting
= array_merge($this->a_formatting
);
2405 /* 4. Let the common ancestor be the element
2406 immediately above the formatting element in the stack
2407 of open elements. */
2408 $common_ancestor = $this->stack
[$fe_s_pos - 1];
2410 /* 5. If the furthest block has a parent node, then
2411 remove the furthest block from its parent node. */
2412 if($furthest_block->parentNode
!== null) {
2413 $furthest_block->parentNode
->removeChild($furthest_block);
2416 /* 6. Let a bookmark note the position of the
2417 formatting element in the list of active formatting
2418 elements relative to the elements on either side
2419 of it in the list. */
2420 $bookmark = $fe_af_pos;
2422 /* 7. Let node and last node be the furthest block.
2423 Follow these steps: */
2424 $node = $furthest_block;
2425 $last_node = $furthest_block;
2428 for($n = array_search($node, $this->stack
, true) - 1; $n >= 0; $n--) {
2429 /* 7.1 Let node be the element immediately
2430 prior to node in the stack of open elements. */
2431 $node = $this->stack
[$n];
2433 /* 7.2 If node is not in the list of active
2434 formatting elements, then remove node from
2435 the stack of open elements and then go back
2437 if(!in_array($node, $this->a_formatting
, true)) {
2438 unset($this->stack
[$n]);
2439 $this->stack
= array_merge($this->stack
);
2446 /* 7.3 Otherwise, if node is the formatting
2447 element, then go to the next step in the overall
2449 if($node === $formatting_element) {
2452 /* 7.4 Otherwise, if last node is the furthest
2453 block, then move the aforementioned bookmark to
2454 be immediately after the node in the list of
2455 active formatting elements. */
2456 } elseif($last_node === $furthest_block) {
2457 $bookmark = array_search($node, $this->a_formatting
, true) +
1;
2460 /* 7.5 If node has any children, perform a
2461 shallow clone of node, replace the entry for
2462 node in the list of active formatting elements
2463 with an entry for the clone, replace the entry
2464 for node in the stack of open elements with an
2465 entry for the clone, and let node be the clone. */
2466 if($node->hasChildNodes()) {
2467 $clone = $node->cloneNode();
2468 $s_pos = array_search($node, $this->stack
, true);
2469 $a_pos = array_search($node, $this->a_formatting
, true);
2471 $this->stack
[$s_pos] = $clone;
2472 $this->a_formatting
[$a_pos] = $clone;
2476 /* 7.6 Insert last node into node, first removing
2477 it from its previous parent node if any. */
2478 if($last_node->parentNode
!== null) {
2479 $last_node->parentNode
->removeChild($last_node);
2482 $node->appendChild($last_node);
2484 /* 7.7 Let last node be node. */
2488 /* 8. Insert whatever last node ended up being in
2489 the previous step into the common ancestor node,
2490 first removing it from its previous parent node if
2492 if($last_node->parentNode
!== null) {
2493 $last_node->parentNode
->removeChild($last_node);
2496 $common_ancestor->appendChild($last_node);
2498 /* 9. Perform a shallow clone of the formatting
2500 $clone = $formatting_element->cloneNode();
2502 /* 10. Take all of the child nodes of the furthest
2503 block and append them to the clone created in the
2505 while($furthest_block->hasChildNodes()) {
2506 $child = $furthest_block->firstChild
;
2507 $furthest_block->removeChild($child);
2508 $clone->appendChild($child);
2511 /* 11. Append that clone to the furthest block. */
2512 $furthest_block->appendChild($clone);
2514 /* 12. Remove the formatting element from the list
2515 of active formatting elements, and insert the clone
2516 into the list of active formatting elements at the
2517 position of the aforementioned bookmark. */
2518 $fe_af_pos = array_search($formatting_element, $this->a_formatting
, true);
2519 unset($this->a_formatting
[$fe_af_pos]);
2520 $this->a_formatting
= array_merge($this->a_formatting
);
2522 $af_part1 = array_slice($this->a_formatting
, 0, $bookmark - 1);
2523 $af_part2 = array_slice($this->a_formatting
, $bookmark, count($this->a_formatting
));
2524 $this->a_formatting
= array_merge($af_part1, array($clone), $af_part2);
2526 /* 13. Remove the formatting element from the stack
2527 of open elements, and insert the clone into the stack
2528 of open elements immediately after (i.e. in a more
2529 deeply nested position than) the position of the
2530 furthest block in that stack. */
2531 $fe_s_pos = array_search($formatting_element, $this->stack
, true);
2532 $fb_s_pos = array_search($furthest_block, $this->stack
, true);
2533 unset($this->stack
[$fe_s_pos]);
2535 $s_part1 = array_slice($this->stack
, 0, $fb_s_pos);
2536 $s_part2 = array_slice($this->stack
, $fb_s_pos +
1, count($this->stack
));
2537 $this->stack
= array_merge($s_part1, array($clone), $s_part2);
2539 /* 14. Jump back to step 1 in this series of steps. */
2540 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2544 /* An end tag token whose tag name is one of: "button",
2545 "marquee", "object" */
2546 case 'button': case 'marquee': case 'object':
2547 /* If the stack of open elements has an element in scope whose
2548 tag name matches the tag name of the token, then generate implied
2550 if($this->elementInScope($token['name'])) {
2551 $this->generateImpliedEndTags();
2553 /* Now, if the current node is not an element with the same
2554 tag name as the token, then this is a parse error. */
2557 /* Now, if the stack of open elements has an element in scope
2558 whose tag name matches the tag name of the token, then pop
2559 elements from the stack until that element has been popped from
2560 the stack, and clear the list of active formatting elements up
2561 to the last marker. */
2562 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
2563 if($this->stack
[$n]->nodeName
=== $token['name']) {
2567 array_pop($this->stack
);
2570 $marker = end(array_keys($this->a_formatting
, self
::MARKER
, true));
2572 for($n = count($this->a_formatting
) - 1; $n > $marker; $n--) {
2573 array_pop($this->a_formatting
);
2578 /* Or an end tag whose tag name is one of: "area", "basefont",
2579 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2580 "input", "isindex", "noembed", "noframes", "param", "select",
2581 "spacer", "table", "textarea", "wbr" */
2582 case 'area': case 'basefont': case 'bgsound': case 'br':
2583 case 'embed': case 'hr': case 'iframe': case 'image':
2584 case 'img': case 'input': case 'isindex': case 'noembed':
2585 case 'noframes': case 'param': case 'select': case 'spacer':
2586 case 'table': case 'textarea': case 'wbr':
2587 // Parse error. Ignore the token.
2590 /* An end tag token not covered by the previous entries */
2592 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
2593 /* Initialise node to be the current node (the bottommost
2594 node of the stack). */
2595 $node = end($this->stack
);
2597 /* If node has the same tag name as the end tag token,
2599 if($token['name'] === $node->nodeName
) {
2600 /* Generate implied end tags. */
2601 $this->generateImpliedEndTags();
2603 /* If the tag name of the end tag token does not
2604 match the tag name of the current node, this is a
2608 /* Pop all the nodes from the current node up to
2609 node, including node, then stop this algorithm. */
2610 for($x = count($this->stack
) - $n; $x >= $n; $x--) {
2611 array_pop($this->stack
);
2615 $category = $this->getElementCategory($node);
2617 if($category !== self
::SPECIAL
&& $category !== self
::SCOPING
) {
2618 /* Otherwise, if node is in neither the formatting
2619 category nor the phrasing category, then this is a
2620 parse error. Stop this algorithm. The end tag token
2632 private function inTable($token) {
2633 $clear = array('html', 'table');
2635 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2636 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2638 if($token['type'] === HTML5
::CHARACTR
&&
2639 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2640 /* Append the character to the current node. */
2641 $text = $this->dom
->createTextNode($token['data']);
2642 end($this->stack
)->appendChild($text);
2644 /* A comment token */
2645 } elseif($token['type'] === HTML5
::COMMENT
) {
2646 /* Append a Comment node to the current node with the data
2647 attribute set to the data given in the comment token. */
2648 $comment = $this->dom
->createComment($token['data']);
2649 end($this->stack
)->appendChild($comment);
2651 /* A start tag whose tag name is "caption" */
2652 } elseif($token['type'] === HTML5
::STARTTAG
&&
2653 $token['name'] === 'caption') {
2654 /* Clear the stack back to a table context. */
2655 $this->clearStackToTableContext($clear);
2657 /* Insert a marker at the end of the list of active
2658 formatting elements. */
2659 $this->a_formatting
[] = self
::MARKER
;
2661 /* Insert an HTML element for the token, then switch the
2662 insertion mode to "in caption". */
2663 $this->insertElement($token);
2664 $this->mode
= self
::IN_CAPTION
;
2666 /* A start tag whose tag name is "colgroup" */
2667 } elseif($token['type'] === HTML5
::STARTTAG
&&
2668 $token['name'] === 'colgroup') {
2669 /* Clear the stack back to a table context. */
2670 $this->clearStackToTableContext($clear);
2672 /* Insert an HTML element for the token, then switch the
2673 insertion mode to "in column group". */
2674 $this->insertElement($token);
2675 $this->mode
= self
::IN_CGROUP
;
2677 /* A start tag whose tag name is "col" */
2678 } elseif($token['type'] === HTML5
::STARTTAG
&&
2679 $token['name'] === 'col') {
2680 $this->inTable(array(
2681 'name' => 'colgroup',
2682 'type' => HTML5
::STARTTAG
,
2686 $this->inColumnGroup($token);
2688 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2689 } elseif($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
2690 array('tbody', 'tfoot', 'thead'))) {
2691 /* Clear the stack back to a table context. */
2692 $this->clearStackToTableContext($clear);
2694 /* Insert an HTML element for the token, then switch the insertion
2695 mode to "in table body". */
2696 $this->insertElement($token);
2697 $this->mode
= self
::IN_TBODY
;
2699 /* A start tag whose tag name is one of: "td", "th", "tr" */
2700 } elseif($token['type'] === HTML5
::STARTTAG
&&
2701 in_array($token['name'], array('td', 'th', 'tr'))) {
2702 /* Act as if a start tag token with the tag name "tbody" had been
2703 seen, then reprocess the current token. */
2704 $this->inTable(array(
2706 'type' => HTML5
::STARTTAG
,
2710 return $this->inTableBody($token);
2712 /* A start tag whose tag name is "table" */
2713 } elseif($token['type'] === HTML5
::STARTTAG
&&
2714 $token['name'] === 'table') {
2715 /* Parse error. Act as if an end tag token with the tag name "table"
2716 had been seen, then, if that token wasn't ignored, reprocess the
2718 $this->inTable(array(
2720 'type' => HTML5
::ENDTAG
2723 return $this->mainPhase($token);
2725 /* An end tag whose tag name is "table" */
2726 } elseif($token['type'] === HTML5
::ENDTAG
&&
2727 $token['name'] === 'table') {
2728 /* If the stack of open elements does not have an element in table
2729 scope with the same tag name as the token, this is a parse error.
2730 Ignore the token. (innerHTML case) */
2731 if(!$this->elementInScope($token['name'], true)) {
2736 /* Generate implied end tags. */
2737 $this->generateImpliedEndTags();
2739 /* Now, if the current node is not a table element, then this
2740 is a parse error. */
2743 /* Pop elements from this stack until a table element has been
2744 popped from the stack. */
2746 $current = end($this->stack
)->nodeName
;
2747 array_pop($this->stack
);
2749 if($current === 'table') {
2754 /* Reset the insertion mode appropriately. */
2755 $this->resetInsertionMode();
2758 /* An end tag whose tag name is one of: "body", "caption", "col",
2759 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2760 } elseif($token['type'] === HTML5
::ENDTAG
&& in_array($token['name'],
2761 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2762 'tfoot', 'th', 'thead', 'tr'))) {
2763 // Parse error. Ignore the token.
2767 /* Parse error. Process the token as if the insertion mode was "in
2768 body", with the following exception: */
2770 /* If the current node is a table, tbody, tfoot, thead, or tr
2771 element, then, whenever a node would be inserted into the current
2772 node, it must instead be inserted into the foster parent element. */
2773 if(in_array(end($this->stack
)->nodeName
,
2774 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2775 /* The foster parent element is the parent element of the last
2776 table element in the stack of open elements, if there is a
2777 table element and it has such a parent element. If there is no
2778 table element in the stack of open elements (innerHTML case),
2779 then the foster parent element is the first element in the
2780 stack of open elements (the html element). Otherwise, if there
2781 is a table element in the stack of open elements, but the last
2782 table element in the stack of open elements has no parent, or
2783 its parent node is not an element, then the foster parent
2784 element is the element before the last table element in the
2785 stack of open elements. */
2786 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
2787 if($this->stack
[$n]->nodeName
=== 'table') {
2788 $table = $this->stack
[$n];
2793 if(isset($table) && $table->parentNode
!== null) {
2794 $this->foster_parent
= $table->parentNode
;
2796 } elseif(!isset($table)) {
2797 $this->foster_parent
= $this->stack
[0];
2799 } elseif(isset($table) && ($table->parentNode
=== null ||
2800 $table->parentNode
->nodeType
!== XML_ELEMENT_NODE
)) {
2801 $this->foster_parent
= $this->stack
[$n - 1];
2805 $this->inBody($token);
2809 private function inCaption($token) {
2810 /* An end tag whose tag name is "caption" */
2811 if($token['type'] === HTML5
::ENDTAG
&& $token['name'] === 'caption') {
2812 /* If the stack of open elements does not have an element in table
2813 scope with the same tag name as the token, this is a parse error.
2814 Ignore the token. (innerHTML case) */
2815 if(!$this->elementInScope($token['name'], true)) {
2820 /* Generate implied end tags. */
2821 $this->generateImpliedEndTags();
2823 /* Now, if the current node is not a caption element, then this
2824 is a parse error. */
2827 /* Pop elements from this stack until a caption element has
2828 been popped from the stack. */
2830 $node = end($this->stack
)->nodeName
;
2831 array_pop($this->stack
);
2833 if($node === 'caption') {
2838 /* Clear the list of active formatting elements up to the last
2840 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2842 /* Switch the insertion mode to "in table". */
2843 $this->mode
= self
::IN_TABLE
;
2846 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2847 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2849 } elseif(($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
2850 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2851 'thead', 'tr'))) ||
($token['type'] === HTML5
::ENDTAG
&&
2852 $token['name'] === 'table')) {
2853 /* Parse error. Act as if an end tag with the tag name "caption"
2854 had been seen, then, if that token wasn't ignored, reprocess the
2856 $this->inCaption(array(
2857 'name' => 'caption',
2858 'type' => HTML5
::ENDTAG
2861 return $this->inTable($token);
2863 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2864 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2865 } elseif($token['type'] === HTML5
::ENDTAG
&& in_array($token['name'],
2866 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2868 // Parse error. Ignore the token.
2872 /* Process the token as if the insertion mode was "in body". */
2873 $this->inBody($token);
2877 private function inColumnGroup($token) {
2878 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2879 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2881 if($token['type'] === HTML5
::CHARACTR
&&
2882 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2883 /* Append the character to the current node. */
2884 $text = $this->dom
->createTextNode($token['data']);
2885 end($this->stack
)->appendChild($text);
2887 /* A comment token */
2888 } elseif($token['type'] === HTML5
::COMMENT
) {
2889 /* Append a Comment node to the current node with the data
2890 attribute set to the data given in the comment token. */
2891 $comment = $this->dom
->createComment($token['data']);
2892 end($this->stack
)->appendChild($comment);
2894 /* A start tag whose tag name is "col" */
2895 } elseif($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'col') {
2896 /* Insert a col element for the token. Immediately pop the current
2897 node off the stack of open elements. */
2898 $this->insertElement($token);
2899 array_pop($this->stack
);
2901 /* An end tag whose tag name is "colgroup" */
2902 } elseif($token['type'] === HTML5
::ENDTAG
&&
2903 $token['name'] === 'colgroup') {
2904 /* If the current node is the root html element, then this is a
2905 parse error, ignore the token. (innerHTML case) */
2906 if(end($this->stack
)->nodeName
=== 'html') {
2909 /* Otherwise, pop the current node (which will be a colgroup
2910 element) from the stack of open elements. Switch the insertion
2911 mode to "in table". */
2913 array_pop($this->stack
);
2914 $this->mode
= self
::IN_TABLE
;
2917 /* An end tag whose tag name is "col" */
2918 } elseif($token['type'] === HTML5
::ENDTAG
&& $token['name'] === 'col') {
2919 /* Parse error. Ignore the token. */
2923 /* Act as if an end tag with the tag name "colgroup" had been seen,
2924 and then, if that token wasn't ignored, reprocess the current token. */
2925 $this->inColumnGroup(array(
2926 'name' => 'colgroup',
2927 'type' => HTML5
::ENDTAG
2930 return $this->inTable($token);
2934 private function inTableBody($token) {
2935 $clear = array('tbody', 'tfoot', 'thead', 'html');
2937 /* A start tag whose tag name is "tr" */
2938 if($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'tr') {
2939 /* Clear the stack back to a table body context. */
2940 $this->clearStackToTableContext($clear);
2942 /* Insert a tr element for the token, then switch the insertion
2943 mode to "in row". */
2944 $this->insertElement($token);
2945 $this->mode
= self
::IN_ROW
;
2947 /* A start tag whose tag name is one of: "th", "td" */
2948 } elseif($token['type'] === HTML5
::STARTTAG
&&
2949 ($token['name'] === 'th' ||
$token['name'] === 'td')) {
2950 /* Parse error. Act as if a start tag with the tag name "tr" had
2951 been seen, then reprocess the current token. */
2952 $this->inTableBody(array(
2954 'type' => HTML5
::STARTTAG
,
2958 return $this->inRow($token);
2960 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2961 } elseif($token['type'] === HTML5
::ENDTAG
&&
2962 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2963 /* If the stack of open elements does not have an element in table
2964 scope with the same tag name as the token, this is a parse error.
2965 Ignore the token. */
2966 if(!$this->elementInScope($token['name'], true)) {
2971 /* Clear the stack back to a table body context. */
2972 $this->clearStackToTableContext($clear);
2974 /* Pop the current node from the stack of open elements. Switch
2975 the insertion mode to "in table". */
2976 array_pop($this->stack
);
2977 $this->mode
= self
::IN_TABLE
;
2980 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2981 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2982 } elseif(($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
2983 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2984 ($token['type'] === HTML5
::STARTTAG
&& $token['name'] === 'table')) {
2985 /* If the stack of open elements does not have a tbody, thead, or
2986 tfoot element in table scope, this is a parse error. Ignore the
2987 token. (innerHTML case) */
2988 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2993 /* Clear the stack back to a table body context. */
2994 $this->clearStackToTableContext($clear);
2996 /* Act as if an end tag with the same tag name as the current
2997 node ("tbody", "tfoot", or "thead") had been seen, then
2998 reprocess the current token. */
2999 $this->inTableBody(array(
3000 'name' => end($this->stack
)->nodeName
,
3001 'type' => HTML5
::ENDTAG
3004 return $this->mainPhase($token);
3007 /* An end tag whose tag name is one of: "body", "caption", "col",
3008 "colgroup", "html", "td", "th", "tr" */
3009 } elseif($token['type'] === HTML5
::ENDTAG
&& in_array($token['name'],
3010 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3011 /* Parse error. Ignore the token. */
3015 /* Process the token as if the insertion mode was "in table". */
3016 $this->inTable($token);
3020 private function inRow($token) {
3021 $clear = array('tr', 'html');
3023 /* A start tag whose tag name is one of: "th", "td" */
3024 if($token['type'] === HTML5
::STARTTAG
&&
3025 ($token['name'] === 'th' ||
$token['name'] === 'td')) {
3026 /* Clear the stack back to a table row context. */
3027 $this->clearStackToTableContext($clear);
3029 /* Insert an HTML element for the token, then switch the insertion
3030 mode to "in cell". */
3031 $this->insertElement($token);
3032 $this->mode
= self
::IN_CELL
;
3034 /* Insert a marker at the end of the list of active formatting
3036 $this->a_formatting
[] = self
::MARKER
;
3038 /* An end tag whose tag name is "tr" */
3039 } elseif($token['type'] === HTML5
::ENDTAG
&& $token['name'] === 'tr') {
3040 /* If the stack of open elements does not have an element in table
3041 scope with the same tag name as the token, this is a parse error.
3042 Ignore the token. (innerHTML case) */
3043 if(!$this->elementInScope($token['name'], true)) {
3048 /* Clear the stack back to a table row context. */
3049 $this->clearStackToTableContext($clear);
3051 /* Pop the current node (which will be a tr element) from the
3052 stack of open elements. Switch the insertion mode to "in table
3054 array_pop($this->stack
);
3055 $this->mode
= self
::IN_TBODY
;
3058 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3059 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3060 } elseif($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
3061 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3062 /* Act as if an end tag with the tag name "tr" had been seen, then,
3063 if that token wasn't ignored, reprocess the current token. */
3066 'type' => HTML5
::ENDTAG
3069 return $this->inCell($token);
3071 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3072 } elseif($token['type'] === HTML5
::ENDTAG
&&
3073 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3074 /* If the stack of open elements does not have an element in table
3075 scope with the same tag name as the token, this is a parse error.
3076 Ignore the token. */
3077 if(!$this->elementInScope($token['name'], true)) {
3082 /* Otherwise, act as if an end tag with the tag name "tr" had
3083 been seen, then reprocess the current token. */
3086 'type' => HTML5
::ENDTAG
3089 return $this->inCell($token);
3092 /* An end tag whose tag name is one of: "body", "caption", "col",
3093 "colgroup", "html", "td", "th" */
3094 } elseif($token['type'] === HTML5
::ENDTAG
&& in_array($token['name'],
3095 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3096 /* Parse error. Ignore the token. */
3100 /* Process the token as if the insertion mode was "in table". */
3101 $this->inTable($token);
3105 private function inCell($token) {
3106 /* An end tag whose tag name is one of: "td", "th" */
3107 if($token['type'] === HTML5
::ENDTAG
&&
3108 ($token['name'] === 'td' ||
$token['name'] === 'th')) {
3109 /* If the stack of open elements does not have an element in table
3110 scope with the same tag name as that of the token, then this is a
3111 parse error and the token must be ignored. */
3112 if(!$this->elementInScope($token['name'], true)) {
3117 /* Generate implied end tags, except for elements with the same
3118 tag name as the token. */
3119 $this->generateImpliedEndTags(array($token['name']));
3121 /* Now, if the current node is not an element with the same tag
3122 name as the token, then this is a parse error. */
3125 /* Pop elements from this stack until an element with the same
3126 tag name as the token has been popped from the stack. */
3128 $node = end($this->stack
)->nodeName
;
3129 array_pop($this->stack
);
3131 if($node === $token['name']) {
3136 /* Clear the list of active formatting elements up to the last
3138 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3140 /* Switch the insertion mode to "in row". (The current node
3141 will be a tr element at this point.) */
3142 $this->mode
= self
::IN_ROW
;
3145 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3146 "tbody", "td", "tfoot", "th", "thead", "tr" */
3147 } elseif($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
3148 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3150 /* If the stack of open elements does not have a td or th element
3151 in table scope, then this is a parse error; ignore the token.
3153 if(!$this->elementInScope(array('td', 'th'), true)) {
3156 /* Otherwise, close the cell (see below) and reprocess the current
3160 return $this->inRow($token);
3163 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3164 "tbody", "td", "tfoot", "th", "thead", "tr" */
3165 } elseif($token['type'] === HTML5
::STARTTAG
&& in_array($token['name'],
3166 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3168 /* If the stack of open elements does not have a td or th element
3169 in table scope, then this is a parse error; ignore the token.
3171 if(!$this->elementInScope(array('td', 'th'), true)) {
3174 /* Otherwise, close the cell (see below) and reprocess the current
3178 return $this->inRow($token);
3181 /* An end tag whose tag name is one of: "body", "caption", "col",
3182 "colgroup", "html" */
3183 } elseif($token['type'] === HTML5
::ENDTAG
&& in_array($token['name'],
3184 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3185 /* Parse error. Ignore the token. */
3187 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3189 } elseif($token['type'] === HTML5
::ENDTAG
&& in_array($token['name'],
3190 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3191 /* If the stack of open elements does not have an element in table
3192 scope with the same tag name as that of the token (which can only
3193 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3194 then this is a parse error and the token must be ignored. */
3195 if(!$this->elementInScope($token['name'], true)) {
3198 /* Otherwise, close the cell (see below) and reprocess the current
3202 return $this->inRow($token);
3207 /* Process the token as if the insertion mode was "in body". */
3208 $this->inBody($token);
3212 private function inSelect($token) {
3213 /* Handle the token as follows: */
3215 /* A character token */
3216 if($token['type'] === HTML5
::CHARACTR
) {
3217 /* Append the token's character to the current node. */
3218 $this->insertText($token['data']);
3220 /* A comment token */
3221 } elseif($token['type'] === HTML5
::COMMENT
) {
3222 /* Append a Comment node to the current node with the data
3223 attribute set to the data given in the comment token. */
3224 $this->insertComment($token['data']);
3226 /* A start tag token whose tag name is "option" */
3227 } elseif($token['type'] === HTML5
::STARTTAG
&&
3228 $token['name'] === 'option') {
3229 /* If the current node is an option element, act as if an end tag
3230 with the tag name "option" had been seen. */
3231 if(end($this->stack
)->nodeName
=== 'option') {
3232 $this->inSelect(array(
3234 'type' => HTML5
::ENDTAG
3238 /* Insert an HTML element for the token. */
3239 $this->insertElement($token);
3241 /* A start tag token whose tag name is "optgroup" */
3242 } elseif($token['type'] === HTML5
::STARTTAG
&&
3243 $token['name'] === 'optgroup') {
3244 /* If the current node is an option element, act as if an end tag
3245 with the tag name "option" had been seen. */
3246 if(end($this->stack
)->nodeName
=== 'option') {
3247 $this->inSelect(array(
3249 'type' => HTML5
::ENDTAG
3253 /* If the current node is an optgroup element, act as if an end tag
3254 with the tag name "optgroup" had been seen. */
3255 if(end($this->stack
)->nodeName
=== 'optgroup') {
3256 $this->inSelect(array(
3257 'name' => 'optgroup',
3258 'type' => HTML5
::ENDTAG
3262 /* Insert an HTML element for the token. */
3263 $this->insertElement($token);
3265 /* An end tag token whose tag name is "optgroup" */
3266 } elseif($token['type'] === HTML5
::ENDTAG
&&
3267 $token['name'] === 'optgroup') {
3268 /* First, if the current node is an option element, and the node
3269 immediately before it in the stack of open elements is an optgroup
3270 element, then act as if an end tag with the tag name "option" had
3272 $elements_in_stack = count($this->stack
);
3274 if($this->stack
[$elements_in_stack - 1]->nodeName
=== 'option' &&
3275 $this->stack
[$elements_in_stack - 2]->nodeName
=== 'optgroup') {
3276 $this->inSelect(array(
3278 'type' => HTML5
::ENDTAG
3282 /* If the current node is an optgroup element, then pop that node
3283 from the stack of open elements. Otherwise, this is a parse error,
3284 ignore the token. */
3285 if($this->stack
[$elements_in_stack - 1] === 'optgroup') {
3286 array_pop($this->stack
);
3289 /* An end tag token whose tag name is "option" */
3290 } elseif($token['type'] === HTML5
::ENDTAG
&&
3291 $token['name'] === 'option') {
3292 /* If the current node is an option element, then pop that node
3293 from the stack of open elements. Otherwise, this is a parse error,
3294 ignore the token. */
3295 if(end($this->stack
)->nodeName
=== 'option') {
3296 array_pop($this->stack
);
3299 /* An end tag whose tag name is "select" */
3300 } elseif($token['type'] === HTML5
::ENDTAG
&&
3301 $token['name'] === 'select') {
3302 /* If the stack of open elements does not have an element in table
3303 scope with the same tag name as the token, this is a parse error.
3304 Ignore the token. (innerHTML case) */
3305 if(!$this->elementInScope($token['name'], true)) {
3310 /* Pop elements from the stack of open elements until a select
3311 element has been popped from the stack. */
3313 $current = end($this->stack
)->nodeName
;
3314 array_pop($this->stack
);
3316 if($current === 'select') {
3321 /* Reset the insertion mode appropriately. */
3322 $this->resetInsertionMode();
3325 /* A start tag whose tag name is "select" */
3326 } elseif($token['name'] === 'select' &&
3327 $token['type'] === HTML5
::STARTTAG
) {
3328 /* Parse error. Act as if the token had been an end tag with the
3329 tag name "select" instead. */
3330 $this->inSelect(array(
3332 'type' => HTML5
::ENDTAG
3335 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3336 "tfoot", "thead", "tr", "td", "th" */
3337 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3338 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5
::ENDTAG
) {
3342 /* If the stack of open elements has an element in table scope with
3343 the same tag name as that of the token, then act as if an end tag
3344 with the tag name "select" had been seen, and reprocess the token.
3345 Otherwise, ignore the token. */
3346 if($this->elementInScope($token['name'], true)) {
3347 $this->inSelect(array(
3349 'type' => HTML5
::ENDTAG
3352 $this->mainPhase($token);
3357 /* Parse error. Ignore the token. */
3361 private function afterBody($token) {
3362 /* Handle the token as follows: */
3364 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3365 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3367 if($token['type'] === HTML5
::CHARACTR
&&
3368 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3369 /* Process the token as it would be processed if the insertion mode
3371 $this->inBody($token);
3373 /* A comment token */
3374 } elseif($token['type'] === HTML5
::COMMENT
) {
3375 /* Append a Comment node to the first element in the stack of open
3376 elements (the html element), with the data attribute set to the
3377 data given in the comment token. */
3378 $comment = $this->dom
->createComment($token['data']);
3379 $this->stack
[0]->appendChild($comment);
3381 /* An end tag with the tag name "html" */
3382 } elseif($token['type'] === HTML5
::ENDTAG
&& $token['name'] === 'html') {
3383 /* If the parser was originally created in order to handle the
3384 setting of an element's innerHTML attribute, this is a parse error;
3385 ignore the token. (The element will be an html element in this
3386 case.) (innerHTML case) */
3388 /* Otherwise, switch to the trailing end phase. */
3389 $this->phase
= self
::END_PHASE
;
3393 /* Parse error. Set the insertion mode to "in body" and reprocess
3395 $this->mode
= self
::IN_BODY
;
3396 return $this->inBody($token);
3400 private function inFrameset($token) {
3401 /* Handle the token as follows: */
3403 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3404 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3405 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3406 if($token['type'] === HTML5
::CHARACTR
&&
3407 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3408 /* Append the character to the current node. */
3409 $this->insertText($token['data']);
3411 /* A comment token */
3412 } elseif($token['type'] === HTML5
::COMMENT
) {
3413 /* Append a Comment node to the current node with the data
3414 attribute set to the data given in the comment token. */
3415 $this->insertComment($token['data']);
3417 /* A start tag with the tag name "frameset" */
3418 } elseif($token['name'] === 'frameset' &&
3419 $token['type'] === HTML5
::STARTTAG
) {
3420 $this->insertElement($token);
3422 /* An end tag with the tag name "frameset" */
3423 } elseif($token['name'] === 'frameset' &&
3424 $token['type'] === HTML5
::ENDTAG
) {
3425 /* If the current node is the root html element, then this is a
3426 parse error; ignore the token. (innerHTML case) */
3427 if(end($this->stack
)->nodeName
=== 'html') {
3431 /* Otherwise, pop the current node from the stack of open
3433 array_pop($this->stack
);
3435 /* If the parser was not originally created in order to handle
3436 the setting of an element's innerHTML attribute (innerHTML case),
3437 and the current node is no longer a frameset element, then change
3438 the insertion mode to "after frameset". */
3439 $this->mode
= self
::AFTR_FRAME
;
3442 /* A start tag with the tag name "frame" */
3443 } elseif($token['name'] === 'frame' &&
3444 $token['type'] === HTML5
::STARTTAG
) {
3445 /* Insert an HTML element for the token. */
3446 $this->insertElement($token);
3448 /* Immediately pop the current node off the stack of open elements. */
3449 array_pop($this->stack
);
3451 /* A start tag with the tag name "noframes" */
3452 } elseif($token['name'] === 'noframes' &&
3453 $token['type'] === HTML5
::STARTTAG
) {
3454 /* Process the token as if the insertion mode had been "in body". */
3455 $this->inBody($token);
3459 /* Parse error. Ignore the token. */
3463 private function afterFrameset($token) {
3464 /* Handle the token as follows: */
3466 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3467 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3468 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3469 if($token['type'] === HTML5
::CHARACTR
&&
3470 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3471 /* Append the character to the current node. */
3472 $this->insertText($token['data']);
3474 /* A comment token */
3475 } elseif($token['type'] === HTML5
::COMMENT
) {
3476 /* Append a Comment node to the current node with the data
3477 attribute set to the data given in the comment token. */
3478 $this->insertComment($token['data']);
3480 /* An end tag with the tag name "html" */
3481 } elseif($token['name'] === 'html' &&
3482 $token['type'] === HTML5
::ENDTAG
) {
3483 /* Switch to the trailing end phase. */
3484 $this->phase
= self
::END_PHASE
;
3486 /* A start tag with the tag name "noframes" */
3487 } elseif($token['name'] === 'noframes' &&
3488 $token['type'] === HTML5
::STARTTAG
) {
3489 /* Process the token as if the insertion mode had been "in body". */
3490 $this->inBody($token);
3494 /* Parse error. Ignore the token. */
3498 private function trailingEndPhase($token) {
3499 /* After the main phase, as each token is emitted from the tokenisation
3500 stage, it must be processed as described in this section. */
3502 /* A DOCTYPE token */
3503 if($token['type'] === HTML5
::DOCTYPE
) {
3504 // Parse error. Ignore the token.
3506 /* A comment token */
3507 } elseif($token['type'] === HTML5
::COMMENT
) {
3508 /* Append a Comment node to the Document object with the data
3509 attribute set to the data given in the comment token. */
3510 $comment = $this->dom
->createComment($token['data']);
3511 $this->dom
->appendChild($comment);
3513 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3514 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3516 } elseif($token['type'] === HTML5
::CHARACTR
&&
3517 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3518 /* Process the token as it would be processed in the main phase. */
3519 $this->mainPhase($token);
3521 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3522 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3523 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3524 } elseif(($token['type'] === HTML5
::CHARACTR
&&
3525 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3526 $token['type'] === HTML5
::STARTTAG ||
$token['type'] === HTML5
::ENDTAG
) {
3527 /* Parse error. Switch back to the main phase and reprocess the
3529 $this->phase
= self
::MAIN_PHASE
;
3530 return $this->mainPhase($token);
3532 /* An end-of-file token */
3533 } elseif($token['type'] === HTML5
::EOF
) {
3538 private function insertElement($token, $append = true, $check = false) {
3539 // Proprietary workaround for libxml2's limitations with tag names
3541 // Slightly modified HTML5 tag-name modification,
3542 // removing anything that's not an ASCII letter, digit, or hyphen
3543 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
3544 // Remove leading hyphens and numbers
3545 $token['name'] = ltrim($token['name'], '-0..9');
3546 // In theory, this should ever be needed, but just in case
3547 if ($token['name'] === '') $token['name'] = 'span'; // arbitrary generic choice
3550 $el = $this->dom
->createElement($token['name']);
3552 foreach($token['attr'] as $attr) {
3553 if(!$el->hasAttribute($attr['name'])) {
3554 $el->setAttribute($attr['name'], $attr['value']);
3558 $this->appendToRealParent($el);
3559 $this->stack
[] = $el;
3564 private function insertText($data) {
3565 $text = $this->dom
->createTextNode($data);
3566 $this->appendToRealParent($text);
3569 private function insertComment($data) {
3570 $comment = $this->dom
->createComment($data);
3571 $this->appendToRealParent($comment);
3574 private function appendToRealParent($node) {
3575 if($this->foster_parent
=== null) {
3576 end($this->stack
)->appendChild($node);
3578 } elseif($this->foster_parent
!== null) {
3579 /* If the foster parent element is the parent element of the
3580 last table element in the stack of open elements, then the new
3581 node must be inserted immediately before the last table element
3582 in the stack of open elements in the foster parent element;
3583 otherwise, the new node must be appended to the foster parent
3585 for($n = count($this->stack
) - 1; $n >= 0; $n--) {
3586 if($this->stack
[$n]->nodeName
=== 'table' &&
3587 $this->stack
[$n]->parentNode
!== null) {
3588 $table = $this->stack
[$n];
3593 if(isset($table) && $this->foster_parent
->isSameNode($table->parentNode
))
3594 $this->foster_parent
->insertBefore($node, $table);
3596 $this->foster_parent
->appendChild($node);
3598 $this->foster_parent
= null;
3602 private function elementInScope($el, $table = false) {
3604 foreach($el as $element) {
3605 if($this->elementInScope($element, $table)) {
3613 $leng = count($this->stack
);
3615 for($n = 0; $n < $leng; $n++
) {
3616 /* 1. Initialise node to be the current node (the bottommost node of
3618 $node = $this->stack
[$leng - 1 - $n];
3620 if($node->tagName
=== $el) {
3621 /* 2. If node is the target node, terminate in a match state. */
3624 } elseif($node->tagName
=== 'table') {
3625 /* 3. Otherwise, if node is a table element, terminate in a failure
3629 } elseif($table === true && in_array($node->tagName
, array('caption', 'td',
3630 'th', 'button', 'marquee', 'object'))) {
3631 /* 4. Otherwise, if the algorithm is the "has an element in scope"
3632 variant (rather than the "has an element in table scope" variant),
3633 and node is one of the following, terminate in a failure state. */
3636 } elseif($node === $node->ownerDocument
->documentElement
) {
3637 /* 5. Otherwise, if node is an html element (root element), terminate
3638 in a failure state. (This can only happen if the node is the topmost
3639 node of the stack of open elements, and prevents the next step from
3640 being invoked if there are no more elements in the stack.) */
3644 /* Otherwise, set node to the previous entry in the stack of open
3645 elements and return to step 2. (This will never fail, since the loop
3646 will always terminate in the previous step if the top of the stack
3651 private function reconstructActiveFormattingElements() {
3652 /* 1. If there are no entries in the list of active formatting elements,
3653 then there is nothing to reconstruct; stop this algorithm. */
3654 $formatting_elements = count($this->a_formatting
);
3656 if($formatting_elements === 0) {
3660 /* 3. Let entry be the last (most recently added) element in the list
3661 of active formatting elements. */
3662 $entry = end($this->a_formatting
);
3664 /* 2. If the last (most recently added) entry in the list of active
3665 formatting elements is a marker, or if it is an element that is in the
3666 stack of open elements, then there is nothing to reconstruct; stop this
3668 if($entry === self
::MARKER ||
in_array($entry, $this->stack
, true)) {
3672 for($a = $formatting_elements - 1; $a >= 0; true) {
3673 /* 4. If there are no entries before entry in the list of active
3674 formatting elements, then jump to step 8. */
3676 $step_seven = false;
3680 /* 5. Let entry be the entry one earlier than entry in the list of
3681 active formatting elements. */
3683 $entry = $this->a_formatting
[$a];
3685 /* 6. If entry is neither a marker nor an element that is also in
3686 thetack of open elements, go to step 4. */
3687 if($entry === self
::MARKER ||
in_array($entry, $this->stack
, true)) {
3693 /* 7. Let entry be the element one later than entry in the list of
3694 active formatting elements. */
3695 if(isset($step_seven) && $step_seven === true) {
3697 $entry = $this->a_formatting
[$a];
3700 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3701 $clone = $entry->cloneNode();
3703 /* 9. Append clone to the current node and push it onto the stack
3704 of open elements so that it is the new current node. */
3705 end($this->stack
)->appendChild($clone);
3706 $this->stack
[] = $clone;
3708 /* 10. Replace the entry for entry in the list with an entry for
3710 $this->a_formatting
[$a] = $clone;
3712 /* 11. If the entry for clone in the list of active formatting
3713 elements is not the last entry in the list, return to step 7. */
3714 if(end($this->a_formatting
) !== $clone) {
3722 private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3723 /* When the steps below require the UA to clear the list of active
3724 formatting elements up to the last marker, the UA must perform the
3728 /* 1. Let entry be the last (most recently added) entry in the list
3729 of active formatting elements. */
3730 $entry = end($this->a_formatting
);
3732 /* 2. Remove entry from the list of active formatting elements. */
3733 array_pop($this->a_formatting
);
3735 /* 3. If entry was a marker, then stop the algorithm at this point.
3736 The list has been cleared up to the last marker. */
3737 if($entry === self
::MARKER
) {
3743 private function generateImpliedEndTags($exclude = array()) {
3744 /* When the steps below require the UA to generate implied end tags,
3745 then, if the current node is a dd element, a dt element, an li element,
3746 a p element, a td element, a th element, or a tr element, the UA must
3747 act as if an end tag with the respective tag name had been seen and
3748 then generate implied end tags again. */
3749 $node = end($this->stack
);
3750 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3752 while(in_array(end($this->stack
)->nodeName
, $elements)) {
3753 array_pop($this->stack
);
3757 private function getElementCategory($node) {
3758 $name = $node->tagName
;
3759 if(in_array($name, $this->special
))
3760 return self
::SPECIAL
;
3762 elseif(in_array($name, $this->scoping
))
3763 return self
::SCOPING
;
3765 elseif(in_array($name, $this->formatting
))
3766 return self
::FORMATTING
;
3769 return self
::PHRASING
;
3772 private function clearStackToTableContext($elements) {
3773 /* When the steps above require the UA to clear the stack back to a
3774 table context, it means that the UA must, while the current node is not
3775 a table element or an html element, pop elements from the stack of open
3776 elements. If this causes any elements to be popped from the stack, then
3777 this is a parse error. */
3779 $node = end($this->stack
)->nodeName
;
3781 if(in_array($node, $elements)) {
3784 array_pop($this->stack
);
3789 private function resetInsertionMode() {
3790 /* 1. Let last be false. */
3792 $leng = count($this->stack
);
3794 for($n = $leng - 1; $n >= 0; $n--) {
3795 /* 2. Let node be the last node in the stack of open elements. */
3796 $node = $this->stack
[$n];
3798 /* 3. If node is the first node in the stack of open elements, then
3799 set last to true. If the element whose innerHTML attribute is being
3800 set is neither a td element nor a th element, then set node to the
3801 element whose innerHTML attribute is being set. (innerHTML case) */
3802 if($this->stack
[0]->isSameNode($node)) {
3806 /* 4. If node is a select element, then switch the insertion mode to
3807 "in select" and abort these steps. (innerHTML case) */
3808 if($node->nodeName
=== 'select') {
3809 $this->mode
= self
::IN_SELECT
;
3812 /* 5. If node is a td or th element, then switch the insertion mode
3813 to "in cell" and abort these steps. */
3814 } elseif($node->nodeName
=== 'td' ||
$node->nodeName
=== 'th') {
3815 $this->mode
= self
::IN_CELL
;
3818 /* 6. If node is a tr element, then switch the insertion mode to
3819 "in row" and abort these steps. */
3820 } elseif($node->nodeName
=== 'tr') {
3821 $this->mode
= self
::IN_ROW
;
3824 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3825 insertion mode to "in table body" and abort these steps. */
3826 } elseif(in_array($node->nodeName
, array('tbody', 'thead', 'tfoot'))) {
3827 $this->mode
= self
::IN_TBODY
;
3830 /* 8. If node is a caption element, then switch the insertion mode
3831 to "in caption" and abort these steps. */
3832 } elseif($node->nodeName
=== 'caption') {
3833 $this->mode
= self
::IN_CAPTION
;
3836 /* 9. If node is a colgroup element, then switch the insertion mode
3837 to "in column group" and abort these steps. (innerHTML case) */
3838 } elseif($node->nodeName
=== 'colgroup') {
3839 $this->mode
= self
::IN_CGROUP
;
3842 /* 10. If node is a table element, then switch the insertion mode
3843 to "in table" and abort these steps. */
3844 } elseif($node->nodeName
=== 'table') {
3845 $this->mode
= self
::IN_TABLE
;
3848 /* 11. If node is a head element, then switch the insertion mode
3849 to "in body" ("in body"! not "in head"!) and abort these steps.
3851 } elseif($node->nodeName
=== 'head') {
3852 $this->mode
= self
::IN_BODY
;
3855 /* 12. If node is a body element, then switch the insertion mode to
3856 "in body" and abort these steps. */
3857 } elseif($node->nodeName
=== 'body') {
3858 $this->mode
= self
::IN_BODY
;
3861 /* 13. If node is a frameset element, then switch the insertion
3862 mode to "in frameset" and abort these steps. (innerHTML case) */
3863 } elseif($node->nodeName
=== 'frameset') {
3864 $this->mode
= self
::IN_FRAME
;
3867 /* 14. If node is an html element, then: if the head element
3868 pointer is null, switch the insertion mode to "before head",
3869 otherwise, switch the insertion mode to "after head". In either
3870 case, abort these steps. (innerHTML case) */
3871 } elseif($node->nodeName
=== 'html') {
3872 $this->mode
= ($this->head_pointer
=== null)
3878 /* 15. If last is true, then set the insertion mode to "in body"
3879 and abort these steps. (innerHTML case) */
3881 $this->mode
= self
::IN_BODY
;
3887 private function closeCell() {
3888 /* If the stack of open elements has a td or th element in table scope,
3889 then act as if an end tag token with that tag name had been seen. */
3890 foreach(array('td', 'th') as $cell) {
3891 if($this->elementInScope($cell, true)) {
3892 $this->inCell(array(
3894 'type' => HTML5
::ENDTAG
3902 public function save() {