Delete asserts, fixes #97.
[htmlpurifier/darkodev.git] / maintenance / PH5P.php
blob9d83dcbf5581b89a4020d8ab7f3d5d59e3364c15
1 <?php
2 class HTML5
4 private $data;
5 private $char;
6 private $EOF;
7 private $state;
8 private $tree;
9 private $token;
10 private $content_model;
11 private $escape = false;
12 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
13 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
14 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
15 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
16 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
17 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
18 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
19 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
20 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
21 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
22 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
23 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
24 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
25 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
26 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
27 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
28 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
29 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
30 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
31 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
32 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
33 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
34 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
35 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
36 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
37 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
38 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
39 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
40 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
41 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
42 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
43 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
44 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
45 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
46 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
47 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
48 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
49 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
50 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
51 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
52 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
53 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
55 const PCDATA = 0;
56 const RCDATA = 1;
57 const CDATA = 2;
58 const PLAINTEXT = 3;
60 const DOCTYPE = 0;
61 const STARTTAG = 1;
62 const ENDTAG = 2;
63 const COMMENT = 3;
64 const CHARACTR = 4;
65 const EOF = 5;
67 public function __construct($data)
69 $data = str_replace("\r\n", "\n", $data);
70 $date = str_replace("\r", null, $data);
72 $this->data = $data;
73 $this->char = -1;
74 $this->EOF = strlen($data);
75 $this->tree = new HTML5TreeConstructer;
76 $this->content_model = self::PCDATA;
78 $this->state = 'data';
80 while($this->state !== null) {
81 $this->{$this->state.'State'}();
85 public function save()
87 return $this->tree->save();
90 private function char()
92 return ($this->char < $this->EOF)
93 ? $this->data[$this->char]
94 : false;
97 private function character($s, $l = 0)
99 if($s + $l < $this->EOF) {
100 if($l === 0) {
101 return $this->data[$s];
102 } else {
103 return substr($this->data, $s, $l);
108 private function characters($char_class, $start)
110 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
113 private function dataState()
115 // Consume the next input character
116 $this->char++;
117 $char = $this->char();
119 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
120 /* U+0026 AMPERSAND (&)
121 When the content model flag is set to one of the PCDATA or RCDATA
122 states: switch to the entity data state. Otherwise: treat it as per
123 the "anything else" entry below. */
124 $this->state = 'entityData';
126 } elseif($char === '-') {
127 /* If the content model flag is set to either the RCDATA state or
128 the CDATA state, and the escape flag is false, and there are at
129 least three characters before this one in the input stream, and the
130 last four characters in the input stream, including this one, are
131 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
132 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
133 if(($this->content_model === self::RCDATA || $this->content_model ===
134 self::CDATA) && $this->escape === false &&
135 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
136 $this->escape = true;
139 /* In any case, emit the input character as a character token. Stay
140 in the data state. */
141 $this->emitToken(array(
142 'type' => self::CHARACTR,
143 'data' => $char
146 /* U+003C LESS-THAN SIGN (<) */
147 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
148 (($this->content_model === self::RCDATA ||
149 $this->content_model === self::CDATA) && $this->escape === false))) {
150 /* When the content model flag is set to the PCDATA state: switch
151 to the tag open state.
153 When the content model flag is set to either the RCDATA state or
154 the CDATA state and the escape flag is false: switch to the tag
155 open state.
157 Otherwise: treat it as per the "anything else" entry below. */
158 $this->state = 'tagOpen';
160 /* U+003E GREATER-THAN SIGN (>) */
161 } elseif($char === '>') {
162 /* If the content model flag is set to either the RCDATA state or
163 the CDATA state, and the escape flag is true, and the last three
164 characters in the input stream including this one are U+002D
165 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
166 set the escape flag to false. */
167 if(($this->content_model === self::RCDATA ||
168 $this->content_model === self::CDATA) && $this->escape === true &&
169 $this->character($this->char, 3) === '-->') {
170 $this->escape = false;
173 /* In any case, emit the input character as a character token.
174 Stay in the data state. */
175 $this->emitToken(array(
176 'type' => self::CHARACTR,
177 'data' => $char
180 } elseif($this->char === $this->EOF) {
181 /* EOF
182 Emit an end-of-file token. */
183 $this->EOF();
185 } elseif($this->content_model === self::PLAINTEXT) {
186 /* When the content model flag is set to the PLAINTEXT state
187 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
188 the text and emit it as a character token. */
189 $this->emitToken(array(
190 'type' => self::CHARACTR,
191 'data' => substr($this->data, $this->char)
194 $this->EOF();
196 } else {
197 /* Anything else
198 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
199 otherwise would also be treated as a character token and emit it
200 as a single character token. Stay in the data state. */
201 $len = strcspn($this->data, '<&', $this->char);
202 $char = substr($this->data, $this->char, $len);
203 $this->char += $len - 1;
205 $this->emitToken(array(
206 'type' => self::CHARACTR,
207 'data' => $char
210 $this->state = 'data';
214 private function entityDataState()
216 // Attempt to consume an entity.
217 $entity = $this->entity();
219 // If nothing is returned, emit a U+0026 AMPERSAND character token.
220 // Otherwise, emit the character token that was returned.
221 $char = (!$entity) ? '&' : $entity;
222 $this->emitToken($char);
224 // Finally, switch to the data state.
225 $this->state = 'data';
228 private function tagOpenState()
230 switch($this->content_model) {
231 case self::RCDATA:
232 case self::CDATA:
233 /* If the next input character is a U+002F SOLIDUS (/) character,
234 consume it and switch to the close tag open state. If the next
235 input character is not a U+002F SOLIDUS (/) character, emit a
236 U+003C LESS-THAN SIGN character token and switch to the data
237 state to process the next input character. */
238 if($this->character($this->char + 1) === '/') {
239 $this->char++;
240 $this->state = 'closeTagOpen';
242 } else {
243 $this->emitToken(array(
244 'type' => self::CHARACTR,
245 'data' => '<'
248 $this->state = 'data';
250 break;
252 case self::PCDATA:
253 // If the content model flag is set to the PCDATA state
254 // Consume the next input character:
255 $this->char++;
256 $char = $this->char();
258 if($char === '!') {
259 /* U+0021 EXCLAMATION MARK (!)
260 Switch to the markup declaration open state. */
261 $this->state = 'markupDeclarationOpen';
263 } elseif($char === '/') {
264 /* U+002F SOLIDUS (/)
265 Switch to the close tag open state. */
266 $this->state = 'closeTagOpen';
268 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
269 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
270 Create a new start tag token, set its tag name to the lowercase
271 version of the input character (add 0x0020 to the character's code
272 point), then switch to the tag name state. (Don't emit the token
273 yet; further details will be filled in before it is emitted.) */
274 $this->token = array(
275 'name' => strtolower($char),
276 'type' => self::STARTTAG,
277 'attr' => array()
280 $this->state = 'tagName';
282 } elseif($char === '>') {
283 /* U+003E GREATER-THAN SIGN (>)
284 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
285 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
286 $this->emitToken(array(
287 'type' => self::CHARACTR,
288 'data' => '<>'
291 $this->state = 'data';
293 } elseif($char === '?') {
294 /* U+003F QUESTION MARK (?)
295 Parse error. Switch to the bogus comment state. */
296 $this->state = 'bogusComment';
298 } else {
299 /* Anything else
300 Parse error. Emit a U+003C LESS-THAN SIGN character token and
301 reconsume the current input character in the data state. */
302 $this->emitToken(array(
303 'type' => self::CHARACTR,
304 'data' => '<'
307 $this->char--;
308 $this->state = 'data';
310 break;
314 private function closeTagOpenState()
316 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
317 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
319 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
320 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
321 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
322 /* If the content model flag is set to the RCDATA or CDATA states then
323 examine the next few characters. If they do not match the tag name of
324 the last start tag token emitted (case insensitively), or if they do but
325 they are not immediately followed by one of the following characters:
326 * U+0009 CHARACTER TABULATION
327 * U+000A LINE FEED (LF)
328 * U+000B LINE TABULATION
329 * U+000C FORM FEED (FF)
330 * U+0020 SPACE
331 * U+003E GREATER-THAN SIGN (>)
332 * U+002F SOLIDUS (/)
333 * EOF
334 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
335 token, a U+002F SOLIDUS character token, and switch to the data state
336 to process the next input character. */
337 $this->emitToken(array(
338 'type' => self::CHARACTR,
339 'data' => '</'
342 $this->state = 'data';
344 } else {
345 /* Otherwise, if the content model flag is set to the PCDATA state,
346 or if the next few characters do match that tag name, consume the
347 next input character: */
348 $this->char++;
349 $char = $this->char();
351 if(preg_match('/^[A-Za-z]$/', $char)) {
352 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
353 Create a new end tag token, set its tag name to the lowercase version
354 of the input character (add 0x0020 to the character's code point), then
355 switch to the tag name state. (Don't emit the token yet; further details
356 will be filled in before it is emitted.) */
357 $this->token = array(
358 'name' => strtolower($char),
359 'type' => self::ENDTAG
362 $this->state = 'tagName';
364 } elseif($char === '>') {
365 /* U+003E GREATER-THAN SIGN (>)
366 Parse error. Switch to the data state. */
367 $this->state = 'data';
369 } elseif($this->char === $this->EOF) {
370 /* EOF
371 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
372 SOLIDUS character token. Reconsume the EOF character in the data state. */
373 $this->emitToken(array(
374 'type' => self::CHARACTR,
375 'data' => '</'
378 $this->char--;
379 $this->state = 'data';
381 } else {
382 /* Parse error. Switch to the bogus comment state. */
383 $this->state = 'bogusComment';
388 private function tagNameState()
390 // Consume the next input character:
391 $this->char++;
392 $char = $this->character($this->char);
394 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
395 /* U+0009 CHARACTER TABULATION
396 U+000A LINE FEED (LF)
397 U+000B LINE TABULATION
398 U+000C FORM FEED (FF)
399 U+0020 SPACE
400 Switch to the before attribute name state. */
401 $this->state = 'beforeAttributeName';
403 } elseif($char === '>') {
404 /* U+003E GREATER-THAN SIGN (>)
405 Emit the current tag token. Switch to the data state. */
406 $this->emitToken($this->token);
407 $this->state = 'data';
409 } elseif($this->char === $this->EOF) {
410 /* EOF
411 Parse error. Emit the current tag token. Reconsume the EOF
412 character in the data state. */
413 $this->emitToken($this->token);
415 $this->char--;
416 $this->state = 'data';
418 } elseif($char === '/') {
419 /* U+002F SOLIDUS (/)
420 Parse error unless this is a permitted slash. Switch to the before
421 attribute name state. */
422 $this->state = 'beforeAttributeName';
424 } else {
425 /* Anything else
426 Append the current input character to the current tag token's tag name.
427 Stay in the tag name state. */
428 $this->token['name'] .= strtolower($char);
429 $this->state = 'tagName';
433 private function beforeAttributeNameState()
435 // Consume the next input character:
436 $this->char++;
437 $char = $this->character($this->char);
439 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
440 /* U+0009 CHARACTER TABULATION
441 U+000A LINE FEED (LF)
442 U+000B LINE TABULATION
443 U+000C FORM FEED (FF)
444 U+0020 SPACE
445 Stay in the before attribute name state. */
446 $this->state = 'beforeAttributeName';
448 } elseif($char === '>') {
449 /* U+003E GREATER-THAN SIGN (>)
450 Emit the current tag token. Switch to the data state. */
451 $this->emitToken($this->token);
452 $this->state = 'data';
454 } elseif($char === '/') {
455 /* U+002F SOLIDUS (/)
456 Parse error unless this is a permitted slash. Stay in the before
457 attribute name state. */
458 $this->state = 'beforeAttributeName';
460 } elseif($this->char === $this->EOF) {
461 /* EOF
462 Parse error. Emit the current tag token. Reconsume the EOF
463 character in the data state. */
464 $this->emitToken($this->token);
466 $this->char--;
467 $this->state = 'data';
469 } else {
470 /* Anything else
471 Start a new attribute in the current tag token. Set that attribute's
472 name to the current input character, and its value to the empty string.
473 Switch to the attribute name state. */
474 $this->token['attr'][] = array(
475 'name' => strtolower($char),
476 'value' => null
479 $this->state = 'attributeName';
483 private function attributeNameState()
485 // Consume the next input character:
486 $this->char++;
487 $char = $this->character($this->char);
489 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
490 /* U+0009 CHARACTER TABULATION
491 U+000A LINE FEED (LF)
492 U+000B LINE TABULATION
493 U+000C FORM FEED (FF)
494 U+0020 SPACE
495 Stay in the before attribute name state. */
496 $this->state = 'afterAttributeName';
498 } elseif($char === '=') {
499 /* U+003D EQUALS SIGN (=)
500 Switch to the before attribute value state. */
501 $this->state = 'beforeAttributeValue';
503 } elseif($char === '>') {
504 /* U+003E GREATER-THAN SIGN (>)
505 Emit the current tag token. Switch to the data state. */
506 $this->emitToken($this->token);
507 $this->state = 'data';
509 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
510 /* U+002F SOLIDUS (/)
511 Parse error unless this is a permitted slash. Switch to the before
512 attribute name state. */
513 $this->state = 'beforeAttributeName';
515 } elseif($this->char === $this->EOF) {
516 /* EOF
517 Parse error. Emit the current tag token. Reconsume the EOF
518 character in the data state. */
519 $this->emitToken($this->token);
521 $this->char--;
522 $this->state = 'data';
524 } else {
525 /* Anything else
526 Append the current input character to the current attribute's name.
527 Stay in the attribute name state. */
528 $last = count($this->token['attr']) - 1;
529 $this->token['attr'][$last]['name'] .= strtolower($char);
531 $this->state = 'attributeName';
535 private function afterAttributeNameState()
537 // Consume the next input character:
538 $this->char++;
539 $char = $this->character($this->char);
541 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
542 /* U+0009 CHARACTER TABULATION
543 U+000A LINE FEED (LF)
544 U+000B LINE TABULATION
545 U+000C FORM FEED (FF)
546 U+0020 SPACE
547 Stay in the after attribute name state. */
548 $this->state = 'afterAttributeName';
550 } elseif($char === '=') {
551 /* U+003D EQUALS SIGN (=)
552 Switch to the before attribute value state. */
553 $this->state = 'beforeAttributeValue';
555 } elseif($char === '>') {
556 /* U+003E GREATER-THAN SIGN (>)
557 Emit the current tag token. Switch to the data state. */
558 $this->emitToken($this->token);
559 $this->state = 'data';
561 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
562 /* U+002F SOLIDUS (/)
563 Parse error unless this is a permitted slash. Switch to the
564 before attribute name state. */
565 $this->state = 'beforeAttributeName';
567 } elseif($this->char === $this->EOF) {
568 /* EOF
569 Parse error. Emit the current tag token. Reconsume the EOF
570 character in the data state. */
571 $this->emitToken($this->token);
573 $this->char--;
574 $this->state = 'data';
576 } else {
577 /* Anything else
578 Start a new attribute in the current tag token. Set that attribute's
579 name to the current input character, and its value to the empty string.
580 Switch to the attribute name state. */
581 $this->token['attr'][] = array(
582 'name' => strtolower($char),
583 'value' => null
586 $this->state = 'attributeName';
590 private function beforeAttributeValueState()
592 // Consume the next input character:
593 $this->char++;
594 $char = $this->character($this->char);
596 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
597 /* U+0009 CHARACTER TABULATION
598 U+000A LINE FEED (LF)
599 U+000B LINE TABULATION
600 U+000C FORM FEED (FF)
601 U+0020 SPACE
602 Stay in the before attribute value state. */
603 $this->state = 'beforeAttributeValue';
605 } elseif($char === '"') {
606 /* U+0022 QUOTATION MARK (")
607 Switch to the attribute value (double-quoted) state. */
608 $this->state = 'attributeValueDoubleQuoted';
610 } elseif($char === '&') {
611 /* U+0026 AMPERSAND (&)
612 Switch to the attribute value (unquoted) state and reconsume
613 this input character. */
614 $this->char--;
615 $this->state = 'attributeValueUnquoted';
617 } elseif($char === '\'') {
618 /* U+0027 APOSTROPHE (')
619 Switch to the attribute value (single-quoted) state. */
620 $this->state = 'attributeValueSingleQuoted';
622 } elseif($char === '>') {
623 /* U+003E GREATER-THAN SIGN (>)
624 Emit the current tag token. Switch to the data state. */
625 $this->emitToken($this->token);
626 $this->state = 'data';
628 } else {
629 /* Anything else
630 Append the current input character to the current attribute's value.
631 Switch to the attribute value (unquoted) state. */
632 $last = count($this->token['attr']) - 1;
633 $this->token['attr'][$last]['value'] .= $char;
635 $this->state = 'attributeValueUnquoted';
639 private function attributeValueDoubleQuotedState()
641 // Consume the next input character:
642 $this->char++;
643 $char = $this->character($this->char);
645 if($char === '"') {
646 /* U+0022 QUOTATION MARK (")
647 Switch to the before attribute name state. */
648 $this->state = 'beforeAttributeName';
650 } elseif($char === '&') {
651 /* U+0026 AMPERSAND (&)
652 Switch to the entity in attribute value state. */
653 $this->entityInAttributeValueState('double');
655 } elseif($this->char === $this->EOF) {
656 /* EOF
657 Parse error. Emit the current tag token. Reconsume the character
658 in the data state. */
659 $this->emitToken($this->token);
661 $this->char--;
662 $this->state = 'data';
664 } else {
665 /* Anything else
666 Append the current input character to the current attribute's value.
667 Stay in the attribute value (double-quoted) state. */
668 $last = count($this->token['attr']) - 1;
669 $this->token['attr'][$last]['value'] .= $char;
671 $this->state = 'attributeValueDoubleQuoted';
675 private function attributeValueSingleQuotedState()
677 // Consume the next input character:
678 $this->char++;
679 $char = $this->character($this->char);
681 if($char === '\'') {
682 /* U+0022 QUOTATION MARK (')
683 Switch to the before attribute name state. */
684 $this->state = 'beforeAttributeName';
686 } elseif($char === '&') {
687 /* U+0026 AMPERSAND (&)
688 Switch to the entity in attribute value state. */
689 $this->entityInAttributeValueState('single');
691 } elseif($this->char === $this->EOF) {
692 /* EOF
693 Parse error. Emit the current tag token. Reconsume the character
694 in the data state. */
695 $this->emitToken($this->token);
697 $this->char--;
698 $this->state = 'data';
700 } else {
701 /* Anything else
702 Append the current input character to the current attribute's value.
703 Stay in the attribute value (single-quoted) state. */
704 $last = count($this->token['attr']) - 1;
705 $this->token['attr'][$last]['value'] .= $char;
707 $this->state = 'attributeValueSingleQuoted';
711 private function attributeValueUnquotedState()
713 // Consume the next input character:
714 $this->char++;
715 $char = $this->character($this->char);
717 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
718 /* U+0009 CHARACTER TABULATION
719 U+000A LINE FEED (LF)
720 U+000B LINE TABULATION
721 U+000C FORM FEED (FF)
722 U+0020 SPACE
723 Switch to the before attribute name state. */
724 $this->state = 'beforeAttributeName';
726 } elseif($char === '&') {
727 /* U+0026 AMPERSAND (&)
728 Switch to the entity in attribute value state. */
729 $this->entityInAttributeValueState('non');
731 } elseif($char === '>') {
732 /* U+003E GREATER-THAN SIGN (>)
733 Emit the current tag token. Switch to the data state. */
734 $this->emitToken($this->token);
735 $this->state = 'data';
737 } else {
738 /* Anything else
739 Append the current input character to the current attribute's value.
740 Stay in the attribute value (unquoted) state. */
741 $last = count($this->token['attr']) - 1;
742 $this->token['attr'][$last]['value'] .= $char;
744 $this->state = 'attributeValueUnquoted';
748 private function entityInAttributeValueState()
750 // Attempt to consume an entity.
751 $entity = $this->entity();
753 // If nothing is returned, append a U+0026 AMPERSAND character to the
754 // current attribute's value. Otherwise, emit the character token that
755 // was returned.
756 $char = (!$entity)
757 ? '&'
758 : $entity;
760 $this->emitToken($char);
763 private function bogusCommentState()
765 /* Consume every character up to the first U+003E GREATER-THAN SIGN
766 character (>) or the end of the file (EOF), whichever comes first. Emit
767 a comment token whose data is the concatenation of all the characters
768 starting from and including the character that caused the state machine
769 to switch into the bogus comment state, up to and including the last
770 consumed character before the U+003E character, if any, or up to the
771 end of the file otherwise. (If the comment was started by the end of
772 the file (EOF), the token is empty.) */
773 $data = $this->characters('^>', $this->char);
774 $this->emitToken(array(
775 'data' => $data,
776 'type' => self::COMMENT
779 $this->char += strlen($data);
781 /* Switch to the data state. */
782 $this->state = 'data';
784 /* If the end of the file was reached, reconsume the EOF character. */
785 if($this->char === $this->EOF) {
786 $this->char = $this->EOF - 1;
790 private function markupDeclarationOpenState()
792 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
793 characters, consume those two characters, create a comment token whose
794 data is the empty string, and switch to the comment state. */
795 if($this->character($this->char + 1, 2) === '--') {
796 $this->char += 2;
797 $this->state = 'comment';
798 $this->token = array(
799 'data' => null,
800 'type' => self::COMMENT
803 /* Otherwise if the next seven chacacters are a case-insensitive match
804 for the word "DOCTYPE", then consume those characters and switch to the
805 DOCTYPE state. */
806 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
807 $this->char += 7;
808 $this->state = 'doctype';
810 /* Otherwise, is is a parse error. Switch to the bogus comment state.
811 The next character that is consumed, if any, is the first character
812 that will be in the comment. */
813 } else {
814 $this->char++;
815 $this->state = 'bogusComment';
819 private function commentState()
821 /* Consume the next input character: */
822 $this->char++;
823 $char = $this->char();
825 /* U+002D HYPHEN-MINUS (-) */
826 if($char === '-') {
827 /* Switch to the comment dash state */
828 $this->state = 'commentDash';
830 /* EOF */
831 } elseif($this->char === $this->EOF) {
832 /* Parse error. Emit the comment token. Reconsume the EOF character
833 in the data state. */
834 $this->emitToken($this->token);
835 $this->char--;
836 $this->state = 'data';
838 /* Anything else */
839 } else {
840 /* Append the input character to the comment token's data. Stay in
841 the comment state. */
842 $this->token['data'] .= $char;
846 private function commentDashState()
848 /* Consume the next input character: */
849 $this->char++;
850 $char = $this->char();
852 /* U+002D HYPHEN-MINUS (-) */
853 if($char === '-') {
854 /* Switch to the comment end state */
855 $this->state = 'commentEnd';
857 /* EOF */
858 } elseif($this->char === $this->EOF) {
859 /* Parse error. Emit the comment token. Reconsume the EOF character
860 in the data state. */
861 $this->emitToken($this->token);
862 $this->char--;
863 $this->state = 'data';
865 /* Anything else */
866 } else {
867 /* Append a U+002D HYPHEN-MINUS (-) character and the input
868 character to the comment token's data. Switch to the comment state. */
869 $this->token['data'] .= '-'.$char;
870 $this->state = 'comment';
874 private function commentEndState()
876 /* Consume the next input character: */
877 $this->char++;
878 $char = $this->char();
880 if($char === '>') {
881 $this->emitToken($this->token);
882 $this->state = 'data';
884 } elseif($char === '-') {
885 $this->token['data'] .= '-';
887 } elseif($this->char === $this->EOF) {
888 $this->emitToken($this->token);
889 $this->char--;
890 $this->state = 'data';
892 } else {
893 $this->token['data'] .= '--'.$char;
894 $this->state = 'comment';
898 private function doctypeState()
900 /* Consume the next input character: */
901 $this->char++;
902 $char = $this->char();
904 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
905 $this->state = 'beforeDoctypeName';
907 } else {
908 $this->char--;
909 $this->state = 'beforeDoctypeName';
913 private function beforeDoctypeNameState()
915 /* Consume the next input character: */
916 $this->char++;
917 $char = $this->char();
919 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
920 // Stay in the before DOCTYPE name state.
922 } elseif(preg_match('/^[a-z]$/', $char)) {
923 $this->token = array(
924 'name' => strtoupper($char),
925 'type' => self::DOCTYPE,
926 'error' => true
929 $this->state = 'doctypeName';
931 } elseif($char === '>') {
932 $this->emitToken(array(
933 'name' => null,
934 'type' => self::DOCTYPE,
935 'error' => true
938 $this->state = 'data';
940 } elseif($this->char === $this->EOF) {
941 $this->emitToken(array(
942 'name' => null,
943 'type' => self::DOCTYPE,
944 'error' => true
947 $this->char--;
948 $this->state = 'data';
950 } else {
951 $this->token = array(
952 'name' => $char,
953 'type' => self::DOCTYPE,
954 'error' => true
957 $this->state = 'doctypeName';
961 private function doctypeNameState()
963 /* Consume the next input character: */
964 $this->char++;
965 $char = $this->char();
967 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
968 $this->state = 'AfterDoctypeName';
970 } elseif($char === '>') {
971 $this->emitToken($this->token);
972 $this->state = 'data';
974 } elseif(preg_match('/^[a-z]$/', $char)) {
975 $this->token['name'] .= strtoupper($char);
977 } elseif($this->char === $this->EOF) {
978 $this->emitToken($this->token);
979 $this->char--;
980 $this->state = 'data';
982 } else {
983 $this->token['name'] .= $char;
986 $this->token['error'] = ($this->token['name'] === 'HTML')
987 ? false
988 : true;
991 private function afterDoctypeNameState()
993 /* Consume the next input character: */
994 $this->char++;
995 $char = $this->char();
997 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
998 // Stay in the DOCTYPE name state.
1000 } elseif($char === '>') {
1001 $this->emitToken($this->token);
1002 $this->state = 'data';
1004 } elseif($this->char === $this->EOF) {
1005 $this->emitToken($this->token);
1006 $this->char--;
1007 $this->state = 'data';
1009 } else {
1010 $this->token['error'] = true;
1011 $this->state = 'bogusDoctype';
1015 private function bogusDoctypeState()
1017 /* Consume the next input character: */
1018 $this->char++;
1019 $char = $this->char();
1021 if($char === '>') {
1022 $this->emitToken($this->token);
1023 $this->state = 'data';
1025 } elseif($this->char === $this->EOF) {
1026 $this->emitToken($this->token);
1027 $this->char--;
1028 $this->state = 'data';
1030 } else {
1031 // Stay in the bogus DOCTYPE state.
1035 private function entity()
1037 $start = $this->char;
1039 // This section defines how to consume an entity. This definition is
1040 // used when parsing entities in text and in attributes.
1042 // The behaviour depends on the identity of the next character (the
1043 // one immediately after the U+0026 AMPERSAND character):
1045 switch($this->character($this->char + 1)) {
1046 // U+0023 NUMBER SIGN (#)
1047 case '#':
1049 // The behaviour further depends on the character after the
1050 // U+0023 NUMBER SIGN:
1051 switch($this->character($this->char + 1)) {
1052 // U+0078 LATIN SMALL LETTER X
1053 // U+0058 LATIN CAPITAL LETTER X
1054 case 'x':
1055 case 'X':
1056 // Follow the steps below, but using the range of
1057 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1058 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1059 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1060 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1061 // words, 0-9, A-F, a-f).
1062 $char = 1;
1063 $char_class = '0-9A-Fa-f';
1064 break;
1066 // Anything else
1067 default:
1068 // Follow the steps below, but using the range of
1069 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1070 // NINE (i.e. just 0-9).
1071 $char = 0;
1072 $char_class = '0-9';
1073 break;
1076 // Consume as many characters as match the range of characters
1077 // given above.
1078 $this->char++;
1079 $e_name = $this->characters($char_class, $this->char + $char + 1);
1080 $entity = $this->character($start, $this->char);
1081 $cond = strlen($e_name) > 0;
1083 // The rest of the parsing happens bellow.
1084 break;
1086 // Anything else
1087 default:
1088 // Consume the maximum number of characters possible, with the
1089 // consumed characters case-sensitively matching one of the
1090 // identifiers in the first column of the entities table.
1091 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1092 $len = strlen($e_name);
1094 for($c = 1; $c <= $len; $c++) {
1095 $id = substr($e_name, 0, $c);
1096 $this->char++;
1098 if(in_array($id, $this->entities)) {
1099 $entity = $id;
1100 break;
1104 $cond = isset($entity);
1105 // The rest of the parsing happens bellow.
1106 break;
1109 if(!$cond) {
1110 // If no match can be made, then this is a parse error. No
1111 // characters are consumed, and nothing is returned.
1112 $this->char = $start;
1113 return false;
1116 // Return a character token for the character corresponding to the
1117 // entity name (as given by the second column of the entities table).
1118 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1121 private function emitToken($token)
1123 $emit = $this->tree->emitToken($token);
1125 if(is_int($emit)) {
1126 $this->content_model = $emit;
1128 } elseif($token['type'] === self::ENDTAG) {
1129 $this->content_model = self::PCDATA;
1133 private function EOF()
1135 $this->state = null;
1136 $this->tree->emitToken(array(
1137 'type' => self::EOF
1142 class HTML5TreeConstructer
1144 public $stack = array();
1146 private $phase;
1147 private $mode;
1148 private $dom;
1149 private $foster_parent = null;
1150 private $a_formatting = array();
1152 private $head_pointer = null;
1153 private $form_pointer = null;
1155 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1156 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1157 private $special = array('address','area','base','basefont','bgsound',
1158 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1159 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1160 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1161 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1162 'option','p','param','plaintext','pre','script','select','spacer','style',
1163 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1165 // The different phases.
1166 const INIT_PHASE = 0;
1167 const ROOT_PHASE = 1;
1168 const MAIN_PHASE = 2;
1169 const END_PHASE = 3;
1171 // The different insertion modes for the main phase.
1172 const BEFOR_HEAD = 0;
1173 const IN_HEAD = 1;
1174 const AFTER_HEAD = 2;
1175 const IN_BODY = 3;
1176 const IN_TABLE = 4;
1177 const IN_CAPTION = 5;
1178 const IN_CGROUP = 6;
1179 const IN_TBODY = 7;
1180 const IN_ROW = 8;
1181 const IN_CELL = 9;
1182 const IN_SELECT = 10;
1183 const AFTER_BODY = 11;
1184 const IN_FRAME = 12;
1185 const AFTR_FRAME = 13;
1187 // The different types of elements.
1188 const SPECIAL = 0;
1189 const SCOPING = 1;
1190 const FORMATTING = 2;
1191 const PHRASING = 3;
1193 const MARKER = 0;
1195 public function __construct()
1197 $this->phase = self::INIT_PHASE;
1198 $this->mode = self::BEFOR_HEAD;
1199 $this->dom = new DOMDocument;
1201 $this->dom->encoding = 'UTF-8';
1202 $this->dom->preserveWhiteSpace = true;
1203 $this->dom->substituteEntities = true;
1204 $this->dom->strictErrorChecking = false;
1207 // Process tag tokens
1208 public function emitToken($token)
1210 switch($this->phase) {
1211 case self::INIT_PHASE: return $this->initPhase($token); break;
1212 case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1213 case self::MAIN_PHASE: return $this->mainPhase($token); break;
1214 case self::END_PHASE : return $this->trailingEndPhase($token); break;
1218 private function initPhase($token)
1220 /* Initially, the tree construction stage must handle each token
1221 emitted from the tokenisation stage as follows: */
1223 /* A DOCTYPE token that is marked as being in error
1224 A comment token
1225 A start tag token
1226 An end tag token
1227 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1228 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1229 or U+0020 SPACE
1230 An end-of-file token */
1231 if((isset($token['error']) && $token['error']) ||
1232 $token['type'] === HTML5::COMMENT ||
1233 $token['type'] === HTML5::STARTTAG ||
1234 $token['type'] === HTML5::ENDTAG ||
1235 $token['type'] === HTML5::EOF ||
1236 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1237 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1238 /* This specification does not define how to handle this case. In
1239 particular, user agents may ignore the entirety of this specification
1240 altogether for such documents, and instead invoke special parse modes
1241 with a greater emphasis on backwards compatibility. */
1243 $this->phase = self::ROOT_PHASE;
1244 return $this->rootElementPhase($token);
1246 /* A DOCTYPE token marked as being correct */
1247 } elseif(isset($token['error']) && !$token['error']) {
1248 /* Append a DocumentType node to the Document node, with the name
1249 attribute set to the name given in the DOCTYPE token (which will be
1250 "HTML"), and the other attributes specific to DocumentType objects
1251 set to null, empty lists, or the empty string as appropriate. */
1252 $doctype = new DOMDocumentType(null, null, 'HTML');
1254 /* Then, switch to the root element phase of the tree construction
1255 stage. */
1256 $this->phase = self::ROOT_PHASE;
1258 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1259 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1260 or U+0020 SPACE */
1261 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1262 $token['data'])) {
1263 /* Append that character to the Document node. */
1264 $text = $this->dom->createTextNode($token['data']);
1265 $this->dom->appendChild($text);
1269 private function rootElementPhase($token)
1271 /* After the initial phase, as each token is emitted from the tokenisation
1272 stage, it must be processed as described in this section. */
1274 /* A DOCTYPE token */
1275 if($token['type'] === HTML5::DOCTYPE) {
1276 // Parse error. Ignore the token.
1278 /* A comment token */
1279 } elseif($token['type'] === HTML5::COMMENT) {
1280 /* Append a Comment node to the Document object with the data
1281 attribute set to the data given in the comment token. */
1282 $comment = $this->dom->createComment($token['data']);
1283 $this->dom->appendChild($comment);
1285 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1286 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1287 or U+0020 SPACE */
1288 } elseif($token['type'] === HTML5::CHARACTR &&
1289 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1290 /* Append that character to the Document node. */
1291 $text = $this->dom->createTextNode($token['data']);
1292 $this->dom->appendChild($text);
1294 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1295 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1296 (FF), or U+0020 SPACE
1297 A start tag token
1298 An end tag token
1299 An end-of-file token */
1300 } elseif(($token['type'] === HTML5::CHARACTR &&
1301 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1302 $token['type'] === HTML5::STARTTAG ||
1303 $token['type'] === HTML5::ENDTAG ||
1304 $token['type'] === HTML5::EOF) {
1305 /* Create an HTMLElement node with the tag name html, in the HTML
1306 namespace. Append it to the Document object. Switch to the main
1307 phase and reprocess the current token. */
1308 $html = $this->dom->createElement('html');
1309 $this->dom->appendChild($html);
1310 $this->stack[] = $html;
1312 $this->phase = self::MAIN_PHASE;
1313 return $this->mainPhase($token);
1317 private function mainPhase($token)
1319 /* Tokens in the main phase must be handled as follows: */
1321 /* A DOCTYPE token */
1322 if($token['type'] === HTML5::DOCTYPE) {
1323 // Parse error. Ignore the token.
1325 /* A start tag token with the tag name "html" */
1326 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1327 /* If this start tag token was not the first start tag token, then
1328 it is a parse error. */
1330 /* For each attribute on the token, check to see if the attribute
1331 is already present on the top element of the stack of open elements.
1332 If it is not, add the attribute and its corresponding value to that
1333 element. */
1334 foreach($token['attr'] as $attr) {
1335 if(!$this->stack[0]->hasAttribute($attr['name'])) {
1336 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1340 /* An end-of-file token */
1341 } elseif($token['type'] === HTML5::EOF) {
1342 /* Generate implied end tags. */
1343 $this->generateImpliedEndTags();
1345 /* Anything else. */
1346 } else {
1347 /* Depends on the insertion mode: */
1348 switch($this->mode) {
1349 case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1350 case self::IN_HEAD: return $this->inHead($token); break;
1351 case self::AFTER_HEAD: return $this->afterHead($token); break;
1352 case self::IN_BODY: return $this->inBody($token); break;
1353 case self::IN_TABLE: return $this->inTable($token); break;
1354 case self::IN_CAPTION: return $this->inCaption($token); break;
1355 case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1356 case self::IN_TBODY: return $this->inTableBody($token); break;
1357 case self::IN_ROW: return $this->inRow($token); break;
1358 case self::IN_CELL: return $this->inCell($token); break;
1359 case self::IN_SELECT: return $this->inSelect($token); break;
1360 case self::AFTER_BODY: return $this->afterBody($token); break;
1361 case self::IN_FRAME: return $this->inFrameset($token); break;
1362 case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1363 case self::END_PHASE: return $this->trailingEndPhase($token); break;
1368 private function beforeHead($token)
1370 /* Handle the token as follows: */
1372 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1373 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1374 or U+0020 SPACE */
1375 if($token['type'] === HTML5::CHARACTR &&
1376 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1377 /* Append the character to the current node. */
1378 $this->insertText($token['data']);
1380 /* A comment token */
1381 } elseif($token['type'] === HTML5::COMMENT) {
1382 /* Append a Comment node to the current node with the data attribute
1383 set to the data given in the comment token. */
1384 $this->insertComment($token['data']);
1386 /* A start tag token with the tag name "head" */
1387 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1388 /* Create an element for the token, append the new element to the
1389 current node and push it onto the stack of open elements. */
1390 $element = $this->insertElement($token);
1392 /* Set the head element pointer to this new element node. */
1393 $this->head_pointer = $element;
1395 /* Change the insertion mode to "in head". */
1396 $this->mode = self::IN_HEAD;
1398 /* A start tag token whose tag name is one of: "base", "link", "meta",
1399 "script", "style", "title". Or an end tag with the tag name "html".
1400 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1401 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1402 or U+0020 SPACE. Or any other start tag token */
1403 } elseif($token['type'] === HTML5::STARTTAG ||
1404 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1405 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1406 $token['data']))) {
1407 /* Act as if a start tag token with the tag name "head" and no
1408 attributes had been seen, then reprocess the current token. */
1409 $this->beforeHead(array(
1410 'name' => 'head',
1411 'type' => HTML5::STARTTAG,
1412 'attr' => array()
1415 return $this->inHead($token);
1417 /* Any other end tag */
1418 } elseif($token['type'] === HTML5::ENDTAG) {
1419 /* Parse error. Ignore the token. */
1423 private function inHead($token)
1425 /* Handle the token as follows: */
1427 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1428 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1429 or U+0020 SPACE.
1431 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1432 or script element, append the character to the current node regardless
1433 of its content. */
1434 if(($token['type'] === HTML5::CHARACTR &&
1435 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1436 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1437 array('title', 'style', 'script')))) {
1438 /* Append the character to the current node. */
1439 $this->insertText($token['data']);
1441 /* A comment token */
1442 } elseif($token['type'] === HTML5::COMMENT) {
1443 /* Append a Comment node to the current node with the data attribute
1444 set to the data given in the comment token. */
1445 $this->insertComment($token['data']);
1447 } elseif($token['type'] === HTML5::ENDTAG &&
1448 in_array($token['name'], array('title', 'style', 'script'))) {
1449 array_pop($this->stack);
1450 return HTML5::PCDATA;
1452 /* A start tag with the tag name "title" */
1453 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1454 /* Create an element for the token and append the new element to the
1455 node pointed to by the head element pointer, or, if that is null
1456 (innerHTML case), to the current node. */
1457 if($this->head_pointer !== null) {
1458 $element = $this->insertElement($token, false);
1459 $this->head_pointer->appendChild($element);
1461 } else {
1462 $element = $this->insertElement($token);
1465 /* Switch the tokeniser's content model flag to the RCDATA state. */
1466 return HTML5::RCDATA;
1468 /* A start tag with the tag name "style" */
1469 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1470 /* Create an element for the token and append the new element to the
1471 node pointed to by the head element pointer, or, if that is null
1472 (innerHTML case), to the current node. */
1473 if($this->head_pointer !== null) {
1474 $element = $this->insertElement($token, false);
1475 $this->head_pointer->appendChild($element);
1477 } else {
1478 $this->insertElement($token);
1481 /* Switch the tokeniser's content model flag to the CDATA state. */
1482 return HTML5::CDATA;
1484 /* A start tag with the tag name "script" */
1485 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1486 /* Create an element for the token. */
1487 $element = $this->insertElement($token, false);
1488 $this->head_pointer->appendChild($element);
1490 /* Switch the tokeniser's content model flag to the CDATA state. */
1491 return HTML5::CDATA;
1493 /* A start tag with the tag name "base", "link", or "meta" */
1494 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1495 array('base', 'link', 'meta'))) {
1496 /* Create an element for the token and append the new element to the
1497 node pointed to by the head element pointer, or, if that is null
1498 (innerHTML case), to the current node. */
1499 if($this->head_pointer !== null) {
1500 $element = $this->insertElement($token, false);
1501 $this->head_pointer->appendChild($element);
1502 array_pop($this->stack);
1504 } else {
1505 $this->insertElement($token);
1508 /* An end tag with the tag name "head" */
1509 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1510 /* If the current node is a head element, pop the current node off
1511 the stack of open elements. */
1512 if($this->head_pointer->isSameNode(end($this->stack))) {
1513 array_pop($this->stack);
1515 /* Otherwise, this is a parse error. */
1516 } else {
1517 // k
1520 /* Change the insertion mode to "after head". */
1521 $this->mode = self::AFTER_HEAD;
1523 /* A start tag with the tag name "head" or an end tag except "html". */
1524 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1525 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1526 // Parse error. Ignore the token.
1528 /* Anything else */
1529 } else {
1530 /* If the current node is a head element, act as if an end tag
1531 token with the tag name "head" had been seen. */
1532 if($this->head_pointer->isSameNode(end($this->stack))) {
1533 $this->inHead(array(
1534 'name' => 'head',
1535 'type' => HTML5::ENDTAG
1538 /* Otherwise, change the insertion mode to "after head". */
1539 } else {
1540 $this->mode = self::AFTER_HEAD;
1543 /* Then, reprocess the current token. */
1544 return $this->afterHead($token);
1548 private function afterHead($token)
1550 /* Handle the token as follows: */
1552 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1553 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1554 or U+0020 SPACE */
1555 if($token['type'] === HTML5::CHARACTR &&
1556 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1557 /* Append the character to the current node. */
1558 $this->insertText($token['data']);
1560 /* A comment token */
1561 } elseif($token['type'] === HTML5::COMMENT) {
1562 /* Append a Comment node to the current node with the data attribute
1563 set to the data given in the comment token. */
1564 $this->insertComment($token['data']);
1566 /* A start tag token with the tag name "body" */
1567 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1568 /* Insert a body element for the token. */
1569 $this->insertElement($token);
1571 /* Change the insertion mode to "in body". */
1572 $this->mode = self::IN_BODY;
1574 /* A start tag token with the tag name "frameset" */
1575 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1576 /* Insert a frameset element for the token. */
1577 $this->insertElement($token);
1579 /* Change the insertion mode to "in frameset". */
1580 $this->mode = self::IN_FRAME;
1582 /* A start tag token whose tag name is one of: "base", "link", "meta",
1583 "script", "style", "title" */
1584 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1585 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1586 /* Parse error. Switch the insertion mode back to "in head" and
1587 reprocess the token. */
1588 $this->mode = self::IN_HEAD;
1589 return $this->inHead($token);
1591 /* Anything else */
1592 } else {
1593 /* Act as if a start tag token with the tag name "body" and no
1594 attributes had been seen, and then reprocess the current token. */
1595 $this->afterHead(array(
1596 'name' => 'body',
1597 'type' => HTML5::STARTTAG,
1598 'attr' => array()
1601 return $this->inBody($token);
1605 private function inBody($token)
1607 /* Handle the token as follows: */
1609 switch($token['type']) {
1610 /* A character token */
1611 case HTML5::CHARACTR:
1612 /* Reconstruct the active formatting elements, if any. */
1613 $this->reconstructActiveFormattingElements();
1615 /* Append the token's character to the current node. */
1616 $this->insertText($token['data']);
1617 break;
1619 /* A comment token */
1620 case HTML5::COMMENT:
1621 /* Append a Comment node to the current node with the data
1622 attribute set to the data given in the comment token. */
1623 $this->insertComment($token['data']);
1624 break;
1626 case HTML5::STARTTAG:
1627 switch($token['name']) {
1628 /* A start tag token whose tag name is one of: "script",
1629 "style" */
1630 case 'script': case 'style':
1631 /* Process the token as if the insertion mode had been "in
1632 head". */
1633 return $this->inHead($token);
1634 break;
1636 /* A start tag token whose tag name is one of: "base", "link",
1637 "meta", "title" */
1638 case 'base': case 'link': case 'meta': case 'title':
1639 /* Parse error. Process the token as if the insertion mode
1640 had been "in head". */
1641 return $this->inHead($token);
1642 break;
1644 /* A start tag token with the tag name "body" */
1645 case 'body':
1646 /* Parse error. If the second element on the stack of open
1647 elements is not a body element, or, if the stack of open
1648 elements has only one node on it, then ignore the token.
1649 (innerHTML case) */
1650 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1651 // Ignore
1653 /* Otherwise, for each attribute on the token, check to see
1654 if the attribute is already present on the body element (the
1655 second element) on the stack of open elements. If it is not,
1656 add the attribute and its corresponding value to that
1657 element. */
1658 } else {
1659 foreach($token['attr'] as $attr) {
1660 if(!$this->stack[1]->hasAttribute($attr['name'])) {
1661 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1665 break;
1667 /* A start tag whose tag name is one of: "address",
1668 "blockquote", "center", "dir", "div", "dl", "fieldset",
1669 "listing", "menu", "ol", "p", "ul" */
1670 case 'address': case 'blockquote': case 'center': case 'dir':
1671 case 'div': case 'dl': case 'fieldset': case 'listing':
1672 case 'menu': case 'ol': case 'p': case 'ul':
1673 /* If the stack of open elements has a p element in scope,
1674 then act as if an end tag with the tag name p had been
1675 seen. */
1676 if($this->elementInScope('p')) {
1677 $this->emitToken(array(
1678 'name' => 'p',
1679 'type' => HTML5::ENDTAG
1683 /* Insert an HTML element for the token. */
1684 $this->insertElement($token);
1685 break;
1687 /* A start tag whose tag name is "form" */
1688 case 'form':
1689 /* If the form element pointer is not null, ignore the
1690 token with a parse error. */
1691 if($this->form_pointer !== null) {
1692 // Ignore.
1694 /* Otherwise: */
1695 } else {
1696 /* If the stack of open elements has a p element in
1697 scope, then act as if an end tag with the tag name p
1698 had been seen. */
1699 if($this->elementInScope('p')) {
1700 $this->emitToken(array(
1701 'name' => 'p',
1702 'type' => HTML5::ENDTAG
1706 /* Insert an HTML element for the token, and set the
1707 form element pointer to point to the element created. */
1708 $element = $this->insertElement($token);
1709 $this->form_pointer = $element;
1711 break;
1713 /* A start tag whose tag name is "li", "dd" or "dt" */
1714 case 'li': case 'dd': case 'dt':
1715 /* If the stack of open elements has a p element in scope,
1716 then act as if an end tag with the tag name p had been
1717 seen. */
1718 if($this->elementInScope('p')) {
1719 $this->emitToken(array(
1720 'name' => 'p',
1721 'type' => HTML5::ENDTAG
1725 $stack_length = count($this->stack) - 1;
1727 for($n = $stack_length; 0 <= $n; $n--) {
1728 /* 1. Initialise node to be the current node (the
1729 bottommost node of the stack). */
1730 $stop = false;
1731 $node = $this->stack[$n];
1732 $cat = $this->getElementCategory($node->tagName);
1734 /* 2. If node is an li, dd or dt element, then pop all
1735 the nodes from the current node up to node, including
1736 node, then stop this algorithm. */
1737 if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1738 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1739 for($x = $stack_length; $x >= $n ; $x--) {
1740 array_pop($this->stack);
1743 break;
1746 /* 3. If node is not in the formatting category, and is
1747 not in the phrasing category, and is not an address or
1748 div element, then stop this algorithm. */
1749 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1750 $node->tagName !== 'address' && $node->tagName !== 'div') {
1751 break;
1755 /* Finally, insert an HTML element with the same tag
1756 name as the token's. */
1757 $this->insertElement($token);
1758 break;
1760 /* A start tag token whose tag name is "plaintext" */
1761 case 'plaintext':
1762 /* If the stack of open elements has a p element in scope,
1763 then act as if an end tag with the tag name p had been
1764 seen. */
1765 if($this->elementInScope('p')) {
1766 $this->emitToken(array(
1767 'name' => 'p',
1768 'type' => HTML5::ENDTAG
1772 /* Insert an HTML element for the token. */
1773 $this->insertElement($token);
1775 return HTML5::PLAINTEXT;
1776 break;
1778 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1779 "h5", "h6" */
1780 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1781 /* If the stack of open elements has a p element in scope,
1782 then act as if an end tag with the tag name p had been seen. */
1783 if($this->elementInScope('p')) {
1784 $this->emitToken(array(
1785 'name' => 'p',
1786 'type' => HTML5::ENDTAG
1790 /* If the stack of open elements has in scope an element whose
1791 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1792 this is a parse error; pop elements from the stack until an
1793 element with one of those tag names has been popped from the
1794 stack. */
1795 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1796 array_pop($this->stack);
1799 /* Insert an HTML element for the token. */
1800 $this->insertElement($token);
1801 break;
1803 /* A start tag whose tag name is "a" */
1804 case 'a':
1805 /* If the list of active formatting elements contains
1806 an element whose tag name is "a" between the end of the
1807 list and the last marker on the list (or the start of
1808 the list if there is no marker on the list), then this
1809 is a parse error; act as if an end tag with the tag name
1810 "a" had been seen, then remove that element from the list
1811 of active formatting elements and the stack of open
1812 elements if the end tag didn't already remove it (it
1813 might not have if the element is not in table scope). */
1814 $leng = count($this->a_formatting);
1816 for($n = $leng - 1; $n >= 0; $n--) {
1817 if($this->a_formatting[$n] === self::MARKER) {
1818 break;
1820 } elseif($this->a_formatting[$n]->nodeName === 'a') {
1821 $this->emitToken(array(
1822 'name' => 'a',
1823 'type' => HTML5::ENDTAG
1825 break;
1829 /* Reconstruct the active formatting elements, if any. */
1830 $this->reconstructActiveFormattingElements();
1832 /* Insert an HTML element for the token. */
1833 $el = $this->insertElement($token);
1835 /* Add that element to the list of active formatting
1836 elements. */
1837 $this->a_formatting[] = $el;
1838 break;
1840 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1841 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1842 case 'b': case 'big': case 'em': case 'font': case 'i':
1843 case 'nobr': case 's': case 'small': case 'strike':
1844 case 'strong': case 'tt': case 'u':
1845 /* Reconstruct the active formatting elements, if any. */
1846 $this->reconstructActiveFormattingElements();
1848 /* Insert an HTML element for the token. */
1849 $el = $this->insertElement($token);
1851 /* Add that element to the list of active formatting
1852 elements. */
1853 $this->a_formatting[] = $el;
1854 break;
1856 /* A start tag token whose tag name is "button" */
1857 case 'button':
1858 /* If the stack of open elements has a button element in scope,
1859 then this is a parse error; act as if an end tag with the tag
1860 name "button" had been seen, then reprocess the token. (We don't
1861 do that. Unnecessary.) */
1862 if($this->elementInScope('button')) {
1863 $this->inBody(array(
1864 'name' => 'button',
1865 'type' => HTML5::ENDTAG
1869 /* Reconstruct the active formatting elements, if any. */
1870 $this->reconstructActiveFormattingElements();
1872 /* Insert an HTML element for the token. */
1873 $this->insertElement($token);
1875 /* Insert a marker at the end of the list of active
1876 formatting elements. */
1877 $this->a_formatting[] = self::MARKER;
1878 break;
1880 /* A start tag token whose tag name is one of: "marquee", "object" */
1881 case 'marquee': case 'object':
1882 /* Reconstruct the active formatting elements, if any. */
1883 $this->reconstructActiveFormattingElements();
1885 /* Insert an HTML element for the token. */
1886 $this->insertElement($token);
1888 /* Insert a marker at the end of the list of active
1889 formatting elements. */
1890 $this->a_formatting[] = self::MARKER;
1891 break;
1893 /* A start tag token whose tag name is "xmp" */
1894 case 'xmp':
1895 /* Reconstruct the active formatting elements, if any. */
1896 $this->reconstructActiveFormattingElements();
1898 /* Insert an HTML element for the token. */
1899 $this->insertElement($token);
1901 /* Switch the content model flag to the CDATA state. */
1902 return HTML5::CDATA;
1903 break;
1905 /* A start tag whose tag name is "table" */
1906 case 'table':
1907 /* If the stack of open elements has a p element in scope,
1908 then act as if an end tag with the tag name p had been seen. */
1909 if($this->elementInScope('p')) {
1910 $this->emitToken(array(
1911 'name' => 'p',
1912 'type' => HTML5::ENDTAG
1916 /* Insert an HTML element for the token. */
1917 $this->insertElement($token);
1919 /* Change the insertion mode to "in table". */
1920 $this->mode = self::IN_TABLE;
1921 break;
1923 /* A start tag whose tag name is one of: "area", "basefont",
1924 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1925 case 'area': case 'basefont': case 'bgsound': case 'br':
1926 case 'embed': case 'img': case 'param': case 'spacer':
1927 case 'wbr':
1928 /* Reconstruct the active formatting elements, if any. */
1929 $this->reconstructActiveFormattingElements();
1931 /* Insert an HTML element for the token. */
1932 $this->insertElement($token);
1934 /* Immediately pop the current node off the stack of open elements. */
1935 array_pop($this->stack);
1936 break;
1938 /* A start tag whose tag name is "hr" */
1939 case 'hr':
1940 /* If the stack of open elements has a p element in scope,
1941 then act as if an end tag with the tag name p had been seen. */
1942 if($this->elementInScope('p')) {
1943 $this->emitToken(array(
1944 'name' => 'p',
1945 'type' => HTML5::ENDTAG
1949 /* Insert an HTML element for the token. */
1950 $this->insertElement($token);
1952 /* Immediately pop the current node off the stack of open elements. */
1953 array_pop($this->stack);
1954 break;
1956 /* A start tag whose tag name is "image" */
1957 case 'image':
1958 /* Parse error. Change the token's tag name to "img" and
1959 reprocess it. (Don't ask.) */
1960 $token['name'] = 'img';
1961 return $this->inBody($token);
1962 break;
1964 /* A start tag whose tag name is "input" */
1965 case 'input':
1966 /* Reconstruct the active formatting elements, if any. */
1967 $this->reconstructActiveFormattingElements();
1969 /* Insert an input element for the token. */
1970 $element = $this->insertElement($token, false);
1972 /* If the form element pointer is not null, then associate the
1973 input element with the form element pointed to by the form
1974 element pointer. */
1975 $this->form_pointer !== null
1976 ? $this->form_pointer->appendChild($element)
1977 : end($this->stack)->appendChild($element);
1979 /* Pop that input element off the stack of open elements. */
1980 array_pop($this->stack);
1981 break;
1983 /* A start tag whose tag name is "isindex" */
1984 case 'isindex':
1985 /* Parse error. */
1986 // w/e
1988 /* If the form element pointer is not null,
1989 then ignore the token. */
1990 if($this->form_pointer === null) {
1991 /* Act as if a start tag token with the tag name "form" had
1992 been seen. */
1993 $this->inBody(array(
1994 'name' => 'body',
1995 'type' => HTML5::STARTTAG,
1996 'attr' => array()
1999 /* Act as if a start tag token with the tag name "hr" had
2000 been seen. */
2001 $this->inBody(array(
2002 'name' => 'hr',
2003 'type' => HTML5::STARTTAG,
2004 'attr' => array()
2007 /* Act as if a start tag token with the tag name "p" had
2008 been seen. */
2009 $this->inBody(array(
2010 'name' => 'p',
2011 'type' => HTML5::STARTTAG,
2012 'attr' => array()
2015 /* Act as if a start tag token with the tag name "label"
2016 had been seen. */
2017 $this->inBody(array(
2018 'name' => 'label',
2019 'type' => HTML5::STARTTAG,
2020 'attr' => array()
2023 /* Act as if a stream of character tokens had been seen. */
2024 $this->insertText('This is a searchable index. '.
2025 'Insert your search keywords here: ');
2027 /* Act as if a start tag token with the tag name "input"
2028 had been seen, with all the attributes from the "isindex"
2029 token, except with the "name" attribute set to the value
2030 "isindex" (ignoring any explicit "name" attribute). */
2031 $attr = $token['attr'];
2032 $attr[] = array('name' => 'name', 'value' => 'isindex');
2034 $this->inBody(array(
2035 'name' => 'input',
2036 'type' => HTML5::STARTTAG,
2037 'attr' => $attr
2040 /* Act as if a stream of character tokens had been seen
2041 (see below for what they should say). */
2042 $this->insertText('This is a searchable index. '.
2043 'Insert your search keywords here: ');
2045 /* Act as if an end tag token with the tag name "label"
2046 had been seen. */
2047 $this->inBody(array(
2048 'name' => 'label',
2049 'type' => HTML5::ENDTAG
2052 /* Act as if an end tag token with the tag name "p" had
2053 been seen. */
2054 $this->inBody(array(
2055 'name' => 'p',
2056 'type' => HTML5::ENDTAG
2059 /* Act as if a start tag token with the tag name "hr" had
2060 been seen. */
2061 $this->inBody(array(
2062 'name' => 'hr',
2063 'type' => HTML5::ENDTAG
2066 /* Act as if an end tag token with the tag name "form" had
2067 been seen. */
2068 $this->inBody(array(
2069 'name' => 'form',
2070 'type' => HTML5::ENDTAG
2073 break;
2075 /* A start tag whose tag name is "textarea" */
2076 case 'textarea':
2077 $this->insertElement($token);
2079 /* Switch the tokeniser's content model flag to the
2080 RCDATA state. */
2081 return HTML5::RCDATA;
2082 break;
2084 /* A start tag whose tag name is one of: "iframe", "noembed",
2085 "noframes" */
2086 case 'iframe': case 'noembed': case 'noframes':
2087 $this->insertElement($token);
2089 /* Switch the tokeniser's content model flag to the CDATA state. */
2090 return HTML5::CDATA;
2091 break;
2093 /* A start tag whose tag name is "select" */
2094 case 'select':
2095 /* Reconstruct the active formatting elements, if any. */
2096 $this->reconstructActiveFormattingElements();
2098 /* Insert an HTML element for the token. */
2099 $this->insertElement($token);
2101 /* Change the insertion mode to "in select". */
2102 $this->mode = self::IN_SELECT;
2103 break;
2105 /* A start or end tag whose tag name is one of: "caption", "col",
2106 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2107 "tbody", "td", "tfoot", "th", "thead", "tr". */
2108 case 'caption': case 'col': case 'colgroup': case 'frame':
2109 case 'frameset': case 'head': case 'option': case 'optgroup':
2110 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2111 case 'tr':
2112 // Parse error. Ignore the token.
2113 break;
2115 /* A start or end tag whose tag name is one of: "event-source",
2116 "section", "nav", "article", "aside", "header", "footer",
2117 "datagrid", "command" */
2118 case 'event-source': case 'section': case 'nav': case 'article':
2119 case 'aside': case 'header': case 'footer': case 'datagrid':
2120 case 'command':
2121 // Work in progress!
2122 break;
2124 /* A start tag token not covered by the previous entries */
2125 default:
2126 /* Reconstruct the active formatting elements, if any. */
2127 $this->reconstructActiveFormattingElements();
2129 $this->insertElement($token);
2130 break;
2132 break;
2134 case HTML5::ENDTAG:
2135 switch($token['name']) {
2136 /* An end tag with the tag name "body" */
2137 case 'body':
2138 /* If the second element in the stack of open elements is
2139 not a body element, this is a parse error. Ignore the token.
2140 (innerHTML case) */
2141 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2142 // Ignore.
2144 /* If the current node is not the body element, then this
2145 is a parse error. */
2146 } elseif(end($this->stack)->nodeName !== 'body') {
2147 // Parse error.
2150 /* Change the insertion mode to "after body". */
2151 $this->mode = self::AFTER_BODY;
2152 break;
2154 /* An end tag with the tag name "html" */
2155 case 'html':
2156 /* Act as if an end tag with tag name "body" had been seen,
2157 then, if that token wasn't ignored, reprocess the current
2158 token. */
2159 $this->inBody(array(
2160 'name' => 'body',
2161 'type' => HTML5::ENDTAG
2164 return $this->afterBody($token);
2165 break;
2167 /* An end tag whose tag name is one of: "address", "blockquote",
2168 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2169 "ol", "pre", "ul" */
2170 case 'address': case 'blockquote': case 'center': case 'dir':
2171 case 'div': case 'dl': case 'fieldset': case 'listing':
2172 case 'menu': case 'ol': case 'pre': case 'ul':
2173 /* If the stack of open elements has an element in scope
2174 with the same tag name as that of the token, then generate
2175 implied end tags. */
2176 if($this->elementInScope($token['name'])) {
2177 $this->generateImpliedEndTags();
2179 /* Now, if the current node is not an element with
2180 the same tag name as that of the token, then this
2181 is a parse error. */
2182 // w/e
2184 /* If the stack of open elements has an element in
2185 scope with the same tag name as that of the token,
2186 then pop elements from this stack until an element
2187 with that tag name has been popped from the stack. */
2188 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2189 if($this->stack[$n]->nodeName === $token['name']) {
2190 $n = -1;
2193 array_pop($this->stack);
2196 break;
2198 /* An end tag whose tag name is "form" */
2199 case 'form':
2200 /* If the stack of open elements has an element in scope
2201 with the same tag name as that of the token, then generate
2202 implied end tags. */
2203 if($this->elementInScope($token['name'])) {
2204 $this->generateImpliedEndTags();
2208 if(end($this->stack)->nodeName !== $token['name']) {
2209 /* Now, if the current node is not an element with the
2210 same tag name as that of the token, then this is a parse
2211 error. */
2212 // w/e
2214 } else {
2215 /* Otherwise, if the current node is an element with
2216 the same tag name as that of the token pop that element
2217 from the stack. */
2218 array_pop($this->stack);
2221 /* In any case, set the form element pointer to null. */
2222 $this->form_pointer = null;
2223 break;
2225 /* An end tag whose tag name is "p" */
2226 case 'p':
2227 /* If the stack of open elements has a p element in scope,
2228 then generate implied end tags, except for p elements. */
2229 if($this->elementInScope('p')) {
2230 $this->generateImpliedEndTags(array('p'));
2232 /* If the current node is not a p element, then this is
2233 a parse error. */
2234 // k
2236 /* If the stack of open elements has a p element in
2237 scope, then pop elements from this stack until the stack
2238 no longer has a p element in scope. */
2239 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2240 if($this->elementInScope('p')) {
2241 array_pop($this->stack);
2243 } else {
2244 break;
2248 break;
2250 /* An end tag whose tag name is "dd", "dt", or "li" */
2251 case 'dd': case 'dt': case 'li':
2252 /* If the stack of open elements has an element in scope
2253 whose tag name matches the tag name of the token, then
2254 generate implied end tags, except for elements with the
2255 same tag name as the token. */
2256 if($this->elementInScope($token['name'])) {
2257 $this->generateImpliedEndTags(array($token['name']));
2259 /* If the current node is not an element with the same
2260 tag name as the token, then this is a parse error. */
2261 // w/e
2263 /* If the stack of open elements has an element in scope
2264 whose tag name matches the tag name of the token, then
2265 pop elements from this stack until an element with that
2266 tag name has been popped from the stack. */
2267 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2268 if($this->stack[$n]->nodeName === $token['name']) {
2269 $n = -1;
2272 array_pop($this->stack);
2275 break;
2277 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2278 "h5", "h6" */
2279 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2280 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2282 /* If the stack of open elements has in scope an element whose
2283 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2284 generate implied end tags. */
2285 if($this->elementInScope($elements)) {
2286 $this->generateImpliedEndTags();
2288 /* Now, if the current node is not an element with the same
2289 tag name as that of the token, then this is a parse error. */
2290 // w/e
2292 /* If the stack of open elements has in scope an element
2293 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2294 "h6", then pop elements from the stack until an element
2295 with one of those tag names has been popped from the stack. */
2296 while($this->elementInScope($elements)) {
2297 array_pop($this->stack);
2300 break;
2302 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2303 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2304 case 'a': case 'b': case 'big': case 'em': case 'font':
2305 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2306 case 'strong': case 'tt': case 'u':
2307 /* 1. Let the formatting element be the last element in
2308 the list of active formatting elements that:
2309 * is between the end of the list and the last scope
2310 marker in the list, if any, or the start of the list
2311 otherwise, and
2312 * has the same tag name as the token.
2314 while(true) {
2315 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2316 if($this->a_formatting[$a] === self::MARKER) {
2317 break;
2319 } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2320 $formatting_element = $this->a_formatting[$a];
2321 $in_stack = in_array($formatting_element, $this->stack, true);
2322 $fe_af_pos = $a;
2323 break;
2327 /* If there is no such node, or, if that node is
2328 also in the stack of open elements but the element
2329 is not in scope, then this is a parse error. Abort
2330 these steps. The token is ignored. */
2331 if(!isset($formatting_element) || ($in_stack &&
2332 !$this->elementInScope($token['name']))) {
2333 break;
2335 /* Otherwise, if there is such a node, but that node
2336 is not in the stack of open elements, then this is a
2337 parse error; remove the element from the list, and
2338 abort these steps. */
2339 } elseif(isset($formatting_element) && !$in_stack) {
2340 unset($this->a_formatting[$fe_af_pos]);
2341 $this->a_formatting = array_merge($this->a_formatting);
2342 break;
2345 /* 2. Let the furthest block be the topmost node in the
2346 stack of open elements that is lower in the stack
2347 than the formatting element, and is not an element in
2348 the phrasing or formatting categories. There might
2349 not be one. */
2350 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2351 $length = count($this->stack);
2353 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2354 $category = $this->getElementCategory($this->stack[$s]->nodeName);
2356 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2357 $furthest_block = $this->stack[$s];
2361 /* 3. If there is no furthest block, then the UA must
2362 skip the subsequent steps and instead just pop all
2363 the nodes from the bottom of the stack of open
2364 elements, from the current node up to the formatting
2365 element, and remove the formatting element from the
2366 list of active formatting elements. */
2367 if(!isset($furthest_block)) {
2368 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2369 array_pop($this->stack);
2372 unset($this->a_formatting[$fe_af_pos]);
2373 $this->a_formatting = array_merge($this->a_formatting);
2374 break;
2377 /* 4. Let the common ancestor be the element
2378 immediately above the formatting element in the stack
2379 of open elements. */
2380 $common_ancestor = $this->stack[$fe_s_pos - 1];
2382 /* 5. If the furthest block has a parent node, then
2383 remove the furthest block from its parent node. */
2384 if($furthest_block->parentNode !== null) {
2385 $furthest_block->parentNode->removeChild($furthest_block);
2388 /* 6. Let a bookmark note the position of the
2389 formatting element in the list of active formatting
2390 elements relative to the elements on either side
2391 of it in the list. */
2392 $bookmark = $fe_af_pos;
2394 /* 7. Let node and last node be the furthest block.
2395 Follow these steps: */
2396 $node = $furthest_block;
2397 $last_node = $furthest_block;
2399 while(true) {
2400 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2401 /* 7.1 Let node be the element immediately
2402 prior to node in the stack of open elements. */
2403 $node = $this->stack[$n];
2405 /* 7.2 If node is not in the list of active
2406 formatting elements, then remove node from
2407 the stack of open elements and then go back
2408 to step 1. */
2409 if(!in_array($node, $this->a_formatting, true)) {
2410 unset($this->stack[$n]);
2411 $this->stack = array_merge($this->stack);
2413 } else {
2414 break;
2418 /* 7.3 Otherwise, if node is the formatting
2419 element, then go to the next step in the overall
2420 algorithm. */
2421 if($node === $formatting_element) {
2422 break;
2424 /* 7.4 Otherwise, if last node is the furthest
2425 block, then move the aforementioned bookmark to
2426 be immediately after the node in the list of
2427 active formatting elements. */
2428 } elseif($last_node === $furthest_block) {
2429 $bookmark = array_search($node, $this->a_formatting, true) + 1;
2432 /* 7.5 If node has any children, perform a
2433 shallow clone of node, replace the entry for
2434 node in the list of active formatting elements
2435 with an entry for the clone, replace the entry
2436 for node in the stack of open elements with an
2437 entry for the clone, and let node be the clone. */
2438 if($node->hasChildNodes()) {
2439 $clone = $node->cloneNode();
2440 $s_pos = array_search($node, $this->stack, true);
2441 $a_pos = array_search($node, $this->a_formatting, true);
2443 $this->stack[$s_pos] = $clone;
2444 $this->a_formatting[$a_pos] = $clone;
2445 $node = $clone;
2448 /* 7.6 Insert last node into node, first removing
2449 it from its previous parent node if any. */
2450 if($last_node->parentNode !== null) {
2451 $last_node->parentNode->removeChild($last_node);
2454 $node->appendChild($last_node);
2456 /* 7.7 Let last node be node. */
2457 $last_node = $node;
2460 /* 8. Insert whatever last node ended up being in
2461 the previous step into the common ancestor node,
2462 first removing it from its previous parent node if
2463 any. */
2464 if($last_node->parentNode !== null) {
2465 $last_node->parentNode->removeChild($last_node);
2468 $common_ancestor->appendChild($last_node);
2470 /* 9. Perform a shallow clone of the formatting
2471 element. */
2472 $clone = $formatting_element->cloneNode();
2474 /* 10. Take all of the child nodes of the furthest
2475 block and append them to the clone created in the
2476 last step. */
2477 while($furthest_block->hasChildNodes()) {
2478 $child = $furthest_block->firstChild;
2479 $furthest_block->removeChild($child);
2480 $clone->appendChild($child);
2483 /* 11. Append that clone to the furthest block. */
2484 $furthest_block->appendChild($clone);
2486 /* 12. Remove the formatting element from the list
2487 of active formatting elements, and insert the clone
2488 into the list of active formatting elements at the
2489 position of the aforementioned bookmark. */
2490 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2491 unset($this->a_formatting[$fe_af_pos]);
2492 $this->a_formatting = array_merge($this->a_formatting);
2494 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2495 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2496 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2498 /* 13. Remove the formatting element from the stack
2499 of open elements, and insert the clone into the stack
2500 of open elements immediately after (i.e. in a more
2501 deeply nested position than) the position of the
2502 furthest block in that stack. */
2503 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2504 $fb_s_pos = array_search($furthest_block, $this->stack, true);
2505 unset($this->stack[$fe_s_pos]);
2507 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2508 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2509 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2511 /* 14. Jump back to step 1 in this series of steps. */
2512 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2514 break;
2516 /* An end tag token whose tag name is one of: "button",
2517 "marquee", "object" */
2518 case 'button': case 'marquee': case 'object':
2519 /* If the stack of open elements has an element in scope whose
2520 tag name matches the tag name of the token, then generate implied
2521 tags. */
2522 if($this->elementInScope($token['name'])) {
2523 $this->generateImpliedEndTags();
2525 /* Now, if the current node is not an element with the same
2526 tag name as the token, then this is a parse error. */
2527 // k
2529 /* Now, if the stack of open elements has an element in scope
2530 whose tag name matches the tag name of the token, then pop
2531 elements from the stack until that element has been popped from
2532 the stack, and clear the list of active formatting elements up
2533 to the last marker. */
2534 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2535 if($this->stack[$n]->nodeName === $token['name']) {
2536 $n = -1;
2539 array_pop($this->stack);
2542 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2544 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2545 array_pop($this->a_formatting);
2548 break;
2550 /* Or an end tag whose tag name is one of: "area", "basefont",
2551 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2552 "input", "isindex", "noembed", "noframes", "param", "select",
2553 "spacer", "table", "textarea", "wbr" */
2554 case 'area': case 'basefont': case 'bgsound': case 'br':
2555 case 'embed': case 'hr': case 'iframe': case 'image':
2556 case 'img': case 'input': case 'isindex': case 'noembed':
2557 case 'noframes': case 'param': case 'select': case 'spacer':
2558 case 'table': case 'textarea': case 'wbr':
2559 // Parse error. Ignore the token.
2560 break;
2562 /* An end tag token not covered by the previous entries */
2563 default:
2564 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2565 /* Initialise node to be the current node (the bottommost
2566 node of the stack). */
2567 $node = end($this->stack);
2569 /* If node has the same tag name as the end tag token,
2570 then: */
2571 if($token['name'] === $node->nodeName) {
2572 /* Generate implied end tags. */
2573 $this->generateImpliedEndTags();
2575 /* If the tag name of the end tag token does not
2576 match the tag name of the current node, this is a
2577 parse error. */
2578 // k
2580 /* Pop all the nodes from the current node up to
2581 node, including node, then stop this algorithm. */
2582 for($x = count($this->stack) - $n; $x >= $n; $x--) {
2583 array_pop($this->stack);
2586 } else {
2587 $category = $this->getElementCategory($node);
2589 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2590 /* Otherwise, if node is in neither the formatting
2591 category nor the phrasing category, then this is a
2592 parse error. Stop this algorithm. The end tag token
2593 is ignored. */
2594 return false;
2598 break;
2600 break;
2604 private function inTable($token)
2606 $clear = array('html', 'table');
2608 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2609 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2610 or U+0020 SPACE */
2611 if($token['type'] === HTML5::CHARACTR &&
2612 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2613 /* Append the character to the current node. */
2614 $text = $this->dom->createTextNode($token['data']);
2615 end($this->stack)->appendChild($text);
2617 /* A comment token */
2618 } elseif($token['type'] === HTML5::COMMENT) {
2619 /* Append a Comment node to the current node with the data
2620 attribute set to the data given in the comment token. */
2621 $comment = $this->dom->createComment($token['data']);
2622 end($this->stack)->appendChild($comment);
2624 /* A start tag whose tag name is "caption" */
2625 } elseif($token['type'] === HTML5::STARTTAG &&
2626 $token['name'] === 'caption') {
2627 /* Clear the stack back to a table context. */
2628 $this->clearStackToTableContext($clear);
2630 /* Insert a marker at the end of the list of active
2631 formatting elements. */
2632 $this->a_formatting[] = self::MARKER;
2634 /* Insert an HTML element for the token, then switch the
2635 insertion mode to "in caption". */
2636 $this->insertElement($token);
2637 $this->mode = self::IN_CAPTION;
2639 /* A start tag whose tag name is "colgroup" */
2640 } elseif($token['type'] === HTML5::STARTTAG &&
2641 $token['name'] === 'colgroup') {
2642 /* Clear the stack back to a table context. */
2643 $this->clearStackToTableContext($clear);
2645 /* Insert an HTML element for the token, then switch the
2646 insertion mode to "in column group". */
2647 $this->insertElement($token);
2648 $this->mode = self::IN_CGROUP;
2650 /* A start tag whose tag name is "col" */
2651 } elseif($token['type'] === HTML5::STARTTAG &&
2652 $token['name'] === 'col') {
2653 $this->inTable(array(
2654 'name' => 'colgroup',
2655 'type' => HTML5::STARTTAG,
2656 'attr' => array()
2659 $this->inColumnGroup($token);
2661 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2662 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2663 array('tbody', 'tfoot', 'thead'))) {
2664 /* Clear the stack back to a table context. */
2665 $this->clearStackToTableContext($clear);
2667 /* Insert an HTML element for the token, then switch the insertion
2668 mode to "in table body". */
2669 $this->insertElement($token);
2670 $this->mode = self::IN_TBODY;
2672 /* A start tag whose tag name is one of: "td", "th", "tr" */
2673 } elseif($token['type'] === HTML5::STARTTAG &&
2674 in_array($token['name'], array('td', 'th', 'tr'))) {
2675 /* Act as if a start tag token with the tag name "tbody" had been
2676 seen, then reprocess the current token. */
2677 $this->inTable(array(
2678 'name' => 'tbody',
2679 'type' => HTML5::STARTTAG,
2680 'attr' => array()
2683 return $this->inTableBody($token);
2685 /* A start tag whose tag name is "table" */
2686 } elseif($token['type'] === HTML5::STARTTAG &&
2687 $token['name'] === 'table') {
2688 /* Parse error. Act as if an end tag token with the tag name "table"
2689 had been seen, then, if that token wasn't ignored, reprocess the
2690 current token. */
2691 $this->inTable(array(
2692 'name' => 'table',
2693 'type' => HTML5::ENDTAG
2696 return $this->mainPhase($token);
2698 /* An end tag whose tag name is "table" */
2699 } elseif($token['type'] === HTML5::ENDTAG &&
2700 $token['name'] === 'table') {
2701 /* If the stack of open elements does not have an element in table
2702 scope with the same tag name as the token, this is a parse error.
2703 Ignore the token. (innerHTML case) */
2704 if(!$this->elementInScope($token['name'], true)) {
2705 return false;
2707 /* Otherwise: */
2708 } else {
2709 /* Generate implied end tags. */
2710 $this->generateImpliedEndTags();
2712 /* Now, if the current node is not a table element, then this
2713 is a parse error. */
2714 // w/e
2716 /* Pop elements from this stack until a table element has been
2717 popped from the stack. */
2718 while(true) {
2719 $current = end($this->stack)->nodeName;
2720 array_pop($this->stack);
2722 if($current === 'table') {
2723 break;
2727 /* Reset the insertion mode appropriately. */
2728 $this->resetInsertionMode();
2731 /* An end tag whose tag name is one of: "body", "caption", "col",
2732 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2733 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2734 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2735 'tfoot', 'th', 'thead', 'tr'))) {
2736 // Parse error. Ignore the token.
2738 /* Anything else */
2739 } else {
2740 /* Parse error. Process the token as if the insertion mode was "in
2741 body", with the following exception: */
2743 /* If the current node is a table, tbody, tfoot, thead, or tr
2744 element, then, whenever a node would be inserted into the current
2745 node, it must instead be inserted into the foster parent element. */
2746 if(in_array(end($this->stack)->nodeName,
2747 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2748 /* The foster parent element is the parent element of the last
2749 table element in the stack of open elements, if there is a
2750 table element and it has such a parent element. If there is no
2751 table element in the stack of open elements (innerHTML case),
2752 then the foster parent element is the first element in the
2753 stack of open elements (the html element). Otherwise, if there
2754 is a table element in the stack of open elements, but the last
2755 table element in the stack of open elements has no parent, or
2756 its parent node is not an element, then the foster parent
2757 element is the element before the last table element in the
2758 stack of open elements. */
2759 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2760 if($this->stack[$n]->nodeName === 'table') {
2761 $table = $this->stack[$n];
2762 break;
2766 if(isset($table) && $table->parentNode !== null) {
2767 $this->foster_parent = $table->parentNode;
2769 } elseif(!isset($table)) {
2770 $this->foster_parent = $this->stack[0];
2772 } elseif(isset($table) && ($table->parentNode === null ||
2773 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2774 $this->foster_parent = $this->stack[$n - 1];
2778 $this->inBody($token);
2782 private function inCaption($token)
2784 /* An end tag whose tag name is "caption" */
2785 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2786 /* If the stack of open elements does not have an element in table
2787 scope with the same tag name as the token, this is a parse error.
2788 Ignore the token. (innerHTML case) */
2789 if(!$this->elementInScope($token['name'], true)) {
2790 // Ignore
2792 /* Otherwise: */
2793 } else {
2794 /* Generate implied end tags. */
2795 $this->generateImpliedEndTags();
2797 /* Now, if the current node is not a caption element, then this
2798 is a parse error. */
2799 // w/e
2801 /* Pop elements from this stack until a caption element has
2802 been popped from the stack. */
2803 while(true) {
2804 $node = end($this->stack)->nodeName;
2805 array_pop($this->stack);
2807 if($node === 'caption') {
2808 break;
2812 /* Clear the list of active formatting elements up to the last
2813 marker. */
2814 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2816 /* Switch the insertion mode to "in table". */
2817 $this->mode = self::IN_TABLE;
2820 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2821 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2822 name is "table" */
2823 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2824 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2825 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2826 $token['name'] === 'table')) {
2827 /* Parse error. Act as if an end tag with the tag name "caption"
2828 had been seen, then, if that token wasn't ignored, reprocess the
2829 current token. */
2830 $this->inCaption(array(
2831 'name' => 'caption',
2832 'type' => HTML5::ENDTAG
2835 return $this->inTable($token);
2837 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2838 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2839 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2840 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2841 'thead', 'tr'))) {
2842 // Parse error. Ignore the token.
2844 /* Anything else */
2845 } else {
2846 /* Process the token as if the insertion mode was "in body". */
2847 $this->inBody($token);
2851 private function inColumnGroup($token)
2853 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2854 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2855 or U+0020 SPACE */
2856 if($token['type'] === HTML5::CHARACTR &&
2857 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2858 /* Append the character to the current node. */
2859 $text = $this->dom->createTextNode($token['data']);
2860 end($this->stack)->appendChild($text);
2862 /* A comment token */
2863 } elseif($token['type'] === HTML5::COMMENT) {
2864 /* Append a Comment node to the current node with the data
2865 attribute set to the data given in the comment token. */
2866 $comment = $this->dom->createComment($token['data']);
2867 end($this->stack)->appendChild($comment);
2869 /* A start tag whose tag name is "col" */
2870 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2871 /* Insert a col element for the token. Immediately pop the current
2872 node off the stack of open elements. */
2873 $this->insertElement($token);
2874 array_pop($this->stack);
2876 /* An end tag whose tag name is "colgroup" */
2877 } elseif($token['type'] === HTML5::ENDTAG &&
2878 $token['name'] === 'colgroup') {
2879 /* If the current node is the root html element, then this is a
2880 parse error, ignore the token. (innerHTML case) */
2881 if(end($this->stack)->nodeName === 'html') {
2882 // Ignore
2884 /* Otherwise, pop the current node (which will be a colgroup
2885 element) from the stack of open elements. Switch the insertion
2886 mode to "in table". */
2887 } else {
2888 array_pop($this->stack);
2889 $this->mode = self::IN_TABLE;
2892 /* An end tag whose tag name is "col" */
2893 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2894 /* Parse error. Ignore the token. */
2896 /* Anything else */
2897 } else {
2898 /* Act as if an end tag with the tag name "colgroup" had been seen,
2899 and then, if that token wasn't ignored, reprocess the current token. */
2900 $this->inColumnGroup(array(
2901 'name' => 'colgroup',
2902 'type' => HTML5::ENDTAG
2905 return $this->inTable($token);
2909 private function inTableBody($token)
2911 $clear = array('tbody', 'tfoot', 'thead', 'html');
2913 /* A start tag whose tag name is "tr" */
2914 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2915 /* Clear the stack back to a table body context. */
2916 $this->clearStackToTableContext($clear);
2918 /* Insert a tr element for the token, then switch the insertion
2919 mode to "in row". */
2920 $this->insertElement($token);
2921 $this->mode = self::IN_ROW;
2923 /* A start tag whose tag name is one of: "th", "td" */
2924 } elseif($token['type'] === HTML5::STARTTAG &&
2925 ($token['name'] === 'th' || $token['name'] === 'td')) {
2926 /* Parse error. Act as if a start tag with the tag name "tr" had
2927 been seen, then reprocess the current token. */
2928 $this->inTableBody(array(
2929 'name' => 'tr',
2930 'type' => HTML5::STARTTAG,
2931 'attr' => array()
2934 return $this->inRow($token);
2936 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2937 } elseif($token['type'] === HTML5::ENDTAG &&
2938 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2939 /* If the stack of open elements does not have an element in table
2940 scope with the same tag name as the token, this is a parse error.
2941 Ignore the token. */
2942 if(!$this->elementInScope($token['name'], true)) {
2943 // Ignore
2945 /* Otherwise: */
2946 } else {
2947 /* Clear the stack back to a table body context. */
2948 $this->clearStackToTableContext($clear);
2950 /* Pop the current node from the stack of open elements. Switch
2951 the insertion mode to "in table". */
2952 array_pop($this->stack);
2953 $this->mode = self::IN_TABLE;
2956 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2957 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2958 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2959 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2960 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2961 /* If the stack of open elements does not have a tbody, thead, or
2962 tfoot element in table scope, this is a parse error. Ignore the
2963 token. (innerHTML case) */
2964 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2965 // Ignore.
2967 /* Otherwise: */
2968 } else {
2969 /* Clear the stack back to a table body context. */
2970 $this->clearStackToTableContext($clear);
2972 /* Act as if an end tag with the same tag name as the current
2973 node ("tbody", "tfoot", or "thead") had been seen, then
2974 reprocess the current token. */
2975 $this->inTableBody(array(
2976 'name' => end($this->stack)->nodeName,
2977 'type' => HTML5::ENDTAG
2980 return $this->mainPhase($token);
2983 /* An end tag whose tag name is one of: "body", "caption", "col",
2984 "colgroup", "html", "td", "th", "tr" */
2985 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2986 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
2987 /* Parse error. Ignore the token. */
2989 /* Anything else */
2990 } else {
2991 /* Process the token as if the insertion mode was "in table". */
2992 $this->inTable($token);
2996 private function inRow($token)
2998 $clear = array('tr', 'html');
3000 /* A start tag whose tag name is one of: "th", "td" */
3001 if($token['type'] === HTML5::STARTTAG &&
3002 ($token['name'] === 'th' || $token['name'] === 'td')) {
3003 /* Clear the stack back to a table row context. */
3004 $this->clearStackToTableContext($clear);
3006 /* Insert an HTML element for the token, then switch the insertion
3007 mode to "in cell". */
3008 $this->insertElement($token);
3009 $this->mode = self::IN_CELL;
3011 /* Insert a marker at the end of the list of active formatting
3012 elements. */
3013 $this->a_formatting[] = self::MARKER;
3015 /* An end tag whose tag name is "tr" */
3016 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3017 /* If the stack of open elements does not have an element in table
3018 scope with the same tag name as the token, this is a parse error.
3019 Ignore the token. (innerHTML case) */
3020 if(!$this->elementInScope($token['name'], true)) {
3021 // Ignore.
3023 /* Otherwise: */
3024 } else {
3025 /* Clear the stack back to a table row context. */
3026 $this->clearStackToTableContext($clear);
3028 /* Pop the current node (which will be a tr element) from the
3029 stack of open elements. Switch the insertion mode to "in table
3030 body". */
3031 array_pop($this->stack);
3032 $this->mode = self::IN_TBODY;
3035 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3036 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3037 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3038 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3039 /* Act as if an end tag with the tag name "tr" had been seen, then,
3040 if that token wasn't ignored, reprocess the current token. */
3041 $this->inRow(array(
3042 'name' => 'tr',
3043 'type' => HTML5::ENDTAG
3046 return $this->inCell($token);
3048 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3049 } elseif($token['type'] === HTML5::ENDTAG &&
3050 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3051 /* If the stack of open elements does not have an element in table
3052 scope with the same tag name as the token, this is a parse error.
3053 Ignore the token. */
3054 if(!$this->elementInScope($token['name'], true)) {
3055 // Ignore.
3057 /* Otherwise: */
3058 } else {
3059 /* Otherwise, act as if an end tag with the tag name "tr" had
3060 been seen, then reprocess the current token. */
3061 $this->inRow(array(
3062 'name' => 'tr',
3063 'type' => HTML5::ENDTAG
3066 return $this->inCell($token);
3069 /* An end tag whose tag name is one of: "body", "caption", "col",
3070 "colgroup", "html", "td", "th" */
3071 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3072 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3073 /* Parse error. Ignore the token. */
3075 /* Anything else */
3076 } else {
3077 /* Process the token as if the insertion mode was "in table". */
3078 $this->inTable($token);
3082 private function inCell($token)
3084 /* An end tag whose tag name is one of: "td", "th" */
3085 if($token['type'] === HTML5::ENDTAG &&
3086 ($token['name'] === 'td' || $token['name'] === 'th')) {
3087 /* If the stack of open elements does not have an element in table
3088 scope with the same tag name as that of the token, then this is a
3089 parse error and the token must be ignored. */
3090 if(!$this->elementInScope($token['name'], true)) {
3091 // Ignore.
3093 /* Otherwise: */
3094 } else {
3095 /* Generate implied end tags, except for elements with the same
3096 tag name as the token. */
3097 $this->generateImpliedEndTags(array($token['name']));
3099 /* Now, if the current node is not an element with the same tag
3100 name as the token, then this is a parse error. */
3101 // k
3103 /* Pop elements from this stack until an element with the same
3104 tag name as the token has been popped from the stack. */
3105 while(true) {
3106 $node = end($this->stack)->nodeName;
3107 array_pop($this->stack);
3109 if($node === $token['name']) {
3110 break;
3114 /* Clear the list of active formatting elements up to the last
3115 marker. */
3116 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3118 /* Switch the insertion mode to "in row". (The current node
3119 will be a tr element at this point.) */
3120 $this->mode = self::IN_ROW;
3123 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3124 "tbody", "td", "tfoot", "th", "thead", "tr" */
3125 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3126 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3127 'thead', 'tr'))) {
3128 /* If the stack of open elements does not have a td or th element
3129 in table scope, then this is a parse error; ignore the token.
3130 (innerHTML case) */
3131 if(!$this->elementInScope(array('td', 'th'), true)) {
3132 // Ignore.
3134 /* Otherwise, close the cell (see below) and reprocess the current
3135 token. */
3136 } else {
3137 $this->closeCell();
3138 return $this->inRow($token);
3141 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3142 "tbody", "td", "tfoot", "th", "thead", "tr" */
3143 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3144 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3145 'thead', 'tr'))) {
3146 /* If the stack of open elements does not have a td or th element
3147 in table scope, then this is a parse error; ignore the token.
3148 (innerHTML case) */
3149 if(!$this->elementInScope(array('td', 'th'), true)) {
3150 // Ignore.
3152 /* Otherwise, close the cell (see below) and reprocess the current
3153 token. */
3154 } else {
3155 $this->closeCell();
3156 return $this->inRow($token);
3159 /* An end tag whose tag name is one of: "body", "caption", "col",
3160 "colgroup", "html" */
3161 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3162 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3163 /* Parse error. Ignore the token. */
3165 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3166 "thead", "tr" */
3167 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3168 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3169 /* If the stack of open elements does not have an element in table
3170 scope with the same tag name as that of the token (which can only
3171 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3172 then this is a parse error and the token must be ignored. */
3173 if(!$this->elementInScope($token['name'], true)) {
3174 // Ignore.
3176 /* Otherwise, close the cell (see below) and reprocess the current
3177 token. */
3178 } else {
3179 $this->closeCell();
3180 return $this->inRow($token);
3183 /* Anything else */
3184 } else {
3185 /* Process the token as if the insertion mode was "in body". */
3186 $this->inBody($token);
3190 private function inSelect($token)
3192 /* Handle the token as follows: */
3194 /* A character token */
3195 if($token['type'] === HTML5::CHARACTR) {
3196 /* Append the token's character to the current node. */
3197 $this->insertText($token['data']);
3199 /* A comment token */
3200 } elseif($token['type'] === HTML5::COMMENT) {
3201 /* Append a Comment node to the current node with the data
3202 attribute set to the data given in the comment token. */
3203 $this->insertComment($token['data']);
3205 /* A start tag token whose tag name is "option" */
3206 } elseif($token['type'] === HTML5::STARTTAG &&
3207 $token['name'] === 'option') {
3208 /* If the current node is an option element, act as if an end tag
3209 with the tag name "option" had been seen. */
3210 if(end($this->stack)->nodeName === 'option') {
3211 $this->inSelect(array(
3212 'name' => 'option',
3213 'type' => HTML5::ENDTAG
3217 /* Insert an HTML element for the token. */
3218 $this->insertElement($token);
3220 /* A start tag token whose tag name is "optgroup" */
3221 } elseif($token['type'] === HTML5::STARTTAG &&
3222 $token['name'] === 'optgroup') {
3223 /* If the current node is an option element, act as if an end tag
3224 with the tag name "option" had been seen. */
3225 if(end($this->stack)->nodeName === 'option') {
3226 $this->inSelect(array(
3227 'name' => 'option',
3228 'type' => HTML5::ENDTAG
3232 /* If the current node is an optgroup element, act as if an end tag
3233 with the tag name "optgroup" had been seen. */
3234 if(end($this->stack)->nodeName === 'optgroup') {
3235 $this->inSelect(array(
3236 'name' => 'optgroup',
3237 'type' => HTML5::ENDTAG
3241 /* Insert an HTML element for the token. */
3242 $this->insertElement($token);
3244 /* An end tag token whose tag name is "optgroup" */
3245 } elseif($token['type'] === HTML5::ENDTAG &&
3246 $token['name'] === 'optgroup') {
3247 /* First, if the current node is an option element, and the node
3248 immediately before it in the stack of open elements is an optgroup
3249 element, then act as if an end tag with the tag name "option" had
3250 been seen. */
3251 $elements_in_stack = count($this->stack);
3253 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3254 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3255 $this->inSelect(array(
3256 'name' => 'option',
3257 'type' => HTML5::ENDTAG
3261 /* If the current node is an optgroup element, then pop that node
3262 from the stack of open elements. Otherwise, this is a parse error,
3263 ignore the token. */
3264 if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3265 array_pop($this->stack);
3268 /* An end tag token whose tag name is "option" */
3269 } elseif($token['type'] === HTML5::ENDTAG &&
3270 $token['name'] === 'option') {
3271 /* If the current node is an option element, then pop that node
3272 from the stack of open elements. Otherwise, this is a parse error,
3273 ignore the token. */
3274 if(end($this->stack)->nodeName === 'option') {
3275 array_pop($this->stack);
3278 /* An end tag whose tag name is "select" */
3279 } elseif($token['type'] === HTML5::ENDTAG &&
3280 $token['name'] === 'select') {
3281 /* If the stack of open elements does not have an element in table
3282 scope with the same tag name as the token, this is a parse error.
3283 Ignore the token. (innerHTML case) */
3284 if(!$this->elementInScope($token['name'], true)) {
3285 // w/e
3287 /* Otherwise: */
3288 } else {
3289 /* Pop elements from the stack of open elements until a select
3290 element has been popped from the stack. */
3291 while(true) {
3292 $current = end($this->stack)->nodeName;
3293 array_pop($this->stack);
3295 if($current === 'select') {
3296 break;
3300 /* Reset the insertion mode appropriately. */
3301 $this->resetInsertionMode();
3304 /* A start tag whose tag name is "select" */
3305 } elseif($token['name'] === 'select' &&
3306 $token['type'] === HTML5::STARTTAG) {
3307 /* Parse error. Act as if the token had been an end tag with the
3308 tag name "select" instead. */
3309 $this->inSelect(array(
3310 'name' => 'select',
3311 'type' => HTML5::ENDTAG
3314 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3315 "tfoot", "thead", "tr", "td", "th" */
3316 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3317 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3318 /* Parse error. */
3319 // w/e
3321 /* If the stack of open elements has an element in table scope with
3322 the same tag name as that of the token, then act as if an end tag
3323 with the tag name "select" had been seen, and reprocess the token.
3324 Otherwise, ignore the token. */
3325 if($this->elementInScope($token['name'], true)) {
3326 $this->inSelect(array(
3327 'name' => 'select',
3328 'type' => HTML5::ENDTAG
3331 $this->mainPhase($token);
3334 /* Anything else */
3335 } else {
3336 /* Parse error. Ignore the token. */
3340 private function afterBody($token)
3342 /* Handle the token as follows: */
3344 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3345 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3346 or U+0020 SPACE */
3347 if($token['type'] === HTML5::CHARACTR &&
3348 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3349 /* Process the token as it would be processed if the insertion mode
3350 was "in body". */
3351 $this->inBody($token);
3353 /* A comment token */
3354 } elseif($token['type'] === HTML5::COMMENT) {
3355 /* Append a Comment node to the first element in the stack of open
3356 elements (the html element), with the data attribute set to the
3357 data given in the comment token. */
3358 $comment = $this->dom->createComment($token['data']);
3359 $this->stack[0]->appendChild($comment);
3361 /* An end tag with the tag name "html" */
3362 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3363 /* If the parser was originally created in order to handle the
3364 setting of an element's innerHTML attribute, this is a parse error;
3365 ignore the token. (The element will be an html element in this
3366 case.) (innerHTML case) */
3368 /* Otherwise, switch to the trailing end phase. */
3369 $this->phase = self::END_PHASE;
3371 /* Anything else */
3372 } else {
3373 /* Parse error. Set the insertion mode to "in body" and reprocess
3374 the token. */
3375 $this->mode = self::IN_BODY;
3376 return $this->inBody($token);
3380 private function inFrameset($token)
3382 /* Handle the token as follows: */
3384 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3385 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3386 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3387 if($token['type'] === HTML5::CHARACTR &&
3388 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3389 /* Append the character to the current node. */
3390 $this->insertText($token['data']);
3392 /* A comment token */
3393 } elseif($token['type'] === HTML5::COMMENT) {
3394 /* Append a Comment node to the current node with the data
3395 attribute set to the data given in the comment token. */
3396 $this->insertComment($token['data']);
3398 /* A start tag with the tag name "frameset" */
3399 } elseif($token['name'] === 'frameset' &&
3400 $token['type'] === HTML5::STARTTAG) {
3401 $this->insertElement($token);
3403 /* An end tag with the tag name "frameset" */
3404 } elseif($token['name'] === 'frameset' &&
3405 $token['type'] === HTML5::ENDTAG) {
3406 /* If the current node is the root html element, then this is a
3407 parse error; ignore the token. (innerHTML case) */
3408 if(end($this->stack)->nodeName === 'html') {
3409 // Ignore
3411 } else {
3412 /* Otherwise, pop the current node from the stack of open
3413 elements. */
3414 array_pop($this->stack);
3416 /* If the parser was not originally created in order to handle
3417 the setting of an element's innerHTML attribute (innerHTML case),
3418 and the current node is no longer a frameset element, then change
3419 the insertion mode to "after frameset". */
3420 $this->mode = self::AFTR_FRAME;
3423 /* A start tag with the tag name "frame" */
3424 } elseif($token['name'] === 'frame' &&
3425 $token['type'] === HTML5::STARTTAG) {
3426 /* Insert an HTML element for the token. */
3427 $this->insertElement($token);
3429 /* Immediately pop the current node off the stack of open elements. */
3430 array_pop($this->stack);
3432 /* A start tag with the tag name "noframes" */
3433 } elseif($token['name'] === 'noframes' &&
3434 $token['type'] === HTML5::STARTTAG) {
3435 /* Process the token as if the insertion mode had been "in body". */
3436 $this->inBody($token);
3438 /* Anything else */
3439 } else {
3440 /* Parse error. Ignore the token. */
3444 private function afterFrameset($token)
3446 /* Handle the token as follows: */
3448 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3449 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3450 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3451 if($token['type'] === HTML5::CHARACTR &&
3452 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3453 /* Append the character to the current node. */
3454 $this->insertText($token['data']);
3456 /* A comment token */
3457 } elseif($token['type'] === HTML5::COMMENT) {
3458 /* Append a Comment node to the current node with the data
3459 attribute set to the data given in the comment token. */
3460 $this->insertComment($token['data']);
3462 /* An end tag with the tag name "html" */
3463 } elseif($token['name'] === 'html' &&
3464 $token['type'] === HTML5::ENDTAG) {
3465 /* Switch to the trailing end phase. */
3466 $this->phase = self::END_PHASE;
3468 /* A start tag with the tag name "noframes" */
3469 } elseif($token['name'] === 'noframes' &&
3470 $token['type'] === HTML5::STARTTAG) {
3471 /* Process the token as if the insertion mode had been "in body". */
3472 $this->inBody($token);
3474 /* Anything else */
3475 } else {
3476 /* Parse error. Ignore the token. */
3480 private function trailingEndPhase($token)
3482 /* After the main phase, as each token is emitted from the tokenisation
3483 stage, it must be processed as described in this section. */
3485 /* A DOCTYPE token */
3486 if($token['type'] === HTML5::DOCTYPE) {
3487 // Parse error. Ignore the token.
3489 /* A comment token */
3490 } elseif($token['type'] === HTML5::COMMENT) {
3491 /* Append a Comment node to the Document object with the data
3492 attribute set to the data given in the comment token. */
3493 $comment = $this->dom->createComment($token['data']);
3494 $this->dom->appendChild($comment);
3496 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3497 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3498 or U+0020 SPACE */
3499 } elseif($token['type'] === HTML5::CHARACTR &&
3500 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3501 /* Process the token as it would be processed in the main phase. */
3502 $this->mainPhase($token);
3504 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3505 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3506 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3507 } elseif(($token['type'] === HTML5::CHARACTR &&
3508 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3509 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3510 /* Parse error. Switch back to the main phase and reprocess the
3511 token. */
3512 $this->phase = self::MAIN_PHASE;
3513 return $this->mainPhase($token);
3515 /* An end-of-file token */
3516 } elseif($token['type'] === HTML5::EOF) {
3517 /* OMG DONE!! */
3521 private function insertElement($token, $append = true)
3523 $el = $this->dom->createElement($token['name']);
3525 foreach($token['attr'] as $attr) {
3526 if(!$el->hasAttribute($attr['name'])) {
3527 $el->setAttribute($attr['name'], $attr['value']);
3531 $this->appendToRealParent($el);
3532 $this->stack[] = $el;
3534 return $el;
3537 private function insertText($data)
3539 $text = $this->dom->createTextNode($data);
3540 $this->appendToRealParent($text);
3543 private function insertComment($data)
3545 $comment = $this->dom->createComment($data);
3546 $this->appendToRealParent($comment);
3549 private function appendToRealParent($node)
3551 if($this->foster_parent === null) {
3552 end($this->stack)->appendChild($node);
3554 } elseif($this->foster_parent !== null) {
3555 /* If the foster parent element is the parent element of the
3556 last table element in the stack of open elements, then the new
3557 node must be inserted immediately before the last table element
3558 in the stack of open elements in the foster parent element;
3559 otherwise, the new node must be appended to the foster parent
3560 element. */
3561 for($n = count($this->stack) - 1; $n >= 0; $n--) {
3562 if($this->stack[$n]->nodeName === 'table' &&
3563 $this->stack[$n]->parentNode !== null) {
3564 $table = $this->stack[$n];
3565 break;
3569 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3570 $this->foster_parent->insertBefore($node, $table);
3571 else
3572 $this->foster_parent->appendChild($node);
3574 $this->foster_parent = null;
3578 private function elementInScope($el, $table = false)
3580 if(is_array($el)) {
3581 foreach($el as $element) {
3582 if($this->elementInScope($element, $table)) {
3583 return true;
3587 return false;
3590 $leng = count($this->stack);
3592 for($n = 0; $n < $leng; $n++) {
3593 /* 1. Initialise node to be the current node (the bottommost node of
3594 the stack). */
3595 $node = $this->stack[$leng - 1 - $n];
3597 if($node->tagName === $el) {
3598 /* 2. If node is the target node, terminate in a match state. */
3599 return true;
3601 } elseif($node->tagName === 'table') {
3602 /* 3. Otherwise, if node is a table element, terminate in a failure
3603 state. */
3604 return false;
3606 } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3607 'th', 'button', 'marquee', 'object'))) {
3608 /* 4. Otherwise, if the algorithm is the "has an element in scope"
3609 variant (rather than the "has an element in table scope" variant),
3610 and node is one of the following, terminate in a failure state. */
3611 return false;
3613 } elseif($node === $node->ownerDocument->documentElement) {
3614 /* 5. Otherwise, if node is an html element (root element), terminate
3615 in a failure state. (This can only happen if the node is the topmost
3616 node of the stack of open elements, and prevents the next step from
3617 being invoked if there are no more elements in the stack.) */
3618 return false;
3621 /* Otherwise, set node to the previous entry in the stack of open
3622 elements and return to step 2. (This will never fail, since the loop
3623 will always terminate in the previous step if the top of the stack
3624 is reached.) */
3628 private function reconstructActiveFormattingElements()
3630 /* 1. If there are no entries in the list of active formatting elements,
3631 then there is nothing to reconstruct; stop this algorithm. */
3632 $formatting_elements = count($this->a_formatting);
3634 if($formatting_elements === 0) {
3635 return false;
3638 /* 3. Let entry be the last (most recently added) element in the list
3639 of active formatting elements. */
3640 $entry = end($this->a_formatting);
3642 /* 2. If the last (most recently added) entry in the list of active
3643 formatting elements is a marker, or if it is an element that is in the
3644 stack of open elements, then there is nothing to reconstruct; stop this
3645 algorithm. */
3646 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3647 return false;
3650 for($a = $formatting_elements - 1; $a >= 0; true) {
3651 /* 4. If there are no entries before entry in the list of active
3652 formatting elements, then jump to step 8. */
3653 if($a === 0) {
3654 $step_seven = false;
3655 break;
3658 /* 5. Let entry be the entry one earlier than entry in the list of
3659 active formatting elements. */
3660 $a--;
3661 $entry = $this->a_formatting[$a];
3663 /* 6. If entry is neither a marker nor an element that is also in
3664 thetack of open elements, go to step 4. */
3665 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3666 break;
3670 while(true) {
3671 /* 7. Let entry be the element one later than entry in the list of
3672 active formatting elements. */
3673 if(isset($step_seven) && $step_seven === true) {
3674 $a++;
3675 $entry = $this->a_formatting[$a];
3678 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3679 $clone = $entry->cloneNode();
3681 /* 9. Append clone to the current node and push it onto the stack
3682 of open elements so that it is the new current node. */
3683 end($this->stack)->appendChild($clone);
3684 $this->stack[] = $clone;
3686 /* 10. Replace the entry for entry in the list with an entry for
3687 clone. */
3688 $this->a_formatting[$a] = $clone;
3690 /* 11. If the entry for clone in the list of active formatting
3691 elements is not the last entry in the list, return to step 7. */
3692 if(end($this->a_formatting) !== $clone) {
3693 $step_seven = true;
3694 } else {
3695 break;
3700 private function clearTheActiveFormattingElementsUpToTheLastMarker()
3702 /* When the steps below require the UA to clear the list of active
3703 formatting elements up to the last marker, the UA must perform the
3704 following steps: */
3706 while(true) {
3707 /* 1. Let entry be the last (most recently added) entry in the list
3708 of active formatting elements. */
3709 $entry = end($this->a_formatting);
3711 /* 2. Remove entry from the list of active formatting elements. */
3712 array_pop($this->a_formatting);
3714 /* 3. If entry was a marker, then stop the algorithm at this point.
3715 The list has been cleared up to the last marker. */
3716 if($entry === self::MARKER) {
3717 break;
3722 private function generateImpliedEndTags(array $exclude = array())
3724 /* When the steps below require the UA to generate implied end tags,
3725 then, if the current node is a dd element, a dt element, an li element,
3726 a p element, a td element, a th element, or a tr element, the UA must
3727 act as if an end tag with the respective tag name had been seen and
3728 then generate implied end tags again. */
3729 $node = end($this->stack);
3730 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3732 while(in_array(end($this->stack)->nodeName, $elements)) {
3733 array_pop($this->stack);
3737 private function getElementCategory($name)
3739 if(in_array($name, $this->special))
3740 return self::SPECIAL;
3742 elseif(in_array($name, $this->scoping))
3743 return self::SCOPING;
3745 elseif(in_array($name, $this->formatting))
3746 return self::FORMATTING;
3748 else
3749 return self::PHRASING;
3752 private function clearStackToTableContext($elements)
3754 /* When the steps above require the UA to clear the stack back to a
3755 table context, it means that the UA must, while the current node is not
3756 a table element or an html element, pop elements from the stack of open
3757 elements. If this causes any elements to be popped from the stack, then
3758 this is a parse error. */
3759 while(true) {
3760 $node = end($this->stack)->nodeName;
3762 if(in_array($node, $elements)) {
3763 break;
3764 } else {
3765 array_pop($this->stack);
3770 private function resetInsertionMode()
3772 /* 1. Let last be false. */
3773 $last = false;
3774 $leng = count($this->stack);
3776 for($n = $leng - 1; $n >= 0; $n--) {
3777 /* 2. Let node be the last node in the stack of open elements. */
3778 $node = $this->stack[$n];
3780 /* 3. If node is the first node in the stack of open elements, then
3781 set last to true. If the element whose innerHTML attribute is being
3782 set is neither a td element nor a th element, then set node to the
3783 element whose innerHTML attribute is being set. (innerHTML case) */
3784 if($this->stack[0]->isSameNode($node)) {
3785 $last = true;
3788 /* 4. If node is a select element, then switch the insertion mode to
3789 "in select" and abort these steps. (innerHTML case) */
3790 if($node->nodeName === 'select') {
3791 $this->mode = self::IN_SELECT;
3792 break;
3794 /* 5. If node is a td or th element, then switch the insertion mode
3795 to "in cell" and abort these steps. */
3796 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3797 $this->mode = self::IN_CELL;
3798 break;
3800 /* 6. If node is a tr element, then switch the insertion mode to
3801 "in row" and abort these steps. */
3802 } elseif($node->nodeName === 'tr') {
3803 $this->mode = self::IN_ROW;
3804 break;
3806 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3807 insertion mode to "in table body" and abort these steps. */
3808 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3809 $this->mode = self::IN_TBODY;
3810 break;
3812 /* 8. If node is a caption element, then switch the insertion mode
3813 to "in caption" and abort these steps. */
3814 } elseif($node->nodeName === 'caption') {
3815 $this->mode = self::IN_CAPTION;
3816 break;
3818 /* 9. If node is a colgroup element, then switch the insertion mode
3819 to "in column group" and abort these steps. (innerHTML case) */
3820 } elseif($node->nodeName === 'colgroup') {
3821 $this->mode = self::IN_CGROUP;
3822 break;
3824 /* 10. If node is a table element, then switch the insertion mode
3825 to "in table" and abort these steps. */
3826 } elseif($node->nodeName === 'table') {
3827 $this->mode = self::IN_TABLE;
3828 break;
3830 /* 11. If node is a head element, then switch the insertion mode
3831 to "in body" ("in body"! not "in head"!) and abort these steps.
3832 (innerHTML case) */
3833 } elseif($node->nodeName === 'head') {
3834 $this->mode = self::IN_BODY;
3835 break;
3837 /* 12. If node is a body element, then switch the insertion mode to
3838 "in body" and abort these steps. */
3839 } elseif($node->nodeName === 'body') {
3840 $this->mode = self::IN_BODY;
3841 break;
3843 /* 13. If node is a frameset element, then switch the insertion
3844 mode to "in frameset" and abort these steps. (innerHTML case) */
3845 } elseif($node->nodeName === 'frameset') {
3846 $this->mode = self::IN_FRAME;
3847 break;
3849 /* 14. If node is an html element, then: if the head element
3850 pointer is null, switch the insertion mode to "before head",
3851 otherwise, switch the insertion mode to "after head". In either
3852 case, abort these steps. (innerHTML case) */
3853 } elseif($node->nodeName === 'html') {
3854 $this->mode = ($this->head_pointer === null)
3855 ? self::BEFOR_HEAD
3856 : self::AFTER_HEAD;
3858 break;
3860 /* 15. If last is true, then set the insertion mode to "in body"
3861 and abort these steps. (innerHTML case) */
3862 } elseif($last) {
3863 $this->mode = self::IN_BODY;
3864 break;
3869 private function closeCell()
3871 /* If the stack of open elements has a td or th element in table scope,
3872 then act as if an end tag token with that tag name had been seen. */
3873 foreach(array('td', 'th') as $cell) {
3874 if($this->elementInScope($cell, true)) {
3875 $this->inCell(array(
3876 'name' => $cell,
3877 'type' => HTML5::ENDTAG
3880 break;
3885 public function save()
3887 return $this->dom;