"MDL-12304, fix double text"
[moodle-linuxchix.git] / lib / htmlpurifier / HTMLPurifier / Lexer / PH5P.php
blobb6762379141b156905ad8c2bba70f677051edbc2
1 <?php
3 require_once 'HTMLPurifier/Lexer/DOMLex.php';
5 /**
6 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
7 * Requires PHP5, and occupies space in the HTML5 pseudo-namespace (may
8 * cause conflicts, sorry).
9 */
11 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex {
13 public function tokenizeHTML($html, $config, &$context) {
14 $html = $this->normalize($html, $config, $context);
15 $html = $this->wrapHTML( $html, $config, $context);
16 $parser = new HTML5($html);
17 $doc = $parser->save();
18 $tokens = array();
19 $this->tokenizeDOM(
20 $doc->getElementsByTagName('html')->item(0)-> // <html>
21 getElementsByTagName('body')->item(0)-> // <body>
22 getElementsByTagName('div')->item(0) // <div>
23 , $tokens);
24 return $tokens;
31 Copyright 2007 Jeroen van der Meer <http://jero.net/>
33 Permission is hereby granted, free of charge, to any person obtaining a
34 copy of this software and associated documentation files (the
35 "Software"), to deal in the Software without restriction, including
36 without limitation the rights to use, copy, modify, merge, publish,
37 distribute, sublicense, and/or sell copies of the Software, and to
38 permit persons to whom the Software is furnished to do so, subject to
39 the following conditions:
41 The above copyright notice and this permission notice shall be included
42 in all copies or substantial portions of the Software.
44 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
45 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
46 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
47 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
48 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
49 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
50 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
54 class HTML5 {
55 private $data;
56 private $char;
57 private $EOF;
58 private $state;
59 private $tree;
60 private $token;
61 private $content_model;
62 private $escape = false;
63 private $entities = array('AElig;','AElig','AMP;','AMP','Aacute;','Aacute',
64 'Acirc;','Acirc','Agrave;','Agrave','Alpha;','Aring;','Aring','Atilde;',
65 'Atilde','Auml;','Auml','Beta;','COPY;','COPY','Ccedil;','Ccedil','Chi;',
66 'Dagger;','Delta;','ETH;','ETH','Eacute;','Eacute','Ecirc;','Ecirc','Egrave;',
67 'Egrave','Epsilon;','Eta;','Euml;','Euml','GT;','GT','Gamma;','Iacute;',
68 'Iacute','Icirc;','Icirc','Igrave;','Igrave','Iota;','Iuml;','Iuml','Kappa;',
69 'LT;','LT','Lambda;','Mu;','Ntilde;','Ntilde','Nu;','OElig;','Oacute;',
70 'Oacute','Ocirc;','Ocirc','Ograve;','Ograve','Omega;','Omicron;','Oslash;',
71 'Oslash','Otilde;','Otilde','Ouml;','Ouml','Phi;','Pi;','Prime;','Psi;',
72 'QUOT;','QUOT','REG;','REG','Rho;','Scaron;','Sigma;','THORN;','THORN',
73 'TRADE;','Tau;','Theta;','Uacute;','Uacute','Ucirc;','Ucirc','Ugrave;',
74 'Ugrave','Upsilon;','Uuml;','Uuml','Xi;','Yacute;','Yacute','Yuml;','Zeta;',
75 'aacute;','aacute','acirc;','acirc','acute;','acute','aelig;','aelig',
76 'agrave;','agrave','alefsym;','alpha;','amp;','amp','and;','ang;','apos;',
77 'aring;','aring','asymp;','atilde;','atilde','auml;','auml','bdquo;','beta;',
78 'brvbar;','brvbar','bull;','cap;','ccedil;','ccedil','cedil;','cedil',
79 'cent;','cent','chi;','circ;','clubs;','cong;','copy;','copy','crarr;',
80 'cup;','curren;','curren','dArr;','dagger;','darr;','deg;','deg','delta;',
81 'diams;','divide;','divide','eacute;','eacute','ecirc;','ecirc','egrave;',
82 'egrave','empty;','emsp;','ensp;','epsilon;','equiv;','eta;','eth;','eth',
83 'euml;','euml','euro;','exist;','fnof;','forall;','frac12;','frac12',
84 'frac14;','frac14','frac34;','frac34','frasl;','gamma;','ge;','gt;','gt',
85 'hArr;','harr;','hearts;','hellip;','iacute;','iacute','icirc;','icirc',
86 'iexcl;','iexcl','igrave;','igrave','image;','infin;','int;','iota;',
87 'iquest;','iquest','isin;','iuml;','iuml','kappa;','lArr;','lambda;','lang;',
88 'laquo;','laquo','larr;','lceil;','ldquo;','le;','lfloor;','lowast;','loz;',
89 'lrm;','lsaquo;','lsquo;','lt;','lt','macr;','macr','mdash;','micro;','micro',
90 'middot;','middot','minus;','mu;','nabla;','nbsp;','nbsp','ndash;','ne;',
91 'ni;','not;','not','notin;','nsub;','ntilde;','ntilde','nu;','oacute;',
92 'oacute','ocirc;','ocirc','oelig;','ograve;','ograve','oline;','omega;',
93 'omicron;','oplus;','or;','ordf;','ordf','ordm;','ordm','oslash;','oslash',
94 'otilde;','otilde','otimes;','ouml;','ouml','para;','para','part;','permil;',
95 'perp;','phi;','pi;','piv;','plusmn;','plusmn','pound;','pound','prime;',
96 'prod;','prop;','psi;','quot;','quot','rArr;','radic;','rang;','raquo;',
97 'raquo','rarr;','rceil;','rdquo;','real;','reg;','reg','rfloor;','rho;',
98 'rlm;','rsaquo;','rsquo;','sbquo;','scaron;','sdot;','sect;','sect','shy;',
99 'shy','sigma;','sigmaf;','sim;','spades;','sub;','sube;','sum;','sup1;',
100 'sup1','sup2;','sup2','sup3;','sup3','sup;','supe;','szlig;','szlig','tau;',
101 'there4;','theta;','thetasym;','thinsp;','thorn;','thorn','tilde;','times;',
102 'times','trade;','uArr;','uacute;','uacute','uarr;','ucirc;','ucirc',
103 'ugrave;','ugrave','uml;','uml','upsih;','upsilon;','uuml;','uuml','weierp;',
104 'xi;','yacute;','yacute','yen;','yen','yuml;','yuml','zeta;','zwj;','zwnj;');
106 const PCDATA = 0;
107 const RCDATA = 1;
108 const CDATA = 2;
109 const PLAINTEXT = 3;
111 const DOCTYPE = 0;
112 const STARTTAG = 1;
113 const ENDTAG = 2;
114 const COMMENT = 3;
115 const CHARACTR = 4;
116 const EOF = 5;
118 public function __construct($data) {
119 $data = str_replace("\r\n", "\n", $data);
120 $date = str_replace("\r", null, $data);
122 $this->data = $data;
123 $this->char = -1;
124 $this->EOF = strlen($data);
125 $this->tree = new HTML5TreeConstructer;
126 $this->content_model = self::PCDATA;
128 $this->state = 'data';
130 while($this->state !== null) {
131 $this->{$this->state.'State'}();
135 public function save() {
136 return $this->tree->save();
139 private function char() {
140 return ($this->char < $this->EOF)
141 ? $this->data[$this->char]
142 : false;
145 private function character($s, $l = 0) {
146 if($s + $l < $this->EOF) {
147 if($l === 0) {
148 return $this->data[$s];
149 } else {
150 return substr($this->data, $s, $l);
155 private function characters($char_class, $start) {
156 return preg_replace('#^(['.$char_class.']+).*#s', '\\1', substr($this->data, $start));
159 private function dataState() {
160 // Consume the next input character
161 $this->char++;
162 $char = $this->char();
164 if($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
165 /* U+0026 AMPERSAND (&)
166 When the content model flag is set to one of the PCDATA or RCDATA
167 states: switch to the entity data state. Otherwise: treat it as per
168 the "anything else" entry below. */
169 $this->state = 'entityData';
171 } elseif($char === '-') {
172 /* If the content model flag is set to either the RCDATA state or
173 the CDATA state, and the escape flag is false, and there are at
174 least three characters before this one in the input stream, and the
175 last four characters in the input stream, including this one, are
176 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
177 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
178 if(($this->content_model === self::RCDATA || $this->content_model ===
179 self::CDATA) && $this->escape === false &&
180 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--') {
181 $this->escape = true;
184 /* In any case, emit the input character as a character token. Stay
185 in the data state. */
186 $this->emitToken(array(
187 'type' => self::CHARACTR,
188 'data' => $char
191 /* U+003C LESS-THAN SIGN (<) */
192 } elseif($char === '<' && ($this->content_model === self::PCDATA ||
193 (($this->content_model === self::RCDATA ||
194 $this->content_model === self::CDATA) && $this->escape === false))) {
195 /* When the content model flag is set to the PCDATA state: switch
196 to the tag open state.
198 When the content model flag is set to either the RCDATA state or
199 the CDATA state and the escape flag is false: switch to the tag
200 open state.
202 Otherwise: treat it as per the "anything else" entry below. */
203 $this->state = 'tagOpen';
205 /* U+003E GREATER-THAN SIGN (>) */
206 } elseif($char === '>') {
207 /* If the content model flag is set to either the RCDATA state or
208 the CDATA state, and the escape flag is true, and the last three
209 characters in the input stream including this one are U+002D
210 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
211 set the escape flag to false. */
212 if(($this->content_model === self::RCDATA ||
213 $this->content_model === self::CDATA) && $this->escape === true &&
214 $this->character($this->char, 3) === '-->') {
215 $this->escape = false;
218 /* In any case, emit the input character as a character token.
219 Stay in the data state. */
220 $this->emitToken(array(
221 'type' => self::CHARACTR,
222 'data' => $char
225 } elseif($this->char === $this->EOF) {
226 /* EOF
227 Emit an end-of-file token. */
228 $this->EOF();
230 } elseif($this->content_model === self::PLAINTEXT) {
231 /* When the content model flag is set to the PLAINTEXT state
232 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
233 the text and emit it as a character token. */
234 $this->emitToken(array(
235 'type' => self::CHARACTR,
236 'data' => substr($this->data, $this->char)
239 $this->EOF();
241 } else {
242 /* Anything else
243 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
244 otherwise would also be treated as a character token and emit it
245 as a single character token. Stay in the data state. */
246 $len = strcspn($this->data, '<&', $this->char);
247 $char = substr($this->data, $this->char, $len);
248 $this->char += $len - 1;
250 $this->emitToken(array(
251 'type' => self::CHARACTR,
252 'data' => $char
255 $this->state = 'data';
259 private function entityDataState() {
260 // Attempt to consume an entity.
261 $entity = $this->entity();
263 // If nothing is returned, emit a U+0026 AMPERSAND character token.
264 // Otherwise, emit the character token that was returned.
265 $char = (!$entity) ? '&' : $entity;
266 $this->emitToken(array(
267 'type' => self::CHARACTR,
268 'data' => $char
271 // Finally, switch to the data state.
272 $this->state = 'data';
275 private function tagOpenState() {
276 switch($this->content_model) {
277 case self::RCDATA:
278 case self::CDATA:
279 /* If the next input character is a U+002F SOLIDUS (/) character,
280 consume it and switch to the close tag open state. If the next
281 input character is not a U+002F SOLIDUS (/) character, emit a
282 U+003C LESS-THAN SIGN character token and switch to the data
283 state to process the next input character. */
284 if($this->character($this->char + 1) === '/') {
285 $this->char++;
286 $this->state = 'closeTagOpen';
288 } else {
289 $this->emitToken(array(
290 'type' => self::CHARACTR,
291 'data' => '<'
294 $this->state = 'data';
296 break;
298 case self::PCDATA:
299 // If the content model flag is set to the PCDATA state
300 // Consume the next input character:
301 $this->char++;
302 $char = $this->char();
304 if($char === '!') {
305 /* U+0021 EXCLAMATION MARK (!)
306 Switch to the markup declaration open state. */
307 $this->state = 'markupDeclarationOpen';
309 } elseif($char === '/') {
310 /* U+002F SOLIDUS (/)
311 Switch to the close tag open state. */
312 $this->state = 'closeTagOpen';
314 } elseif(preg_match('/^[A-Za-z]$/', $char)) {
315 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
316 Create a new start tag token, set its tag name to the lowercase
317 version of the input character (add 0x0020 to the character's code
318 point), then switch to the tag name state. (Don't emit the token
319 yet; further details will be filled in before it is emitted.) */
320 $this->token = array(
321 'name' => strtolower($char),
322 'type' => self::STARTTAG,
323 'attr' => array()
326 $this->state = 'tagName';
328 } elseif($char === '>') {
329 /* U+003E GREATER-THAN SIGN (>)
330 Parse error. Emit a U+003C LESS-THAN SIGN character token and a
331 U+003E GREATER-THAN SIGN character token. Switch to the data state. */
332 $this->emitToken(array(
333 'type' => self::CHARACTR,
334 'data' => '<>'
337 $this->state = 'data';
339 } elseif($char === '?') {
340 /* U+003F QUESTION MARK (?)
341 Parse error. Switch to the bogus comment state. */
342 $this->state = 'bogusComment';
344 } else {
345 /* Anything else
346 Parse error. Emit a U+003C LESS-THAN SIGN character token and
347 reconsume the current input character in the data state. */
348 $this->emitToken(array(
349 'type' => self::CHARACTR,
350 'data' => '<'
353 $this->char--;
354 $this->state = 'data';
356 break;
360 private function closeTagOpenState() {
361 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
362 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
364 if(($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
365 (!$the_same || ($the_same && (!preg_match('/[\t\n\x0b\x0c >\/]/',
366 $this->character($this->char + 1 + strlen($next_node))) || $this->EOF === $this->char)))) {
367 /* If the content model flag is set to the RCDATA or CDATA states then
368 examine the next few characters. If they do not match the tag name of
369 the last start tag token emitted (case insensitively), or if they do but
370 they are not immediately followed by one of the following characters:
371 * U+0009 CHARACTER TABULATION
372 * U+000A LINE FEED (LF)
373 * U+000B LINE TABULATION
374 * U+000C FORM FEED (FF)
375 * U+0020 SPACE
376 * U+003E GREATER-THAN SIGN (>)
377 * U+002F SOLIDUS (/)
378 * EOF
379 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
380 token, a U+002F SOLIDUS character token, and switch to the data state
381 to process the next input character. */
382 $this->emitToken(array(
383 'type' => self::CHARACTR,
384 'data' => '</'
387 $this->state = 'data';
389 } else {
390 /* Otherwise, if the content model flag is set to the PCDATA state,
391 or if the next few characters do match that tag name, consume the
392 next input character: */
393 $this->char++;
394 $char = $this->char();
396 if(preg_match('/^[A-Za-z]$/', $char)) {
397 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
398 Create a new end tag token, set its tag name to the lowercase version
399 of the input character (add 0x0020 to the character's code point), then
400 switch to the tag name state. (Don't emit the token yet; further details
401 will be filled in before it is emitted.) */
402 $this->token = array(
403 'name' => strtolower($char),
404 'type' => self::ENDTAG
407 $this->state = 'tagName';
409 } elseif($char === '>') {
410 /* U+003E GREATER-THAN SIGN (>)
411 Parse error. Switch to the data state. */
412 $this->state = 'data';
414 } elseif($this->char === $this->EOF) {
415 /* EOF
416 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
417 SOLIDUS character token. Reconsume the EOF character in the data state. */
418 $this->emitToken(array(
419 'type' => self::CHARACTR,
420 'data' => '</'
423 $this->char--;
424 $this->state = 'data';
426 } else {
427 /* Parse error. Switch to the bogus comment state. */
428 $this->state = 'bogusComment';
433 private function tagNameState() {
434 // Consume the next input character:
435 $this->char++;
436 $char = $this->character($this->char);
438 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
439 /* U+0009 CHARACTER TABULATION
440 U+000A LINE FEED (LF)
441 U+000B LINE TABULATION
442 U+000C FORM FEED (FF)
443 U+0020 SPACE
444 Switch to the before attribute name state. */
445 $this->state = 'beforeAttributeName';
447 } elseif($char === '>') {
448 /* U+003E GREATER-THAN SIGN (>)
449 Emit the current tag token. Switch to the data state. */
450 $this->emitToken($this->token);
451 $this->state = 'data';
453 } elseif($this->char === $this->EOF) {
454 /* EOF
455 Parse error. Emit the current tag token. Reconsume the EOF
456 character in the data state. */
457 $this->emitToken($this->token);
459 $this->char--;
460 $this->state = 'data';
462 } elseif($char === '/') {
463 /* U+002F SOLIDUS (/)
464 Parse error unless this is a permitted slash. Switch to the before
465 attribute name state. */
466 $this->state = 'beforeAttributeName';
468 } else {
469 /* Anything else
470 Append the current input character to the current tag token's tag name.
471 Stay in the tag name state. */
472 $this->token['name'] .= strtolower($char);
473 $this->state = 'tagName';
477 private function beforeAttributeNameState() {
478 // Consume the next input character:
479 $this->char++;
480 $char = $this->character($this->char);
482 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
483 /* U+0009 CHARACTER TABULATION
484 U+000A LINE FEED (LF)
485 U+000B LINE TABULATION
486 U+000C FORM FEED (FF)
487 U+0020 SPACE
488 Stay in the before attribute name state. */
489 $this->state = 'beforeAttributeName';
491 } elseif($char === '>') {
492 /* U+003E GREATER-THAN SIGN (>)
493 Emit the current tag token. Switch to the data state. */
494 $this->emitToken($this->token);
495 $this->state = 'data';
497 } elseif($char === '/') {
498 /* U+002F SOLIDUS (/)
499 Parse error unless this is a permitted slash. Stay in the before
500 attribute name state. */
501 $this->state = 'beforeAttributeName';
503 } elseif($this->char === $this->EOF) {
504 /* EOF
505 Parse error. Emit the current tag token. Reconsume the EOF
506 character in the data state. */
507 $this->emitToken($this->token);
509 $this->char--;
510 $this->state = 'data';
512 } else {
513 /* Anything else
514 Start a new attribute in the current tag token. Set that attribute's
515 name to the current input character, and its value to the empty string.
516 Switch to the attribute name state. */
517 $this->token['attr'][] = array(
518 'name' => strtolower($char),
519 'value' => null
522 $this->state = 'attributeName';
526 private function attributeNameState() {
527 // Consume the next input character:
528 $this->char++;
529 $char = $this->character($this->char);
531 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
532 /* U+0009 CHARACTER TABULATION
533 U+000A LINE FEED (LF)
534 U+000B LINE TABULATION
535 U+000C FORM FEED (FF)
536 U+0020 SPACE
537 Stay in the before attribute name state. */
538 $this->state = 'afterAttributeName';
540 } elseif($char === '=') {
541 /* U+003D EQUALS SIGN (=)
542 Switch to the before attribute value state. */
543 $this->state = 'beforeAttributeValue';
545 } elseif($char === '>') {
546 /* U+003E GREATER-THAN SIGN (>)
547 Emit the current tag token. Switch to the data state. */
548 $this->emitToken($this->token);
549 $this->state = 'data';
551 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
552 /* U+002F SOLIDUS (/)
553 Parse error unless this is a permitted slash. Switch to the before
554 attribute name state. */
555 $this->state = 'beforeAttributeName';
557 } elseif($this->char === $this->EOF) {
558 /* EOF
559 Parse error. Emit the current tag token. Reconsume the EOF
560 character in the data state. */
561 $this->emitToken($this->token);
563 $this->char--;
564 $this->state = 'data';
566 } else {
567 /* Anything else
568 Append the current input character to the current attribute's name.
569 Stay in the attribute name state. */
570 $last = count($this->token['attr']) - 1;
571 $this->token['attr'][$last]['name'] .= strtolower($char);
573 $this->state = 'attributeName';
577 private function afterAttributeNameState() {
578 // Consume the next input character:
579 $this->char++;
580 $char = $this->character($this->char);
582 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
583 /* U+0009 CHARACTER TABULATION
584 U+000A LINE FEED (LF)
585 U+000B LINE TABULATION
586 U+000C FORM FEED (FF)
587 U+0020 SPACE
588 Stay in the after attribute name state. */
589 $this->state = 'afterAttributeName';
591 } elseif($char === '=') {
592 /* U+003D EQUALS SIGN (=)
593 Switch to the before attribute value state. */
594 $this->state = 'beforeAttributeValue';
596 } elseif($char === '>') {
597 /* U+003E GREATER-THAN SIGN (>)
598 Emit the current tag token. Switch to the data state. */
599 $this->emitToken($this->token);
600 $this->state = 'data';
602 } elseif($char === '/' && $this->character($this->char + 1) !== '>') {
603 /* U+002F SOLIDUS (/)
604 Parse error unless this is a permitted slash. Switch to the
605 before attribute name state. */
606 $this->state = 'beforeAttributeName';
608 } elseif($this->char === $this->EOF) {
609 /* EOF
610 Parse error. Emit the current tag token. Reconsume the EOF
611 character in the data state. */
612 $this->emitToken($this->token);
614 $this->char--;
615 $this->state = 'data';
617 } else {
618 /* Anything else
619 Start a new attribute in the current tag token. Set that attribute's
620 name to the current input character, and its value to the empty string.
621 Switch to the attribute name state. */
622 $this->token['attr'][] = array(
623 'name' => strtolower($char),
624 'value' => null
627 $this->state = 'attributeName';
631 private function beforeAttributeValueState() {
632 // Consume the next input character:
633 $this->char++;
634 $char = $this->character($this->char);
636 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
637 /* U+0009 CHARACTER TABULATION
638 U+000A LINE FEED (LF)
639 U+000B LINE TABULATION
640 U+000C FORM FEED (FF)
641 U+0020 SPACE
642 Stay in the before attribute value state. */
643 $this->state = 'beforeAttributeValue';
645 } elseif($char === '"') {
646 /* U+0022 QUOTATION MARK (")
647 Switch to the attribute value (double-quoted) state. */
648 $this->state = 'attributeValueDoubleQuoted';
650 } elseif($char === '&') {
651 /* U+0026 AMPERSAND (&)
652 Switch to the attribute value (unquoted) state and reconsume
653 this input character. */
654 $this->char--;
655 $this->state = 'attributeValueUnquoted';
657 } elseif($char === '\'') {
658 /* U+0027 APOSTROPHE (')
659 Switch to the attribute value (single-quoted) state. */
660 $this->state = 'attributeValueSingleQuoted';
662 } elseif($char === '>') {
663 /* U+003E GREATER-THAN SIGN (>)
664 Emit the current tag token. Switch to the data state. */
665 $this->emitToken($this->token);
666 $this->state = 'data';
668 } else {
669 /* Anything else
670 Append the current input character to the current attribute's value.
671 Switch to the attribute value (unquoted) state. */
672 $last = count($this->token['attr']) - 1;
673 $this->token['attr'][$last]['value'] .= $char;
675 $this->state = 'attributeValueUnquoted';
679 private function attributeValueDoubleQuotedState() {
680 // Consume the next input character:
681 $this->char++;
682 $char = $this->character($this->char);
684 if($char === '"') {
685 /* U+0022 QUOTATION MARK (")
686 Switch to the before attribute name state. */
687 $this->state = 'beforeAttributeName';
689 } elseif($char === '&') {
690 /* U+0026 AMPERSAND (&)
691 Switch to the entity in attribute value state. */
692 $this->entityInAttributeValueState('double');
694 } elseif($this->char === $this->EOF) {
695 /* EOF
696 Parse error. Emit the current tag token. Reconsume the character
697 in the data state. */
698 $this->emitToken($this->token);
700 $this->char--;
701 $this->state = 'data';
703 } else {
704 /* Anything else
705 Append the current input character to the current attribute's value.
706 Stay in the attribute value (double-quoted) state. */
707 $last = count($this->token['attr']) - 1;
708 $this->token['attr'][$last]['value'] .= $char;
710 $this->state = 'attributeValueDoubleQuoted';
714 private function attributeValueSingleQuotedState() {
715 // Consume the next input character:
716 $this->char++;
717 $char = $this->character($this->char);
719 if($char === '\'') {
720 /* U+0022 QUOTATION MARK (')
721 Switch to the before attribute name state. */
722 $this->state = 'beforeAttributeName';
724 } elseif($char === '&') {
725 /* U+0026 AMPERSAND (&)
726 Switch to the entity in attribute value state. */
727 $this->entityInAttributeValueState('single');
729 } elseif($this->char === $this->EOF) {
730 /* EOF
731 Parse error. Emit the current tag token. Reconsume the character
732 in the data state. */
733 $this->emitToken($this->token);
735 $this->char--;
736 $this->state = 'data';
738 } else {
739 /* Anything else
740 Append the current input character to the current attribute's value.
741 Stay in the attribute value (single-quoted) state. */
742 $last = count($this->token['attr']) - 1;
743 $this->token['attr'][$last]['value'] .= $char;
745 $this->state = 'attributeValueSingleQuoted';
749 private function attributeValueUnquotedState() {
750 // Consume the next input character:
751 $this->char++;
752 $char = $this->character($this->char);
754 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
755 /* U+0009 CHARACTER TABULATION
756 U+000A LINE FEED (LF)
757 U+000B LINE TABULATION
758 U+000C FORM FEED (FF)
759 U+0020 SPACE
760 Switch to the before attribute name state. */
761 $this->state = 'beforeAttributeName';
763 } elseif($char === '&') {
764 /* U+0026 AMPERSAND (&)
765 Switch to the entity in attribute value state. */
766 $this->entityInAttributeValueState();
768 } elseif($char === '>') {
769 /* U+003E GREATER-THAN SIGN (>)
770 Emit the current tag token. Switch to the data state. */
771 $this->emitToken($this->token);
772 $this->state = 'data';
774 } else {
775 /* Anything else
776 Append the current input character to the current attribute's value.
777 Stay in the attribute value (unquoted) state. */
778 $last = count($this->token['attr']) - 1;
779 $this->token['attr'][$last]['value'] .= $char;
781 $this->state = 'attributeValueUnquoted';
785 private function entityInAttributeValueState() {
786 // Attempt to consume an entity.
787 $entity = $this->entity();
789 // If nothing is returned, append a U+0026 AMPERSAND character to the
790 // current attribute's value. Otherwise, emit the character token that
791 // was returned.
792 $char = (!$entity)
793 ? '&'
794 : $entity;
796 $last = count($this->token['attr']) - 1;
797 $this->token['attr'][$last]['value'] .= $char;
800 private function bogusCommentState() {
801 /* Consume every character up to the first U+003E GREATER-THAN SIGN
802 character (>) or the end of the file (EOF), whichever comes first. Emit
803 a comment token whose data is the concatenation of all the characters
804 starting from and including the character that caused the state machine
805 to switch into the bogus comment state, up to and including the last
806 consumed character before the U+003E character, if any, or up to the
807 end of the file otherwise. (If the comment was started by the end of
808 the file (EOF), the token is empty.) */
809 $data = $this->characters('^>', $this->char);
810 $this->emitToken(array(
811 'data' => $data,
812 'type' => self::COMMENT
815 $this->char += strlen($data);
817 /* Switch to the data state. */
818 $this->state = 'data';
820 /* If the end of the file was reached, reconsume the EOF character. */
821 if($this->char === $this->EOF) {
822 $this->char = $this->EOF - 1;
826 private function markupDeclarationOpenState() {
827 /* If the next two characters are both U+002D HYPHEN-MINUS (-)
828 characters, consume those two characters, create a comment token whose
829 data is the empty string, and switch to the comment state. */
830 if($this->character($this->char + 1, 2) === '--') {
831 $this->char += 2;
832 $this->state = 'comment';
833 $this->token = array(
834 'data' => null,
835 'type' => self::COMMENT
838 /* Otherwise if the next seven chacacters are a case-insensitive match
839 for the word "DOCTYPE", then consume those characters and switch to the
840 DOCTYPE state. */
841 } elseif(strtolower($this->character($this->char + 1, 7)) === 'doctype') {
842 $this->char += 7;
843 $this->state = 'doctype';
845 /* Otherwise, is is a parse error. Switch to the bogus comment state.
846 The next character that is consumed, if any, is the first character
847 that will be in the comment. */
848 } else {
849 $this->char++;
850 $this->state = 'bogusComment';
854 private function commentState() {
855 /* Consume the next input character: */
856 $this->char++;
857 $char = $this->char();
859 /* U+002D HYPHEN-MINUS (-) */
860 if($char === '-') {
861 /* Switch to the comment dash state */
862 $this->state = 'commentDash';
864 /* EOF */
865 } elseif($this->char === $this->EOF) {
866 /* Parse error. Emit the comment token. Reconsume the EOF character
867 in the data state. */
868 $this->emitToken($this->token);
869 $this->char--;
870 $this->state = 'data';
872 /* Anything else */
873 } else {
874 /* Append the input character to the comment token's data. Stay in
875 the comment state. */
876 $this->token['data'] .= $char;
880 private function commentDashState() {
881 /* Consume the next input character: */
882 $this->char++;
883 $char = $this->char();
885 /* U+002D HYPHEN-MINUS (-) */
886 if($char === '-') {
887 /* Switch to the comment end state */
888 $this->state = 'commentEnd';
890 /* EOF */
891 } elseif($this->char === $this->EOF) {
892 /* Parse error. Emit the comment token. Reconsume the EOF character
893 in the data state. */
894 $this->emitToken($this->token);
895 $this->char--;
896 $this->state = 'data';
898 /* Anything else */
899 } else {
900 /* Append a U+002D HYPHEN-MINUS (-) character and the input
901 character to the comment token's data. Switch to the comment state. */
902 $this->token['data'] .= '-'.$char;
903 $this->state = 'comment';
907 private function commentEndState() {
908 /* Consume the next input character: */
909 $this->char++;
910 $char = $this->char();
912 if($char === '>') {
913 $this->emitToken($this->token);
914 $this->state = 'data';
916 } elseif($char === '-') {
917 $this->token['data'] .= '-';
919 } elseif($this->char === $this->EOF) {
920 $this->emitToken($this->token);
921 $this->char--;
922 $this->state = 'data';
924 } else {
925 $this->token['data'] .= '--'.$char;
926 $this->state = 'comment';
930 private function doctypeState() {
931 /* Consume the next input character: */
932 $this->char++;
933 $char = $this->char();
935 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
936 $this->state = 'beforeDoctypeName';
938 } else {
939 $this->char--;
940 $this->state = 'beforeDoctypeName';
944 private function beforeDoctypeNameState() {
945 /* Consume the next input character: */
946 $this->char++;
947 $char = $this->char();
949 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
950 // Stay in the before DOCTYPE name state.
952 } elseif(preg_match('/^[a-z]$/', $char)) {
953 $this->token = array(
954 'name' => strtoupper($char),
955 'type' => self::DOCTYPE,
956 'error' => true
959 $this->state = 'doctypeName';
961 } elseif($char === '>') {
962 $this->emitToken(array(
963 'name' => null,
964 'type' => self::DOCTYPE,
965 'error' => true
968 $this->state = 'data';
970 } elseif($this->char === $this->EOF) {
971 $this->emitToken(array(
972 'name' => null,
973 'type' => self::DOCTYPE,
974 'error' => true
977 $this->char--;
978 $this->state = 'data';
980 } else {
981 $this->token = array(
982 'name' => $char,
983 'type' => self::DOCTYPE,
984 'error' => true
987 $this->state = 'doctypeName';
991 private function doctypeNameState() {
992 /* Consume the next input character: */
993 $this->char++;
994 $char = $this->char();
996 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
997 $this->state = 'AfterDoctypeName';
999 } elseif($char === '>') {
1000 $this->emitToken($this->token);
1001 $this->state = 'data';
1003 } elseif(preg_match('/^[a-z]$/', $char)) {
1004 $this->token['name'] .= strtoupper($char);
1006 } elseif($this->char === $this->EOF) {
1007 $this->emitToken($this->token);
1008 $this->char--;
1009 $this->state = 'data';
1011 } else {
1012 $this->token['name'] .= $char;
1015 $this->token['error'] = ($this->token['name'] === 'HTML')
1016 ? false
1017 : true;
1020 private function afterDoctypeNameState() {
1021 /* Consume the next input character: */
1022 $this->char++;
1023 $char = $this->char();
1025 if(preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1026 // Stay in the DOCTYPE name state.
1028 } elseif($char === '>') {
1029 $this->emitToken($this->token);
1030 $this->state = 'data';
1032 } elseif($this->char === $this->EOF) {
1033 $this->emitToken($this->token);
1034 $this->char--;
1035 $this->state = 'data';
1037 } else {
1038 $this->token['error'] = true;
1039 $this->state = 'bogusDoctype';
1043 private function bogusDoctypeState() {
1044 /* Consume the next input character: */
1045 $this->char++;
1046 $char = $this->char();
1048 if($char === '>') {
1049 $this->emitToken($this->token);
1050 $this->state = 'data';
1052 } elseif($this->char === $this->EOF) {
1053 $this->emitToken($this->token);
1054 $this->char--;
1055 $this->state = 'data';
1057 } else {
1058 // Stay in the bogus DOCTYPE state.
1062 private function entity() {
1063 $start = $this->char;
1065 // This section defines how to consume an entity. This definition is
1066 // used when parsing entities in text and in attributes.
1068 // The behaviour depends on the identity of the next character (the
1069 // one immediately after the U+0026 AMPERSAND character):
1071 switch($this->character($this->char + 1)) {
1072 // U+0023 NUMBER SIGN (#)
1073 case '#':
1075 // The behaviour further depends on the character after the
1076 // U+0023 NUMBER SIGN:
1077 switch($this->character($this->char + 1)) {
1078 // U+0078 LATIN SMALL LETTER X
1079 // U+0058 LATIN CAPITAL LETTER X
1080 case 'x':
1081 case 'X':
1082 // Follow the steps below, but using the range of
1083 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1084 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1085 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1086 // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1087 // words, 0-9, A-F, a-f).
1088 $char = 1;
1089 $char_class = '0-9A-Fa-f';
1090 break;
1092 // Anything else
1093 default:
1094 // Follow the steps below, but using the range of
1095 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1096 // NINE (i.e. just 0-9).
1097 $char = 0;
1098 $char_class = '0-9';
1099 break;
1102 // Consume as many characters as match the range of characters
1103 // given above.
1104 $this->char++;
1105 $e_name = $this->characters($char_class, $this->char + $char + 1);
1106 $entity = $this->character($start, $this->char);
1107 $cond = strlen($e_name) > 0;
1109 // The rest of the parsing happens bellow.
1110 break;
1112 // Anything else
1113 default:
1114 // Consume the maximum number of characters possible, with the
1115 // consumed characters case-sensitively matching one of the
1116 // identifiers in the first column of the entities table.
1117 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1118 $len = strlen($e_name);
1120 for($c = 1; $c <= $len; $c++) {
1121 $id = substr($e_name, 0, $c);
1122 $this->char++;
1124 if(in_array($id, $this->entities)) {
1125 if ($e_name[$c-1] !== ';') {
1126 if ($c < $len && $e_name[$c] == ';') {
1127 $this->char++; // consume extra semicolon
1130 $entity = $id;
1131 break;
1135 $cond = isset($entity);
1136 // The rest of the parsing happens bellow.
1137 break;
1140 if(!$cond) {
1141 // If no match can be made, then this is a parse error. No
1142 // characters are consumed, and nothing is returned.
1143 $this->char = $start;
1144 return false;
1147 // Return a character token for the character corresponding to the
1148 // entity name (as given by the second column of the entities table).
1149 return html_entity_decode('&'.$entity.';', ENT_QUOTES, 'UTF-8');
1152 private function emitToken($token) {
1153 $emit = $this->tree->emitToken($token);
1155 if(is_int($emit)) {
1156 $this->content_model = $emit;
1158 } elseif($token['type'] === self::ENDTAG) {
1159 $this->content_model = self::PCDATA;
1163 private function EOF() {
1164 $this->state = null;
1165 $this->tree->emitToken(array(
1166 'type' => self::EOF
1171 class HTML5TreeConstructer {
1172 public $stack = array();
1174 private $phase;
1175 private $mode;
1176 private $dom;
1177 private $foster_parent = null;
1178 private $a_formatting = array();
1180 private $head_pointer = null;
1181 private $form_pointer = null;
1183 private $scoping = array('button','caption','html','marquee','object','table','td','th');
1184 private $formatting = array('a','b','big','em','font','i','nobr','s','small','strike','strong','tt','u');
1185 private $special = array('address','area','base','basefont','bgsound',
1186 'blockquote','body','br','center','col','colgroup','dd','dir','div','dl',
1187 'dt','embed','fieldset','form','frame','frameset','h1','h2','h3','h4','h5',
1188 'h6','head','hr','iframe','image','img','input','isindex','li','link',
1189 'listing','menu','meta','noembed','noframes','noscript','ol','optgroup',
1190 'option','p','param','plaintext','pre','script','select','spacer','style',
1191 'tbody','textarea','tfoot','thead','title','tr','ul','wbr');
1193 // The different phases.
1194 const INIT_PHASE = 0;
1195 const ROOT_PHASE = 1;
1196 const MAIN_PHASE = 2;
1197 const END_PHASE = 3;
1199 // The different insertion modes for the main phase.
1200 const BEFOR_HEAD = 0;
1201 const IN_HEAD = 1;
1202 const AFTER_HEAD = 2;
1203 const IN_BODY = 3;
1204 const IN_TABLE = 4;
1205 const IN_CAPTION = 5;
1206 const IN_CGROUP = 6;
1207 const IN_TBODY = 7;
1208 const IN_ROW = 8;
1209 const IN_CELL = 9;
1210 const IN_SELECT = 10;
1211 const AFTER_BODY = 11;
1212 const IN_FRAME = 12;
1213 const AFTR_FRAME = 13;
1215 // The different types of elements.
1216 const SPECIAL = 0;
1217 const SCOPING = 1;
1218 const FORMATTING = 2;
1219 const PHRASING = 3;
1221 const MARKER = 0;
1223 public function __construct() {
1224 $this->phase = self::INIT_PHASE;
1225 $this->mode = self::BEFOR_HEAD;
1226 $this->dom = new DOMDocument;
1228 $this->dom->encoding = 'UTF-8';
1229 $this->dom->preserveWhiteSpace = true;
1230 $this->dom->substituteEntities = true;
1231 $this->dom->strictErrorChecking = false;
1234 // Process tag tokens
1235 public function emitToken($token) {
1236 switch($this->phase) {
1237 case self::INIT_PHASE: return $this->initPhase($token); break;
1238 case self::ROOT_PHASE: return $this->rootElementPhase($token); break;
1239 case self::MAIN_PHASE: return $this->mainPhase($token); break;
1240 case self::END_PHASE : return $this->trailingEndPhase($token); break;
1244 private function initPhase($token) {
1245 /* Initially, the tree construction stage must handle each token
1246 emitted from the tokenisation stage as follows: */
1248 /* A DOCTYPE token that is marked as being in error
1249 A comment token
1250 A start tag token
1251 An end tag token
1252 A character token that is not one of one of U+0009 CHARACTER TABULATION,
1253 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1254 or U+0020 SPACE
1255 An end-of-file token */
1256 if((isset($token['error']) && $token['error']) ||
1257 $token['type'] === HTML5::COMMENT ||
1258 $token['type'] === HTML5::STARTTAG ||
1259 $token['type'] === HTML5::ENDTAG ||
1260 $token['type'] === HTML5::EOF ||
1261 ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1262 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))) {
1263 /* This specification does not define how to handle this case. In
1264 particular, user agents may ignore the entirety of this specification
1265 altogether for such documents, and instead invoke special parse modes
1266 with a greater emphasis on backwards compatibility. */
1268 $this->phase = self::ROOT_PHASE;
1269 return $this->rootElementPhase($token);
1271 /* A DOCTYPE token marked as being correct */
1272 } elseif(isset($token['error']) && !$token['error']) {
1273 /* Append a DocumentType node to the Document node, with the name
1274 attribute set to the name given in the DOCTYPE token (which will be
1275 "HTML"), and the other attributes specific to DocumentType objects
1276 set to null, empty lists, or the empty string as appropriate. */
1277 $doctype = new DOMDocumentType(null, null, 'HTML');
1279 /* Then, switch to the root element phase of the tree construction
1280 stage. */
1281 $this->phase = self::ROOT_PHASE;
1283 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1284 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1285 or U+0020 SPACE */
1286 } elseif(isset($token['data']) && preg_match('/^[\t\n\x0b\x0c ]+$/',
1287 $token['data'])) {
1288 /* Append that character to the Document node. */
1289 $text = $this->dom->createTextNode($token['data']);
1290 $this->dom->appendChild($text);
1294 private function rootElementPhase($token) {
1295 /* After the initial phase, as each token is emitted from the tokenisation
1296 stage, it must be processed as described in this section. */
1298 /* A DOCTYPE token */
1299 if($token['type'] === HTML5::DOCTYPE) {
1300 // Parse error. Ignore the token.
1302 /* A comment token */
1303 } elseif($token['type'] === HTML5::COMMENT) {
1304 /* Append a Comment node to the Document object with the data
1305 attribute set to the data given in the comment token. */
1306 $comment = $this->dom->createComment($token['data']);
1307 $this->dom->appendChild($comment);
1309 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1310 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1311 or U+0020 SPACE */
1312 } elseif($token['type'] === HTML5::CHARACTR &&
1313 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1314 /* Append that character to the Document node. */
1315 $text = $this->dom->createTextNode($token['data']);
1316 $this->dom->appendChild($text);
1318 /* A character token that is not one of U+0009 CHARACTER TABULATION,
1319 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1320 (FF), or U+0020 SPACE
1321 A start tag token
1322 An end tag token
1323 An end-of-file token */
1324 } elseif(($token['type'] === HTML5::CHARACTR &&
1325 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1326 $token['type'] === HTML5::STARTTAG ||
1327 $token['type'] === HTML5::ENDTAG ||
1328 $token['type'] === HTML5::EOF) {
1329 /* Create an HTMLElement node with the tag name html, in the HTML
1330 namespace. Append it to the Document object. Switch to the main
1331 phase and reprocess the current token. */
1332 $html = $this->dom->createElement('html');
1333 $this->dom->appendChild($html);
1334 $this->stack[] = $html;
1336 $this->phase = self::MAIN_PHASE;
1337 return $this->mainPhase($token);
1341 private function mainPhase($token) {
1342 /* Tokens in the main phase must be handled as follows: */
1344 /* A DOCTYPE token */
1345 if($token['type'] === HTML5::DOCTYPE) {
1346 // Parse error. Ignore the token.
1348 /* A start tag token with the tag name "html" */
1349 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1350 /* If this start tag token was not the first start tag token, then
1351 it is a parse error. */
1353 /* For each attribute on the token, check to see if the attribute
1354 is already present on the top element of the stack of open elements.
1355 If it is not, add the attribute and its corresponding value to that
1356 element. */
1357 foreach($token['attr'] as $attr) {
1358 if(!$this->stack[0]->hasAttribute($attr['name'])) {
1359 $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1363 /* An end-of-file token */
1364 } elseif($token['type'] === HTML5::EOF) {
1365 /* Generate implied end tags. */
1366 $this->generateImpliedEndTags();
1368 /* Anything else. */
1369 } else {
1370 /* Depends on the insertion mode: */
1371 switch($this->mode) {
1372 case self::BEFOR_HEAD: return $this->beforeHead($token); break;
1373 case self::IN_HEAD: return $this->inHead($token); break;
1374 case self::AFTER_HEAD: return $this->afterHead($token); break;
1375 case self::IN_BODY: return $this->inBody($token); break;
1376 case self::IN_TABLE: return $this->inTable($token); break;
1377 case self::IN_CAPTION: return $this->inCaption($token); break;
1378 case self::IN_CGROUP: return $this->inColumnGroup($token); break;
1379 case self::IN_TBODY: return $this->inTableBody($token); break;
1380 case self::IN_ROW: return $this->inRow($token); break;
1381 case self::IN_CELL: return $this->inCell($token); break;
1382 case self::IN_SELECT: return $this->inSelect($token); break;
1383 case self::AFTER_BODY: return $this->afterBody($token); break;
1384 case self::IN_FRAME: return $this->inFrameset($token); break;
1385 case self::AFTR_FRAME: return $this->afterFrameset($token); break;
1386 case self::END_PHASE: return $this->trailingEndPhase($token); break;
1391 private function beforeHead($token) {
1392 /* Handle the token as follows: */
1394 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1395 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1396 or U+0020 SPACE */
1397 if($token['type'] === HTML5::CHARACTR &&
1398 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1399 /* Append the character to the current node. */
1400 $this->insertText($token['data']);
1402 /* A comment token */
1403 } elseif($token['type'] === HTML5::COMMENT) {
1404 /* Append a Comment node to the current node with the data attribute
1405 set to the data given in the comment token. */
1406 $this->insertComment($token['data']);
1408 /* A start tag token with the tag name "head" */
1409 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1410 /* Create an element for the token, append the new element to the
1411 current node and push it onto the stack of open elements. */
1412 $element = $this->insertElement($token);
1414 /* Set the head element pointer to this new element node. */
1415 $this->head_pointer = $element;
1417 /* Change the insertion mode to "in head". */
1418 $this->mode = self::IN_HEAD;
1420 /* A start tag token whose tag name is one of: "base", "link", "meta",
1421 "script", "style", "title". Or an end tag with the tag name "html".
1422 Or a character token that is not one of U+0009 CHARACTER TABULATION,
1423 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1424 or U+0020 SPACE. Or any other start tag token */
1425 } elseif($token['type'] === HTML5::STARTTAG ||
1426 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1427 ($token['type'] === HTML5::CHARACTR && !preg_match('/^[\t\n\x0b\x0c ]$/',
1428 $token['data']))) {
1429 /* Act as if a start tag token with the tag name "head" and no
1430 attributes had been seen, then reprocess the current token. */
1431 $this->beforeHead(array(
1432 'name' => 'head',
1433 'type' => HTML5::STARTTAG,
1434 'attr' => array()
1437 return $this->inHead($token);
1439 /* Any other end tag */
1440 } elseif($token['type'] === HTML5::ENDTAG) {
1441 /* Parse error. Ignore the token. */
1445 private function inHead($token) {
1446 /* Handle the token as follows: */
1448 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1449 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1450 or U+0020 SPACE.
1452 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1453 or script element, append the character to the current node regardless
1454 of its content. */
1455 if(($token['type'] === HTML5::CHARACTR &&
1456 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1457 $token['type'] === HTML5::CHARACTR && in_array(end($this->stack)->nodeName,
1458 array('title', 'style', 'script')))) {
1459 /* Append the character to the current node. */
1460 $this->insertText($token['data']);
1462 /* A comment token */
1463 } elseif($token['type'] === HTML5::COMMENT) {
1464 /* Append a Comment node to the current node with the data attribute
1465 set to the data given in the comment token. */
1466 $this->insertComment($token['data']);
1468 } elseif($token['type'] === HTML5::ENDTAG &&
1469 in_array($token['name'], array('title', 'style', 'script'))) {
1470 array_pop($this->stack);
1471 return HTML5::PCDATA;
1473 /* A start tag with the tag name "title" */
1474 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
1475 /* Create an element for the token and append the new element to the
1476 node pointed to by the head element pointer, or, if that is null
1477 (innerHTML case), to the current node. */
1478 if($this->head_pointer !== null) {
1479 $element = $this->insertElement($token, false);
1480 $this->head_pointer->appendChild($element);
1482 } else {
1483 $element = $this->insertElement($token);
1486 /* Switch the tokeniser's content model flag to the RCDATA state. */
1487 return HTML5::RCDATA;
1489 /* A start tag with the tag name "style" */
1490 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
1491 /* Create an element for the token and append the new element to the
1492 node pointed to by the head element pointer, or, if that is null
1493 (innerHTML case), to the current node. */
1494 if($this->head_pointer !== null) {
1495 $element = $this->insertElement($token, false);
1496 $this->head_pointer->appendChild($element);
1498 } else {
1499 $this->insertElement($token);
1502 /* Switch the tokeniser's content model flag to the CDATA state. */
1503 return HTML5::CDATA;
1505 /* A start tag with the tag name "script" */
1506 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
1507 /* Create an element for the token. */
1508 $element = $this->insertElement($token, false);
1509 $this->head_pointer->appendChild($element);
1511 /* Switch the tokeniser's content model flag to the CDATA state. */
1512 return HTML5::CDATA;
1514 /* A start tag with the tag name "base", "link", or "meta" */
1515 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1516 array('base', 'link', 'meta'))) {
1517 /* Create an element for the token and append the new element to the
1518 node pointed to by the head element pointer, or, if that is null
1519 (innerHTML case), to the current node. */
1520 if($this->head_pointer !== null) {
1521 $element = $this->insertElement($token, false);
1522 $this->head_pointer->appendChild($element);
1523 array_pop($this->stack);
1525 } else {
1526 $this->insertElement($token);
1529 /* An end tag with the tag name "head" */
1530 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
1531 /* If the current node is a head element, pop the current node off
1532 the stack of open elements. */
1533 if($this->head_pointer->isSameNode(end($this->stack))) {
1534 array_pop($this->stack);
1536 /* Otherwise, this is a parse error. */
1537 } else {
1538 // k
1541 /* Change the insertion mode to "after head". */
1542 $this->mode = self::AFTER_HEAD;
1544 /* A start tag with the tag name "head" or an end tag except "html". */
1545 } elseif(($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
1546 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')) {
1547 // Parse error. Ignore the token.
1549 /* Anything else */
1550 } else {
1551 /* If the current node is a head element, act as if an end tag
1552 token with the tag name "head" had been seen. */
1553 if($this->head_pointer->isSameNode(end($this->stack))) {
1554 $this->inHead(array(
1555 'name' => 'head',
1556 'type' => HTML5::ENDTAG
1559 /* Otherwise, change the insertion mode to "after head". */
1560 } else {
1561 $this->mode = self::AFTER_HEAD;
1564 /* Then, reprocess the current token. */
1565 return $this->afterHead($token);
1569 private function afterHead($token) {
1570 /* Handle the token as follows: */
1572 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1573 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1574 or U+0020 SPACE */
1575 if($token['type'] === HTML5::CHARACTR &&
1576 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
1577 /* Append the character to the current node. */
1578 $this->insertText($token['data']);
1580 /* A comment token */
1581 } elseif($token['type'] === HTML5::COMMENT) {
1582 /* Append a Comment node to the current node with the data attribute
1583 set to the data given in the comment token. */
1584 $this->insertComment($token['data']);
1586 /* A start tag token with the tag name "body" */
1587 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
1588 /* Insert a body element for the token. */
1589 $this->insertElement($token);
1591 /* Change the insertion mode to "in body". */
1592 $this->mode = self::IN_BODY;
1594 /* A start tag token with the tag name "frameset" */
1595 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
1596 /* Insert a frameset element for the token. */
1597 $this->insertElement($token);
1599 /* Change the insertion mode to "in frameset". */
1600 $this->mode = self::IN_FRAME;
1602 /* A start tag token whose tag name is one of: "base", "link", "meta",
1603 "script", "style", "title" */
1604 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
1605 array('base', 'link', 'meta', 'script', 'style', 'title'))) {
1606 /* Parse error. Switch the insertion mode back to "in head" and
1607 reprocess the token. */
1608 $this->mode = self::IN_HEAD;
1609 return $this->inHead($token);
1611 /* Anything else */
1612 } else {
1613 /* Act as if a start tag token with the tag name "body" and no
1614 attributes had been seen, and then reprocess the current token. */
1615 $this->afterHead(array(
1616 'name' => 'body',
1617 'type' => HTML5::STARTTAG,
1618 'attr' => array()
1621 return $this->inBody($token);
1625 private function inBody($token) {
1626 /* Handle the token as follows: */
1628 switch($token['type']) {
1629 /* A character token */
1630 case HTML5::CHARACTR:
1631 /* Reconstruct the active formatting elements, if any. */
1632 $this->reconstructActiveFormattingElements();
1634 /* Append the token's character to the current node. */
1635 $this->insertText($token['data']);
1636 break;
1638 /* A comment token */
1639 case HTML5::COMMENT:
1640 /* Append a Comment node to the current node with the data
1641 attribute set to the data given in the comment token. */
1642 $this->insertComment($token['data']);
1643 break;
1645 case HTML5::STARTTAG:
1646 switch($token['name']) {
1647 /* A start tag token whose tag name is one of: "script",
1648 "style" */
1649 case 'script': case 'style':
1650 /* Process the token as if the insertion mode had been "in
1651 head". */
1652 return $this->inHead($token);
1653 break;
1655 /* A start tag token whose tag name is one of: "base", "link",
1656 "meta", "title" */
1657 case 'base': case 'link': case 'meta': case 'title':
1658 /* Parse error. Process the token as if the insertion mode
1659 had been "in head". */
1660 return $this->inHead($token);
1661 break;
1663 /* A start tag token with the tag name "body" */
1664 case 'body':
1665 /* Parse error. If the second element on the stack of open
1666 elements is not a body element, or, if the stack of open
1667 elements has only one node on it, then ignore the token.
1668 (innerHTML case) */
1669 if(count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
1670 // Ignore
1672 /* Otherwise, for each attribute on the token, check to see
1673 if the attribute is already present on the body element (the
1674 second element) on the stack of open elements. If it is not,
1675 add the attribute and its corresponding value to that
1676 element. */
1677 } else {
1678 foreach($token['attr'] as $attr) {
1679 if(!$this->stack[1]->hasAttribute($attr['name'])) {
1680 $this->stack[1]->setAttribute($attr['name'], $attr['value']);
1684 break;
1686 /* A start tag whose tag name is one of: "address",
1687 "blockquote", "center", "dir", "div", "dl", "fieldset",
1688 "listing", "menu", "ol", "p", "ul" */
1689 case 'address': case 'blockquote': case 'center': case 'dir':
1690 case 'div': case 'dl': case 'fieldset': case 'listing':
1691 case 'menu': case 'ol': case 'p': case 'ul':
1692 /* If the stack of open elements has a p element in scope,
1693 then act as if an end tag with the tag name p had been
1694 seen. */
1695 if($this->elementInScope('p')) {
1696 $this->emitToken(array(
1697 'name' => 'p',
1698 'type' => HTML5::ENDTAG
1702 /* Insert an HTML element for the token. */
1703 $this->insertElement($token);
1704 break;
1706 /* A start tag whose tag name is "form" */
1707 case 'form':
1708 /* If the form element pointer is not null, ignore the
1709 token with a parse error. */
1710 if($this->form_pointer !== null) {
1711 // Ignore.
1713 /* Otherwise: */
1714 } else {
1715 /* If the stack of open elements has a p element in
1716 scope, then act as if an end tag with the tag name p
1717 had been seen. */
1718 if($this->elementInScope('p')) {
1719 $this->emitToken(array(
1720 'name' => 'p',
1721 'type' => HTML5::ENDTAG
1725 /* Insert an HTML element for the token, and set the
1726 form element pointer to point to the element created. */
1727 $element = $this->insertElement($token);
1728 $this->form_pointer = $element;
1730 break;
1732 /* A start tag whose tag name is "li", "dd" or "dt" */
1733 case 'li': case 'dd': case 'dt':
1734 /* If the stack of open elements has a p element in scope,
1735 then act as if an end tag with the tag name p had been
1736 seen. */
1737 if($this->elementInScope('p')) {
1738 $this->emitToken(array(
1739 'name' => 'p',
1740 'type' => HTML5::ENDTAG
1744 $stack_length = count($this->stack) - 1;
1746 for($n = $stack_length; 0 <= $n; $n--) {
1747 /* 1. Initialise node to be the current node (the
1748 bottommost node of the stack). */
1749 $stop = false;
1750 $node = $this->stack[$n];
1751 $cat = $this->getElementCategory($node->tagName);
1753 /* 2. If node is an li, dd or dt element, then pop all
1754 the nodes from the current node up to node, including
1755 node, then stop this algorithm. */
1756 if($token['name'] === $node->tagName || ($token['name'] !== 'li'
1757 && ($node->tagName === 'dd' || $node->tagName === 'dt'))) {
1758 for($x = $stack_length; $x >= $n ; $x--) {
1759 array_pop($this->stack);
1762 break;
1765 /* 3. If node is not in the formatting category, and is
1766 not in the phrasing category, and is not an address or
1767 div element, then stop this algorithm. */
1768 if($cat !== self::FORMATTING && $cat !== self::PHRASING &&
1769 $node->tagName !== 'address' && $node->tagName !== 'div') {
1770 break;
1774 /* Finally, insert an HTML element with the same tag
1775 name as the token's. */
1776 $this->insertElement($token);
1777 break;
1779 /* A start tag token whose tag name is "plaintext" */
1780 case 'plaintext':
1781 /* If the stack of open elements has a p element in scope,
1782 then act as if an end tag with the tag name p had been
1783 seen. */
1784 if($this->elementInScope('p')) {
1785 $this->emitToken(array(
1786 'name' => 'p',
1787 'type' => HTML5::ENDTAG
1791 /* Insert an HTML element for the token. */
1792 $this->insertElement($token);
1794 return HTML5::PLAINTEXT;
1795 break;
1797 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
1798 "h5", "h6" */
1799 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
1800 /* If the stack of open elements has a p element in scope,
1801 then act as if an end tag with the tag name p had been seen. */
1802 if($this->elementInScope('p')) {
1803 $this->emitToken(array(
1804 'name' => 'p',
1805 'type' => HTML5::ENDTAG
1809 /* If the stack of open elements has in scope an element whose
1810 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
1811 this is a parse error; pop elements from the stack until an
1812 element with one of those tag names has been popped from the
1813 stack. */
1814 while($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
1815 array_pop($this->stack);
1818 /* Insert an HTML element for the token. */
1819 $this->insertElement($token);
1820 break;
1822 /* A start tag whose tag name is "a" */
1823 case 'a':
1824 /* If the list of active formatting elements contains
1825 an element whose tag name is "a" between the end of the
1826 list and the last marker on the list (or the start of
1827 the list if there is no marker on the list), then this
1828 is a parse error; act as if an end tag with the tag name
1829 "a" had been seen, then remove that element from the list
1830 of active formatting elements and the stack of open
1831 elements if the end tag didn't already remove it (it
1832 might not have if the element is not in table scope). */
1833 $leng = count($this->a_formatting);
1835 for($n = $leng - 1; $n >= 0; $n--) {
1836 if($this->a_formatting[$n] === self::MARKER) {
1837 break;
1839 } elseif($this->a_formatting[$n]->nodeName === 'a') {
1840 $this->emitToken(array(
1841 'name' => 'a',
1842 'type' => HTML5::ENDTAG
1844 break;
1848 /* Reconstruct the active formatting elements, if any. */
1849 $this->reconstructActiveFormattingElements();
1851 /* Insert an HTML element for the token. */
1852 $el = $this->insertElement($token);
1854 /* Add that element to the list of active formatting
1855 elements. */
1856 $this->a_formatting[] = $el;
1857 break;
1859 /* A start tag whose tag name is one of: "b", "big", "em", "font",
1860 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
1861 case 'b': case 'big': case 'em': case 'font': case 'i':
1862 case 'nobr': case 's': case 'small': case 'strike':
1863 case 'strong': case 'tt': case 'u':
1864 /* Reconstruct the active formatting elements, if any. */
1865 $this->reconstructActiveFormattingElements();
1867 /* Insert an HTML element for the token. */
1868 $el = $this->insertElement($token);
1870 /* Add that element to the list of active formatting
1871 elements. */
1872 $this->a_formatting[] = $el;
1873 break;
1875 /* A start tag token whose tag name is "button" */
1876 case 'button':
1877 /* If the stack of open elements has a button element in scope,
1878 then this is a parse error; act as if an end tag with the tag
1879 name "button" had been seen, then reprocess the token. (We don't
1880 do that. Unnecessary.) */
1881 if($this->elementInScope('button')) {
1882 $this->inBody(array(
1883 'name' => 'button',
1884 'type' => HTML5::ENDTAG
1888 /* Reconstruct the active formatting elements, if any. */
1889 $this->reconstructActiveFormattingElements();
1891 /* Insert an HTML element for the token. */
1892 $this->insertElement($token);
1894 /* Insert a marker at the end of the list of active
1895 formatting elements. */
1896 $this->a_formatting[] = self::MARKER;
1897 break;
1899 /* A start tag token whose tag name is one of: "marquee", "object" */
1900 case 'marquee': case 'object':
1901 /* Reconstruct the active formatting elements, if any. */
1902 $this->reconstructActiveFormattingElements();
1904 /* Insert an HTML element for the token. */
1905 $this->insertElement($token);
1907 /* Insert a marker at the end of the list of active
1908 formatting elements. */
1909 $this->a_formatting[] = self::MARKER;
1910 break;
1912 /* A start tag token whose tag name is "xmp" */
1913 case 'xmp':
1914 /* Reconstruct the active formatting elements, if any. */
1915 $this->reconstructActiveFormattingElements();
1917 /* Insert an HTML element for the token. */
1918 $this->insertElement($token);
1920 /* Switch the content model flag to the CDATA state. */
1921 return HTML5::CDATA;
1922 break;
1924 /* A start tag whose tag name is "table" */
1925 case 'table':
1926 /* If the stack of open elements has a p element in scope,
1927 then act as if an end tag with the tag name p had been seen. */
1928 if($this->elementInScope('p')) {
1929 $this->emitToken(array(
1930 'name' => 'p',
1931 'type' => HTML5::ENDTAG
1935 /* Insert an HTML element for the token. */
1936 $this->insertElement($token);
1938 /* Change the insertion mode to "in table". */
1939 $this->mode = self::IN_TABLE;
1940 break;
1942 /* A start tag whose tag name is one of: "area", "basefont",
1943 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
1944 case 'area': case 'basefont': case 'bgsound': case 'br':
1945 case 'embed': case 'img': case 'param': case 'spacer':
1946 case 'wbr':
1947 /* Reconstruct the active formatting elements, if any. */
1948 $this->reconstructActiveFormattingElements();
1950 /* Insert an HTML element for the token. */
1951 $this->insertElement($token);
1953 /* Immediately pop the current node off the stack of open elements. */
1954 array_pop($this->stack);
1955 break;
1957 /* A start tag whose tag name is "hr" */
1958 case 'hr':
1959 /* If the stack of open elements has a p element in scope,
1960 then act as if an end tag with the tag name p had been seen. */
1961 if($this->elementInScope('p')) {
1962 $this->emitToken(array(
1963 'name' => 'p',
1964 'type' => HTML5::ENDTAG
1968 /* Insert an HTML element for the token. */
1969 $this->insertElement($token);
1971 /* Immediately pop the current node off the stack of open elements. */
1972 array_pop($this->stack);
1973 break;
1975 /* A start tag whose tag name is "image" */
1976 case 'image':
1977 /* Parse error. Change the token's tag name to "img" and
1978 reprocess it. (Don't ask.) */
1979 $token['name'] = 'img';
1980 return $this->inBody($token);
1981 break;
1983 /* A start tag whose tag name is "input" */
1984 case 'input':
1985 /* Reconstruct the active formatting elements, if any. */
1986 $this->reconstructActiveFormattingElements();
1988 /* Insert an input element for the token. */
1989 $element = $this->insertElement($token, false);
1991 /* If the form element pointer is not null, then associate the
1992 input element with the form element pointed to by the form
1993 element pointer. */
1994 $this->form_pointer !== null
1995 ? $this->form_pointer->appendChild($element)
1996 : end($this->stack)->appendChild($element);
1998 /* Pop that input element off the stack of open elements. */
1999 array_pop($this->stack);
2000 break;
2002 /* A start tag whose tag name is "isindex" */
2003 case 'isindex':
2004 /* Parse error. */
2005 // w/e
2007 /* If the form element pointer is not null,
2008 then ignore the token. */
2009 if($this->form_pointer === null) {
2010 /* Act as if a start tag token with the tag name "form" had
2011 been seen. */
2012 $this->inBody(array(
2013 'name' => 'body',
2014 'type' => HTML5::STARTTAG,
2015 'attr' => array()
2018 /* Act as if a start tag token with the tag name "hr" had
2019 been seen. */
2020 $this->inBody(array(
2021 'name' => 'hr',
2022 'type' => HTML5::STARTTAG,
2023 'attr' => array()
2026 /* Act as if a start tag token with the tag name "p" had
2027 been seen. */
2028 $this->inBody(array(
2029 'name' => 'p',
2030 'type' => HTML5::STARTTAG,
2031 'attr' => array()
2034 /* Act as if a start tag token with the tag name "label"
2035 had been seen. */
2036 $this->inBody(array(
2037 'name' => 'label',
2038 'type' => HTML5::STARTTAG,
2039 'attr' => array()
2042 /* Act as if a stream of character tokens had been seen. */
2043 $this->insertText('This is a searchable index. '.
2044 'Insert your search keywords here: ');
2046 /* Act as if a start tag token with the tag name "input"
2047 had been seen, with all the attributes from the "isindex"
2048 token, except with the "name" attribute set to the value
2049 "isindex" (ignoring any explicit "name" attribute). */
2050 $attr = $token['attr'];
2051 $attr[] = array('name' => 'name', 'value' => 'isindex');
2053 $this->inBody(array(
2054 'name' => 'input',
2055 'type' => HTML5::STARTTAG,
2056 'attr' => $attr
2059 /* Act as if a stream of character tokens had been seen
2060 (see below for what they should say). */
2061 $this->insertText('This is a searchable index. '.
2062 'Insert your search keywords here: ');
2064 /* Act as if an end tag token with the tag name "label"
2065 had been seen. */
2066 $this->inBody(array(
2067 'name' => 'label',
2068 'type' => HTML5::ENDTAG
2071 /* Act as if an end tag token with the tag name "p" had
2072 been seen. */
2073 $this->inBody(array(
2074 'name' => 'p',
2075 'type' => HTML5::ENDTAG
2078 /* Act as if a start tag token with the tag name "hr" had
2079 been seen. */
2080 $this->inBody(array(
2081 'name' => 'hr',
2082 'type' => HTML5::ENDTAG
2085 /* Act as if an end tag token with the tag name "form" had
2086 been seen. */
2087 $this->inBody(array(
2088 'name' => 'form',
2089 'type' => HTML5::ENDTAG
2092 break;
2094 /* A start tag whose tag name is "textarea" */
2095 case 'textarea':
2096 $this->insertElement($token);
2098 /* Switch the tokeniser's content model flag to the
2099 RCDATA state. */
2100 return HTML5::RCDATA;
2101 break;
2103 /* A start tag whose tag name is one of: "iframe", "noembed",
2104 "noframes" */
2105 case 'iframe': case 'noembed': case 'noframes':
2106 $this->insertElement($token);
2108 /* Switch the tokeniser's content model flag to the CDATA state. */
2109 return HTML5::CDATA;
2110 break;
2112 /* A start tag whose tag name is "select" */
2113 case 'select':
2114 /* Reconstruct the active formatting elements, if any. */
2115 $this->reconstructActiveFormattingElements();
2117 /* Insert an HTML element for the token. */
2118 $this->insertElement($token);
2120 /* Change the insertion mode to "in select". */
2121 $this->mode = self::IN_SELECT;
2122 break;
2124 /* A start or end tag whose tag name is one of: "caption", "col",
2125 "colgroup", "frame", "frameset", "head", "option", "optgroup",
2126 "tbody", "td", "tfoot", "th", "thead", "tr". */
2127 case 'caption': case 'col': case 'colgroup': case 'frame':
2128 case 'frameset': case 'head': case 'option': case 'optgroup':
2129 case 'tbody': case 'td': case 'tfoot': case 'th': case 'thead':
2130 case 'tr':
2131 // Parse error. Ignore the token.
2132 break;
2134 /* A start or end tag whose tag name is one of: "event-source",
2135 "section", "nav", "article", "aside", "header", "footer",
2136 "datagrid", "command" */
2137 case 'event-source': case 'section': case 'nav': case 'article':
2138 case 'aside': case 'header': case 'footer': case 'datagrid':
2139 case 'command':
2140 // Work in progress!
2141 break;
2143 /* A start tag token not covered by the previous entries */
2144 default:
2145 /* Reconstruct the active formatting elements, if any. */
2146 $this->reconstructActiveFormattingElements();
2148 $this->insertElement($token);
2149 break;
2151 break;
2153 case HTML5::ENDTAG:
2154 switch($token['name']) {
2155 /* An end tag with the tag name "body" */
2156 case 'body':
2157 /* If the second element in the stack of open elements is
2158 not a body element, this is a parse error. Ignore the token.
2159 (innerHTML case) */
2160 if(count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2161 // Ignore.
2163 /* If the current node is not the body element, then this
2164 is a parse error. */
2165 } elseif(end($this->stack)->nodeName !== 'body') {
2166 // Parse error.
2169 /* Change the insertion mode to "after body". */
2170 $this->mode = self::AFTER_BODY;
2171 break;
2173 /* An end tag with the tag name "html" */
2174 case 'html':
2175 /* Act as if an end tag with tag name "body" had been seen,
2176 then, if that token wasn't ignored, reprocess the current
2177 token. */
2178 $this->inBody(array(
2179 'name' => 'body',
2180 'type' => HTML5::ENDTAG
2183 return $this->afterBody($token);
2184 break;
2186 /* An end tag whose tag name is one of: "address", "blockquote",
2187 "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2188 "ol", "pre", "ul" */
2189 case 'address': case 'blockquote': case 'center': case 'dir':
2190 case 'div': case 'dl': case 'fieldset': case 'listing':
2191 case 'menu': case 'ol': case 'pre': case 'ul':
2192 /* If the stack of open elements has an element in scope
2193 with the same tag name as that of the token, then generate
2194 implied end tags. */
2195 if($this->elementInScope($token['name'])) {
2196 $this->generateImpliedEndTags();
2198 /* Now, if the current node is not an element with
2199 the same tag name as that of the token, then this
2200 is a parse error. */
2201 // w/e
2203 /* If the stack of open elements has an element in
2204 scope with the same tag name as that of the token,
2205 then pop elements from this stack until an element
2206 with that tag name has been popped from the stack. */
2207 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2208 if($this->stack[$n]->nodeName === $token['name']) {
2209 $n = -1;
2212 array_pop($this->stack);
2215 break;
2217 /* An end tag whose tag name is "form" */
2218 case 'form':
2219 /* If the stack of open elements has an element in scope
2220 with the same tag name as that of the token, then generate
2221 implied end tags. */
2222 if($this->elementInScope($token['name'])) {
2223 $this->generateImpliedEndTags();
2227 if(end($this->stack)->nodeName !== $token['name']) {
2228 /* Now, if the current node is not an element with the
2229 same tag name as that of the token, then this is a parse
2230 error. */
2231 // w/e
2233 } else {
2234 /* Otherwise, if the current node is an element with
2235 the same tag name as that of the token pop that element
2236 from the stack. */
2237 array_pop($this->stack);
2240 /* In any case, set the form element pointer to null. */
2241 $this->form_pointer = null;
2242 break;
2244 /* An end tag whose tag name is "p" */
2245 case 'p':
2246 /* If the stack of open elements has a p element in scope,
2247 then generate implied end tags, except for p elements. */
2248 if($this->elementInScope('p')) {
2249 $this->generateImpliedEndTags(array('p'));
2251 /* If the current node is not a p element, then this is
2252 a parse error. */
2253 // k
2255 /* If the stack of open elements has a p element in
2256 scope, then pop elements from this stack until the stack
2257 no longer has a p element in scope. */
2258 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2259 if($this->elementInScope('p')) {
2260 array_pop($this->stack);
2262 } else {
2263 break;
2267 break;
2269 /* An end tag whose tag name is "dd", "dt", or "li" */
2270 case 'dd': case 'dt': case 'li':
2271 /* If the stack of open elements has an element in scope
2272 whose tag name matches the tag name of the token, then
2273 generate implied end tags, except for elements with the
2274 same tag name as the token. */
2275 if($this->elementInScope($token['name'])) {
2276 $this->generateImpliedEndTags(array($token['name']));
2278 /* If the current node is not an element with the same
2279 tag name as the token, then this is a parse error. */
2280 // w/e
2282 /* If the stack of open elements has an element in scope
2283 whose tag name matches the tag name of the token, then
2284 pop elements from this stack until an element with that
2285 tag name has been popped from the stack. */
2286 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2287 if($this->stack[$n]->nodeName === $token['name']) {
2288 $n = -1;
2291 array_pop($this->stack);
2294 break;
2296 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2297 "h5", "h6" */
2298 case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
2299 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2301 /* If the stack of open elements has in scope an element whose
2302 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2303 generate implied end tags. */
2304 if($this->elementInScope($elements)) {
2305 $this->generateImpliedEndTags();
2307 /* Now, if the current node is not an element with the same
2308 tag name as that of the token, then this is a parse error. */
2309 // w/e
2311 /* If the stack of open elements has in scope an element
2312 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2313 "h6", then pop elements from the stack until an element
2314 with one of those tag names has been popped from the stack. */
2315 while($this->elementInScope($elements)) {
2316 array_pop($this->stack);
2319 break;
2321 /* An end tag whose tag name is one of: "a", "b", "big", "em",
2322 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2323 case 'a': case 'b': case 'big': case 'em': case 'font':
2324 case 'i': case 'nobr': case 's': case 'small': case 'strike':
2325 case 'strong': case 'tt': case 'u':
2326 /* 1. Let the formatting element be the last element in
2327 the list of active formatting elements that:
2328 * is between the end of the list and the last scope
2329 marker in the list, if any, or the start of the list
2330 otherwise, and
2331 * has the same tag name as the token.
2333 while(true) {
2334 for($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
2335 if($this->a_formatting[$a] === self::MARKER) {
2336 break;
2338 } elseif($this->a_formatting[$a]->tagName === $token['name']) {
2339 $formatting_element = $this->a_formatting[$a];
2340 $in_stack = in_array($formatting_element, $this->stack, true);
2341 $fe_af_pos = $a;
2342 break;
2346 /* If there is no such node, or, if that node is
2347 also in the stack of open elements but the element
2348 is not in scope, then this is a parse error. Abort
2349 these steps. The token is ignored. */
2350 if(!isset($formatting_element) || ($in_stack &&
2351 !$this->elementInScope($token['name']))) {
2352 break;
2354 /* Otherwise, if there is such a node, but that node
2355 is not in the stack of open elements, then this is a
2356 parse error; remove the element from the list, and
2357 abort these steps. */
2358 } elseif(isset($formatting_element) && !$in_stack) {
2359 unset($this->a_formatting[$fe_af_pos]);
2360 $this->a_formatting = array_merge($this->a_formatting);
2361 break;
2364 /* 2. Let the furthest block be the topmost node in the
2365 stack of open elements that is lower in the stack
2366 than the formatting element, and is not an element in
2367 the phrasing or formatting categories. There might
2368 not be one. */
2369 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2370 $length = count($this->stack);
2372 for($s = $fe_s_pos + 1; $s < $length; $s++) {
2373 $category = $this->getElementCategory($this->stack[$s]->nodeName);
2375 if($category !== self::PHRASING && $category !== self::FORMATTING) {
2376 $furthest_block = $this->stack[$s];
2380 /* 3. If there is no furthest block, then the UA must
2381 skip the subsequent steps and instead just pop all
2382 the nodes from the bottom of the stack of open
2383 elements, from the current node up to the formatting
2384 element, and remove the formatting element from the
2385 list of active formatting elements. */
2386 if(!isset($furthest_block)) {
2387 for($n = $length - 1; $n >= $fe_s_pos; $n--) {
2388 array_pop($this->stack);
2391 unset($this->a_formatting[$fe_af_pos]);
2392 $this->a_formatting = array_merge($this->a_formatting);
2393 break;
2396 /* 4. Let the common ancestor be the element
2397 immediately above the formatting element in the stack
2398 of open elements. */
2399 $common_ancestor = $this->stack[$fe_s_pos - 1];
2401 /* 5. If the furthest block has a parent node, then
2402 remove the furthest block from its parent node. */
2403 if($furthest_block->parentNode !== null) {
2404 $furthest_block->parentNode->removeChild($furthest_block);
2407 /* 6. Let a bookmark note the position of the
2408 formatting element in the list of active formatting
2409 elements relative to the elements on either side
2410 of it in the list. */
2411 $bookmark = $fe_af_pos;
2413 /* 7. Let node and last node be the furthest block.
2414 Follow these steps: */
2415 $node = $furthest_block;
2416 $last_node = $furthest_block;
2418 while(true) {
2419 for($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
2420 /* 7.1 Let node be the element immediately
2421 prior to node in the stack of open elements. */
2422 $node = $this->stack[$n];
2424 /* 7.2 If node is not in the list of active
2425 formatting elements, then remove node from
2426 the stack of open elements and then go back
2427 to step 1. */
2428 if(!in_array($node, $this->a_formatting, true)) {
2429 unset($this->stack[$n]);
2430 $this->stack = array_merge($this->stack);
2432 } else {
2433 break;
2437 /* 7.3 Otherwise, if node is the formatting
2438 element, then go to the next step in the overall
2439 algorithm. */
2440 if($node === $formatting_element) {
2441 break;
2443 /* 7.4 Otherwise, if last node is the furthest
2444 block, then move the aforementioned bookmark to
2445 be immediately after the node in the list of
2446 active formatting elements. */
2447 } elseif($last_node === $furthest_block) {
2448 $bookmark = array_search($node, $this->a_formatting, true) + 1;
2451 /* 7.5 If node has any children, perform a
2452 shallow clone of node, replace the entry for
2453 node in the list of active formatting elements
2454 with an entry for the clone, replace the entry
2455 for node in the stack of open elements with an
2456 entry for the clone, and let node be the clone. */
2457 if($node->hasChildNodes()) {
2458 $clone = $node->cloneNode();
2459 $s_pos = array_search($node, $this->stack, true);
2460 $a_pos = array_search($node, $this->a_formatting, true);
2462 $this->stack[$s_pos] = $clone;
2463 $this->a_formatting[$a_pos] = $clone;
2464 $node = $clone;
2467 /* 7.6 Insert last node into node, first removing
2468 it from its previous parent node if any. */
2469 if($last_node->parentNode !== null) {
2470 $last_node->parentNode->removeChild($last_node);
2473 $node->appendChild($last_node);
2475 /* 7.7 Let last node be node. */
2476 $last_node = $node;
2479 /* 8. Insert whatever last node ended up being in
2480 the previous step into the common ancestor node,
2481 first removing it from its previous parent node if
2482 any. */
2483 if($last_node->parentNode !== null) {
2484 $last_node->parentNode->removeChild($last_node);
2487 $common_ancestor->appendChild($last_node);
2489 /* 9. Perform a shallow clone of the formatting
2490 element. */
2491 $clone = $formatting_element->cloneNode();
2493 /* 10. Take all of the child nodes of the furthest
2494 block and append them to the clone created in the
2495 last step. */
2496 while($furthest_block->hasChildNodes()) {
2497 $child = $furthest_block->firstChild;
2498 $furthest_block->removeChild($child);
2499 $clone->appendChild($child);
2502 /* 11. Append that clone to the furthest block. */
2503 $furthest_block->appendChild($clone);
2505 /* 12. Remove the formatting element from the list
2506 of active formatting elements, and insert the clone
2507 into the list of active formatting elements at the
2508 position of the aforementioned bookmark. */
2509 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
2510 unset($this->a_formatting[$fe_af_pos]);
2511 $this->a_formatting = array_merge($this->a_formatting);
2513 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
2514 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
2515 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
2517 /* 13. Remove the formatting element from the stack
2518 of open elements, and insert the clone into the stack
2519 of open elements immediately after (i.e. in a more
2520 deeply nested position than) the position of the
2521 furthest block in that stack. */
2522 $fe_s_pos = array_search($formatting_element, $this->stack, true);
2523 $fb_s_pos = array_search($furthest_block, $this->stack, true);
2524 unset($this->stack[$fe_s_pos]);
2526 $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
2527 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
2528 $this->stack = array_merge($s_part1, array($clone), $s_part2);
2530 /* 14. Jump back to step 1 in this series of steps. */
2531 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
2533 break;
2535 /* An end tag token whose tag name is one of: "button",
2536 "marquee", "object" */
2537 case 'button': case 'marquee': case 'object':
2538 /* If the stack of open elements has an element in scope whose
2539 tag name matches the tag name of the token, then generate implied
2540 tags. */
2541 if($this->elementInScope($token['name'])) {
2542 $this->generateImpliedEndTags();
2544 /* Now, if the current node is not an element with the same
2545 tag name as the token, then this is a parse error. */
2546 // k
2548 /* Now, if the stack of open elements has an element in scope
2549 whose tag name matches the tag name of the token, then pop
2550 elements from the stack until that element has been popped from
2551 the stack, and clear the list of active formatting elements up
2552 to the last marker. */
2553 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2554 if($this->stack[$n]->nodeName === $token['name']) {
2555 $n = -1;
2558 array_pop($this->stack);
2561 $marker = end(array_keys($this->a_formatting, self::MARKER, true));
2563 for($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
2564 array_pop($this->a_formatting);
2567 break;
2569 /* Or an end tag whose tag name is one of: "area", "basefont",
2570 "bgsound", "br", "embed", "hr", "iframe", "image", "img",
2571 "input", "isindex", "noembed", "noframes", "param", "select",
2572 "spacer", "table", "textarea", "wbr" */
2573 case 'area': case 'basefont': case 'bgsound': case 'br':
2574 case 'embed': case 'hr': case 'iframe': case 'image':
2575 case 'img': case 'input': case 'isindex': case 'noembed':
2576 case 'noframes': case 'param': case 'select': case 'spacer':
2577 case 'table': case 'textarea': case 'wbr':
2578 // Parse error. Ignore the token.
2579 break;
2581 /* An end tag token not covered by the previous entries */
2582 default:
2583 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2584 /* Initialise node to be the current node (the bottommost
2585 node of the stack). */
2586 $node = end($this->stack);
2588 /* If node has the same tag name as the end tag token,
2589 then: */
2590 if($token['name'] === $node->nodeName) {
2591 /* Generate implied end tags. */
2592 $this->generateImpliedEndTags();
2594 /* If the tag name of the end tag token does not
2595 match the tag name of the current node, this is a
2596 parse error. */
2597 // k
2599 /* Pop all the nodes from the current node up to
2600 node, including node, then stop this algorithm. */
2601 for($x = count($this->stack) - $n; $x >= $n; $x--) {
2602 array_pop($this->stack);
2605 } else {
2606 $category = $this->getElementCategory($node);
2608 if($category !== self::SPECIAL && $category !== self::SCOPING) {
2609 /* Otherwise, if node is in neither the formatting
2610 category nor the phrasing category, then this is a
2611 parse error. Stop this algorithm. The end tag token
2612 is ignored. */
2613 return false;
2617 break;
2619 break;
2623 private function inTable($token) {
2624 $clear = array('html', 'table');
2626 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2627 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2628 or U+0020 SPACE */
2629 if($token['type'] === HTML5::CHARACTR &&
2630 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2631 /* Append the character to the current node. */
2632 $text = $this->dom->createTextNode($token['data']);
2633 end($this->stack)->appendChild($text);
2635 /* A comment token */
2636 } elseif($token['type'] === HTML5::COMMENT) {
2637 /* Append a Comment node to the current node with the data
2638 attribute set to the data given in the comment token. */
2639 $comment = $this->dom->createComment($token['data']);
2640 end($this->stack)->appendChild($comment);
2642 /* A start tag whose tag name is "caption" */
2643 } elseif($token['type'] === HTML5::STARTTAG &&
2644 $token['name'] === 'caption') {
2645 /* Clear the stack back to a table context. */
2646 $this->clearStackToTableContext($clear);
2648 /* Insert a marker at the end of the list of active
2649 formatting elements. */
2650 $this->a_formatting[] = self::MARKER;
2652 /* Insert an HTML element for the token, then switch the
2653 insertion mode to "in caption". */
2654 $this->insertElement($token);
2655 $this->mode = self::IN_CAPTION;
2657 /* A start tag whose tag name is "colgroup" */
2658 } elseif($token['type'] === HTML5::STARTTAG &&
2659 $token['name'] === 'colgroup') {
2660 /* Clear the stack back to a table context. */
2661 $this->clearStackToTableContext($clear);
2663 /* Insert an HTML element for the token, then switch the
2664 insertion mode to "in column group". */
2665 $this->insertElement($token);
2666 $this->mode = self::IN_CGROUP;
2668 /* A start tag whose tag name is "col" */
2669 } elseif($token['type'] === HTML5::STARTTAG &&
2670 $token['name'] === 'col') {
2671 $this->inTable(array(
2672 'name' => 'colgroup',
2673 'type' => HTML5::STARTTAG,
2674 'attr' => array()
2677 $this->inColumnGroup($token);
2679 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
2680 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2681 array('tbody', 'tfoot', 'thead'))) {
2682 /* Clear the stack back to a table context. */
2683 $this->clearStackToTableContext($clear);
2685 /* Insert an HTML element for the token, then switch the insertion
2686 mode to "in table body". */
2687 $this->insertElement($token);
2688 $this->mode = self::IN_TBODY;
2690 /* A start tag whose tag name is one of: "td", "th", "tr" */
2691 } elseif($token['type'] === HTML5::STARTTAG &&
2692 in_array($token['name'], array('td', 'th', 'tr'))) {
2693 /* Act as if a start tag token with the tag name "tbody" had been
2694 seen, then reprocess the current token. */
2695 $this->inTable(array(
2696 'name' => 'tbody',
2697 'type' => HTML5::STARTTAG,
2698 'attr' => array()
2701 return $this->inTableBody($token);
2703 /* A start tag whose tag name is "table" */
2704 } elseif($token['type'] === HTML5::STARTTAG &&
2705 $token['name'] === 'table') {
2706 /* Parse error. Act as if an end tag token with the tag name "table"
2707 had been seen, then, if that token wasn't ignored, reprocess the
2708 current token. */
2709 $this->inTable(array(
2710 'name' => 'table',
2711 'type' => HTML5::ENDTAG
2714 return $this->mainPhase($token);
2716 /* An end tag whose tag name is "table" */
2717 } elseif($token['type'] === HTML5::ENDTAG &&
2718 $token['name'] === 'table') {
2719 /* If the stack of open elements does not have an element in table
2720 scope with the same tag name as the token, this is a parse error.
2721 Ignore the token. (innerHTML case) */
2722 if(!$this->elementInScope($token['name'], true)) {
2723 return false;
2725 /* Otherwise: */
2726 } else {
2727 /* Generate implied end tags. */
2728 $this->generateImpliedEndTags();
2730 /* Now, if the current node is not a table element, then this
2731 is a parse error. */
2732 // w/e
2734 /* Pop elements from this stack until a table element has been
2735 popped from the stack. */
2736 while(true) {
2737 $current = end($this->stack)->nodeName;
2738 array_pop($this->stack);
2740 if($current === 'table') {
2741 break;
2745 /* Reset the insertion mode appropriately. */
2746 $this->resetInsertionMode();
2749 /* An end tag whose tag name is one of: "body", "caption", "col",
2750 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2751 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2752 array('body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td',
2753 'tfoot', 'th', 'thead', 'tr'))) {
2754 // Parse error. Ignore the token.
2756 /* Anything else */
2757 } else {
2758 /* Parse error. Process the token as if the insertion mode was "in
2759 body", with the following exception: */
2761 /* If the current node is a table, tbody, tfoot, thead, or tr
2762 element, then, whenever a node would be inserted into the current
2763 node, it must instead be inserted into the foster parent element. */
2764 if(in_array(end($this->stack)->nodeName,
2765 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
2766 /* The foster parent element is the parent element of the last
2767 table element in the stack of open elements, if there is a
2768 table element and it has such a parent element. If there is no
2769 table element in the stack of open elements (innerHTML case),
2770 then the foster parent element is the first element in the
2771 stack of open elements (the html element). Otherwise, if there
2772 is a table element in the stack of open elements, but the last
2773 table element in the stack of open elements has no parent, or
2774 its parent node is not an element, then the foster parent
2775 element is the element before the last table element in the
2776 stack of open elements. */
2777 for($n = count($this->stack) - 1; $n >= 0; $n--) {
2778 if($this->stack[$n]->nodeName === 'table') {
2779 $table = $this->stack[$n];
2780 break;
2784 if(isset($table) && $table->parentNode !== null) {
2785 $this->foster_parent = $table->parentNode;
2787 } elseif(!isset($table)) {
2788 $this->foster_parent = $this->stack[0];
2790 } elseif(isset($table) && ($table->parentNode === null ||
2791 $table->parentNode->nodeType !== XML_ELEMENT_NODE)) {
2792 $this->foster_parent = $this->stack[$n - 1];
2796 $this->inBody($token);
2800 private function inCaption($token) {
2801 /* An end tag whose tag name is "caption" */
2802 if($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
2803 /* If the stack of open elements does not have an element in table
2804 scope with the same tag name as the token, this is a parse error.
2805 Ignore the token. (innerHTML case) */
2806 if(!$this->elementInScope($token['name'], true)) {
2807 // Ignore
2809 /* Otherwise: */
2810 } else {
2811 /* Generate implied end tags. */
2812 $this->generateImpliedEndTags();
2814 /* Now, if the current node is not a caption element, then this
2815 is a parse error. */
2816 // w/e
2818 /* Pop elements from this stack until a caption element has
2819 been popped from the stack. */
2820 while(true) {
2821 $node = end($this->stack)->nodeName;
2822 array_pop($this->stack);
2824 if($node === 'caption') {
2825 break;
2829 /* Clear the list of active formatting elements up to the last
2830 marker. */
2831 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
2833 /* Switch the insertion mode to "in table". */
2834 $this->mode = self::IN_TABLE;
2837 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2838 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
2839 name is "table" */
2840 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2841 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
2842 'thead', 'tr'))) || ($token['type'] === HTML5::ENDTAG &&
2843 $token['name'] === 'table')) {
2844 /* Parse error. Act as if an end tag with the tag name "caption"
2845 had been seen, then, if that token wasn't ignored, reprocess the
2846 current token. */
2847 $this->inCaption(array(
2848 'name' => 'caption',
2849 'type' => HTML5::ENDTAG
2852 return $this->inTable($token);
2854 /* An end tag whose tag name is one of: "body", "col", "colgroup",
2855 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
2856 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
2857 array('body', 'col', 'colgroup', 'html', 'tbody', 'tfoot', 'th',
2858 'thead', 'tr'))) {
2859 // Parse error. Ignore the token.
2861 /* Anything else */
2862 } else {
2863 /* Process the token as if the insertion mode was "in body". */
2864 $this->inBody($token);
2868 private function inColumnGroup($token) {
2869 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2870 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2871 or U+0020 SPACE */
2872 if($token['type'] === HTML5::CHARACTR &&
2873 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
2874 /* Append the character to the current node. */
2875 $text = $this->dom->createTextNode($token['data']);
2876 end($this->stack)->appendChild($text);
2878 /* A comment token */
2879 } elseif($token['type'] === HTML5::COMMENT) {
2880 /* Append a Comment node to the current node with the data
2881 attribute set to the data given in the comment token. */
2882 $comment = $this->dom->createComment($token['data']);
2883 end($this->stack)->appendChild($comment);
2885 /* A start tag whose tag name is "col" */
2886 } elseif($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
2887 /* Insert a col element for the token. Immediately pop the current
2888 node off the stack of open elements. */
2889 $this->insertElement($token);
2890 array_pop($this->stack);
2892 /* An end tag whose tag name is "colgroup" */
2893 } elseif($token['type'] === HTML5::ENDTAG &&
2894 $token['name'] === 'colgroup') {
2895 /* If the current node is the root html element, then this is a
2896 parse error, ignore the token. (innerHTML case) */
2897 if(end($this->stack)->nodeName === 'html') {
2898 // Ignore
2900 /* Otherwise, pop the current node (which will be a colgroup
2901 element) from the stack of open elements. Switch the insertion
2902 mode to "in table". */
2903 } else {
2904 array_pop($this->stack);
2905 $this->mode = self::IN_TABLE;
2908 /* An end tag whose tag name is "col" */
2909 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
2910 /* Parse error. Ignore the token. */
2912 /* Anything else */
2913 } else {
2914 /* Act as if an end tag with the tag name "colgroup" had been seen,
2915 and then, if that token wasn't ignored, reprocess the current token. */
2916 $this->inColumnGroup(array(
2917 'name' => 'colgroup',
2918 'type' => HTML5::ENDTAG
2921 return $this->inTable($token);
2925 private function inTableBody($token) {
2926 $clear = array('tbody', 'tfoot', 'thead', 'html');
2928 /* A start tag whose tag name is "tr" */
2929 if($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
2930 /* Clear the stack back to a table body context. */
2931 $this->clearStackToTableContext($clear);
2933 /* Insert a tr element for the token, then switch the insertion
2934 mode to "in row". */
2935 $this->insertElement($token);
2936 $this->mode = self::IN_ROW;
2938 /* A start tag whose tag name is one of: "th", "td" */
2939 } elseif($token['type'] === HTML5::STARTTAG &&
2940 ($token['name'] === 'th' || $token['name'] === 'td')) {
2941 /* Parse error. Act as if a start tag with the tag name "tr" had
2942 been seen, then reprocess the current token. */
2943 $this->inTableBody(array(
2944 'name' => 'tr',
2945 'type' => HTML5::STARTTAG,
2946 'attr' => array()
2949 return $this->inRow($token);
2951 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
2952 } elseif($token['type'] === HTML5::ENDTAG &&
2953 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
2954 /* If the stack of open elements does not have an element in table
2955 scope with the same tag name as the token, this is a parse error.
2956 Ignore the token. */
2957 if(!$this->elementInScope($token['name'], true)) {
2958 // Ignore
2960 /* Otherwise: */
2961 } else {
2962 /* Clear the stack back to a table body context. */
2963 $this->clearStackToTableContext($clear);
2965 /* Pop the current node from the stack of open elements. Switch
2966 the insertion mode to "in table". */
2967 array_pop($this->stack);
2968 $this->mode = self::IN_TABLE;
2971 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
2972 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
2973 } elseif(($token['type'] === HTML5::STARTTAG && in_array($token['name'],
2974 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead'))) ||
2975 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')) {
2976 /* If the stack of open elements does not have a tbody, thead, or
2977 tfoot element in table scope, this is a parse error. Ignore the
2978 token. (innerHTML case) */
2979 if(!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
2980 // Ignore.
2982 /* Otherwise: */
2983 } else {
2984 /* Clear the stack back to a table body context. */
2985 $this->clearStackToTableContext($clear);
2987 /* Act as if an end tag with the same tag name as the current
2988 node ("tbody", "tfoot", or "thead") had been seen, then
2989 reprocess the current token. */
2990 $this->inTableBody(array(
2991 'name' => end($this->stack)->nodeName,
2992 'type' => HTML5::ENDTAG
2995 return $this->mainPhase($token);
2998 /* An end tag whose tag name is one of: "body", "caption", "col",
2999 "colgroup", "html", "td", "th", "tr" */
3000 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3001 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3002 /* Parse error. Ignore the token. */
3004 /* Anything else */
3005 } else {
3006 /* Process the token as if the insertion mode was "in table". */
3007 $this->inTable($token);
3011 private function inRow($token) {
3012 $clear = array('tr', 'html');
3014 /* A start tag whose tag name is one of: "th", "td" */
3015 if($token['type'] === HTML5::STARTTAG &&
3016 ($token['name'] === 'th' || $token['name'] === 'td')) {
3017 /* Clear the stack back to a table row context. */
3018 $this->clearStackToTableContext($clear);
3020 /* Insert an HTML element for the token, then switch the insertion
3021 mode to "in cell". */
3022 $this->insertElement($token);
3023 $this->mode = self::IN_CELL;
3025 /* Insert a marker at the end of the list of active formatting
3026 elements. */
3027 $this->a_formatting[] = self::MARKER;
3029 /* An end tag whose tag name is "tr" */
3030 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3031 /* If the stack of open elements does not have an element in table
3032 scope with the same tag name as the token, this is a parse error.
3033 Ignore the token. (innerHTML case) */
3034 if(!$this->elementInScope($token['name'], true)) {
3035 // Ignore.
3037 /* Otherwise: */
3038 } else {
3039 /* Clear the stack back to a table row context. */
3040 $this->clearStackToTableContext($clear);
3042 /* Pop the current node (which will be a tr element) from the
3043 stack of open elements. Switch the insertion mode to "in table
3044 body". */
3045 array_pop($this->stack);
3046 $this->mode = self::IN_TBODY;
3049 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3050 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3051 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3052 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr'))) {
3053 /* Act as if an end tag with the tag name "tr" had been seen, then,
3054 if that token wasn't ignored, reprocess the current token. */
3055 $this->inRow(array(
3056 'name' => 'tr',
3057 'type' => HTML5::ENDTAG
3060 return $this->inCell($token);
3062 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3063 } elseif($token['type'] === HTML5::ENDTAG &&
3064 in_array($token['name'], array('tbody', 'tfoot', 'thead'))) {
3065 /* If the stack of open elements does not have an element in table
3066 scope with the same tag name as the token, this is a parse error.
3067 Ignore the token. */
3068 if(!$this->elementInScope($token['name'], true)) {
3069 // Ignore.
3071 /* Otherwise: */
3072 } else {
3073 /* Otherwise, act as if an end tag with the tag name "tr" had
3074 been seen, then reprocess the current token. */
3075 $this->inRow(array(
3076 'name' => 'tr',
3077 'type' => HTML5::ENDTAG
3080 return $this->inCell($token);
3083 /* An end tag whose tag name is one of: "body", "caption", "col",
3084 "colgroup", "html", "td", "th" */
3085 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3086 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr'))) {
3087 /* Parse error. Ignore the token. */
3089 /* Anything else */
3090 } else {
3091 /* Process the token as if the insertion mode was "in table". */
3092 $this->inTable($token);
3096 private function inCell($token) {
3097 /* An end tag whose tag name is one of: "td", "th" */
3098 if($token['type'] === HTML5::ENDTAG &&
3099 ($token['name'] === 'td' || $token['name'] === 'th')) {
3100 /* If the stack of open elements does not have an element in table
3101 scope with the same tag name as that of the token, then this is a
3102 parse error and the token must be ignored. */
3103 if(!$this->elementInScope($token['name'], true)) {
3104 // Ignore.
3106 /* Otherwise: */
3107 } else {
3108 /* Generate implied end tags, except for elements with the same
3109 tag name as the token. */
3110 $this->generateImpliedEndTags(array($token['name']));
3112 /* Now, if the current node is not an element with the same tag
3113 name as the token, then this is a parse error. */
3114 // k
3116 /* Pop elements from this stack until an element with the same
3117 tag name as the token has been popped from the stack. */
3118 while(true) {
3119 $node = end($this->stack)->nodeName;
3120 array_pop($this->stack);
3122 if($node === $token['name']) {
3123 break;
3127 /* Clear the list of active formatting elements up to the last
3128 marker. */
3129 $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3131 /* Switch the insertion mode to "in row". (The current node
3132 will be a tr element at this point.) */
3133 $this->mode = self::IN_ROW;
3136 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3137 "tbody", "td", "tfoot", "th", "thead", "tr" */
3138 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3139 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3140 'thead', 'tr'))) {
3141 /* If the stack of open elements does not have a td or th element
3142 in table scope, then this is a parse error; ignore the token.
3143 (innerHTML case) */
3144 if(!$this->elementInScope(array('td', 'th'), true)) {
3145 // Ignore.
3147 /* Otherwise, close the cell (see below) and reprocess the current
3148 token. */
3149 } else {
3150 $this->closeCell();
3151 return $this->inRow($token);
3154 /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3155 "tbody", "td", "tfoot", "th", "thead", "tr" */
3156 } elseif($token['type'] === HTML5::STARTTAG && in_array($token['name'],
3157 array('caption', 'col', 'colgroup', 'tbody', 'td', 'tfoot', 'th',
3158 'thead', 'tr'))) {
3159 /* If the stack of open elements does not have a td or th element
3160 in table scope, then this is a parse error; ignore the token.
3161 (innerHTML case) */
3162 if(!$this->elementInScope(array('td', 'th'), true)) {
3163 // Ignore.
3165 /* Otherwise, close the cell (see below) and reprocess the current
3166 token. */
3167 } else {
3168 $this->closeCell();
3169 return $this->inRow($token);
3172 /* An end tag whose tag name is one of: "body", "caption", "col",
3173 "colgroup", "html" */
3174 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3175 array('body', 'caption', 'col', 'colgroup', 'html'))) {
3176 /* Parse error. Ignore the token. */
3178 /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3179 "thead", "tr" */
3180 } elseif($token['type'] === HTML5::ENDTAG && in_array($token['name'],
3181 array('table', 'tbody', 'tfoot', 'thead', 'tr'))) {
3182 /* If the stack of open elements does not have an element in table
3183 scope with the same tag name as that of the token (which can only
3184 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
3185 then this is a parse error and the token must be ignored. */
3186 if(!$this->elementInScope($token['name'], true)) {
3187 // Ignore.
3189 /* Otherwise, close the cell (see below) and reprocess the current
3190 token. */
3191 } else {
3192 $this->closeCell();
3193 return $this->inRow($token);
3196 /* Anything else */
3197 } else {
3198 /* Process the token as if the insertion mode was "in body". */
3199 $this->inBody($token);
3203 private function inSelect($token) {
3204 /* Handle the token as follows: */
3206 /* A character token */
3207 if($token['type'] === HTML5::CHARACTR) {
3208 /* Append the token's character to the current node. */
3209 $this->insertText($token['data']);
3211 /* A comment token */
3212 } elseif($token['type'] === HTML5::COMMENT) {
3213 /* Append a Comment node to the current node with the data
3214 attribute set to the data given in the comment token. */
3215 $this->insertComment($token['data']);
3217 /* A start tag token whose tag name is "option" */
3218 } elseif($token['type'] === HTML5::STARTTAG &&
3219 $token['name'] === 'option') {
3220 /* If the current node is an option element, act as if an end tag
3221 with the tag name "option" had been seen. */
3222 if(end($this->stack)->nodeName === 'option') {
3223 $this->inSelect(array(
3224 'name' => 'option',
3225 'type' => HTML5::ENDTAG
3229 /* Insert an HTML element for the token. */
3230 $this->insertElement($token);
3232 /* A start tag token whose tag name is "optgroup" */
3233 } elseif($token['type'] === HTML5::STARTTAG &&
3234 $token['name'] === 'optgroup') {
3235 /* If the current node is an option element, act as if an end tag
3236 with the tag name "option" had been seen. */
3237 if(end($this->stack)->nodeName === 'option') {
3238 $this->inSelect(array(
3239 'name' => 'option',
3240 'type' => HTML5::ENDTAG
3244 /* If the current node is an optgroup element, act as if an end tag
3245 with the tag name "optgroup" had been seen. */
3246 if(end($this->stack)->nodeName === 'optgroup') {
3247 $this->inSelect(array(
3248 'name' => 'optgroup',
3249 'type' => HTML5::ENDTAG
3253 /* Insert an HTML element for the token. */
3254 $this->insertElement($token);
3256 /* An end tag token whose tag name is "optgroup" */
3257 } elseif($token['type'] === HTML5::ENDTAG &&
3258 $token['name'] === 'optgroup') {
3259 /* First, if the current node is an option element, and the node
3260 immediately before it in the stack of open elements is an optgroup
3261 element, then act as if an end tag with the tag name "option" had
3262 been seen. */
3263 $elements_in_stack = count($this->stack);
3265 if($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
3266 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup') {
3267 $this->inSelect(array(
3268 'name' => 'option',
3269 'type' => HTML5::ENDTAG
3273 /* If the current node is an optgroup element, then pop that node
3274 from the stack of open elements. Otherwise, this is a parse error,
3275 ignore the token. */
3276 if($this->stack[$elements_in_stack - 1] === 'optgroup') {
3277 array_pop($this->stack);
3280 /* An end tag token whose tag name is "option" */
3281 } elseif($token['type'] === HTML5::ENDTAG &&
3282 $token['name'] === 'option') {
3283 /* If the current node is an option element, then pop that node
3284 from the stack of open elements. Otherwise, this is a parse error,
3285 ignore the token. */
3286 if(end($this->stack)->nodeName === 'option') {
3287 array_pop($this->stack);
3290 /* An end tag whose tag name is "select" */
3291 } elseif($token['type'] === HTML5::ENDTAG &&
3292 $token['name'] === 'select') {
3293 /* If the stack of open elements does not have an element in table
3294 scope with the same tag name as the token, this is a parse error.
3295 Ignore the token. (innerHTML case) */
3296 if(!$this->elementInScope($token['name'], true)) {
3297 // w/e
3299 /* Otherwise: */
3300 } else {
3301 /* Pop elements from the stack of open elements until a select
3302 element has been popped from the stack. */
3303 while(true) {
3304 $current = end($this->stack)->nodeName;
3305 array_pop($this->stack);
3307 if($current === 'select') {
3308 break;
3312 /* Reset the insertion mode appropriately. */
3313 $this->resetInsertionMode();
3316 /* A start tag whose tag name is "select" */
3317 } elseif($token['name'] === 'select' &&
3318 $token['type'] === HTML5::STARTTAG) {
3319 /* Parse error. Act as if the token had been an end tag with the
3320 tag name "select" instead. */
3321 $this->inSelect(array(
3322 'name' => 'select',
3323 'type' => HTML5::ENDTAG
3326 /* An end tag whose tag name is one of: "caption", "table", "tbody",
3327 "tfoot", "thead", "tr", "td", "th" */
3328 } elseif(in_array($token['name'], array('caption', 'table', 'tbody',
3329 'tfoot', 'thead', 'tr', 'td', 'th')) && $token['type'] === HTML5::ENDTAG) {
3330 /* Parse error. */
3331 // w/e
3333 /* If the stack of open elements has an element in table scope with
3334 the same tag name as that of the token, then act as if an end tag
3335 with the tag name "select" had been seen, and reprocess the token.
3336 Otherwise, ignore the token. */
3337 if($this->elementInScope($token['name'], true)) {
3338 $this->inSelect(array(
3339 'name' => 'select',
3340 'type' => HTML5::ENDTAG
3343 $this->mainPhase($token);
3346 /* Anything else */
3347 } else {
3348 /* Parse error. Ignore the token. */
3352 private function afterBody($token) {
3353 /* Handle the token as follows: */
3355 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3356 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3357 or U+0020 SPACE */
3358 if($token['type'] === HTML5::CHARACTR &&
3359 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3360 /* Process the token as it would be processed if the insertion mode
3361 was "in body". */
3362 $this->inBody($token);
3364 /* A comment token */
3365 } elseif($token['type'] === HTML5::COMMENT) {
3366 /* Append a Comment node to the first element in the stack of open
3367 elements (the html element), with the data attribute set to the
3368 data given in the comment token. */
3369 $comment = $this->dom->createComment($token['data']);
3370 $this->stack[0]->appendChild($comment);
3372 /* An end tag with the tag name "html" */
3373 } elseif($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
3374 /* If the parser was originally created in order to handle the
3375 setting of an element's innerHTML attribute, this is a parse error;
3376 ignore the token. (The element will be an html element in this
3377 case.) (innerHTML case) */
3379 /* Otherwise, switch to the trailing end phase. */
3380 $this->phase = self::END_PHASE;
3382 /* Anything else */
3383 } else {
3384 /* Parse error. Set the insertion mode to "in body" and reprocess
3385 the token. */
3386 $this->mode = self::IN_BODY;
3387 return $this->inBody($token);
3391 private function inFrameset($token) {
3392 /* Handle the token as follows: */
3394 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3395 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3396 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3397 if($token['type'] === HTML5::CHARACTR &&
3398 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3399 /* Append the character to the current node. */
3400 $this->insertText($token['data']);
3402 /* A comment token */
3403 } elseif($token['type'] === HTML5::COMMENT) {
3404 /* Append a Comment node to the current node with the data
3405 attribute set to the data given in the comment token. */
3406 $this->insertComment($token['data']);
3408 /* A start tag with the tag name "frameset" */
3409 } elseif($token['name'] === 'frameset' &&
3410 $token['type'] === HTML5::STARTTAG) {
3411 $this->insertElement($token);
3413 /* An end tag with the tag name "frameset" */
3414 } elseif($token['name'] === 'frameset' &&
3415 $token['type'] === HTML5::ENDTAG) {
3416 /* If the current node is the root html element, then this is a
3417 parse error; ignore the token. (innerHTML case) */
3418 if(end($this->stack)->nodeName === 'html') {
3419 // Ignore
3421 } else {
3422 /* Otherwise, pop the current node from the stack of open
3423 elements. */
3424 array_pop($this->stack);
3426 /* If the parser was not originally created in order to handle
3427 the setting of an element's innerHTML attribute (innerHTML case),
3428 and the current node is no longer a frameset element, then change
3429 the insertion mode to "after frameset". */
3430 $this->mode = self::AFTR_FRAME;
3433 /* A start tag with the tag name "frame" */
3434 } elseif($token['name'] === 'frame' &&
3435 $token['type'] === HTML5::STARTTAG) {
3436 /* Insert an HTML element for the token. */
3437 $this->insertElement($token);
3439 /* Immediately pop the current node off the stack of open elements. */
3440 array_pop($this->stack);
3442 /* A start tag with the tag name "noframes" */
3443 } elseif($token['name'] === 'noframes' &&
3444 $token['type'] === HTML5::STARTTAG) {
3445 /* Process the token as if the insertion mode had been "in body". */
3446 $this->inBody($token);
3448 /* Anything else */
3449 } else {
3450 /* Parse error. Ignore the token. */
3454 private function afterFrameset($token) {
3455 /* Handle the token as follows: */
3457 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3458 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3459 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
3460 if($token['type'] === HTML5::CHARACTR &&
3461 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3462 /* Append the character to the current node. */
3463 $this->insertText($token['data']);
3465 /* A comment token */
3466 } elseif($token['type'] === HTML5::COMMENT) {
3467 /* Append a Comment node to the current node with the data
3468 attribute set to the data given in the comment token. */
3469 $this->insertComment($token['data']);
3471 /* An end tag with the tag name "html" */
3472 } elseif($token['name'] === 'html' &&
3473 $token['type'] === HTML5::ENDTAG) {
3474 /* Switch to the trailing end phase. */
3475 $this->phase = self::END_PHASE;
3477 /* A start tag with the tag name "noframes" */
3478 } elseif($token['name'] === 'noframes' &&
3479 $token['type'] === HTML5::STARTTAG) {
3480 /* Process the token as if the insertion mode had been "in body". */
3481 $this->inBody($token);
3483 /* Anything else */
3484 } else {
3485 /* Parse error. Ignore the token. */
3489 private function trailingEndPhase($token) {
3490 /* After the main phase, as each token is emitted from the tokenisation
3491 stage, it must be processed as described in this section. */
3493 /* A DOCTYPE token */
3494 if($token['type'] === HTML5::DOCTYPE) {
3495 // Parse error. Ignore the token.
3497 /* A comment token */
3498 } elseif($token['type'] === HTML5::COMMENT) {
3499 /* Append a Comment node to the Document object with the data
3500 attribute set to the data given in the comment token. */
3501 $comment = $this->dom->createComment($token['data']);
3502 $this->dom->appendChild($comment);
3504 /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3505 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3506 or U+0020 SPACE */
3507 } elseif($token['type'] === HTML5::CHARACTR &&
3508 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) {
3509 /* Process the token as it would be processed in the main phase. */
3510 $this->mainPhase($token);
3512 /* A character token that is not one of U+0009 CHARACTER TABULATION,
3513 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3514 or U+0020 SPACE. Or a start tag token. Or an end tag token. */
3515 } elseif(($token['type'] === HTML5::CHARACTR &&
3516 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
3517 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG) {
3518 /* Parse error. Switch back to the main phase and reprocess the
3519 token. */
3520 $this->phase = self::MAIN_PHASE;
3521 return $this->mainPhase($token);
3523 /* An end-of-file token */
3524 } elseif($token['type'] === HTML5::EOF) {
3525 /* OMG DONE!! */
3529 private function insertElement($token, $append = true) {
3530 $el = $this->dom->createElement($token['name']);
3532 foreach($token['attr'] as $attr) {
3533 if(!$el->hasAttribute($attr['name'])) {
3534 $el->setAttribute($attr['name'], $attr['value']);
3538 $this->appendToRealParent($el);
3539 $this->stack[] = $el;
3541 return $el;
3544 private function insertText($data) {
3545 $text = $this->dom->createTextNode($data);
3546 $this->appendToRealParent($text);
3549 private function insertComment($data) {
3550 $comment = $this->dom->createComment($data);
3551 $this->appendToRealParent($comment);
3554 private function appendToRealParent($node) {
3555 if($this->foster_parent === null) {
3556 end($this->stack)->appendChild($node);
3558 } elseif($this->foster_parent !== null) {
3559 /* If the foster parent element is the parent element of the
3560 last table element in the stack of open elements, then the new
3561 node must be inserted immediately before the last table element
3562 in the stack of open elements in the foster parent element;
3563 otherwise, the new node must be appended to the foster parent
3564 element. */
3565 for($n = count($this->stack) - 1; $n >= 0; $n--) {
3566 if($this->stack[$n]->nodeName === 'table' &&
3567 $this->stack[$n]->parentNode !== null) {
3568 $table = $this->stack[$n];
3569 break;
3573 if(isset($table) && $this->foster_parent->isSameNode($table->parentNode))
3574 $this->foster_parent->insertBefore($node, $table);
3575 else
3576 $this->foster_parent->appendChild($node);
3578 $this->foster_parent = null;
3582 private function elementInScope($el, $table = false) {
3583 if(is_array($el)) {
3584 foreach($el as $element) {
3585 if($this->elementInScope($element, $table)) {
3586 return true;
3590 return false;
3593 $leng = count($this->stack);
3595 for($n = 0; $n < $leng; $n++) {
3596 /* 1. Initialise node to be the current node (the bottommost node of
3597 the stack). */
3598 $node = $this->stack[$leng - 1 - $n];
3600 if($node->tagName === $el) {
3601 /* 2. If node is the target node, terminate in a match state. */
3602 return true;
3604 } elseif($node->tagName === 'table') {
3605 /* 3. Otherwise, if node is a table element, terminate in a failure
3606 state. */
3607 return false;
3609 } elseif($table === true && in_array($node->tagName, array('caption', 'td',
3610 'th', 'button', 'marquee', 'object'))) {
3611 /* 4. Otherwise, if the algorithm is the "has an element in scope"
3612 variant (rather than the "has an element in table scope" variant),
3613 and node is one of the following, terminate in a failure state. */
3614 return false;
3616 } elseif($node === $node->ownerDocument->documentElement) {
3617 /* 5. Otherwise, if node is an html element (root element), terminate
3618 in a failure state. (This can only happen if the node is the topmost
3619 node of the stack of open elements, and prevents the next step from
3620 being invoked if there are no more elements in the stack.) */
3621 return false;
3624 /* Otherwise, set node to the previous entry in the stack of open
3625 elements and return to step 2. (This will never fail, since the loop
3626 will always terminate in the previous step if the top of the stack
3627 is reached.) */
3631 private function reconstructActiveFormattingElements() {
3632 /* 1. If there are no entries in the list of active formatting elements,
3633 then there is nothing to reconstruct; stop this algorithm. */
3634 $formatting_elements = count($this->a_formatting);
3636 if($formatting_elements === 0) {
3637 return false;
3640 /* 3. Let entry be the last (most recently added) element in the list
3641 of active formatting elements. */
3642 $entry = end($this->a_formatting);
3644 /* 2. If the last (most recently added) entry in the list of active
3645 formatting elements is a marker, or if it is an element that is in the
3646 stack of open elements, then there is nothing to reconstruct; stop this
3647 algorithm. */
3648 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3649 return false;
3652 for($a = $formatting_elements - 1; $a >= 0; true) {
3653 /* 4. If there are no entries before entry in the list of active
3654 formatting elements, then jump to step 8. */
3655 if($a === 0) {
3656 $step_seven = false;
3657 break;
3660 /* 5. Let entry be the entry one earlier than entry in the list of
3661 active formatting elements. */
3662 $a--;
3663 $entry = $this->a_formatting[$a];
3665 /* 6. If entry is neither a marker nor an element that is also in
3666 thetack of open elements, go to step 4. */
3667 if($entry === self::MARKER || in_array($entry, $this->stack, true)) {
3668 break;
3672 while(true) {
3673 /* 7. Let entry be the element one later than entry in the list of
3674 active formatting elements. */
3675 if(isset($step_seven) && $step_seven === true) {
3676 $a++;
3677 $entry = $this->a_formatting[$a];
3680 /* 8. Perform a shallow clone of the element entry to obtain clone. */
3681 $clone = $entry->cloneNode();
3683 /* 9. Append clone to the current node and push it onto the stack
3684 of open elements so that it is the new current node. */
3685 end($this->stack)->appendChild($clone);
3686 $this->stack[] = $clone;
3688 /* 10. Replace the entry for entry in the list with an entry for
3689 clone. */
3690 $this->a_formatting[$a] = $clone;
3692 /* 11. If the entry for clone in the list of active formatting
3693 elements is not the last entry in the list, return to step 7. */
3694 if(end($this->a_formatting) !== $clone) {
3695 $step_seven = true;
3696 } else {
3697 break;
3702 private function clearTheActiveFormattingElementsUpToTheLastMarker() {
3703 /* When the steps below require the UA to clear the list of active
3704 formatting elements up to the last marker, the UA must perform the
3705 following steps: */
3707 while(true) {
3708 /* 1. Let entry be the last (most recently added) entry in the list
3709 of active formatting elements. */
3710 $entry = end($this->a_formatting);
3712 /* 2. Remove entry from the list of active formatting elements. */
3713 array_pop($this->a_formatting);
3715 /* 3. If entry was a marker, then stop the algorithm at this point.
3716 The list has been cleared up to the last marker. */
3717 if($entry === self::MARKER) {
3718 break;
3723 private function generateImpliedEndTags($exclude = array()) {
3724 /* When the steps below require the UA to generate implied end tags,
3725 then, if the current node is a dd element, a dt element, an li element,
3726 a p element, a td element, a th element, or a tr element, the UA must
3727 act as if an end tag with the respective tag name had been seen and
3728 then generate implied end tags again. */
3729 $node = end($this->stack);
3730 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
3732 while(in_array(end($this->stack)->nodeName, $elements)) {
3733 array_pop($this->stack);
3737 private function getElementCategory($node) {
3738 $name = $node->tagName;
3739 if(in_array($name, $this->special))
3740 return self::SPECIAL;
3742 elseif(in_array($name, $this->scoping))
3743 return self::SCOPING;
3745 elseif(in_array($name, $this->formatting))
3746 return self::FORMATTING;
3748 else
3749 return self::PHRASING;
3752 private function clearStackToTableContext($elements) {
3753 /* When the steps above require the UA to clear the stack back to a
3754 table context, it means that the UA must, while the current node is not
3755 a table element or an html element, pop elements from the stack of open
3756 elements. If this causes any elements to be popped from the stack, then
3757 this is a parse error. */
3758 while(true) {
3759 $node = end($this->stack)->nodeName;
3761 if(in_array($node, $elements)) {
3762 break;
3763 } else {
3764 array_pop($this->stack);
3769 private function resetInsertionMode() {
3770 /* 1. Let last be false. */
3771 $last = false;
3772 $leng = count($this->stack);
3774 for($n = $leng - 1; $n >= 0; $n--) {
3775 /* 2. Let node be the last node in the stack of open elements. */
3776 $node = $this->stack[$n];
3778 /* 3. If node is the first node in the stack of open elements, then
3779 set last to true. If the element whose innerHTML attribute is being
3780 set is neither a td element nor a th element, then set node to the
3781 element whose innerHTML attribute is being set. (innerHTML case) */
3782 if($this->stack[0]->isSameNode($node)) {
3783 $last = true;
3786 /* 4. If node is a select element, then switch the insertion mode to
3787 "in select" and abort these steps. (innerHTML case) */
3788 if($node->nodeName === 'select') {
3789 $this->mode = self::IN_SELECT;
3790 break;
3792 /* 5. If node is a td or th element, then switch the insertion mode
3793 to "in cell" and abort these steps. */
3794 } elseif($node->nodeName === 'td' || $node->nodeName === 'th') {
3795 $this->mode = self::IN_CELL;
3796 break;
3798 /* 6. If node is a tr element, then switch the insertion mode to
3799 "in row" and abort these steps. */
3800 } elseif($node->nodeName === 'tr') {
3801 $this->mode = self::IN_ROW;
3802 break;
3804 /* 7. If node is a tbody, thead, or tfoot element, then switch the
3805 insertion mode to "in table body" and abort these steps. */
3806 } elseif(in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
3807 $this->mode = self::IN_TBODY;
3808 break;
3810 /* 8. If node is a caption element, then switch the insertion mode
3811 to "in caption" and abort these steps. */
3812 } elseif($node->nodeName === 'caption') {
3813 $this->mode = self::IN_CAPTION;
3814 break;
3816 /* 9. If node is a colgroup element, then switch the insertion mode
3817 to "in column group" and abort these steps. (innerHTML case) */
3818 } elseif($node->nodeName === 'colgroup') {
3819 $this->mode = self::IN_CGROUP;
3820 break;
3822 /* 10. If node is a table element, then switch the insertion mode
3823 to "in table" and abort these steps. */
3824 } elseif($node->nodeName === 'table') {
3825 $this->mode = self::IN_TABLE;
3826 break;
3828 /* 11. If node is a head element, then switch the insertion mode
3829 to "in body" ("in body"! not "in head"!) and abort these steps.
3830 (innerHTML case) */
3831 } elseif($node->nodeName === 'head') {
3832 $this->mode = self::IN_BODY;
3833 break;
3835 /* 12. If node is a body element, then switch the insertion mode to
3836 "in body" and abort these steps. */
3837 } elseif($node->nodeName === 'body') {
3838 $this->mode = self::IN_BODY;
3839 break;
3841 /* 13. If node is a frameset element, then switch the insertion
3842 mode to "in frameset" and abort these steps. (innerHTML case) */
3843 } elseif($node->nodeName === 'frameset') {
3844 $this->mode = self::IN_FRAME;
3845 break;
3847 /* 14. If node is an html element, then: if the head element
3848 pointer is null, switch the insertion mode to "before head",
3849 otherwise, switch the insertion mode to "after head". In either
3850 case, abort these steps. (innerHTML case) */
3851 } elseif($node->nodeName === 'html') {
3852 $this->mode = ($this->head_pointer === null)
3853 ? self::BEFOR_HEAD
3854 : self::AFTER_HEAD;
3856 break;
3858 /* 15. If last is true, then set the insertion mode to "in body"
3859 and abort these steps. (innerHTML case) */
3860 } elseif($last) {
3861 $this->mode = self::IN_BODY;
3862 break;
3867 private function closeCell() {
3868 /* If the stack of open elements has a td or th element in table scope,
3869 then act as if an end tag token with that tag name had been seen. */
3870 foreach(array('td', 'th') as $cell) {
3871 if($this->elementInScope($cell, true)) {
3872 $this->inCell(array(
3873 'name' => $cell,
3874 'type' => HTML5::ENDTAG
3877 break;
3882 public function save() {
3883 return $this->dom;