Fixed tiny issue with Tokenizer
[haanga.git] / lib / Haanga / Compiler / Tokenizer.php
blobbfa50b01c8b6d20edfded53d87d0f849ceea94c4
1 <?php
2 /*
3 +---------------------------------------------------------------------------------+
4 | Copyright (c) 2010 César Rodas and Menéame Comunicacions S.L. |
5 +---------------------------------------------------------------------------------+
6 | Redistribution and use in source and binary forms, with or without |
7 | modification, are permitted provided that the following conditions are met: |
8 | 1. Redistributions of source code must retain the above copyright |
9 | notice, this list of conditions and the following disclaimer. |
10 | |
11 | 2. Redistributions in binary form must reproduce the above copyright |
12 | notice, this list of conditions and the following disclaimer in the |
13 | documentation and/or other materials provided with the distribution. |
14 | |
15 | 3. All advertising materials mentioning features or use of this software |
16 | must display the following acknowledgement: |
17 | This product includes software developed by César D. Rodas. |
18 | |
19 | 4. Neither the name of the César D. Rodas nor the |
20 | names of its contributors may be used to endorse or promote products |
21 | derived from this software without specific prior written permission. |
22 | |
23 | THIS SOFTWARE IS PROVIDED BY CÉSAR D. RODAS ''AS IS'' AND ANY |
24 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
25 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
26 | DISCLAIMED. IN NO EVENT SHALL CÉSAR D. RODAS BE LIABLE FOR ANY |
27 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
28 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
29 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
30 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
31 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
32 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE |
33 +---------------------------------------------------------------------------------+
34 | Authors: César Rodas <crodas@php.net> |
35 +---------------------------------------------------------------------------------+
38 class HG_Parser Extends Haanga_Compiler_Parser
40 /* subclass to made easier references to constants */
44 /**
45 * Hand-written Tokenizer class inspired by SQLite's tokenize.c
48 class Haanga_Compiler_Tokenizer
50 /* they are case sensitive and sorted! */
51 static $keywords = array(
52 'AND' => HG_Parser::T_AND,
53 'FALSE' => HG_Parser::T_FALSE,
54 'NOT' => HG_Parser::T_NOT,
55 'OR' => HG_Parser::T_OR,
56 'TRUE' => HG_Parser::T_TRUE,
57 '_(' => HG_Parser::T_INTL,
58 'as' => HG_Parser::T_AS,
59 'autoescape' => HG_Parser::T_AUTOESCAPE,
60 'block' => HG_Parser::T_BLOCK,
61 'by' => HG_Parser::T_BY,
62 'else' => HG_Parser::T_ELSE,
63 'empty' => HG_Parser::T_EMPTY,
64 'extends' => HG_Parser::T_EXTENDS,
65 'filter' => HG_Parser::T_FILTER,
66 'for' => HG_Parser::T_FOR,
67 'if' => HG_Parser::T_IF,
68 'ifchanged' => HG_Parser::T_IFCHANGED,
69 'ifequal' => HG_Parser::T_IFEQUAL,
70 'ifnotequal' => HG_Parser::T_IFNOTEQUAL,
71 'in' => HG_Parser::T_IN,
72 'include' => HG_Parser::T_INCLUDE,
73 'load' => HG_Parser::T_LOAD,
74 'not' => HG_Parser::T_NOT,
75 'regroup' => HG_Parser::T_REGROUP,
76 'set' => HG_Parser::T_SET,
77 'spacefull' => HG_Parser::T_SPACEFULL,
78 'step' => HG_Parser::T_STEP,
79 'with' => HG_Parser::T_WITH,
82 /* common operations */
83 static $operators_single = array(
84 '!' => HG_Parser::T_NOT,
85 '%' => HG_Parser::T_MOD,
86 '&' => HG_Parser::T_BITWISE,
87 '(' => HG_Parser::T_LPARENT,
88 ')' => HG_Parser::T_RPARENT,
89 '*' => HG_Parser::T_TIMES,
90 '+' => HG_Parser::T_PLUS,
91 ',' => HG_Parser::T_COMMA,
92 '-' => HG_Parser::T_MINUS,
93 '.' => HG_Parser::T_DOT,
94 '/' => HG_Parser::T_DIV,
95 ':' => HG_Parser::T_COLON,
96 '<' => HG_Parser::T_LT,
97 '=' => HG_Parser::T_ASSIGN,
98 '>' => HG_Parser::T_GT,
99 '[' => HG_Parser::T_BRACKETS_OPEN,
100 ']' => HG_Parser::T_BRACKETS_CLOSE,
101 '|' => HG_Parser::T_PIPE,
103 static $operators = array(
104 '!==' => HG_Parser::T_NE,
105 '!=' => HG_Parser::T_NE,
106 '&&' => HG_Parser::T_AND,
107 '->' => HG_Parser::T_OBJ,
108 '..' => HG_Parser::T_DOTDOT,
109 '<<' => HG_Parser::T_BITWISE,
110 '<=' => HG_Parser::T_LE,
111 '===' => HG_Parser::T_EQ,
112 '==' => HG_Parser::T_EQ,
113 '>=' => HG_Parser::T_GE,
114 '>>' => HG_Parser::T_BITWISE,
115 '||' => HG_Parser::T_OR,
118 static $close_tags = array();
120 static $open_tag = "{%";
121 static $end_tag = "%}";
122 static $open_comment = "{#";
123 static $end_comment = "#}";
124 static $open_print = "{{";
125 static $end_print = "}}";
127 public $open_tags;
128 public $value;
129 public $token;
130 public $status = self::IN_NONE;
132 const IN_NONE = 0;
133 const IN_HTML = 1;
134 const IN_TAG = 2;
135 const IN_ECHO = 3;
137 function __construct($data, $compiler, $file)
139 $this->data = $data;
140 $this->compiler = $compiler;
141 $this->line = 1;
142 $this->N = 0;
143 $this->file = $file;
144 $this->length = strlen($data);
147 /*$tmp1 = self::$operators;
148 $tmp2 = $tmp1;
149 ksort($tmp2);
150 var_dump($tmp2, $tmp1 === $tmp2);die(); /**/
152 self::$close_tags =array(
153 self::$end_tag => HG_Parser::T_TAG_CLOSE,
154 self::$end_print => HG_Parser::T_PRINT_CLOSE,
158 $this->open_tags = array(
159 self::$open_tag => HG_Parser::T_TAG_OPEN,
160 self::$open_print => HG_Parser::T_PRINT_OPEN,
161 self::$open_comment => HG_Parser::T_COMMENT,
165 function yylex()
167 $this->token = NULL;
169 if ($this->length == $this->N) {
170 if ($this->status != self::IN_NONE && $this->status != self::IN_HTML) {
171 $this->Error("Unexpected end");
173 return FALSE;
176 if ($this->status == self::IN_NONE) {
177 $i = &$this->N;
178 $data = substr($this->data, $i, 12);
180 static $lencache = array();
181 foreach ($this->open_tags as $value => $token) {
182 if (!isset($lencache[$value])) {
183 $lencache[$value] = strlen($value);
185 $len = $lencache[$value];
186 if (strncmp($data, $value, $len) == 0) {
187 $this->value = $value;
188 $this->token = $token;
189 $i += $len;
190 switch ($this->token) {
191 case HG_Parser::T_TAG_OPEN:
192 $this->status = self::IN_TAG;
193 break;
194 case HG_Parser::T_COMMENT:
195 $zdata = & $this->data;
197 if (($pos=strpos($zdata, self::$end_comment, $i)) === FALSE) {
198 $this->error("unexpected end");
201 $this->value = substr($zdata, $i, $pos-2);
202 $this->status = self::IN_NONE;
203 $i = $pos + 2;
204 break;
205 case HG_Parser::T_PRINT_OPEN:
206 $this->status = self::IN_ECHO;
207 break;
209 return TRUE;
213 $this->status = self::IN_HTML;
216 switch ($this->status)
218 case self::IN_TAG:
219 case self::IN_ECHO:
220 $this->yylex_main();
221 break;
222 default:
223 $this->yylex_html();
227 if (empty($this->token)) {
228 if ($this->status != self::IN_NONE && $this->status != self::IN_HTML) {
229 $this->Error("Unexpected end");
231 return FALSE;
234 return TRUE;
238 function yylex_html()
240 $data = &$this->data;
241 $i = &$this->N;
243 foreach ($this->open_tags as $value => $status) {
244 $pos = strpos($data, $value, $i);
245 if ($pos === FALSE) {
246 continue;
248 if (!isset($lowest_pos) || $lowest_pos > $pos) {
249 $lowest_pos = $pos;
253 if (isset($lowest_pos)) {
254 $this->value = substr($data, $i, $lowest_pos-$i);
255 $this->token = HG_Parser::T_HTML;
256 $this->status = self::IN_NONE;
257 $i += $lowest_pos - $i;
258 } else {
259 $this->value = substr($data, $i);
260 $this->token = HG_Parser::T_HTML;
261 $i = $this->length;
264 $this->line += substr_count($this->value, "\n");
269 function yylex_main()
271 $data = &$this->data;
273 for ($i=&$this->N; is_null($this->token) && $i < $this->length; ++$i) {
274 switch ($data[$i]) {
276 /* strings {{{ */
277 case '"':
278 case "'":
279 $end = $data[$i];
280 $value = "";
281 while ($data[++$i] != $end) {
282 switch ($data[$i]) {
283 case "\\":
284 switch ($data[++$i]) {
285 case "n":
286 $value .= "\n";
287 break;
288 case "t":
289 $value .= "\t";
290 break;
291 default:
292 $value .= $data[$i];
294 break;
295 case $end:
296 --$i;
297 break 2;
298 default:
299 if ($data[$i] == "\n") {
300 $this->line++;
302 $value .= $data[$i];
304 if (!isset($data[$i+1])) {
305 $this->Error("unclosed string");
308 $this->value = $value;
309 $this->token = HG_Parser::T_STRING;
310 break;
311 /* }}} */
313 /* number {{{ */
314 case '0': case '1': case '2': case '3': case '4':
315 case '5': case '6': case '7': case '8': case '9':
316 $value = "";
317 $dot = FALSE;
318 for ($e=0; $i < $this->length; ++$e, ++$i) {
319 switch ($data[$i]) {
320 case '0': case '1': case '2': case '3': case '4':
321 case '5': case '6': case '7': case '8': case '9':
322 $value .= $data[$i];
323 break;
324 case '.':
325 if (!$dot) {
326 $value .= ".";
327 $dot = TRUE;
328 } else {
329 $this->error("Invalid number");
331 break;
332 default:
333 break 2; /* break the main loop */
336 if (!$this->is_token_end($data[$i]) &&
337 !isset(self::$operators_single[$data[$i]]) || $value[$e-1] == '.') {
338 $this->error("Unexpected '{$data[$i]}'");
340 $this->value = $value;
341 $this->token = HG_Parser::T_NUMERIC;
342 break 2;
343 /* }}} */
345 case "\n": case " ": case "\t": case "\r": case "\f":
346 for (; is_null($this->token) && $i < $this->length; ++$i) {
347 switch ($data[$i]) {
348 case "\n":
349 $this->line++;
350 case " ": case "\t": case "\r": case "\f":
351 break;
352 case '.':
353 if ($data[$i+1] != '.') {
354 $this->token = HG_Parser::T_CONCAT;
355 $this->value = '.';
356 $i++;
357 return;
359 default:
360 /* break main loop */
361 /* and decrease because last processed byte */
362 /* wasn't a dot (T_CONCAT) */
363 --$i;
364 break 2;
367 break; /* whitespaces are ignored */
368 default:
369 if (!$this->getTag() && !$this->getOperator()) {
370 $alpha = $this->getAlpha();
371 if ($alpha === FALSE) {
372 $this->error("error: unexpected ".substr($data, $i));
374 static $tag=NULL;
375 if (!$tag) {
376 $tag = Haanga_Extension::getInstance('Tag');
378 $value = $tag->isValid($alpha);
379 $this->token = $value ? $value : HG_Parser::T_ALPHA;
380 $this->value = $alpha;
383 break 2;
387 if ($this->token == HG_Parser::T_TAG_CLOSE ||
388 $this->token == HG_Parser::T_PRINT_CLOSE) {
389 $this->status = self::IN_NONE;
394 function getTag()
396 static $lencache = array();
398 $i = &$this->N;
399 $data = substr($this->data, $i, 12);
400 foreach (self::$close_tags as $value => $token) {
401 if (!isset($lencache[$value])) {
402 $lencache[$value] = strlen($value);
404 $len = $lencache[$value];
405 if (strncmp($data, $value, $len) == 0) {
406 $this->token = $token;
407 $this->value = $value;
408 $i += $len;
409 return TRUE;
413 foreach (self::$keywords as $value => $token) {
414 if (!isset($lencache[$value])) {
415 $lencache[$value] = strlen($value);
417 $len = $lencache[$value];
418 switch (strncmp($data, $value, $len)) {
419 case -1:
420 break 2;
421 case 0: // match
422 if (isset($data[$len]) && !$this->is_token_end($data[$len])) {
423 /* probably a variable name TRUEfoo (and not TRUE) */
424 continue;
426 $this->token = $token;
427 $this->value = $value;
428 $i += $len;
429 return TRUE;
433 /* /end([a-zA-Z][a-zA-Z0-9]*)/ */
434 if (strncmp($data, "end", 3) == 0) {
435 $this->value = $this->getAlpha();
436 $this->token = HG_Parser::T_CUSTOM_END;
437 return TRUE;
440 return FALSE;
443 function Error($text)
445 throw new Haanga_Compiler_Exception($text." in ".$this->file.":".$this->line);
448 function getOperator()
450 static $lencache = array();
452 $i = &$this->N;
453 $data = substr($this->data, $i, 12);
455 foreach (self::$operators as $value => $token) {
456 if (!isset($lencache[$value])) {
457 $lencache[$value] = strlen($value);
459 $len = $lencache[$value];
460 switch (strncmp($data, $value, $len)) {
461 case -1:
462 if (strlen($data) == $len) {
463 break 2;
465 break;
466 case 0:
467 $this->token = $token;
468 $this->value = $value;
469 $i += $len;
470 return TRUE;
474 $data = $this->data[$i];
475 foreach (self::$operators_single as $value => $token) {
476 if ($value == $data) {
477 $this->token = $token;
478 $this->value = $value;
479 $i += 1;
480 return TRUE;
481 } else if ($value > $data) {
482 break;
487 return FALSE;
492 * Return TRUE if $letter is a valid "token_end". We use token_end
493 * to avoid confuse T_ALPHA TRUEfoo with TRUE and foo (T_ALPHA)
495 * @param string $letter
497 * @return bool
499 protected function is_token_end($letter)
501 /* [^a-zA-Z0-9_] */
502 return !(
503 ('a' <= $letter && 'z' >= $letter) ||
504 ('A' <= $letter && 'Z' >= $letter) ||
505 ('0' <= $letter && '9' >= $letter) ||
506 $letter == "_"
510 function getAlpha()
512 /* [a-zA-Z_][a-zA-Z0-9_]* */
513 $i = &$this->N;
514 $data = &$this->data;
516 if ( !('a' <= $data[$i] && 'z' >= $data[$i]) &&
517 !('A' <= $data[$i] && 'Z' >= $data[$i]) && $data[$i] != '_') {
518 return FALSE;
521 $value = "";
522 for (; $i < $this->length; ++$i) {
523 if (
524 ('a' <= $data[$i] && 'z' >= $data[$i]) ||
525 ('A' <= $data[$i] && 'Z' >= $data[$i]) ||
526 ('0' <= $data[$i] && '9' >= $data[$i]) ||
527 $data[$i] == "_"
529 $value .= $data[$i];
530 } else {
531 break;
535 return $value;
538 function getLine()
540 return $this->line;
544 static function init($template, $compiler, $file='')
546 $lexer = new Haanga_Compiler_Tokenizer($template, $compiler, $file);
547 $parser = new Haanga_Compiler_Parser($lexer, $file);
549 $parser->compiler = $compiler;
551 try {
552 for($i=0; ; $i++) {
553 if (!$lexer->yylex()) {
554 break;
556 $parser->doParse($lexer->token, $lexer->value);
558 } catch (Exception $e) {
559 /* destroy the parser */
560 try {
561 $parser->doParse(0,0);
562 } catch (Exception $e) {}
563 throw $e; /* re-throw exception */
566 $parser->doParse(0, 0);
568 return (array)$parser->body;