3 * base include file for SimpleTest
5 * @subpackage MockObjects
6 * @version $Id: parser.php,v 1.72 2006/01/04 03:15:05 lastcraft Exp $
10 * Lexer mode stack constants
12 if (! defined('LEXER_ENTER')) {
13 define('LEXER_ENTER', 1);
15 if (! defined('LEXER_MATCHED')) {
16 define('LEXER_MATCHED', 2);
18 if (! defined('LEXER_UNMATCHED')) {
19 define('LEXER_UNMATCHED', 3);
21 if (! defined('LEXER_EXIT')) {
22 define('LEXER_EXIT', 4);
24 if (! defined('LEXER_SPECIAL')) {
25 define('LEXER_SPECIAL', 5);
30 * Compounded regular expression. Any of
31 * the contained patterns could match and
32 * when one does, it's label is returned.
34 * @subpackage WebTester
43 * Constructor. Starts with no patterns.
44 * @param boolean $case True for case sensitive, false
48 function ParallelRegex($case) {
50 $this->_patterns
= array();
51 $this->_labels
= array();
56 * Adds a pattern with an optional label.
57 * @param string $pattern Perl style regex, but ( and )
58 * lose the usual meaning.
59 * @param string $label Label of regex to be returned
63 function addPattern($pattern, $label = true) {
64 $count = count($this->_patterns
);
65 $this->_patterns
[$count] = $pattern;
66 $this->_labels
[$count] = $label;
71 * Attempts to match all patterns at once against
73 * @param string $subject String to match against.
74 * @param string $match First matched portion of
76 * @return boolean True on success.
79 function match($subject, &$match) {
80 if (count($this->_patterns
) == 0) {
83 if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
88 for ($i = 1; $i < count($matches); $i++
) {
90 return $this->_labels
[$i - 1];
97 * Compounds the patterns into a single
98 * regular expression separated with the
99 * "or" operator. Caches the regex.
100 * Will automatically escape (, ) and / tokens.
101 * @param array $patterns List of patterns in order.
104 function _getCompoundedRegex() {
105 if ($this->_regex
== null) {
106 for ($i = 0, $count = count($this->_patterns
); $i < $count; $i++
) {
107 $this->_patterns
[$i] = '(' . str_replace(
108 array('/', '(', ')'),
109 array('\/', '\(', '\)'),
110 $this->_patterns
[$i]) . ')';
112 $this->_regex
= "/" . implode("|", $this->_patterns
) . "/" . $this->_getPerlMatchingFlags();
114 return $this->_regex
;
118 * Accessor for perl regex mode flags to use.
119 * @return string Perl regex flags.
122 function _getPerlMatchingFlags() {
123 return ($this->_case ?
"msS" : "msSi");
128 * States for a stack machine.
129 * @package SimpleTest
130 * @subpackage WebTester
132 class SimpleStateStack
{
136 * Constructor. Starts in named state.
137 * @param string $start Starting state name.
140 function SimpleStateStack($start) {
141 $this->_stack
= array($start);
145 * Accessor for current state.
146 * @return string State.
149 function getCurrent() {
150 return $this->_stack
[count($this->_stack
) - 1];
154 * Adds a state to the stack and sets it
155 * to be the current state.
156 * @param string $state New state.
159 function enter($state) {
160 array_push($this->_stack
, $state);
164 * Leaves the current state and reverts
165 * to the previous one.
166 * @return boolean False if we drop off
167 * the bottom of the list.
171 if (count($this->_stack
) == 1) {
174 array_pop($this->_stack
);
180 * Accepts text and breaks it into tokens.
181 * Some optimisation to make the sure the
182 * content is only scanned by the PHP regex
183 * parser once. Lexer modes must not start
184 * with leading underscores.
185 * @package SimpleTest
186 * @subpackage WebTester
196 * Sets up the lexer in case insensitive matching
198 * @param SimpleSaxParser $parser Handling strategy by
200 * @param string $start Starting handler.
201 * @param boolean $case True for case sensitive.
204 function SimpleLexer(&$parser, $start = "accept", $case = false) {
205 $this->_case
= $case;
206 $this->_regexes
= array();
207 $this->_parser
= &$parser;
208 $this->_mode
= &new SimpleStateStack($start);
209 $this->_mode_handlers
= array($start => $start);
213 * Adds a token search pattern for a particular
214 * parsing mode. The pattern does not change the
216 * @param string $pattern Perl style regex, but ( and )
217 * lose the usual meaning.
218 * @param string $mode Should only apply this
219 * pattern when dealing with
220 * this type of input.
223 function addPattern($pattern, $mode = "accept") {
224 if (! isset($this->_regexes
[$mode])) {
225 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
227 $this->_regexes
[$mode]->addPattern($pattern);
228 if (! isset($this->_mode_handlers
[$mode])) {
229 $this->_mode_handlers
[$mode] = $mode;
234 * Adds a pattern that will enter a new parsing
235 * mode. Useful for entering parenthesis, strings,
237 * @param string $pattern Perl style regex, but ( and )
238 * lose the usual meaning.
239 * @param string $mode Should only apply this
240 * pattern when dealing with
241 * this type of input.
242 * @param string $new_mode Change parsing to this new
246 function addEntryPattern($pattern, $mode, $new_mode) {
247 if (! isset($this->_regexes
[$mode])) {
248 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
250 $this->_regexes
[$mode]->addPattern($pattern, $new_mode);
251 if (! isset($this->_mode_handlers
[$new_mode])) {
252 $this->_mode_handlers
[$new_mode] = $new_mode;
257 * Adds a pattern that will exit the current mode
258 * and re-enter the previous one.
259 * @param string $pattern Perl style regex, but ( and )
260 * lose the usual meaning.
261 * @param string $mode Mode to leave.
264 function addExitPattern($pattern, $mode) {
265 if (! isset($this->_regexes
[$mode])) {
266 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
268 $this->_regexes
[$mode]->addPattern($pattern, "__exit");
269 if (! isset($this->_mode_handlers
[$mode])) {
270 $this->_mode_handlers
[$mode] = $mode;
275 * Adds a pattern that has a special mode. Acts as an entry
276 * and exit pattern in one go, effectively calling a special
277 * parser handler for this token only.
278 * @param string $pattern Perl style regex, but ( and )
279 * lose the usual meaning.
280 * @param string $mode Should only apply this
281 * pattern when dealing with
282 * this type of input.
283 * @param string $special Use this mode for this one token.
286 function addSpecialPattern($pattern, $mode, $special) {
287 if (! isset($this->_regexes
[$mode])) {
288 $this->_regexes
[$mode] = new ParallelRegex($this->_case
);
290 $this->_regexes
[$mode]->addPattern($pattern, "_$special");
291 if (! isset($this->_mode_handlers
[$special])) {
292 $this->_mode_handlers
[$special] = $special;
297 * Adds a mapping from a mode to another handler.
298 * @param string $mode Mode to be remapped.
299 * @param string $handler New target handler.
302 function mapHandler($mode, $handler) {
303 $this->_mode_handlers
[$mode] = $handler;
307 * Splits the page text into tokens. Will fail
308 * if the handlers report an error or if no
309 * content is consumed. If successful then each
310 * unparsed and parsed token invokes a call to the
312 * @param string $raw Raw HTML text.
313 * @return boolean True on success, else false.
316 function parse($raw) {
317 if (! isset($this->_parser
)) {
320 $length = strlen($raw);
321 while (is_array($parsed = $this->_reduce($raw))) {
322 list($raw, $unmatched, $matched, $mode) = $parsed;
323 if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
329 if (strlen($raw) == $length) {
332 $length = strlen($raw);
337 return $this->_invokeParser($raw, LEXER_UNMATCHED
);
341 * Sends the matched token and any leading unmatched
342 * text to the parser changing the lexer to a new
343 * mode if one is listed.
344 * @param string $unmatched Unmatched leading portion.
345 * @param string $matched Actual token match.
346 * @param string $mode Mode after match. A boolean
347 * false mode causes no change.
348 * @return boolean False if there was any error
352 function _dispatchTokens($unmatched, $matched, $mode = false) {
353 if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED
)) {
356 if (is_bool($mode)) {
357 return $this->_invokeParser($matched, LEXER_MATCHED
);
359 if ($this->_isModeEnd($mode)) {
360 if (! $this->_invokeParser($matched, LEXER_EXIT
)) {
363 return $this->_mode
->leave();
365 if ($this->_isSpecialMode($mode)) {
366 $this->_mode
->enter($this->_decodeSpecial($mode));
367 if (! $this->_invokeParser($matched, LEXER_SPECIAL
)) {
370 return $this->_mode
->leave();
372 $this->_mode
->enter($mode);
373 return $this->_invokeParser($matched, LEXER_ENTER
);
377 * Tests to see if the new mode is actually to leave
378 * the current mode and pop an item from the matching
380 * @param string $mode Mode to test.
381 * @return boolean True if this is the exit mode.
384 function _isModeEnd($mode) {
385 return ($mode === "__exit");
389 * Test to see if the mode is one where this mode
390 * is entered for this token only and automatically
391 * leaves immediately afterwoods.
392 * @param string $mode Mode to test.
393 * @return boolean True if this is the exit mode.
396 function _isSpecialMode($mode) {
397 return (strncmp($mode, "_", 1) == 0);
401 * Strips the magic underscore marking single token
403 * @param string $mode Mode to decode.
404 * @return string Underlying mode name.
407 function _decodeSpecial($mode) {
408 return substr($mode, 1);
412 * Calls the parser method named after the current
413 * mode. Empty content will be ignored. The lexer
414 * has a parser handler for each mode in the lexer.
415 * @param string $content Text parsed.
416 * @param boolean $is_match Token is recognised rather
417 * than unparsed data.
420 function _invokeParser($content, $is_match) {
421 if (($content === '') ||
($content === false)) {
424 $handler = $this->_mode_handlers
[$this->_mode
->getCurrent()];
425 return $this->_parser
->$handler($content, $is_match);
429 * Tries to match a chunk of text and if successful
430 * removes the recognised chunk and any leading
431 * unparsed data. Empty strings will not be matched.
432 * @param string $raw The subject to parse. This is the
433 * content that will be eaten.
434 * @return array/boolean Three item list of unparsed
435 * content followed by the
436 * recognised token and finally the
437 * action the parser is to take.
438 * True if no match, false if there
439 * is a parsing error.
442 function _reduce($raw) {
443 if ($action = $this->_regexes
[$this->_mode
->getCurrent()]->match($raw, $match)) {
444 $unparsed_character_count = strpos($raw, $match);
445 $unparsed = substr($raw, 0, $unparsed_character_count);
446 $raw = substr($raw, $unparsed_character_count +
strlen($match));
447 return array($raw, $unparsed, $match, $action);
454 * Breas HTML into SAX events.
455 * @package SimpleTest
456 * @subpackage WebTester
458 class SimpleHtmlLexer
extends SimpleLexer
{
461 * Sets up the lexer with case insensitive matching
462 * and adds the HTML handlers.
463 * @param SimpleSaxParser $parser Handling strategy by
467 function SimpleHtmlLexer(&$parser) {
468 $this->SimpleLexer($parser, 'text');
469 $this->mapHandler('text', 'acceptTextToken');
470 $this->_addSkipping();
471 foreach ($this->_getParsedTags() as $tag) {
472 $this->_addTag($tag);
474 $this->_addInTagTokens();
478 * List of parsed tags. Others are ignored.
479 * @return array List of searched for tags.
482 function _getParsedTags() {
483 return array('a', 'title', 'form', 'input', 'button', 'textarea', 'select',
484 'option', 'frameset', 'frame', 'label');
488 * The lexer has to skip certain sections such
489 * as server code, client code and styles.
492 function _addSkipping() {
493 $this->mapHandler('css', 'ignore');
494 $this->addEntryPattern('<style', 'text', 'css');
495 $this->addExitPattern('</style>', 'css');
496 $this->mapHandler('js', 'ignore');
497 $this->addEntryPattern('<script', 'text', 'js');
498 $this->addExitPattern('</script>', 'js');
499 $this->mapHandler('comment', 'ignore');
500 $this->addEntryPattern('<!--', 'text', 'comment');
501 $this->addExitPattern('-->', 'comment');
505 * Pattern matches to start and end a tag.
506 * @param string $tag Name of tag to scan for.
509 function _addTag($tag) {
510 $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken');
511 $this->addEntryPattern("<$tag", 'text', 'tag');
515 * Pattern matches to parse the inside of a tag
516 * including the attributes and their quoting.
519 function _addInTagTokens() {
520 $this->mapHandler('tag', 'acceptStartToken');
521 $this->addSpecialPattern('\s+', 'tag', 'ignore');
522 $this->_addAttributeTokens();
523 $this->addExitPattern('/>', 'tag');
524 $this->addExitPattern('>', 'tag');
528 * Matches attributes that are either single quoted,
529 * double quoted or unquoted.
532 function _addAttributeTokens() {
533 $this->mapHandler('dq_attribute', 'acceptAttributeToken');
534 $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute');
535 $this->addPattern("\\\\\"", 'dq_attribute');
536 $this->addExitPattern('"', 'dq_attribute');
537 $this->mapHandler('sq_attribute', 'acceptAttributeToken');
538 $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute');
539 $this->addPattern("\\\\'", 'sq_attribute');
540 $this->addExitPattern("'", 'sq_attribute');
541 $this->mapHandler('uq_attribute', 'acceptAttributeToken');
542 $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute');
547 * Converts HTML tokens into selected SAX events.
548 * @package SimpleTest
549 * @subpackage WebTester
551 class SimpleHtmlSaxParser
{
556 var $_current_attribute;
560 * @param SimpleSaxListener $listener SAX event handler.
563 function SimpleHtmlSaxParser(&$listener) {
564 $this->_listener
= &$listener;
565 $this->_lexer
= &$this->createLexer($this);
567 $this->_attributes
= array();
568 $this->_current_attribute
= '';
572 * Runs the content through the lexer which
573 * should call back to the acceptors.
574 * @param string $raw Page text to parse.
575 * @return boolean False if parse error.
578 function parse($raw) {
579 return $this->_lexer
->parse($raw);
583 * Sets up the matching lexer. Starts in 'text' mode.
584 * @param SimpleSaxParser $parser Event generator, usually $self.
585 * @return SimpleLexer Lexer suitable for this parser.
589 function &createLexer(&$parser) {
590 $lexer = &new SimpleHtmlLexer($parser);
595 * Accepts a token from the tag mode. If the
596 * starting element completes then the element
597 * is dispatched and the current attributes
598 * set back to empty. The element or attribute
599 * name is converted to lower case.
600 * @param string $token Incoming characters.
601 * @param integer $event Lexer event type.
602 * @return boolean False if parse error.
605 function acceptStartToken($token, $event) {
606 if ($event == LEXER_ENTER
) {
607 $this->_tag
= strtolower(substr($token, 1));
610 if ($event == LEXER_EXIT
) {
611 $success = $this->_listener
->startElement(
615 $this->_attributes
= array();
619 $this->_current_attribute
= strtolower(SimpleHtmlSaxParser
::decodeHtml($token));
620 $this->_attributes
[$this->_current_attribute
] = '';
626 * Accepts a token from the end tag mode.
627 * The element name is converted to lower case.
628 * @param string $token Incoming characters.
629 * @param integer $event Lexer event type.
630 * @return boolean False if parse error.
633 function acceptEndToken($token, $event) {
634 if (! preg_match('/<\/(.*)>/', $token, $matches)) {
637 return $this->_listener
->endElement(strtolower($matches[1]));
641 * Part of the tag data.
642 * @param string $token Incoming characters.
643 * @param integer $event Lexer event type.
644 * @return boolean False if parse error.
647 function acceptAttributeToken($token, $event) {
648 if ($event == LEXER_UNMATCHED
) {
649 $this->_attributes
[$this->_current_attribute
] .=
650 SimpleHtmlSaxParser
::decodeHtml($token);
652 if ($event == LEXER_SPECIAL
) {
653 $this->_attributes
[$this->_current_attribute
] .=
654 preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser
::decodeHtml($token));
660 * A character entity.
661 * @param string $token Incoming characters.
662 * @param integer $event Lexer event type.
663 * @return boolean False if parse error.
666 function acceptEntityToken($token, $event) {
670 * Character data between tags regarded as
672 * @param string $token Incoming characters.
673 * @param integer $event Lexer event type.
674 * @return boolean False if parse error.
677 function acceptTextToken($token, $event) {
678 return $this->_listener
->addContent($token);
682 * Incoming data to be ignored.
683 * @param string $token Incoming characters.
684 * @param integer $event Lexer event type.
685 * @return boolean False if parse error.
688 function ignore($token, $event) {
693 * Decodes any HTML entities.
694 * @param string $html Incoming HTML.
695 * @return string Outgoing plain text.
699 function decodeHtml($html) {
700 static $translations;
701 if (! isset($translations)) {
702 $translations = array_flip(get_html_translation_table(HTML_ENTITIES
));
704 return strtr($html, $translations);
708 * Turns HTML into text browser visible text. Images
709 * are converted to their alt text and tags are supressed.
710 * Entities are converted to their visible representation.
711 * @param string $html HTML to convert.
712 * @return string Plain text.
716 function normalise($html) {
717 $text = preg_replace('|<!--.*?-->|', '', $html);
718 $text = preg_replace('|<img.*?alt\s*=\s*"(.*?)".*?>|', ' \1 ', $text);
719 $text = preg_replace('|<img.*?alt\s*=\s*\'(.*?)\'.*?>|', ' \1 ', $text);
720 $text = preg_replace('|<img.*?alt\s*=\s*([a-zA-Z_]+).*?>|', ' \1 ', $text);
721 $text = preg_replace('|<.*?>|', '', $text);
722 $text = SimpleHtmlSaxParser
::decodeHtml($text);
723 $text = preg_replace('|\s+|', ' ', $text);
730 * @package SimpleTest
731 * @subpackage WebTester
734 class SimpleSaxListener
{
737 * Sets the document to write to.
740 function SimpleSaxListener() {
744 * Start of element event.
745 * @param string $name Element name.
746 * @param hash $attributes Name value pairs.
747 * Attributes without content
748 * are marked as true.
749 * @return boolean False on parse error.
752 function startElement($name, $attributes) {
756 * End of element event.
757 * @param string $name Element name.
758 * @return boolean False on parse error.
761 function endElement($name) {
765 * Unparsed, but relevant data.
766 * @param string $text May include unparsed tags.
767 * @return boolean False on parse error.
770 function addContent($text) {