lib/htmlpurifier/HTMLPurifier/Lexer.php

   1 <?php
   2
   3 require_once 'HTMLPurifier/Token.php';
   4 require_once 'HTMLPurifier/Encoder.php';
   5 require_once 'HTMLPurifier/EntityParser.php';
   6
   7 HTMLPurifier_ConfigSchema::define(
   8     'Core', 'AcceptFullDocuments', true, 'bool',
   9     'This parameter determines whether or not the filter should accept full '.
  10     'HTML documents, not just HTML fragments.  When on, it will '.
  11     'drop all sections except the content between body.'
  12 );
  13
  14 /**
  15  * Forgivingly lexes HTML (SGML-style) markup into tokens.
  16  *
  17  * A lexer parses a string of SGML-style markup and converts them into
  18  * corresponding tokens.  It doesn't check for well-formedness, although its
  19  * internal mechanism may make this automatic (such as the case of
  20  * HTMLPurifier_Lexer_DOMLex).  There are several implementations to choose
  21  * from.
  22  *
  23  * A lexer is HTML-oriented: it might work with XML, but it's not
  24  * recommended, as we adhere to a subset of the specification for optimization
  25  * reasons.
  26  *
  27  * This class should not be directly instantiated, but you may use create() to
  28  * retrieve a default copy of the lexer.  Being a supertype, this class
  29  * does not actually define any implementation, but offers commonly used
  30  * convenience functions for subclasses.
  31  *
  32  * @note The unit tests will instantiate this class for testing purposes, as
  33  *       many of the utility functions require a class to be instantiated.
  34  *       Be careful when porting this class to PHP 5.
  35  *
  36  * @par
  37  *
  38  * @note
  39  * We use tokens rather than create a DOM representation because DOM would:
  40  *
  41  * @par
  42  *  -# Require more processing power to create,
  43  *  -# Require recursion to iterate,
  44  *  -# Must be compatible with PHP 5's DOM (otherwise duplication),
  45  *  -# Has the entire document structure (html and body not needed), and
  46  *  -# Has unknown readability improvement.
  47  *
  48  * @par
  49  * What the last item means is that the functions for manipulating tokens are
  50  * already fairly compact, and when well-commented, more abstraction may not
  51  * be needed.
  52  *
  53  * @see HTMLPurifier_Token
  54  */
  55 class HTMLPurifier_Lexer
  56 {
  57
  58     function HTMLPurifier_Lexer() {
  59         $this->_entity_parser = new HTMLPurifier_EntityParser();
  60     }
  61
  62
  63     /**
  64      * Most common entity to raw value conversion table for special entities.
  65      * @protected
  66      */
  67     var $_special_entity2str =
  68             array(
  69                     '&quot;' => '"',
  70                     '&amp;'  => '&',
  71                     '&lt;'   => '<',
  72                     '&gt;'   => '>',
  73                     '&#39;'  => "'",
  74                     '&#039;' => "'",
  75                     '&#x27;' => "'"
  76             );
  77
  78     /**
  79      * Parses special entities into the proper characters.
  80      *
  81      * This string will translate escaped versions of the special characters
  82      * into the correct ones.
  83      *
  84      * @warning
  85      * You should be able to treat the output of this function as
  86      * completely parsed, but that's only because all other entities should
  87      * have been handled previously in substituteNonSpecialEntities()
  88      *
  89      * @param $string String character data to be parsed.
  90      * @returns Parsed character data.
  91      */
  92     function parseData($string) {
  93
  94         // following functions require at least one character
  95         if ($string === '') return '';
  96
  97         // subtracts amps that cannot possibly be escaped
  98         $num_amp = substr_count($string, '&') - substr_count($string, '& ') -
  99             ($string[strlen($string)-1] === '&' ? 1 : 0);
 100
 101         if (!$num_amp) return $string; // abort if no entities
 102         $num_esc_amp = substr_count($string, '&amp;');
 103         $string = strtr($string, $this->_special_entity2str);
 104
 105         // code duplication for sake of optimization, see above
 106         $num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
 107             ($string[strlen($string)-1] === '&' ? 1 : 0);
 108
 109         if ($num_amp_2 <= $num_esc_amp) return $string;
 110
 111         // hmm... now we have some uncommon entities. Use the callback.
 112         $string = $this->_entity_parser->substituteSpecialEntities($string);
 113         return $string;
 114     }
 115
 116     /**
 117      * Lexes an HTML string into tokens.
 118      *
 119      * @param $string String HTML.
 120      * @return HTMLPurifier_Token array representation of HTML.
 121      */
 122     function tokenizeHTML($string, $config, &$context) {
 123         trigger_error('Call to abstract class', E_USER_ERROR);
 124     }
 125
 126     /**
 127      * Retrieves or sets the default Lexer as a Prototype Factory.
 128      *
 129      * Depending on what PHP version you are running, the abstract base
 130      * Lexer class will determine which concrete Lexer is best for you:
 131      * HTMLPurifier_Lexer_DirectLex for PHP 4, and HTMLPurifier_Lexer_DOMLex
 132      * for PHP 5 and beyond.
 133      *
 134      * Passing the optional prototype lexer parameter will override the
 135      * default with your own implementation.  A copy/reference of the prototype
 136      * lexer will now be returned when you request a new lexer.
 137      *
 138      * @static
 139      *
 140      * @note
 141      * Though it is possible to call this factory method from subclasses,
 142      * such usage is not recommended.
 143      *
 144      * @param $prototype Optional prototype lexer.
 145      * @return Concrete lexer.
 146      */
 147     function create($prototype = null) {
 148         // we don't really care if it's a reference or a copy
 149         static $lexer = null;
 150         if ($prototype) {
 151             $lexer = $prototype;
 152         }
 153         if (empty($lexer)) {
 154             if (version_compare(PHP_VERSION, "5", ">=") && // check for PHP5
 155                 class_exists('DOMDocument')) { // check for DOM support
 156                 require_once 'HTMLPurifier/Lexer/DOMLex.php';
 157                 $lexer = new HTMLPurifier_Lexer_DOMLex();
 158             } else {
 159                 require_once 'HTMLPurifier/Lexer/DirectLex.php';
 160                 $lexer = new HTMLPurifier_Lexer_DirectLex();
 161             }
 162         }
 163         return $lexer;
 164     }
 165
 166     /**
 167      * Translates CDATA sections into regular sections (through escaping).
 168      *
 169      * @static
 170      * @protected
 171      * @param $string HTML string to process.
 172      * @returns HTML with CDATA sections escaped.
 173      */
 174     function escapeCDATA($string) {
 175         return preg_replace_callback(
 176             '/<!\[CDATA\[(.+?)\]\]>/',
 177             array('HTMLPurifier_Lexer', 'CDATACallback'),
 178             $string
 179         );
 180     }
 181
 182     /**
 183      * Callback function for escapeCDATA() that does the work.
 184      *
 185      * @static
 186      * @warning Though this is public in order to let the callback happen,
 187      *          calling it directly is not recommended.
 188      * @params $matches PCRE matches array, with index 0 the entire match
 189      *                  and 1 the inside of the CDATA section.
 190      * @returns Escaped internals of the CDATA section.
 191      */
 192     function CDATACallback($matches) {
 193         // not exactly sure why the character set is needed, but whatever
 194         return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
 195     }
 196
 197     /**
 198      * Takes a piece of HTML and normalizes it by converting entities, fixing
 199      * encoding, extracting bits, and other good stuff.
 200      */
 201     function normalize($html, $config, &$context) {
 202
 203         // extract body from document if applicable
 204         if ($config->get('Core', 'AcceptFullDocuments')) {
 205             $html = $this->extractBody($html);
 206         }
 207
 208         // escape CDATA
 209         $html = $this->escapeCDATA($html);
 210
 211         // expand entities that aren't the big five
 212         $html = $this->_entity_parser->substituteNonSpecialEntities($html);
 213
 214         // clean into wellformed UTF-8 string for an SGML context: this has
 215         // to be done after entity expansion because the entities sometimes
 216         // represent non-SGML characters (horror, horror!)
 217         $html = HTMLPurifier_Encoder::cleanUTF8($html);
 218
 219         return $html;
 220     }
 221
 222     /**
 223      * Takes a string of HTML (fragment or document) and returns the content
 224      */
 225     function extractBody($html) {
 226         $matches = array();
 227         $result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
 228         if ($result) {
 229             return $matches[1];
 230         } else {
 231             return $html;
 232         }
 233     }
 234
 235 }
 236
 237 ?>