vendor/TextParsers/html2text.php

   1 <?php
   2 /**
   3  * html2text converts HTML Markup to Markdown [1]. It also supports
   4  * Markdown Extra [2] by Michel Fortin [3].
   5  *
   6  * It started as a port of Aaron Swartz' [4] html2text.py [5] but
   7  * got a long way since. This is more than a mere port now!
   8  *
   9  * [1]: http://daringfireball.com/projects/markdown
  10  * [2]: http://www.michelf.com/projects/php-markdown/extra/
  11  * [3]: http://www.michelf.com/
  12  * [4]: http://www.aaronsw.com/
  13  * [5]: http://www.aaronsw.com/2002/html2text/
  14  *
  15  * @version 1.4
  16  * @author Milian Wolff (mail@milianw.de,http://milianw.de)
  17  * @license LGPL, see LICENSE.txt and below
  18  * @copyright (C) 2007  Milian Wolff
  19  *
  20  * This library is free software; you can redistribute it and/or
  21  * modify it under the terms of the GNU Lesser General Public
  22  * License as published by the Free Software Foundation; either
  23  * version 2.1 of the License, or (at your option) any later version.
  24  *
  25  * This library is distributed in the hope that it will be useful,
  26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  28  * Lesser General Public License for more details.
  29  *
  30  * You should have received a copy of the GNU Lesser General Public
  31  * License along with this library; if not, write to the Free Software
  32  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  33  */
  34 class html2text {
  35     # input html
  36     var $html = '';
  37     # output markdown
  38     var $outtext = '';
  39     # some control structures
  40     var $p_p = 0;
  41     var $start = 1;
  42     var $space = 0;
  43     var $force_html = false;
  44     var $force_html_start = array('tag'=>'','parents'=>0);
  45     # links
  46     var $a = array();
  47     var $astack = array();
  48     var $acount = 0;
  49     # lists
  50     var $list = array ();
  51     var $list_depth = 0;
  52     var $lastWasNL = false;
  53     # indenting and appending
  54     var $append = '';
  55     var $indent = '';
  56     # these elements will be dropped with all subelements
  57     var $drop = array(
  58         'script',
  59         'head',
  60         'style',
  61         'form',
  62     );
  63     # these elements will be quietly ignored, their children will be parsed
  64     var $ignore = array(
  65         'wrapper', # important!
  66         'html', # closing html tag
  67         'body', # closing body tag
  68         'thead',
  69         'tbody',
  70         'tfoot',
  71     );
  72     # these elements can have some attributes
  73     var $has_attrs = array(
  74         # tag => list of allowed attrs
  75         'h1' => array('id'),
  76         'h2' => array('id'),
  77         'h3' => array('id'),
  78         'h4' => array('id'),
  79         'h5' => array('id'),
  80         'h6' => array('id'),
  81         'a' => array('href','title'),
  82         'img' => array('src','alt','title'),
  83         # tables
  84         'th' => array('align'),
  85         'td' => array('align'),
  86         # footnotes
  87         'sup' => array('id'),
  88         'footnote' => array('nr'),
  89         # abbrevations
  90         'abbr' => array('title'),
  91         'acronym' => array('title'),
  92     );
  93     # table
  94     var $max_len = array();
  95     var $align = array();
  96     var $cols = array();
  97     var $rows = array();
  98     var $col = 0;
  99     var $row = 0;
 100     var $header = array();
 101     # parents
 102     var $parents = array();
 103     # abbrevations
 104     var $abbrs = array();
 105     # buffer
 106     var $buffer = array();
 107     var $buffer_lvl = 0;
 108     # options
 109     var $LINKS_EACH_PARAGRAPH;
 110     var $BODY_WIDTH;
 111     var $KEEP_HTML;
 112     # global xml parser
 113     var $xml_parser;
 114     /**
 115      * setup the xml_parser
 116      * $links_each_paragraph: if set to true, the list of links will be
 117      * displayed after each paragraph, else it will be displayed on the end of
 118      * the file
 119      * $body_width: if set to a integer greater 0 the output text will be
 120      * wrapped to that width (in characters)
 121      * $keep_html: if set to true, all unrecognized html tags will be kept, else
 122      * they'll be removed
 123      *
 124      * @param bool $links_each_paragraph default true
 125      * @param integer $body_width default 0
 126      * @param bool $keep_html default true
 127      * @return void
 128      */
 129     function html2text($links_each_paragraph = true,$body_width = 0,$keep_html = true) {
 130         $this->LINKS_EACH_PARAGRAPH = $links_each_paragraph;
 131         $this->BODY_WIDTH = $body_width;
 132         $this->KEEP_HTML = $keep_html;
 133         $this->xml_parser = xml_parser_create();
 134         xml_set_object($this->xml_parser, $this);
 135         xml_parser_set_option($this->xml_parser, XML_OPTION_CASE_FOLDING, 0);
 136         xml_set_element_handler($this->xml_parser, 'starttag', 'endtag');
 137         xml_set_character_data_handler($this->xml_parser, 'handle_data');
 138         xml_set_default_handler($this->xml_parser,'handle_default');
 139     }
 140     /**
 141      * parse a html string to text
 142      *
 143      * @param string $html
 144      * @return string
 145      */
 146     function load_string($html) {
 147         $html = trim($html);
 148         if(empty($html)){
 149             return '';
 150         }
 151         # use unix style newlines
 152         $html = str_replace("\r","\n",str_replace("\r\n","\n",$html));
 153         # remove doctype and xml tags
 154         $html = preg_replace('#^.*<body[^>]*>#Us','<html><body>',$html);
 155         /*
 156          * cope  with bad html
 157          */
 158         $html = preg_replace('/<img(?!.*\/>)([^>]*)>/Us','<img$1 />',$html);
 159         $html = preg_replace('#&(?!amp;)#','&amp;',$html);
 160         $html = str_replace('<','&lt;',$html);
 161         $html = preg_replace('#&lt;([a-z]+[^>]*) ?/>#Us','<$1 />',$html);
 162         # unmatched tags (poor performance)
 163         preg_match_all('#&lt;(([a-z]|h[1-6])+)(?= |>)#',$html,$matches);
 164         foreach($matches[1] as $tag){
 165             $html = preg_replace('#&lt;'.$tag.'( |>)(.*)&lt;/'.$tag.'>#Us','<'.$tag.'$1$2</'.$tag.'>',$html,1);
 166         }
 167         # encode < to &lt; and & to &amp; inside <pre>|<code>
 168         $html = preg_replace_callback('#(<pre[^>]*>\s*<code[^>]*>|<code[^>]*>|<pre[^>]*>)(.*)(</pre>\s*</code>|</code>|</pre>)#Us',
 169                 create_function(
 170                      '$matches',
 171                      'return $matches[1].str_replace(\'<\',\'&lt;\',$matches[2]).$matches[3];'
 172                  ),$html);
 173         # handle empty attributes (e.g. <input checked>)
 174         $html = preg_replace_callback('#<([a-z]+)(?>[^>]* [^=]+(?> [^>]*)?) ?/?>#s',array(&$this,'parse_empty_attribs'),$html);
 175         # fake wrapper
 176         $html = '<wrapper>'.$html.'</wrapper>';
 177         # footnotes
 178         $html = preg_replace_callback('#<div class="footnotes">\s*<hr />\s*<ol>\s*(<li id="fn:\d+">.+</li>)\s*</ol>\s*</div>#Us',array(&$this,'footnotes'),$html);
 179         # last newline inside <pre> should not be parsed
 180         $html = preg_replace('#\n</code></pre>#s','</code></pre>',$html);
 181         # some html elements should not be parsed if their children wont be parsed:
 182         if($this->KEEP_HTML){
 183             # <ul|ol><li class="asdf">, complex because we need to handle nested lists
 184             if(preg_match('#<li [^>]+>#',$html)){
 185                 preg_match_all('#(?:<li [^>]+>|</?(?:ul|ol)[^>]*>)#',$html,$matches,PREG_OFFSET_CAPTURE);
 186                 $lists = array();
 187                 $offset = 0;
 188                 $ins = ' forcehtml="1"';
 189                 $add = strlen($ins);
 190                 foreach($matches[0] as $k => $a){
 191                     if(substr($a[0],0,3) == '<li'){
 192                         $list = &$lists[count($lists)-1];
 193                         if(!$list['forced']){
 194                             $list['forced'] = true;
 195                             $html = substr_replace($html,$ins,$list['offset']+$offset,0);
 196                             $offset += $add;
 197                         }
 198                     } else {
 199                         if(substr($a[0],0,2) == '</'){ # close tag
 200                             array_pop($lists);
 201                         } else { # open tag
 202                             array_push($lists,array(
 203                                 'offset' => $a[1]+3,
 204                                 'forced' => strstr($a[0],'forcehtml='),
 205                             ));
 206                         }
 207                     }
 208                 }
 209             }
 210             # <pre><code class="asdf">
 211             $html = preg_replace('#(?><pre>)\s*(<code .+>)#Us','<pre forcehtml="1">$1',$html);
 212         }
 213         $this->html = $html;
 214         # ok, now lets start parsing!
 215         #echo dump($html);
 216         $this->parse();
 217         return $this->close();
 218     }
 219     /**
 220      * clean up footnotes
 221      *
 222      * @param array $matches
 223      * @return string
 224      */
 225     function footnotes($matches){
 226         # remove footnote link
 227         $matches = preg_replace('@<a href="#fnref:\d+" rev="footnote"[^>]*>&amp;#8617;</a>@U','',$matches[1]);
 228         # remove empty paragraph
 229         $matches = str_replace('<p></p>','',$matches);
 230         # wrap in footnotes tag
 231         $matches = '<footnotes>'.$matches.'</footnotes>';
 232         # <li id="fn:1">...</li> -> <footnote nr="1">...</footnote>
 233         $matches = str_replace('<li id="fn:','<footnote nr="',$matches);
 234         return preg_replace('#</li>\s*(<footnote|</footnotes)#s','</footnote>$1',$matches);
 235     }
 236     /**
 237      * @param array $matches
 238      * @return string
 239      */
 240     function parse_empty_attribs($matches){
 241         if(preg_match('#^<[a-z]+(?: [a-z]+=(?:"[^"]*"|\'[^\']*\'))+ ?/?>$#s',$matches[0])){
 242             # mismatch, this tag is correct
 243             return $matches[0];
 244         }
 245         echo dump($matches[0]);
 246         die();
 247         $rep = $this->KEEP_HTML ? '$1="$1"' : '';
 248         return '<'.$matches[1].preg_replace('#(?<= )([^ =>]{2,})(?= |$)#Us',$rep,$matches[2]).'>';
 249     }
 250     /**
 251      * parse a html file to text
 252      *
 253      * @param string $file
 254      * @return string
 255      */
 256     function load_file($file) {
 257         $contents = file_get_contents($file);
 258         if(!$contents){
 259              trigger_error('could not open XML input',E_USER_WARNING);
 260              return false;
 261         }
 262         return $this->load_string($contents);
 263     }
 264     /**
 265      * start parsing html to text
 266      *
 267      * @param void
 268      * @return void
 269      */
 270     function parse() {
 271         $html = explode("\n", $this->html);
 272         foreach ($html as $line) {
 273             if (!xml_parse($this->xml_parser, $line . "\n")) {
 274                 $errcode = xml_get_error_code($this->xml_parser);
 275                 trigger_error(sprintf("XML error #%d: %s at line %d:<br /><pre><code>%s</code></pre>", $errcode,xml_error_string($errcode), xml_get_current_line_number($this->xml_parser),htmlspecialchars($line)),E_USER_WARNING);
 276                 #return;
 277             }
 278         }
 279     }
 280     /**
 281      * close parser and return text
 282      *
 283      * @param void
 284      * @return string
 285      */
 286     function close() {
 287         xml_parser_free($this->xml_parser);
 288         $this->pbr();
 289         $this->o('', false, 'end');
 290         $this->out("\n");
 291         $this->links();
 292         # blockquotes
 293         $this->outtext = preg_replace_callback('#^(\s*)((> )+)#m',array(&$this,'cleanup_bq'),$this->outtext);
 294         # cleanup
 295         $this->outtext = str_replace('&amp;','&',str_replace('&lt;','<',str_replace('&gt;','>',$this->outtext)));
 296         # empty lines (not preformatted)
 297         $this->outtext = preg_replace('#^\s{1,4}$#m','',$this->outtext);
 298         # empty quoted lines
 299         $this->outtext = preg_replace('#^(>+)\s{1,5}$#m','$1',$this->outtext);
 300         return rtrim($this->optwrap($this->outtext));
 301     }
 302     /**
 303      * replace "> > > " with ">>> "
 304      *
 305      * @param array $m matches
 306      * @return string
 307      */
 308     function cleanup_bq($m){
 309         return $m[1].str_repeat('>',strlen($m[2])/2).' ';
 310     }
 311     /**
 312      * handles html comments
 313      *
 314      * @param resource $parser
 315      * @param string $data
 316      * @return void
 317      */
 318     function handle_default($parser,$data){
 319         if(substr($data,0,4) == '<!--' && substr($data,-3) == '-->'){
 320             $this->outtext .= "\n\n".$data."\n";
 321         }
 322     }
 323     /**
 324      * adds pure data to the output (e.g. <p>DATA</p>)
 325      *
 326      * @param resource $parser
 327      * @param string $data
 328      * @return void
 329      */
 330     function handle_data($parser, $data) {
 331         $this->o($data, true);
 332     }
 333     /**
 334      * start tags (e.g. <p>)
 335      *
 336      * @param resource $parser
 337      * @param string $tag
 338      * @param array $attrs
 339      * @return void
 340      */
 341     function starttag($parser, $tag, $attrs) {
 342         $this->handle_tag($tag, $attrs, true);
 343     }
 344     /**
 345      * end tags (e.g. </p>)
 346      *
 347      * @param resource $parser
 348      * @param string $tag
 349      * @return void
 350      */
 351     function endtag($parser, $tag) {
 352         $this->handle_tag($tag, null, false);
 353     }
 354     /**
 355      * force html output of all children
 356      *
 357      * @param $tag
 358      * @return void
 359      */
 360     function force_html($tag){
 361         $this->force_html = true;
 362         $this->force_html_start = array(
 363             'tag' => $tag,
 364             'parents' => isset($this->parents[$tag]) ? strlen($this->parents[$tag]) : 0
 365         );
 366     }
 367     /**
 368      * parsing logic based on tag name
 369      *
 370      * @param string $tag
 371      * @param array $attrs
 372      * @param bool $start
 373      * @return void
 374      */
 375     function handle_tag($tag, $attrs, $start) {
 376         if(in_array($tag,$this->drop)){ # drop tags with content
 377             if($start){
 378                 $this->buffer();
 379             } else {
 380                 $this->unbuffer();
 381             }
 382             return;
 383         }
 384         if(in_array($tag,$this->ignore)){ # drop tags but keep content
 385             return;
 386         }
 387         # keeping the original html
 388         if($this->KEEP_HTML){
 389             if($start){
 390                 # is the force html attr set?
 391                 if(!$this->force_html && isset($attrs['forcehtml'])){
 392                     $this->force_html($tag);
 393                 }
 394                 # we'll have to keep this tag
 395                 if($this->force_html) {
 396                     $this->keep_tag($tag,$attrs,$start,true);
 397                     return;
 398                 } else {
 399                     # tag has attrs which can't be converted
 400                     if(!empty($attrs) && $this->keep_tag($tag,$attrs,$start)){
 401                         return;
 402                     }
 403                 }
 404             } else {
 405                 if($this->force_html){
 406                     $this->keep_tag($tag,$attrs,$start,true);
 407                     if($tag == $this->force_html_start['tag'] && strlen($this->parents[$tag]) == $this->force_html_start['parents']){
 408                         $this->force_html = false;
 409                     }
 410                     return;
 411                 } elseif($this->parent($tag,'kept') && $this->keep_tag($tag,$attrs,$start)) {
 412                     return;
 413                 }
 414             }
 415         }
 416         switch ($tag) {
 417             case 'h1' :
 418             case 'h2' :
 419             case 'h3' :
 420             case 'h4' :
 421             case 'h5' :
 422             case 'h6' :
 423                 $this->p();
 424                 if ($start) {
 425                     $this->o(str_repeat('#', intval($tag[1])) . ' ');
 426                     if(!empty($attrs['id'])){
 427                         $this->append = ' {#'.$attrs['id'].'}';
 428                     }
 429                 } else {
 430                     $this->out($this->append);
 431                     $this->append = '';
 432                 }
 433                 break;
 434             case 'div' :
 435                 $this->p();
 436                 break;
 437             case 'p' :
 438                 $this->p();
 439                 break;
 440             case 'br' :
 441                 if ($start) {
 442                     $this->o("  \n");
 443                 }
 444                 break;
 445             case 'hr' :
 446                 if ($start) {
 447                     $this->p();
 448                     $this->o('* * *');
 449                     $this->p();
 450                 }
 451                 break;
 452             case 'blockquote' :
 453                 $this->indent('> ',$start);
 454                 if ($start) {
 455                     $this->start = true;
 456                     $this->out("\n\n".$this->indent);
 457                 }
 458                 break;
 459             case 'em' :
 460             case 'i' :
 461             case 'u' :
 462                 $this->o('_');
 463                 break;
 464             case 'strong' :
 465             case 'b' :
 466                 $this->o('**');
 467                 break;
 468             # footnotes
 469             case 'sup':
 470                 if($start){
 471                     if(count($attrs) != 1 || !isset($attrs['id']) || !preg_match('#^fnref:(\d+)$#',$attrs['id'],$matches)){
 472                         # keep tag
 473                         $this->keep_tag($tag,$attrs,$start,true);
 474                         return;
 475                     }
 476                     # parse footnote
 477                     $this->out('[^'.$matches[1].']');
 478                     # omit output of link (<a href="#fn:1" rel="footnote">1</a>)
 479                     $this->buffer();
 480                 } else {
 481                     # last sup was not parsed -> keep tag
 482                     if(!$this->parent('sup')){
 483                         $this->keep_tag($tag,$attrs,$start);
 484                         return;
 485                     }
 486                     # sup was parsed -> reset buffer
 487                     $this->unbuffer();
 488                 }
 489                 break;
 490             case 'footnotes':
 491                 $this->p();
 492                 break;
 493             case 'footnote':
 494                 if($start){
 495                     $this->o('[^'.$attrs['nr']."]:\n".$this->indent.'    ');
 496                     $this->start = true;
 497                 }
 498                 $this->indent('    ',$start);
 499                 break;
 500             case 'a':
 501                 if($start) {
 502                     # buffer to check for inline links like <foo@bar.com> and the like
 503                     if (isset ($attrs['href'])) {
 504                         $this->buffer();
 505                         array_push($this->astack, $attrs);
 506                     } else {
 507                         array_push($this->astack, null);
 508                     }
 509                 } else {
 510                     if($this->astack) {
 511                         $a = array_pop($this->astack);
 512                         if ($a) {
 513                             # for emails
 514                             $a['href'] = $this->decode($a['href']);
 515                             $buffer = $this->unbuffer();
 516                             $buffer_check = $this->decode(trim($buffer));
 517                             if((substr($a['href'],0,7) == 'mailto:' && 'mailto:'.$buffer_check == $a['href']) || $a['href'] == $buffer_check){
 518                             # inline link
 519                                 $this->out('<'.$buffer_check.'>',true);
 520                             } else {
 521                             # block link
 522                                 $this->previousIndex($a);
 523                                 $this->out('['.$buffer.']['.$a['count'].']',true);
 524                             }
 525                         }
 526                     }
 527                 }
 528                 break;
 529             # abbrevations
 530             case 'abbr':
 531             case 'acronym':
 532                 if($start){
 533                     $this->buffer();
 534                     array_push($this->abbrs,isset($attrs['title'])?$attrs['title']:'');
 535                 } else {
 536                     $abbr = $this->unbuffer();
 537                     $def = array_pop($this->abbrs);
 538                     # only add abbr if its not already defined
 539                     if(!isset($this->abbrs[$abbr])){
 540                         $this->abbrs[$abbr] = $def;
 541                     }
 542                     $this->o($abbr);
 543                 }
 544                 break;
 545             case 'img' :
 546                 if ($start) {
 547                     if (isset ($attrs['src'])) {
 548                         $attrs['href'] = $attrs['src'];
 549                         $alt = '';
 550                         if (isset ($attrs['alt'])) {
 551                             $alt = $attrs['alt'];
 552                         } elseif(isset($attrs['title'])){
 553                             $alt = $attrs['title'];
 554                         }
 555                         $this->previousIndex($attrs);
 556                         $this->o('!['.$alt.'][' . $attrs['count'] . ']');
 557                     }
 558                 }
 559                 break;
 560             case 'code':
 561                 # do we have to keep this tag?
 562                 # or is a parent <pre> element existing?
 563                 if($this->keep_tag($tag,$attrs,$start) || $this->parent('pre')){
 564                     return;
 565                 }
 566                 # convert to `code` and handle backticks inside code block
 567                 # <code>foo`bar</code> has to get ``foo`bar`` and so forth
 568                 if($start){
 569                     $this->buffer();
 570                 } else {
 571                     $str = $this->unbuffer();
 572                     preg_match_all('#`+#',$str,$matches);
 573                     if(!empty($matches[0])){
 574                         rsort($matches[0]);
 575                         $len = strlen($matches[0][0])+1;
 576                     } else {
 577                         $len = 1;
 578                     }
 579                     $ticks = str_repeat('`',$len);
 580                     $this->out($ticks.$str.$ticks);
 581                 }
 582                 break;
 583             case 'dl' :
 584                 # note: if <dl> gets parsed, its direct children (<dd> and <dt>) will be parsed as well
 585                 if ($start) {
 586                     $this->p();
 587                 }
 588                 break;
 589             case 'dd' :
 590                 # is the parent dl parsed?
 591                 if(!$this->parent('dl')){
 592                     $this->keep_tag($tag,$attrs,$start,true);
 593                     return;
 594                 }
 595                 if ($start) {
 596                     $this->o(':   ');
 597                     $this->start = true;
 598                 } else {
 599                     $this->outtext .= "\n";
 600                     $this->pbr();
 601                 }
 602                 $this->indent('    ',$start);
 603                 break;
 604             case 'dt' :
 605                 # is the parent dl parsed?
 606                 if(!$this->parent('dl')){
 607                     $this->keep_tag($tag,$attrs,$start,true);
 608                     return;
 609                 }
 610                 if (!$start) {
 611                     $this->pbr();
 612                 }
 613                 break;
 614             case 'ol' :
 615             case 'ul' :
 616                 # note: if this element gets parsed, its direct children <li>s will be parsed as well
 617                 if ($start) {
 618                     array_push($this->list, array (
 619                         'name' => $tag,
 620                         'num' => 0
 621                     ));
 622                 } else {
 623                     array_pop($this->list);
 624                     $this->pbr();
 625                 }
 626                 break;
 627             case 'li' :
 628                 if ($this->list) {
 629                     $li = &$this->list[count($this->list) - 1];
 630                 }
 631                 # not inside a list or the list tag was not parsed
 632                 if(!isset($li) || !$this->parent($li['name'])){
 633                     $this->keep_tag($tag,$attrs,$start,true);
 634                     return;
 635                 }
 636                 if ($start) {
 637                     $this->pbr();
 638                     if($li['name'] == 'ul'){
 639                         $this->o('*  ');
 640                     } else {
 641                         $li['num']++;
 642                         /**
 643                         * @todo line up <ol><li>s > 9 correctly.
 644                         */
 645                         $this->o($li['num'].'. ');
 646                     }
 647                     $this->start = true;
 648                     $this->indent('   ',$start);
 649                 } else {
 650                     $this->indent('   ',$start);
 651                 }
 652                 break;
 653             case 'table':
 654                 # NOTE: if the <table> tag gets parsed, all its children will be as well!
 655
 656                 # finally: parse the whole table
 657                 if(!$start){
 658                     $this->outtext .= "\n\n";
 659                     $separator = array();
 660                     # seperator with correct align identifikators
 661                     foreach($this->cols as $col => $arr){
 662                         $this->max_len[$col] = max($arr);
 663                         $left = $right = '';
 664                         switch($this->align[$col]){
 665                             case 'center':
 666                                 $right = ':';
 667                             case 'left':
 668                                 $left = ':';
 669                                 break;
 670                             case 'right':
 671                                 $right = ':';
 672                                 break;
 673                         }
 674                         array_push($separator,$left.str_repeat('-',$this->max_len[$col]).$right);
 675                     }
 676                     $separator = '| '.implode(' | ',$separator).' |';
 677                     # set equal width
 678                     array_walk($this->rows,array(&$this,'fill_td'));
 679                     $rows = $this->rows;
 680                     foreach($rows as $row => $cols){
 681                         $this->pbr();
 682                         $this->o('| '.implode(' | ',$cols).' |');
 683                         if(in_array($row,$this->header)){
 684                             $this->pbr();
 685                             $this->o($separator);
 686                         }
 687                     }
 688                     $this->cols = array();
 689                     $this->rows = array();
 690                     $this->align = array();
 691                     $this->pbr();
 692                 }
 693                 break;
 694             case 'tr':
 695                 # not inside a table or the parent table was not parsed
 696                 if(!$this->parent('table')){
 697                     $this->keep_tag($tag,$attrs,$start,true);
 698                     return;
 699                 }
 700                 if($start){
 701                     $this->row++;
 702                 } else {
 703                     $this->col = 0;
 704                 }
 705                 break;
 706             case 'th':
 707                 # not inside a table or the parent table was not parsed
 708                 if(!$this->parent('table')){
 709                     $this->keep_tag($tag,$attrs,$start,true);
 710                     return;
 711                 }
 712                 if($start){
 713                     if(!in_array($this->row,$this->header)){
 714                         array_push($this->header,$this->row);
 715                     }
 716                     $this->col++;
 717                     $this->align[$this->col] = !empty($attrs['align']) ? $attrs['align'] : null;
 718                 }
 719                 break;
 720             case 'td':
 721                 # not inside a table or the parent table was not parsed
 722                 if(!$this->parent('table')){
 723                     $this->keep_tag($tag,$attrs,$start,true);
 724                     return;
 725                 }
 726                 if($start){
 727                     $this->col++;
 728                     if(!empty($attrs['align']) && is_null($this->align[$this->col])){
 729                         $this->align[$this->col] = $attrs['align'];
 730                         if($attrs['align'] == 'center'){
 731                             $this->max_len[$this->col] +=2;
 732                         }
 733                     }
 734                 }
 735                 break;
 736             case 'pre':
 737                 $this->indent('    ',$start,true);
 738                 if ($start) {
 739                     $this->pbr();
 740                 }
 741                 break;
 742             default:
 743                 $this->keep_tag($tag,$attrs,$start,true);
 744                 return;
 745         }
 746         # if we want to keep all non convertible html this function has to know if some parent elemts
 747         # were parsed or not (also some elements need to know if)
 748         if($start){
 749             if(!isset($this->parents[$tag])){
 750                 $this->parents[$tag] = '1';
 751             } else {
 752                 $this->parents[$tag] .= '1';
 753             }
 754         } else {
 755             if($this->LINKS_EACH_PARAGRAPH && in_array($tag,array('p','ul','blockquote','ol','dl','table','h1','h2','h3','h4','h5','h6'))){
 756                 $this->links();
 757             }
 758             $this->parents[$tag] = substr($this->parents[$tag],0,-1);
 759         }
 760         return;
 761     }
 762     /**
 763      * adds a string to the output ($this->outtext)
 764      * also copes with tables
 765      *
 766      * @param string $str
 767      * @return void
 768      */
 769     function out($str) {
 770         # buffering
 771         if($this->buffer_lvl){
 772             $this->buffer[$this->buffer_lvl] .= $str;
 773             return;
 774         }
 775         # this is for tables (see php markdown extra by michel fortin)
 776         if(($this->parent('th') || $this->parent('td'))){
 777             $str = trim($str);
 778             if(!isset($this->rows[$this->row][$this->col])){
 779                 $this->rows[$this->row][$this->col] = $str;
 780             } else {
 781                 $this->rows[$this->row][$this->col] .= $str;
 782             }
 783             if(!isset($this->cols[$this->col][$this->row])){
 784                 $this->cols[$this->col][$this->row] = strlen($str);
 785             } else {
 786                 $this->cols[$this->col][$this->row] += strlen($str);
 787             }
 788             return;
 789         }
 790         $this->outtext .= $str;
 791     }
 792     /**
 793      * further parse the output and add newlines, remove whitespaces and such
 794      *
 795      * @param string $data
 796      * @param bool $puredata
 797      * @param string $force
 798      * @return void
 799      */
 800     function o($data, $puredata = false, $force = false) {
 801         if($this->parent('table') && trim($data) == ''){ # drop whitespaces inside tables
 802             return;
 803         } elseif ($puredata && !$this->parent('code','both') && !$this->parent('pre','both')) { # keep whitespace for code
 804             $data = preg_replace('#\s+#', ' ', $data);
 805         }
 806         if (!$data && !$force) {
 807             return;
 808         }
 809         if (!empty($this->indent)) {
 810             $data = str_replace("\n", "\n".$this->indent, $data);
 811         }
 812         if ($this->start) {
 813             if($data == ' '){
 814                 return;
 815             }
 816             $this->p_p = 0;
 817             $this->start = 0;
 818         }
 819         if ($force == 'end') {
 820             # It's the end.
 821             $this->p_p = 0;
 822             $this->out("\n");
 823         }
 824         if ($this->p_p) {
 825             if($data == ' '){
 826                 return;
 827             }
 828             $data = ltrim($data);
 829             $this->out(str_repeat("\n".$this->indent, $this->p_p));
 830         }
 831         $this->p_p = 0;
 832         $this->out($data);
 833         if($data){
 834             $this->lastWasNL = substr($data, -1) == "\n";
 835         }
 836     }
 837     /**
 838      * display block links after paragraph etc.
 839      * also handle abbrs
 840      *
 841      * @param void
 842      * @return void
 843      */
 844     function links(){
 845         $this->abbrs();
 846         if(empty($this->a)){
 847             return; # no links stored
 848         }
 849         $pre = '';
 850         $this->out("\n\n");
 851         foreach($this->a as $links){
 852                 /**
 853                  * @todo  base href
 854                  */
 855                 foreach($links as $link){
 856                     $a = $pre.' [' . $link['count'] . ']: ' . $link['href'];
 857                     if (isset ($link['title'])) {
 858                         $a .= ' (' . $link['title'] . ')';
 859                     }
 860                     $this->out($a."\n");
 861                 }
 862         }
 863         $this->a = array();
 864         $this->out("\n");
 865         $this->lastWasNL = true;
 866     }
 867     /**
 868      * display abbr list
 869      *
 870      * @param void
 871      * @return void
 872      */
 873     function abbrs(){
 874         if(empty($this->abbrs)){
 875             return; # no abbrs stored
 876         }
 877         $this->out("\n\n");
 878         foreach($this->abbrs as $abbr => $def){
 879             $this->out('*['.$abbr.']: '.$def."\n");
 880         }
 881         $this->abbrs = array();
 882         $this->out("\n");
 883         $this->lastWasNL = true;
 884     }
 885     /**
 886      * if the link is already set use its count, else increase acount
 887      *
 888      * @param array &$attrs link attributes
 889      * @return void
 890      */
 891     function previousIndex(&$attrs) {
 892         # check for existing link
 893         if(isset($this->a[$attrs['href']])){
 894             foreach($this->a[$attrs['href']] as $a){
 895                 if (!empty($attrs['title']) || !empty($a['title'])){
 896                     if($a['title'] == $attrs['title']) {
 897                         $attrs = $a;
 898                         return;
 899                     }
 900                 } else {
 901                     $attrs = $a;
 902                     return;
 903                 }
 904             }
 905         }
 906         # if we come here, no matching link was found
 907         $this->acount++;
 908         $attrs['count'] = $this->acount;
 909         if(isset($this->a[$attrs['href']])){
 910             array_push($this->a[$attrs['href']],$attrs);
 911         } else {
 912             $this->a[$attrs['href']] = array($attrs);
 913         }
 914     }
 915     /**
 916      * handles bad html to avoid xml parse errors
 917      *
 918      * @param string $html
 919      * @return string
 920      */
 921     function handle_bad_html($html){
 922         return preg_replace_callback('#&lt;([a-z1-6]+)( [^>]*)?>(.*(?R).*)&lt;/\\1>#Us',array(&$this,'replace_bad_html'),$html);
 923     }
 924     /**
 925      * callback function which is used in handle_bad_html()
 926      *
 927      * @param array $matches
 928      * @return string
 929      */
 930     function replace_bad_html($matches){
 931         # recursion
 932         $matches[3] = $this->handle_bad_html($matches[3]);
 933         return '<'.$matches[1].$matches[2].'>'.$matches[3].'</'.$matches[1].'>';
 934     }
 935     /**
 936      * if the option BODY_WIDTH is set, this option will wrap text to the
 937      * provided width
 938      *
 939      * @param string $text
 940      * @return string
 941      *
 942      * @todo wrapping of code (also kept code blocks)
 943      */
 944     function optwrap($text) {
 945         if ($this->BODY_WIDTH < 30) {
 946             return $text;
 947         }
 948         $result = '';
 949         $split = explode("\n", $text);
 950         foreach ($split as $para) {
 951             if (strlen($para) > 0) {
 952                 if (preg_match('#^(\s*):   #',$para,$indent)) { # definition lists
 953                     $indent = isset($indent[1]) ? $indent[1] : '';
 954                     $result .= wordwrap($para, $this->BODY_WIDTH - strlen($indent) - 4, "\n".$indent.'    ')."\n";
 955                 } elseif(preg_match('#^(\s*>+)#',$para,$indent)){ # blockquote
 956                     $result .= wordwrap($para,$this->BODY_WIDTH - (strlen($indent[0])+1),"\n".$indent[0].' ')."\n";
 957                 } elseif(preg_match('#^\s*\|#',$para)){ # table
 958                     $result .= $para."\n"; # dont wrap
 959                 } elseif(preg_match('#^(\s*)\*#',$para,$indent)) { # list item @todo: ol
 960                     $indent = isset($indent[1]) ? $indent[1] : '';
 961                     $indent.= '   ';
 962                     $result .= wordwrap($para,$this->BODY_WIDTH - strlen($indent),"\n".$indent). "\n";
 963                 } elseif(preg_match('#^ \[[^\]]+\]:#',$para)){ # block links
 964                     # don't wrap at the moment
 965                     $result .= $para."\n";
 966                     continue;
 967                 } else { # something else
 968                     preg_match('#^\s+#',$para,$indent);
 969                     $indent = isset($indent[0]) ? $indent[0] : '';
 970                     $result .= wordwrap($para,$this->BODY_WIDTH - strlen($indent),"\n".$indent). "\n";
 971                 }
 972             } else {
 973                 $result .= "\n";
 974             }
 975         }
 976         return $result;
 977     }
 978     /**
 979      * handles html tags which are not represented by the parser logic
 980      * if $this->KEEP_HTML is set to true, the tag will be appended to the
 981      * output and `markdown="1"` added to its attributes
 982      *
 983      * @param string $tag
 984      * @param array $attrs
 985      * @param bool $start
 986      * @param array $known_attrs these attrs can be handled by markdown
 987      * @return bool
 988      */
 989     function keep_tag($tag,$attrs,$start,$force = false){
 990         if(!$force && !$this->KEEP_HTML){
 991             return false;
 992         }
 993
 994         # start tag
 995         if($start){
 996             # if there is a attr which cannot be handled by markdown
 997             # this tag will be kept.
 998             if(isset($this->has_attrs[$tag])){
 999                 $known_attrs = $this->has_attrs[$tag];
1000             } else {
1001                 $known_attrs = array();
1002             }
1003             if(!$force && count($known_attrs) >= count($attrs)){
1004                 if(empty($attrs) || count(array_diff(array_keys($attrs),$known_attrs)) == 0){
1005                     # tag can be handled by markdown!
1006                     return false;
1007                 }
1008             }
1009             $attr = '';
1010             if(!empty($attrs)){
1011                 foreach($attrs as $key => $value){
1012                     if($key == 'forcehtml'){
1013                         continue;
1014                     }
1015                     $attr.=' '.$key.'="'.$value.'"';
1016                 }
1017             }
1018             if(!$force && in_array($tag,array('div','center','li','dt','dd'))){
1019                 $attr.= ' markdown="1"';
1020             } elseif(!$this->force_html) {
1021                 $this->force_html($tag);
1022             }
1023             $this->o('<'.$tag.$attr.'>',true);
1024             # add to list of parents:
1025             if(isset($this->parents[$tag])){
1026                 $this->parents[$tag] .= '2';
1027             } else {
1028                 $this->parents[$tag] = '2';
1029             }
1030         # close tag
1031         } else {
1032             if(!$force && !$this->parent($tag,'kept')){
1033                 # the start tag of this element was not parsed
1034                 return false;
1035             }
1036             $this->o('</'.$tag.'>');
1037             $this->parents[$tag] = substr($this->parents[$tag],0,-1);
1038             # newlines after </tag>
1039             if(in_array($tag,array('th','td','dt','dd','li','p'))){
1040                 $this->o("\n");
1041             }
1042         }
1043         # newlines after <tag> and </tag>
1044         if(in_array($tag,array('div','center','table','tr','ul','ol','dl','pre'))){
1045             $this->o("\n");
1046         }
1047         return true;
1048     }
1049     /**
1050      * outputs a cell widened to the proper width
1051      *
1052      * @param array &$row
1053      * @return void
1054      */
1055     function fill_td(&$row){
1056         $len = 0;
1057         foreach($row as $col => $cont){
1058             $width = $this->max_len[$col];
1059             switch($this->align[$col]){
1060                 case 'center':
1061                     $width += 2;
1062                     $row[$col] = str_pad($row[$col],$width,' ',STR_PAD_BOTH);
1063                     break;
1064                 case 'left':
1065                     $width++;
1066                 default:
1067                     $row[$col] = str_pad($row[$col],$width,' ');
1068                     break;
1069                 case 'right':
1070                     $width++;
1071                     $row[$col] = str_pad($row[$col],$width,' ',STR_PAD_LEFT);
1072                     break;
1073             }
1074         }
1075     }
1076     /**
1077      * some sort of <br />
1078      *
1079      * @param void
1080      * @return void
1081      */
1082     function pbr() {
1083         if ($this->p_p == 0) {
1084             $this->p_p = 1;
1085         }
1086     }
1087     /**
1088      * text <p> (e.g. newlines after output)
1089      *
1090      * @param void
1091      * @return void
1092      */
1093     function p() {
1094         if($this->parent('table')){
1095             return;
1096         }
1097         $this->p_p = 2;
1098     }
1099     /**
1100      * add $indent before each line
1101      *
1102      * @param string $indent
1103      * @param bool $start wether it's an opening tag or a closing one
1104      * @param bool $output shall $indent be outputted? (only if $start is true)
1105      * @return void
1106      */
1107     function indent($indent,$start,$output=false){
1108         if($start){
1109             if($output){
1110                 $this->o($indent);
1111             }
1112             $this->indent .= $indent;
1113         } else {
1114             $len = strlen($indent);
1115             if($len >= strlen($this->indent)){
1116                 $this->indent = '';
1117             } else {
1118                 $this->indent = substr($this->indent,0,-$len);
1119             }
1120         }
1121     }
1122     /**
1123      * checks if a parent element exists
1124      * use $type to check for a parsed parent element or a kept element
1125      * @param string $parent name of the parent tag
1126      * @param string $type either 'parsed' or 'kept' or 'both'
1127      * @return bool
1128      */
1129     function parent($parent,$type = 'parsed'){
1130         if(!isset($this->parents[$parent])){
1131             return false;
1132         }
1133         if($type != 'both'){
1134             $type = $type == 'parsed' ? '1' : '2';
1135             return substr($this->parents[$parent],-1) === $type;
1136         } else {
1137             return !empty($this->parents[$parent]);
1138         }
1139     }
1140     /**
1141      * start buffer
1142      *
1143      * @param void
1144      * @return void
1145      */
1146     function buffer(){
1147         if($this->p_p){
1148             $this->out(str_repeat("\n".$this->indent, $this->p_p));
1149             $this->p_p = 0;
1150         }
1151         $this->buffer_lvl++;
1152         $this->buffer[$this->buffer_lvl] = '';
1153     }
1154     /**
1155      * end buffer and return buffered output
1156      *
1157      * @param void
1158      * @return string
1159      */
1160     function unbuffer(){
1161         $out = $this->buffer[$this->buffer_lvl];
1162         unset($this->buffer[$this->buffer_lvl]);
1163         $this->buffer_lvl--;
1164         return $out;
1165     }
1166     /**
1167      * decode email
1168      *
1169      * @author derernst@gmx.ch <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
1170      */
1171     function decode($text,$quote_style = ENT_NOQUOTES){
1172         if (function_exists('html_entity_decode')) {
1173             $text = html_entity_decode($text, $quote_style, 'ISO-8859-1'); // NOTE: UTF-8 does not work!
1174         }
1175         else {
1176             $trans_tbl = get_html_translation_table(HTML_ENTITIES, $quote_style);
1177             $trans_tbl = array_flip($trans_tbl);
1178             $text = strtr($text, $trans_tbl);
1179         }
1180         $text = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $text);
1181         $text = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $text);
1182         return $text;
1183     }
1184 }
1185 ?>