New records could not hold multiple habtm associates using ::set(). Fixes #160.
[akelos.git] / vendor / TextParsers / html2text.php
bloba95b1736cdf82d8d2a4f3ca60b2c18aa169902c2
1 <?php
2 /**
3 * html2text converts HTML Markup to Markdown [1]. It also supports
4 * Markdown Extra [2] by Michel Fortin [3].
6 * It started as a port of Aaron Swartz' [4] html2text.py [5] but
7 * got a long way since. This is more than a mere port now!
9 * [1]: http://daringfireball.com/projects/markdown
10 * [2]: http://www.michelf.com/projects/php-markdown/extra/
11 * [3]: http://www.michelf.com/
12 * [4]: http://www.aaronsw.com/
13 * [5]: http://www.aaronsw.com/2002/html2text/
15 * @version 1.4
16 * @author Milian Wolff (mail@milianw.de,http://milianw.de)
17 * @license LGPL, see LICENSE.txt and below
18 * @copyright (C) 2007 Milian Wolff
20 * This library is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU Lesser General Public
22 * License as published by the Free Software Foundation; either
23 * version 2.1 of the License, or (at your option) any later version.
25 * This library is distributed in the hope that it will be useful,
26 * but WITHOUT ANY WARRANTY; without even the implied warranty of
27 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
28 * Lesser General Public License for more details.
30 * You should have received a copy of the GNU Lesser General Public
31 * License along with this library; if not, write to the Free Software
32 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
34 class html2text {
35 # input html
36 var $html = '';
37 # output markdown
38 var $outtext = '';
39 # some control structures
40 var $p_p = 0;
41 var $start = 1;
42 var $space = 0;
43 var $force_html = false;
44 var $force_html_start = array('tag'=>'','parents'=>0);
45 # links
46 var $a = array();
47 var $astack = array();
48 var $acount = 0;
49 # lists
50 var $list = array ();
51 var $list_depth = 0;
52 var $lastWasNL = false;
53 # indenting and appending
54 var $append = '';
55 var $indent = '';
56 # these elements will be dropped with all subelements
57 var $drop = array(
58 'script',
59 'head',
60 'style',
61 'form',
63 # these elements will be quietly ignored, their children will be parsed
64 var $ignore = array(
65 'wrapper', # important!
66 'html', # closing html tag
67 'body', # closing body tag
68 'thead',
69 'tbody',
70 'tfoot',
72 # these elements can have some attributes
73 var $has_attrs = array(
74 # tag => list of allowed attrs
75 'h1' => array('id'),
76 'h2' => array('id'),
77 'h3' => array('id'),
78 'h4' => array('id'),
79 'h5' => array('id'),
80 'h6' => array('id'),
81 'a' => array('href','title'),
82 'img' => array('src','alt','title'),
83 # tables
84 'th' => array('align'),
85 'td' => array('align'),
86 # footnotes
87 'sup' => array('id'),
88 'footnote' => array('nr'),
89 # abbrevations
90 'abbr' => array('title'),
91 'acronym' => array('title'),
93 # table
94 var $max_len = array();
95 var $align = array();
96 var $cols = array();
97 var $rows = array();
98 var $col = 0;
99 var $row = 0;
100 var $header = array();
101 # parents
102 var $parents = array();
103 # abbrevations
104 var $abbrs = array();
105 # buffer
106 var $buffer = array();
107 var $buffer_lvl = 0;
108 # options
109 var $LINKS_EACH_PARAGRAPH;
110 var $BODY_WIDTH;
111 var $KEEP_HTML;
112 # global xml parser
113 var $xml_parser;
115 * setup the xml_parser
116 * $links_each_paragraph: if set to true, the list of links will be
117 * displayed after each paragraph, else it will be displayed on the end of
118 * the file
119 * $body_width: if set to a integer greater 0 the output text will be
120 * wrapped to that width (in characters)
121 * $keep_html: if set to true, all unrecognized html tags will be kept, else
122 * they'll be removed
124 * @param bool $links_each_paragraph default true
125 * @param integer $body_width default 0
126 * @param bool $keep_html default true
127 * @return void
129 function html2text($links_each_paragraph = true,$body_width = 0,$keep_html = true) {
130 $this->LINKS_EACH_PARAGRAPH = $links_each_paragraph;
131 $this->BODY_WIDTH = $body_width;
132 $this->KEEP_HTML = $keep_html;
133 $this->xml_parser = xml_parser_create();
134 xml_set_object($this->xml_parser, $this);
135 xml_parser_set_option($this->xml_parser, XML_OPTION_CASE_FOLDING, 0);
136 xml_set_element_handler($this->xml_parser, 'starttag', 'endtag');
137 xml_set_character_data_handler($this->xml_parser, 'handle_data');
138 xml_set_default_handler($this->xml_parser,'handle_default');
141 * parse a html string to text
143 * @param string $html
144 * @return string
146 function load_string($html) {
147 $html = trim($html);
148 if(empty($html)){
149 return '';
151 # use unix style newlines
152 $html = str_replace("\r","\n",str_replace("\r\n","\n",$html));
153 # remove doctype and xml tags
154 $html = preg_replace('#^.*<body[^>]*>#Us','<html><body>',$html);
156 * cope with bad html
158 $html = preg_replace('/<img(?!.*\/>)([^>]*)>/Us','<img$1 />',$html);
159 $html = preg_replace('#&(?!amp;)#','&amp;',$html);
160 $html = str_replace('<','&lt;',$html);
161 $html = preg_replace('#&lt;([a-z]+[^>]*) ?/>#Us','<$1 />',$html);
162 # unmatched tags (poor performance)
163 preg_match_all('#&lt;(([a-z]|h[1-6])+)(?= |>)#',$html,$matches);
164 foreach($matches[1] as $tag){
165 $html = preg_replace('#&lt;'.$tag.'( |>)(.*)&lt;/'.$tag.'>#Us','<'.$tag.'$1$2</'.$tag.'>',$html,1);
167 # encode < to &lt; and & to &amp; inside <pre>|<code>
168 $html = preg_replace_callback('#(<pre[^>]*>\s*<code[^>]*>|<code[^>]*>|<pre[^>]*>)(.*)(</pre>\s*</code>|</code>|</pre>)#Us',
169 create_function(
170 '$matches',
171 'return $matches[1].str_replace(\'<\',\'&lt;\',$matches[2]).$matches[3];'
172 ),$html);
173 # handle empty attributes (e.g. <input checked>)
174 $html = preg_replace_callback('#<([a-z]+)(?>[^>]* [^=]+(?> [^>]*)?) ?/?>#s',array(&$this,'parse_empty_attribs'),$html);
175 # fake wrapper
176 $html = '<wrapper>'.$html.'</wrapper>';
177 # footnotes
178 $html = preg_replace_callback('#<div class="footnotes">\s*<hr />\s*<ol>\s*(<li id="fn:\d+">.+</li>)\s*</ol>\s*</div>#Us',array(&$this,'footnotes'),$html);
179 # last newline inside <pre> should not be parsed
180 $html = preg_replace('#\n</code></pre>#s','</code></pre>',$html);
181 # some html elements should not be parsed if their children wont be parsed:
182 if($this->KEEP_HTML){
183 # <ul|ol><li class="asdf">, complex because we need to handle nested lists
184 if(preg_match('#<li [^>]+>#',$html)){
185 preg_match_all('#(?:<li [^>]+>|</?(?:ul|ol)[^>]*>)#',$html,$matches,PREG_OFFSET_CAPTURE);
186 $lists = array();
187 $offset = 0;
188 $ins = ' forcehtml="1"';
189 $add = strlen($ins);
190 foreach($matches[0] as $k => $a){
191 if(substr($a[0],0,3) == '<li'){
192 $list = &$lists[count($lists)-1];
193 if(!$list['forced']){
194 $list['forced'] = true;
195 $html = substr_replace($html,$ins,$list['offset']+$offset,0);
196 $offset += $add;
198 } else {
199 if(substr($a[0],0,2) == '</'){ # close tag
200 array_pop($lists);
201 } else { # open tag
202 array_push($lists,array(
203 'offset' => $a[1]+3,
204 'forced' => strstr($a[0],'forcehtml='),
210 # <pre><code class="asdf">
211 $html = preg_replace('#(?><pre>)\s*(<code .+>)#Us','<pre forcehtml="1">$1',$html);
213 $this->html = $html;
214 # ok, now lets start parsing!
215 #echo dump($html);
216 $this->parse();
217 return $this->close();
220 * clean up footnotes
222 * @param array $matches
223 * @return string
225 function footnotes($matches){
226 # remove footnote link
227 $matches = preg_replace('@<a href="#fnref:\d+" rev="footnote"[^>]*>&amp;#8617;</a>@U','',$matches[1]);
228 # remove empty paragraph
229 $matches = str_replace('<p></p>','',$matches);
230 # wrap in footnotes tag
231 $matches = '<footnotes>'.$matches.'</footnotes>';
232 # <li id="fn:1">...</li> -> <footnote nr="1">...</footnote>
233 $matches = str_replace('<li id="fn:','<footnote nr="',$matches);
234 return preg_replace('#</li>\s*(<footnote|</footnotes)#s','</footnote>$1',$matches);
237 * @param array $matches
238 * @return string
240 function parse_empty_attribs($matches){
241 if(preg_match('#^<[a-z]+(?: [a-z]+=(?:"[^"]*"|\'[^\']*\'))+ ?/?>$#s',$matches[0])){
242 # mismatch, this tag is correct
243 return $matches[0];
245 echo dump($matches[0]);
246 die();
247 $rep = $this->KEEP_HTML ? '$1="$1"' : '';
248 return '<'.$matches[1].preg_replace('#(?<= )([^ =>]{2,})(?= |$)#Us',$rep,$matches[2]).'>';
251 * parse a html file to text
253 * @param string $file
254 * @return string
256 function load_file($file) {
257 $contents = file_get_contents($file);
258 if(!$contents){
259 trigger_error('could not open XML input',E_USER_WARNING);
260 return false;
262 return $this->load_string($contents);
265 * start parsing html to text
267 * @param void
268 * @return void
270 function parse() {
271 $html = explode("\n", $this->html);
272 foreach ($html as $line) {
273 if (!xml_parse($this->xml_parser, $line . "\n")) {
274 $errcode = xml_get_error_code($this->xml_parser);
275 trigger_error(sprintf("XML error #%d: %s at line %d:<br /><pre><code>%s</code></pre>", $errcode,xml_error_string($errcode), xml_get_current_line_number($this->xml_parser),htmlspecialchars($line)),E_USER_WARNING);
276 #return;
281 * close parser and return text
283 * @param void
284 * @return string
286 function close() {
287 xml_parser_free($this->xml_parser);
288 $this->pbr();
289 $this->o('', false, 'end');
290 $this->out("\n");
291 $this->links();
292 # blockquotes
293 $this->outtext = preg_replace_callback('#^(\s*)((> )+)#m',array(&$this,'cleanup_bq'),$this->outtext);
294 # cleanup
295 $this->outtext = str_replace('&amp;','&',str_replace('&lt;','<',str_replace('&gt;','>',$this->outtext)));
296 # empty lines (not preformatted)
297 $this->outtext = preg_replace('#^\s{1,4}$#m','',$this->outtext);
298 # empty quoted lines
299 $this->outtext = preg_replace('#^(>+)\s{1,5}$#m','$1',$this->outtext);
300 return rtrim($this->optwrap($this->outtext));
303 * replace "> > > " with ">>> "
305 * @param array $m matches
306 * @return string
308 function cleanup_bq($m){
309 return $m[1].str_repeat('>',strlen($m[2])/2).' ';
312 * handles html comments
314 * @param resource $parser
315 * @param string $data
316 * @return void
318 function handle_default($parser,$data){
319 if(substr($data,0,4) == '<!--' && substr($data,-3) == '-->'){
320 $this->outtext .= "\n\n".$data."\n";
324 * adds pure data to the output (e.g. <p>DATA</p>)
326 * @param resource $parser
327 * @param string $data
328 * @return void
330 function handle_data($parser, $data) {
331 $this->o($data, true);
334 * start tags (e.g. <p>)
336 * @param resource $parser
337 * @param string $tag
338 * @param array $attrs
339 * @return void
341 function starttag($parser, $tag, $attrs) {
342 $this->handle_tag($tag, $attrs, true);
345 * end tags (e.g. </p>)
347 * @param resource $parser
348 * @param string $tag
349 * @return void
351 function endtag($parser, $tag) {
352 $this->handle_tag($tag, null, false);
355 * force html output of all children
357 * @param $tag
358 * @return void
360 function force_html($tag){
361 $this->force_html = true;
362 $this->force_html_start = array(
363 'tag' => $tag,
364 'parents' => isset($this->parents[$tag]) ? strlen($this->parents[$tag]) : 0
368 * parsing logic based on tag name
370 * @param string $tag
371 * @param array $attrs
372 * @param bool $start
373 * @return void
375 function handle_tag($tag, $attrs, $start) {
376 if(in_array($tag,$this->drop)){ # drop tags with content
377 if($start){
378 $this->buffer();
379 } else {
380 $this->unbuffer();
382 return;
384 if(in_array($tag,$this->ignore)){ # drop tags but keep content
385 return;
387 # keeping the original html
388 if($this->KEEP_HTML){
389 if($start){
390 # is the force html attr set?
391 if(!$this->force_html && isset($attrs['forcehtml'])){
392 $this->force_html($tag);
394 # we'll have to keep this tag
395 if($this->force_html) {
396 $this->keep_tag($tag,$attrs,$start,true);
397 return;
398 } else {
399 # tag has attrs which can't be converted
400 if(!empty($attrs) && $this->keep_tag($tag,$attrs,$start)){
401 return;
404 } else {
405 if($this->force_html){
406 $this->keep_tag($tag,$attrs,$start,true);
407 if($tag == $this->force_html_start['tag'] && strlen($this->parents[$tag]) == $this->force_html_start['parents']){
408 $this->force_html = false;
410 return;
411 } elseif($this->parent($tag,'kept') && $this->keep_tag($tag,$attrs,$start)) {
412 return;
416 switch ($tag) {
417 case 'h1' :
418 case 'h2' :
419 case 'h3' :
420 case 'h4' :
421 case 'h5' :
422 case 'h6' :
423 $this->p();
424 if ($start) {
425 $this->o(str_repeat('#', intval($tag[1])) . ' ');
426 if(!empty($attrs['id'])){
427 $this->append = ' {#'.$attrs['id'].'}';
429 } else {
430 $this->out($this->append);
431 $this->append = '';
433 break;
434 case 'div' :
435 $this->p();
436 break;
437 case 'p' :
438 $this->p();
439 break;
440 case 'br' :
441 if ($start) {
442 $this->o(" \n");
444 break;
445 case 'hr' :
446 if ($start) {
447 $this->p();
448 $this->o('* * *');
449 $this->p();
451 break;
452 case 'blockquote' :
453 $this->indent('> ',$start);
454 if ($start) {
455 $this->start = true;
456 $this->out("\n\n".$this->indent);
458 break;
459 case 'em' :
460 case 'i' :
461 case 'u' :
462 $this->o('_');
463 break;
464 case 'strong' :
465 case 'b' :
466 $this->o('**');
467 break;
468 # footnotes
469 case 'sup':
470 if($start){
471 if(count($attrs) != 1 || !isset($attrs['id']) || !preg_match('#^fnref:(\d+)$#',$attrs['id'],$matches)){
472 # keep tag
473 $this->keep_tag($tag,$attrs,$start,true);
474 return;
476 # parse footnote
477 $this->out('[^'.$matches[1].']');
478 # omit output of link (<a href="#fn:1" rel="footnote">1</a>)
479 $this->buffer();
480 } else {
481 # last sup was not parsed -> keep tag
482 if(!$this->parent('sup')){
483 $this->keep_tag($tag,$attrs,$start);
484 return;
486 # sup was parsed -> reset buffer
487 $this->unbuffer();
489 break;
490 case 'footnotes':
491 $this->p();
492 break;
493 case 'footnote':
494 if($start){
495 $this->o('[^'.$attrs['nr']."]:\n".$this->indent.' ');
496 $this->start = true;
498 $this->indent(' ',$start);
499 break;
500 case 'a':
501 if($start) {
502 # buffer to check for inline links like <foo@bar.com> and the like
503 if (isset ($attrs['href'])) {
504 $this->buffer();
505 array_push($this->astack, $attrs);
506 } else {
507 array_push($this->astack, null);
509 } else {
510 if($this->astack) {
511 $a = array_pop($this->astack);
512 if ($a) {
513 # for emails
514 $a['href'] = $this->decode($a['href']);
515 $buffer = $this->unbuffer();
516 $buffer_check = $this->decode(trim($buffer));
517 if((substr($a['href'],0,7) == 'mailto:' && 'mailto:'.$buffer_check == $a['href']) || $a['href'] == $buffer_check){
518 # inline link
519 $this->out('<'.$buffer_check.'>',true);
520 } else {
521 # block link
522 $this->previousIndex($a);
523 $this->out('['.$buffer.']['.$a['count'].']',true);
528 break;
529 # abbrevations
530 case 'abbr':
531 case 'acronym':
532 if($start){
533 $this->buffer();
534 array_push($this->abbrs,isset($attrs['title'])?$attrs['title']:'');
535 } else {
536 $abbr = $this->unbuffer();
537 $def = array_pop($this->abbrs);
538 # only add abbr if its not already defined
539 if(!isset($this->abbrs[$abbr])){
540 $this->abbrs[$abbr] = $def;
542 $this->o($abbr);
544 break;
545 case 'img' :
546 if ($start) {
547 if (isset ($attrs['src'])) {
548 $attrs['href'] = $attrs['src'];
549 $alt = '';
550 if (isset ($attrs['alt'])) {
551 $alt = $attrs['alt'];
552 } elseif(isset($attrs['title'])){
553 $alt = $attrs['title'];
555 $this->previousIndex($attrs);
556 $this->o('!['.$alt.'][' . $attrs['count'] . ']');
559 break;
560 case 'code':
561 # do we have to keep this tag?
562 # or is a parent <pre> element existing?
563 if($this->keep_tag($tag,$attrs,$start) || $this->parent('pre')){
564 return;
566 # convert to `code` and handle backticks inside code block
567 # <code>foo`bar</code> has to get ``foo`bar`` and so forth
568 if($start){
569 $this->buffer();
570 } else {
571 $str = $this->unbuffer();
572 preg_match_all('#`+#',$str,$matches);
573 if(!empty($matches[0])){
574 rsort($matches[0]);
575 $len = strlen($matches[0][0])+1;
576 } else {
577 $len = 1;
579 $ticks = str_repeat('`',$len);
580 $this->out($ticks.$str.$ticks);
582 break;
583 case 'dl' :
584 # note: if <dl> gets parsed, its direct children (<dd> and <dt>) will be parsed as well
585 if ($start) {
586 $this->p();
588 break;
589 case 'dd' :
590 # is the parent dl parsed?
591 if(!$this->parent('dl')){
592 $this->keep_tag($tag,$attrs,$start,true);
593 return;
595 if ($start) {
596 $this->o(': ');
597 $this->start = true;
598 } else {
599 $this->outtext .= "\n";
600 $this->pbr();
602 $this->indent(' ',$start);
603 break;
604 case 'dt' :
605 # is the parent dl parsed?
606 if(!$this->parent('dl')){
607 $this->keep_tag($tag,$attrs,$start,true);
608 return;
610 if (!$start) {
611 $this->pbr();
613 break;
614 case 'ol' :
615 case 'ul' :
616 # note: if this element gets parsed, its direct children <li>s will be parsed as well
617 if ($start) {
618 array_push($this->list, array (
619 'name' => $tag,
620 'num' => 0
622 } else {
623 array_pop($this->list);
624 $this->pbr();
626 break;
627 case 'li' :
628 if ($this->list) {
629 $li = &$this->list[count($this->list) - 1];
631 # not inside a list or the list tag was not parsed
632 if(!isset($li) || !$this->parent($li['name'])){
633 $this->keep_tag($tag,$attrs,$start,true);
634 return;
636 if ($start) {
637 $this->pbr();
638 if($li['name'] == 'ul'){
639 $this->o('* ');
640 } else {
641 $li['num']++;
643 * @todo line up <ol><li>s > 9 correctly.
645 $this->o($li['num'].'. ');
647 $this->start = true;
648 $this->indent(' ',$start);
649 } else {
650 $this->indent(' ',$start);
652 break;
653 case 'table':
654 # NOTE: if the <table> tag gets parsed, all its children will be as well!
656 # finally: parse the whole table
657 if(!$start){
658 $this->outtext .= "\n\n";
659 $separator = array();
660 # seperator with correct align identifikators
661 foreach($this->cols as $col => $arr){
662 $this->max_len[$col] = max($arr);
663 $left = $right = '';
664 switch($this->align[$col]){
665 case 'center':
666 $right = ':';
667 case 'left':
668 $left = ':';
669 break;
670 case 'right':
671 $right = ':';
672 break;
674 array_push($separator,$left.str_repeat('-',$this->max_len[$col]).$right);
676 $separator = '| '.implode(' | ',$separator).' |';
677 # set equal width
678 array_walk($this->rows,array(&$this,'fill_td'));
679 $rows = $this->rows;
680 foreach($rows as $row => $cols){
681 $this->pbr();
682 $this->o('| '.implode(' | ',$cols).' |');
683 if(in_array($row,$this->header)){
684 $this->pbr();
685 $this->o($separator);
688 $this->cols = array();
689 $this->rows = array();
690 $this->align = array();
691 $this->pbr();
693 break;
694 case 'tr':
695 # not inside a table or the parent table was not parsed
696 if(!$this->parent('table')){
697 $this->keep_tag($tag,$attrs,$start,true);
698 return;
700 if($start){
701 $this->row++;
702 } else {
703 $this->col = 0;
705 break;
706 case 'th':
707 # not inside a table or the parent table was not parsed
708 if(!$this->parent('table')){
709 $this->keep_tag($tag,$attrs,$start,true);
710 return;
712 if($start){
713 if(!in_array($this->row,$this->header)){
714 array_push($this->header,$this->row);
716 $this->col++;
717 $this->align[$this->col] = !empty($attrs['align']) ? $attrs['align'] : null;
719 break;
720 case 'td':
721 # not inside a table or the parent table was not parsed
722 if(!$this->parent('table')){
723 $this->keep_tag($tag,$attrs,$start,true);
724 return;
726 if($start){
727 $this->col++;
728 if(!empty($attrs['align']) && is_null($this->align[$this->col])){
729 $this->align[$this->col] = $attrs['align'];
730 if($attrs['align'] == 'center'){
731 $this->max_len[$this->col] +=2;
735 break;
736 case 'pre':
737 $this->indent(' ',$start,true);
738 if ($start) {
739 $this->pbr();
741 break;
742 default:
743 $this->keep_tag($tag,$attrs,$start,true);
744 return;
746 # if we want to keep all non convertible html this function has to know if some parent elemts
747 # were parsed or not (also some elements need to know if)
748 if($start){
749 if(!isset($this->parents[$tag])){
750 $this->parents[$tag] = '1';
751 } else {
752 $this->parents[$tag] .= '1';
754 } else {
755 if($this->LINKS_EACH_PARAGRAPH && in_array($tag,array('p','ul','blockquote','ol','dl','table','h1','h2','h3','h4','h5','h6'))){
756 $this->links();
758 $this->parents[$tag] = substr($this->parents[$tag],0,-1);
760 return;
763 * adds a string to the output ($this->outtext)
764 * also copes with tables
766 * @param string $str
767 * @return void
769 function out($str) {
770 # buffering
771 if($this->buffer_lvl){
772 $this->buffer[$this->buffer_lvl] .= $str;
773 return;
775 # this is for tables (see php markdown extra by michel fortin)
776 if(($this->parent('th') || $this->parent('td'))){
777 $str = trim($str);
778 if(!isset($this->rows[$this->row][$this->col])){
779 $this->rows[$this->row][$this->col] = $str;
780 } else {
781 $this->rows[$this->row][$this->col] .= $str;
783 if(!isset($this->cols[$this->col][$this->row])){
784 $this->cols[$this->col][$this->row] = strlen($str);
785 } else {
786 $this->cols[$this->col][$this->row] += strlen($str);
788 return;
790 $this->outtext .= $str;
793 * further parse the output and add newlines, remove whitespaces and such
795 * @param string $data
796 * @param bool $puredata
797 * @param string $force
798 * @return void
800 function o($data, $puredata = false, $force = false) {
801 if($this->parent('table') && trim($data) == ''){ # drop whitespaces inside tables
802 return;
803 } elseif ($puredata && !$this->parent('code','both') && !$this->parent('pre','both')) { # keep whitespace for code
804 $data = preg_replace('#\s+#', ' ', $data);
806 if (!$data && !$force) {
807 return;
809 if (!empty($this->indent)) {
810 $data = str_replace("\n", "\n".$this->indent, $data);
812 if ($this->start) {
813 if($data == ' '){
814 return;
816 $this->p_p = 0;
817 $this->start = 0;
819 if ($force == 'end') {
820 # It's the end.
821 $this->p_p = 0;
822 $this->out("\n");
824 if ($this->p_p) {
825 if($data == ' '){
826 return;
828 $data = ltrim($data);
829 $this->out(str_repeat("\n".$this->indent, $this->p_p));
831 $this->p_p = 0;
832 $this->out($data);
833 if($data){
834 $this->lastWasNL = substr($data, -1) == "\n";
838 * display block links after paragraph etc.
839 * also handle abbrs
841 * @param void
842 * @return void
844 function links(){
845 $this->abbrs();
846 if(empty($this->a)){
847 return; # no links stored
849 $pre = '';
850 $this->out("\n\n");
851 foreach($this->a as $links){
853 * @todo base href
855 foreach($links as $link){
856 $a = $pre.' [' . $link['count'] . ']: ' . $link['href'];
857 if (isset ($link['title'])) {
858 $a .= ' (' . $link['title'] . ')';
860 $this->out($a."\n");
863 $this->a = array();
864 $this->out("\n");
865 $this->lastWasNL = true;
868 * display abbr list
870 * @param void
871 * @return void
873 function abbrs(){
874 if(empty($this->abbrs)){
875 return; # no abbrs stored
877 $this->out("\n\n");
878 foreach($this->abbrs as $abbr => $def){
879 $this->out('*['.$abbr.']: '.$def."\n");
881 $this->abbrs = array();
882 $this->out("\n");
883 $this->lastWasNL = true;
886 * if the link is already set use its count, else increase acount
888 * @param array &$attrs link attributes
889 * @return void
891 function previousIndex(&$attrs) {
892 # check for existing link
893 if(isset($this->a[$attrs['href']])){
894 foreach($this->a[$attrs['href']] as $a){
895 if (!empty($attrs['title']) || !empty($a['title'])){
896 if($a['title'] == $attrs['title']) {
897 $attrs = $a;
898 return;
900 } else {
901 $attrs = $a;
902 return;
906 # if we come here, no matching link was found
907 $this->acount++;
908 $attrs['count'] = $this->acount;
909 if(isset($this->a[$attrs['href']])){
910 array_push($this->a[$attrs['href']],$attrs);
911 } else {
912 $this->a[$attrs['href']] = array($attrs);
916 * handles bad html to avoid xml parse errors
918 * @param string $html
919 * @return string
921 function handle_bad_html($html){
922 return preg_replace_callback('#&lt;([a-z1-6]+)( [^>]*)?>(.*(?R).*)&lt;/\\1>#Us',array(&$this,'replace_bad_html'),$html);
925 * callback function which is used in handle_bad_html()
927 * @param array $matches
928 * @return string
930 function replace_bad_html($matches){
931 # recursion
932 $matches[3] = $this->handle_bad_html($matches[3]);
933 return '<'.$matches[1].$matches[2].'>'.$matches[3].'</'.$matches[1].'>';
936 * if the option BODY_WIDTH is set, this option will wrap text to the
937 * provided width
939 * @param string $text
940 * @return string
942 * @todo wrapping of code (also kept code blocks)
944 function optwrap($text) {
945 if ($this->BODY_WIDTH < 30) {
946 return $text;
948 $result = '';
949 $split = explode("\n", $text);
950 foreach ($split as $para) {
951 if (strlen($para) > 0) {
952 if (preg_match('#^(\s*): #',$para,$indent)) { # definition lists
953 $indent = isset($indent[1]) ? $indent[1] : '';
954 $result .= wordwrap($para, $this->BODY_WIDTH - strlen($indent) - 4, "\n".$indent.' ')."\n";
955 } elseif(preg_match('#^(\s*>+)#',$para,$indent)){ # blockquote
956 $result .= wordwrap($para,$this->BODY_WIDTH - (strlen($indent[0])+1),"\n".$indent[0].' ')."\n";
957 } elseif(preg_match('#^\s*\|#',$para)){ # table
958 $result .= $para."\n"; # dont wrap
959 } elseif(preg_match('#^(\s*)\*#',$para,$indent)) { # list item @todo: ol
960 $indent = isset($indent[1]) ? $indent[1] : '';
961 $indent.= ' ';
962 $result .= wordwrap($para,$this->BODY_WIDTH - strlen($indent),"\n".$indent). "\n";
963 } elseif(preg_match('#^ \[[^\]]+\]:#',$para)){ # block links
964 # don't wrap at the moment
965 $result .= $para."\n";
966 continue;
967 } else { # something else
968 preg_match('#^\s+#',$para,$indent);
969 $indent = isset($indent[0]) ? $indent[0] : '';
970 $result .= wordwrap($para,$this->BODY_WIDTH - strlen($indent),"\n".$indent). "\n";
972 } else {
973 $result .= "\n";
976 return $result;
979 * handles html tags which are not represented by the parser logic
980 * if $this->KEEP_HTML is set to true, the tag will be appended to the
981 * output and `markdown="1"` added to its attributes
983 * @param string $tag
984 * @param array $attrs
985 * @param bool $start
986 * @param array $known_attrs these attrs can be handled by markdown
987 * @return bool
989 function keep_tag($tag,$attrs,$start,$force = false){
990 if(!$force && !$this->KEEP_HTML){
991 return false;
994 # start tag
995 if($start){
996 # if there is a attr which cannot be handled by markdown
997 # this tag will be kept.
998 if(isset($this->has_attrs[$tag])){
999 $known_attrs = $this->has_attrs[$tag];
1000 } else {
1001 $known_attrs = array();
1003 if(!$force && count($known_attrs) >= count($attrs)){
1004 if(empty($attrs) || count(array_diff(array_keys($attrs),$known_attrs)) == 0){
1005 # tag can be handled by markdown!
1006 return false;
1009 $attr = '';
1010 if(!empty($attrs)){
1011 foreach($attrs as $key => $value){
1012 if($key == 'forcehtml'){
1013 continue;
1015 $attr.=' '.$key.'="'.$value.'"';
1018 if(!$force && in_array($tag,array('div','center','li','dt','dd'))){
1019 $attr.= ' markdown="1"';
1020 } elseif(!$this->force_html) {
1021 $this->force_html($tag);
1023 $this->o('<'.$tag.$attr.'>',true);
1024 # add to list of parents:
1025 if(isset($this->parents[$tag])){
1026 $this->parents[$tag] .= '2';
1027 } else {
1028 $this->parents[$tag] = '2';
1030 # close tag
1031 } else {
1032 if(!$force && !$this->parent($tag,'kept')){
1033 # the start tag of this element was not parsed
1034 return false;
1036 $this->o('</'.$tag.'>');
1037 $this->parents[$tag] = substr($this->parents[$tag],0,-1);
1038 # newlines after </tag>
1039 if(in_array($tag,array('th','td','dt','dd','li','p'))){
1040 $this->o("\n");
1043 # newlines after <tag> and </tag>
1044 if(in_array($tag,array('div','center','table','tr','ul','ol','dl','pre'))){
1045 $this->o("\n");
1047 return true;
1050 * outputs a cell widened to the proper width
1052 * @param array &$row
1053 * @return void
1055 function fill_td(&$row){
1056 $len = 0;
1057 foreach($row as $col => $cont){
1058 $width = $this->max_len[$col];
1059 switch($this->align[$col]){
1060 case 'center':
1061 $width += 2;
1062 $row[$col] = str_pad($row[$col],$width,' ',STR_PAD_BOTH);
1063 break;
1064 case 'left':
1065 $width++;
1066 default:
1067 $row[$col] = str_pad($row[$col],$width,' ');
1068 break;
1069 case 'right':
1070 $width++;
1071 $row[$col] = str_pad($row[$col],$width,' ',STR_PAD_LEFT);
1072 break;
1077 * some sort of <br />
1079 * @param void
1080 * @return void
1082 function pbr() {
1083 if ($this->p_p == 0) {
1084 $this->p_p = 1;
1088 * text <p> (e.g. newlines after output)
1090 * @param void
1091 * @return void
1093 function p() {
1094 if($this->parent('table')){
1095 return;
1097 $this->p_p = 2;
1100 * add $indent before each line
1102 * @param string $indent
1103 * @param bool $start wether it's an opening tag or a closing one
1104 * @param bool $output shall $indent be outputted? (only if $start is true)
1105 * @return void
1107 function indent($indent,$start,$output=false){
1108 if($start){
1109 if($output){
1110 $this->o($indent);
1112 $this->indent .= $indent;
1113 } else {
1114 $len = strlen($indent);
1115 if($len >= strlen($this->indent)){
1116 $this->indent = '';
1117 } else {
1118 $this->indent = substr($this->indent,0,-$len);
1123 * checks if a parent element exists
1124 * use $type to check for a parsed parent element or a kept element
1125 * @param string $parent name of the parent tag
1126 * @param string $type either 'parsed' or 'kept' or 'both'
1127 * @return bool
1129 function parent($parent,$type = 'parsed'){
1130 if(!isset($this->parents[$parent])){
1131 return false;
1133 if($type != 'both'){
1134 $type = $type == 'parsed' ? '1' : '2';
1135 return substr($this->parents[$parent],-1) === $type;
1136 } else {
1137 return !empty($this->parents[$parent]);
1141 * start buffer
1143 * @param void
1144 * @return void
1146 function buffer(){
1147 if($this->p_p){
1148 $this->out(str_repeat("\n".$this->indent, $this->p_p));
1149 $this->p_p = 0;
1151 $this->buffer_lvl++;
1152 $this->buffer[$this->buffer_lvl] = '';
1155 * end buffer and return buffered output
1157 * @param void
1158 * @return string
1160 function unbuffer(){
1161 $out = $this->buffer[$this->buffer_lvl];
1162 unset($this->buffer[$this->buffer_lvl]);
1163 $this->buffer_lvl--;
1164 return $out;
1167 * decode email
1169 * @author derernst@gmx.ch <http://www.php.net/manual/en/function.html-entity-decode.php#68536>
1171 function decode($text,$quote_style = ENT_NOQUOTES){
1172 if (function_exists('html_entity_decode')) {
1173 $text = html_entity_decode($text, $quote_style, 'ISO-8859-1'); // NOTE: UTF-8 does not work!
1175 else {
1176 $trans_tbl = get_html_translation_table(HTML_ENTITIES, $quote_style);
1177 $trans_tbl = array_flip($trans_tbl);
1178 $text = strtr($text, $trans_tbl);
1180 $text = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $text);
1181 $text = preg_replace('~&#([0-9]+);~e', 'chr("\\1")', $text);
1182 return $text;