Fixing file upload params ($_FILES) normalization. Closes #75
[akelos.git] / vendor / TextParsers / markdown.php
blob49b418678370010b1edfcfbf7ef3ed2cd61d10e2
1 <?php
3 # Markdown - A text-to-HTML conversion tool for web writers
5 # PHP Markdown
6 # Copyright (c) 2004-2006 Michel Fortin
7 # <http://www.michelf.com/projects/php-markdown/>
9 # Original Markdown
10 # Copyright (c) 2004-2006 John Gruber
11 # <http://daringfireball.net/projects/markdown/>
15 define( 'MARKDOWN_VERSION', "1.0.1e" ); # Thu 28 Dec 2006
19 # Global default settings:
22 # Change to ">" for HTML output
23 define( 'MARKDOWN_EMPTY_ELEMENT_SUFFIX', " />");
25 # Define the width of a tab for code blocks.
26 define( 'MARKDOWN_TAB_WIDTH', 4 );
30 # WordPress settings:
33 # Change to false to remove Markdown from posts and/or comments.
34 define( 'MARKDOWN_WP_POSTS', true );
35 define( 'MARKDOWN_WP_COMMENTS', true );
39 ### Standard Function Interface ###
41 define( 'MARKDOWN_PARSER_CLASS', 'Markdown_Parser' );
43 function Markdown($text) {
45 # Initialize the parser and return the result of its transform method.
47 # Setup static parser variable.
48 static $parser;
49 if (!isset($parser)) {
50 $parser_class = MARKDOWN_PARSER_CLASS;
51 $parser = new $parser_class;
54 # Transform text using parser.
55 return $parser->transform($text);
60 # Markdown Parser Class
63 class Markdown_Parser {
65 # Regex to match balanced [brackets].
66 # Needed to insert a maximum bracked depth while converting to PHP.
67 var $nested_brackets_depth = 6;
68 var $nested_brackets;
70 # Table of hash values for escaped characters:
71 var $escape_chars = '\`*_{}[]()>#+-.!';
72 var $escape_table = array();
73 var $backslash_escape_table = array();
75 # Change to ">" for HTML output.
76 var $empty_element_suffix = MARKDOWN_EMPTY_ELEMENT_SUFFIX;
77 var $tab_width = MARKDOWN_TAB_WIDTH;
80 function Markdown_Parser() {
82 # Constructor function. Initialize appropriate member variables.
84 $this->_initDetab();
86 $this->nested_brackets =
87 str_repeat('(?>[^\[\]]+|\[', $this->nested_brackets_depth).
88 str_repeat('\])*', $this->nested_brackets_depth);
90 # Create an identical table but for escaped characters.
91 foreach (preg_split('/(?!^|$)/', $this->escape_chars) as $char) {
92 $hash = md5($char);
93 $this->escape_table[$char] = $hash;
94 $this->backslash_escape_table["\\$char"] = $hash;
97 # Sort document, block, and span gamut in ascendent priority order.
98 asort($this->document_gamut);
99 asort($this->block_gamut);
100 asort($this->span_gamut);
104 # Internal hashes used during transformation.
105 var $urls = array();
106 var $titles = array();
107 var $html_blocks = array();
108 var $html_hashes = array(); # Contains both blocks and span hashes.
111 function transform($text) {
113 # Main function. The order in which other subs are called here is
114 # essential. Link and image substitutions need to happen before
115 # _EscapeSpecialCharsWithinTagAttributes(), so that any *'s or _'s in the <a>
116 # and <img> tags get encoded.
118 # Clear the global hashes. If we don't clear these, you get conflicts
119 # from other articles when generating a page which contains more than
120 # one article (e.g. an index page that shows the N most recent
121 # articles):
122 $this->urls = array();
123 $this->titles = array();
124 $this->html_blocks = array();
125 $this->html_hashes = array();
127 # Standardize line endings:
128 # DOS to Unix and Mac to Unix
129 $text = str_replace(array("\r\n", "\r"), "\n", $text);
131 # Make sure $text ends with a couple of newlines:
132 $text .= "\n\n";
134 # Convert all tabs to spaces.
135 $text = $this->detab($text);
137 # Turn block-level HTML blocks into hash entries
138 $text = $this->hashHTMLBlocks($text);
140 # Strip any lines consisting only of spaces and tabs.
141 # This makes subsequent regexen easier to write, because we can
142 # match consecutive blank lines with /\n+/ instead of something
143 # contorted like /[ \t]*\n+/ .
144 $text = preg_replace('/^[ \t]+$/m', '', $text);
146 # Run document gamut methods.
147 foreach ($this->document_gamut as $method => $priority) {
148 $text = $this->$method($text);
151 return $text . "\n";
154 var $document_gamut = array(
155 # Strip link definitions, store in hashes.
156 "stripLinkDefinitions" => 20,
158 "runBasicBlockGamut" => 30,
159 "unescapeSpecialChars" => 90,
163 function stripLinkDefinitions($text) {
165 # Strips link definitions from text, stores the URLs and titles in
166 # hash references.
168 $less_than_tab = $this->tab_width - 1;
170 # Link defs are in the form: ^[id]: url "optional title"
171 $text = preg_replace_callback('{
172 ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1
173 [ \t]*
174 \n? # maybe *one* newline
175 [ \t]*
176 <?(\S+?)>? # url = $2
177 [ \t]*
178 \n? # maybe one newline
179 [ \t]*
181 (?<=\s) # lookbehind for whitespace
182 ["(]
183 (.*?) # title = $3
184 [")]
185 [ \t]*
186 )? # title is optional
187 (?:\n+|\Z)
188 }xm',
189 array(&$this, '_stripLinkDefinitions_callback'),
190 $text);
191 return $text;
193 function _stripLinkDefinitions_callback($matches) {
194 $link_id = strtolower($matches[1]);
195 $this->urls[$link_id] = $this->encodeAmpsAndAngles($matches[2]);
196 if (isset($matches[3]))
197 $this->titles[$link_id] = str_replace('"', '&quot;', $matches[3]);
198 return ''; # String that will replace the block
202 function hashHTMLBlocks($text) {
203 $less_than_tab = $this->tab_width - 1;
205 # Hashify HTML blocks:
206 # We only want to do this for block-level HTML tags, such as headers,
207 # lists, and tables. That's because we still want to wrap <p>s around
208 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
209 # phrase emphasis, and spans. The list of tags we're looking for is
210 # hard-coded:
211 $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
212 'script|noscript|form|fieldset|iframe|math|ins|del';
213 $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'.
214 'script|noscript|form|fieldset|iframe|math';
216 # Regular expression for the content of a block tag.
217 $nested_tags_level = 4;
218 $attr = '
219 (?> # optional tag attributes
220 \s # starts with whitespace
222 [^>"/]+ # text outside quotes
224 /+(?!>) # slash not followed by ">"
226 "[^"]*" # text inside double quotes (tolerate ">")
228 \'[^\']*\' # text inside single quotes (tolerate ">")
232 $content =
233 str_repeat('
235 [^<]+ # content without tag
237 <\2 # nested opening tag
238 '.$attr.' # attributes
242 >', $nested_tags_level). # end of opening tag
243 '.*?'. # last level nested tag content
244 str_repeat('
245 </\2\s*> # closing nested tag
248 <(?!/\2\s*> # other tags with a different name
250 )*',
251 $nested_tags_level);
253 # First, look for nested blocks, e.g.:
254 # <div>
255 # <div>
256 # tags for inner block must be indented.
257 # </div>
258 # </div>
260 # The outermost tags must start at the left margin for this to match, and
261 # the inner nested divs must be indented.
262 # We need to do this before the next, more liberal match, because the next
263 # match will start at the first `<div>` and stop at the first `</div>`.
264 $text = preg_replace_callback('{
265 ( # save in $1
266 ^ # start of line (with /m)
267 <('.$block_tags_a.')# start tag = $2
268 '.$attr.'>\n # attributes followed by > and \n
269 '.$content.' # content, support nesting
270 </\2> # the matching end tag
271 [ \t]* # trailing spaces/tabs
272 (?=\n+|\Z) # followed by a newline or end of document
274 }xm',
275 array(&$this, '_hashHTMLBlocks_callback'),
276 $text);
279 # Match from `\n<tag>` to `</tag>\n`, handling nested tags in between.
281 $text = preg_replace_callback('{
282 ( # save in $1
283 ^ # start of line (with /m)
284 <('.$block_tags_b.')# start tag = $2
285 '.$attr.'> # attributes followed by >
286 '.$content.' # content, support nesting
287 </\2> # the matching end tag
288 [ \t]* # trailing spaces/tabs
289 (?=\n+|\Z) # followed by a newline or end of document
291 }xm',
292 array(&$this, '_hashHTMLBlocks_callback'),
293 $text);
295 # Special case just for <hr />. It was easier to make a special case than
296 # to make the other regex more complicated.
297 $text = preg_replace_callback('{
299 (?<=\n\n) # Starting after a blank line
300 | # or
301 \A\n? # the beginning of the doc
303 ( # save in $1
304 [ ]{0,'.$less_than_tab.'}
305 <(hr) # start tag = $2
306 \b # word break
307 ([^<>])*? #
308 /?> # the matching end tag
309 [ \t]*
310 (?=\n{2,}|\Z) # followed by a blank line or end of document
312 }x',
313 array(&$this, '_hashHTMLBlocks_callback'),
314 $text);
316 # Special case for standalone HTML comments:
317 $text = preg_replace_callback('{
319 (?<=\n\n) # Starting after a blank line
320 | # or
321 \A\n? # the beginning of the doc
323 ( # save in $1
324 [ ]{0,'.$less_than_tab.'}
325 (?s:
326 <!-- .*? -->
328 [ \t]*
329 (?=\n{2,}|\Z) # followed by a blank line or end of document
331 }x',
332 array(&$this, '_hashHTMLBlocks_callback'),
333 $text);
335 /* PHP and ASP-style processor instructions (<? and <%...%>)*/
336 $text = preg_replace_callback('{
338 (?<=\n\n) # Starting after a blank line
339 | # or
340 \A\n? # the beginning of the doc
342 ( # save in $1
343 [ ]{0,'.$less_than_tab.'}
344 (?s:
345 <([?%]) # $2
349 [ \t]*
350 (?=\n{2,}|\Z) # followed by a blank line or end of document
352 }x',
353 array(&$this, '_hashHTMLBlocks_callback'),
354 $text);
356 return $text;
358 function _hashHTMLBlocks_callback($matches) {
359 $text = $matches[1];
360 $key = $this->hashBlock($text);
361 return "\n\n$key\n\n";
365 function hashBlock($text) {
367 # Called whenever a tag must be hashed when a function insert a block-level
368 # tag in $text, it pass through this function and is automaticaly escaped,
369 # which remove the need to call _HashHTMLBlocks at every step.
371 # Swap back any tag hash found in $text so we do not have to `unhash`
372 # multiple times at the end.
373 $text = $this->unhash($text);
375 # Then hash the block.
376 $key = md5($text);
377 $this->html_hashes[$key] = $text;
378 $this->html_blocks[$key] = $text;
379 return $key; # String that will replace the tag.
383 function hashSpan($text) {
385 # Called whenever a tag must be hashed when a function insert a span-level
386 # element in $text, it pass through this function and is automaticaly
387 # escaped, blocking invalid nested overlap.
389 # Swap back any tag hash found in $text so we do not have to `unhash`
390 # multiple times at the end.
391 $text = $this->unhash($text);
393 # Then hash the span.
394 $key = md5($text);
395 $this->html_hashes[$key] = $text;
396 return $key; # String that will replace the span tag.
400 var $block_gamut = array(
402 # These are all the transformations that form block-level
403 # tags like paragraphs, headers, and list items.
405 "doHeaders" => 10,
406 "doHorizontalRules" => 20,
408 "doLists" => 40,
409 "doCodeBlocks" => 50,
410 "doBlockQuotes" => 60,
413 function runBlockGamut($text) {
415 # Run block gamut tranformations.
417 # We need to escape raw HTML in Markdown source before doing anything
418 # else. This need to be done for each block, and not only at the
419 # begining in the Markdown function since hashed blocks can be part of
420 # list items and could have been indented. Indented blocks would have
421 # been seen as a code block in a previous pass of hashHTMLBlocks.
422 $text = $this->hashHTMLBlocks($text);
424 return $this->runBasicBlockGamut($text);
427 function runBasicBlockGamut($text) {
429 # Run block gamut tranformations, without hashing HTML blocks. This is
430 # useful when HTML blocks are known to be already hashed, like in the first
431 # whole-document pass.
433 foreach ($this->block_gamut as $method => $priority) {
434 $text = $this->$method($text);
437 # Finally form paragraph and restore hashed blocks.
438 $text = $this->formParagraphs($text);
440 return $text;
444 function doHorizontalRules($text) {
445 # Do Horizontal Rules:
446 return preg_replace(
447 array('{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}mx',
448 '{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}mx',
449 '{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}mx'),
450 "\n".$this->hashBlock("<hr$this->empty_element_suffix")."\n",
451 $text);
455 var $span_gamut = array(
457 # These are all the transformations that occur *within* block-level
458 # tags like paragraphs, headers, and list items.
460 "escapeSpecialCharsWithinTagAttributes" => -20,
461 "doCodeSpans" => -10,
462 "encodeBackslashEscapes" => -5,
464 # Process anchor and image tags. Images must come first,
465 # because ![foo][f] looks like an anchor.
466 "doImages" => 10,
467 "doAnchors" => 20,
469 # Make links out of things like `<http://example.com/>`
470 # Must come after doAnchors, because you can use < and >
471 # delimiters in inline links like [this](<url>).
472 "doAutoLinks" => 30,
473 "encodeAmpsAndAngles" => 40,
475 "doItalicsAndBold" => 50,
476 "doHardBreaks" => 60,
479 function runSpanGamut($text) {
481 # Run span gamut tranformations.
483 foreach ($this->span_gamut as $method => $priority) {
484 $text = $this->$method($text);
487 return $text;
491 function doHardBreaks($text) {
492 # Do hard breaks:
493 $br_tag = $this->hashSpan("<br$this->empty_element_suffix\n");
494 return preg_replace('/ {2,}\n/', $br_tag, $text);
498 function escapeSpecialCharsWithinTagAttributes($text) {
500 # Within tags -- meaning between < and > -- encode [\ ` * _] so they
501 # don't conflict with their use in Markdown for code, italics and strong.
502 # We're replacing each such character with its corresponding MD5 checksum
503 # value; this is likely overkill, but it should prevent us from colliding
504 # with the escape values by accident.
506 $tokens = $this->tokenizeHTML($text);
507 $text = ''; # rebuild $text from the tokens
509 foreach ($tokens as $cur_token) {
510 if ($cur_token[0] == 'tag') {
511 $cur_token[1] = str_replace('\\', $this->escape_table['\\'], $cur_token[1]);
512 $cur_token[1] = str_replace(array('`'), $this->escape_table['`'], $cur_token[1]);
513 $cur_token[1] = str_replace('*', $this->escape_table['*'], $cur_token[1]);
514 $cur_token[1] = str_replace('_', $this->escape_table['_'], $cur_token[1]);
516 $text .= $cur_token[1];
518 return $text;
522 function doAnchors($text) {
524 # Turn Markdown link shortcuts into XHTML <a> tags.
527 # First, handle reference-style links: [link text] [id]
529 $text = preg_replace_callback('{
530 ( # wrap whole match in $1
532 ('.$this->nested_brackets.') # link text = $2
535 [ ]? # one optional space
536 (?:\n[ ]*)? # one optional newline followed by spaces
539 (.*?) # id = $3
542 }xs',
543 array(&$this, '_doAnchors_reference_callback'), $text);
546 # Next, inline-style links: [link text](url "optional title")
548 $text = preg_replace_callback('{
549 ( # wrap whole match in $1
551 ('.$this->nested_brackets.') # link text = $2
553 \( # literal paren
554 [ \t]*
555 <?(.*?)>? # href = $3
556 [ \t]*
557 ( # $4
558 ([\'"]) # quote char = $5
559 (.*?) # Title = $6
560 \5 # matching quote
561 [ \t]* # ignore any spaces/tabs between closing quote and )
562 )? # title is optional
565 }xs',
566 array(&$this, '_DoAnchors_inline_callback'), $text);
569 # Last, handle reference-style shortcuts: [link text]
570 # These must come last in case you've also got [link test][1]
571 # or [link test](/foo)
573 // $text = preg_replace_callback('{
574 // ( # wrap whole match in $1
575 // \[
576 // ([^\[\]]+) # link text = $2; can\'t contain [ or ]
577 // \]
578 // )
579 // }xs',
580 // array(&$this, '_doAnchors_reference_callback'), $text);
582 return $text;
584 function _doAnchors_reference_callback($matches) {
585 $whole_match = $matches[1];
586 $link_text = $matches[2];
587 $link_id =& $matches[3];
589 if ($link_id == "") {
590 # for shortcut links like [this][] or [this].
591 $link_id = $link_text;
594 # lower-case and turn embedded newlines into spaces
595 $link_id = strtolower($link_id);
596 $link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
598 if (isset($this->urls[$link_id])) {
599 $url = $this->urls[$link_id];
600 $url = $this->encodeAmpsAndAngles($url);
602 $result = "<a href=\"$url\"";
603 if ( isset( $this->titles[$link_id] ) ) {
604 $title = $this->titles[$link_id];
605 $title = $this->encodeAmpsAndAngles($title);
606 $result .= " title=\"$title\"";
609 $link_text = $this->runSpanGamut($link_text);
610 $result .= ">$link_text</a>";
611 $result = $this->hashSpan($result);
613 else {
614 $result = $whole_match;
616 return $result;
618 function _doAnchors_inline_callback($matches) {
619 $whole_match = $matches[1];
620 $link_text = $this->runSpanGamut($matches[2]);
621 $url = $matches[3];
622 $title =& $matches[6];
624 $url = $this->encodeAmpsAndAngles($url);
626 $result = "<a href=\"$url\"";
627 if (isset($title)) {
628 $title = str_replace('"', '&quot;', $title);
629 $title = $this->encodeAmpsAndAngles($title);
630 $result .= " title=\"$title\"";
633 $link_text = $this->runSpanGamut($link_text);
634 $result .= ">$link_text</a>";
636 return $this->hashSpan($result);
640 function doImages($text) {
642 # Turn Markdown image shortcuts into <img> tags.
645 # First, handle reference-style labeled images: ![alt text][id]
647 $text = preg_replace_callback('{
648 ( # wrap whole match in $1
650 ('.$this->nested_brackets.') # alt text = $2
653 [ ]? # one optional space
654 (?:\n[ ]*)? # one optional newline followed by spaces
657 (.*?) # id = $3
661 }xs',
662 array(&$this, '_doImages_reference_callback'), $text);
665 # Next, handle inline images: ![alt text](url "optional title")
666 # Don't forget: encode * and _
668 $text = preg_replace_callback('{
669 ( # wrap whole match in $1
671 ('.$this->nested_brackets.') # alt text = $2
673 \s? # One optional whitespace character
674 \( # literal paren
675 [ \t]*
676 <?(\S+?)>? # src url = $3
677 [ \t]*
678 ( # $4
679 ([\'"]) # quote char = $5
680 (.*?) # title = $6
681 \5 # matching quote
682 [ \t]*
683 )? # title is optional
686 }xs',
687 array(&$this, '_doImages_inline_callback'), $text);
689 return $text;
691 function _doImages_reference_callback($matches) {
692 $whole_match = $matches[1];
693 $alt_text = $matches[2];
694 $link_id = strtolower($matches[3]);
696 if ($link_id == "") {
697 $link_id = strtolower($alt_text); # for shortcut links like ![this][].
700 $alt_text = str_replace('"', '&quot;', $alt_text);
701 if (isset($this->urls[$link_id])) {
702 $url = $this->urls[$link_id];
703 $result = "<img src=\"$url\" alt=\"$alt_text\"";
704 if (isset($this->titles[$link_id])) {
705 $title = $this->titles[$link_id];
706 $result .= " title=\"$title\"";
708 $result .= $this->empty_element_suffix;
709 $result = $this->hashSpan($result);
711 else {
712 # If there's no such link ID, leave intact:
713 $result = $whole_match;
716 return $result;
718 function _doImages_inline_callback($matches) {
719 $whole_match = $matches[1];
720 $alt_text = $matches[2];
721 $url = $matches[3];
722 $title =& $matches[6];
724 $alt_text = str_replace('"', '&quot;', $alt_text);
725 $result = "<img src=\"$url\" alt=\"$alt_text\"";
726 if (isset($title)) {
727 $title = str_replace('"', '&quot;', $title);
728 $result .= " title=\"$title\""; # $title already quoted
730 $result .= $this->empty_element_suffix;
732 return $this->hashSpan($result);
736 function doHeaders($text) {
737 # Setext-style headers:
738 # Header 1
739 # ========
741 # Header 2
742 # --------
744 $text = preg_replace_callback('{ ^(.+)[ \t]*\n=+[ \t]*\n+ }mx',
745 array(&$this, '_doHeaders_callback_setext_h1'), $text);
746 $text = preg_replace_callback('{ ^(.+)[ \t]*\n-+[ \t]*\n+ }mx',
747 array(&$this, '_doHeaders_callback_setext_h2'), $text);
749 # atx-style headers:
750 # # Header 1
751 # ## Header 2
752 # ## Header 2 with closing hashes ##
753 # ...
754 # ###### Header 6
756 $text = preg_replace_callback('{
757 ^(\#{1,6}) # $1 = string of #\'s
758 [ \t]*
759 (.+?) # $2 = Header text
760 [ \t]*
761 \#* # optional closing #\'s (not counted)
763 }xm',
764 array(&$this, '_doHeaders_callback_atx'), $text);
766 return $text;
768 function _doHeaders_callback_setext_h1($matches) {
769 return $this->hashBlock("<h1>".$this->runSpanGamut($matches[1])."</h1>")."\n\n";
771 function _doHeaders_callback_setext_h2($matches) {
772 return $this->hashBlock("<h2>".$this->runSpanGamut($matches[1])."</h2>")."\n\n";
774 function _doHeaders_callback_atx($matches) {
775 $level = strlen($matches[1]);
776 return $this->hashBlock("<h$level>".$this->runSpanGamut($matches[2])."</h$level>")."\n\n";
780 function doLists($text) {
782 # Form HTML ordered (numbered) and unordered (bulleted) lists.
784 $less_than_tab = $this->tab_width - 1;
786 # Re-usable patterns to match list item bullets and number markers:
787 $marker_ul = '[*+-]';
788 $marker_ol = '\d+[.]';
789 $marker_any = "(?:$marker_ul|$marker_ol)";
791 $markers = array($marker_ul, $marker_ol);
793 foreach ($markers as $marker) {
794 # Re-usable pattern to match any entirel ul or ol list:
795 $whole_list = '
796 ( # $1 = whole list
797 ( # $2
798 [ ]{0,'.$less_than_tab.'}
799 ('.$marker.') # $3 = first list item marker
800 [ \t]+
802 (?s:.+?)
803 ( # $4
806 \n{2,}
807 (?=\S)
808 (?! # Negative lookahead for another list item marker
809 [ \t]*
810 '.$marker.'[ \t]+
814 '; // mx
816 # We use a different prefix before nested lists than top-level lists.
817 # See extended comment in _ProcessListItems().
819 if ($this->list_level) {
820 $text = preg_replace_callback('{
822 '.$whole_list.'
823 }mx',
824 array(&$this, '_doLists_callback'), $text);
826 else {
827 $text = preg_replace_callback('{
828 (?:(?<=\n)\n|\A\n?) # Must eat the newline
829 '.$whole_list.'
830 }mx',
831 array(&$this, '_doLists_callback'), $text);
835 return $text;
837 function _doLists_callback($matches) {
838 # Re-usable patterns to match list item bullets and number markers:
839 $marker_ul = '[*+-]';
840 $marker_ol = '\d+[.]';
841 $marker_any = "(?:$marker_ul|$marker_ol)";
843 $list = $matches[1];
844 $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol";
846 $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol );
848 # Turn double returns into triple returns, so that we can make a
849 # paragraph for the last item in a list, if necessary:
850 $list = preg_replace("/\n{2,}/", "\n\n\n", $list);
851 $result = $this->processListItems($list, $marker_any);
853 $result = $this->hashBlock("<$list_type>\n" . $result . "</$list_type>");
854 return "\n". $result ."\n\n";
857 var $list_level = 0;
859 function processListItems($list_str, $marker_any) {
861 # Process the contents of a single ordered or unordered list, splitting it
862 # into individual list items.
864 # The $this->list_level global keeps track of when we're inside a list.
865 # Each time we enter a list, we increment it; when we leave a list,
866 # we decrement. If it's zero, we're not in a list anymore.
868 # We do this because when we're not inside a list, we want to treat
869 # something like this:
871 # I recommend upgrading to version
872 # 8. Oops, now this line is treated
873 # as a sub-list.
875 # As a single paragraph, despite the fact that the second line starts
876 # with a digit-period-space sequence.
878 # Whereas when we're inside a list (or sub-list), that line will be
879 # treated as the start of a sub-list. What a kludge, huh? This is
880 # an aspect of Markdown's syntax that's hard to parse perfectly
881 # without resorting to mind-reading. Perhaps the solution is to
882 # change the syntax rules such that sub-lists must start with a
883 # starting cardinal number; e.g. "1." or "a.".
885 $this->list_level++;
887 # trim trailing blank lines:
888 $list_str = preg_replace("/\n{2,}\\z/", "\n", $list_str);
890 $list_str = preg_replace_callback('{
891 (\n)? # leading line = $1
892 (^[ \t]*) # leading whitespace = $2
893 ('.$marker_any.') [ \t]+ # list marker = $3
894 ((?s:.+?) # list item text = $4
895 (\n{1,2}))
896 (?= \n* (\z | \2 ('.$marker_any.') [ \t]+))
897 }xm',
898 array(&$this, '_processListItems_callback'), $list_str);
900 $this->list_level--;
901 return $list_str;
903 function _processListItems_callback($matches) {
904 $item = $matches[4];
905 $leading_line =& $matches[1];
906 $leading_space =& $matches[2];
908 if ($leading_line || preg_match('/\n{2,}/', $item)) {
909 $item = $this->runBlockGamut($this->outdent($item));
911 else {
912 # Recursion for sub-lists:
913 $item = $this->doLists($this->outdent($item));
914 $item = preg_replace('/\n+$/', '', $item);
915 $item = $this->runSpanGamut($item);
918 return "<li>" . $item . "</li>\n";
922 function doCodeBlocks($text) {
924 # Process Markdown `<pre><code>` blocks.
926 $text = preg_replace_callback('{
927 (?:\n\n|\A)
928 ( # $1 = the code block -- one or more lines, starting with a space/tab
930 (?:[ ]{'.$this->tab_width.'} | \t) # Lines must start with a tab or a tab-width of spaces
931 .*\n+
934 ((?=^[ ]{0,'.$this->tab_width.'}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
935 }xm',
936 array(&$this, '_doCodeBlocks_callback'), $text);
938 return $text;
940 function _doCodeBlocks_callback($matches) {
941 $codeblock = $matches[1];
943 $codeblock = $this->encodeCode($this->outdent($codeblock));
944 // $codeblock = $this->detab($codeblock);
945 # trim leading newlines and trailing whitespace
946 $codeblock = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $codeblock);
948 $result = "\n\n".$this->hashBlock("<pre><code>" . $codeblock . "\n</code></pre>")."\n\n";
950 return $result;
954 function doCodeSpans($text) {
956 # * Backtick quotes are used for <code></code> spans.
958 # * You can use multiple backticks as the delimiters if you want to
959 # include literal backticks in the code span. So, this input:
961 # Just type ``foo `bar` baz`` at the prompt.
963 # Will translate to:
965 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
967 # There's no arbitrary limit to the number of backticks you
968 # can use as delimters. If you need three consecutive backticks
969 # in your code, use four for delimiters, etc.
971 # * You can use spaces to get literal backticks at the edges:
973 # ... type `` `bar` `` ...
975 # Turns to:
977 # ... type <code>`bar`</code> ...
979 $text = preg_replace_callback('@
980 (?<!\\\) # Character before opening ` can\'t be a backslash
981 (`+) # $1 = Opening run of `
982 (.+?) # $2 = The code block
983 (?<!`)
984 \1 # Matching closer
985 (?!`)
986 @xs',
987 array(&$this, '_doCodeSpans_callback'), $text);
989 return $text;
991 function _doCodeSpans_callback($matches) {
992 $c = $matches[2];
993 $c = preg_replace('/^[ \t]*/', '', $c); # leading whitespace
994 $c = preg_replace('/[ \t]*$/', '', $c); # trailing whitespace
995 $c = $this->encodeCode($c);
996 return $this->hashSpan("<code>$c</code>");
1000 function encodeCode($_) {
1002 # Encode/escape certain characters inside Markdown code runs.
1003 # The point is that in code, these characters are literals,
1004 # and lose their special Markdown meanings.
1006 # Encode all ampersands; HTML entities are not
1007 # entities within a Markdown code span.
1008 $_ = str_replace('&', '&amp;', $_);
1010 # Do the angle bracket song and dance:
1011 $_ = str_replace(array('<', '>'),
1012 array('&lt;', '&gt;'), $_);
1014 # Now, escape characters that are magic in Markdown:
1015 // $_ = str_replace(array_keys($this->escape_table),
1016 // array_values($this->escape_table), $_);
1018 return $_;
1022 function doItalicsAndBold($text) {
1023 # <strong> must go first:
1024 $text = preg_replace_callback('{
1025 ( # $1: Marker
1026 (?<!\*\*) \* | # (not preceded by two chars of
1027 (?<!__) _ # the same marker)
1030 (?=\S) # Not followed by whitespace
1031 (?!\1\1) # or two others marker chars.
1032 ( # $2: Content
1034 [^*_]+? # Anthing not em markers.
1036 # Balence any regular emphasis inside.
1037 \1 (?=\S) .+? (?<=\S) \1
1039 (?! \1 ) . # Allow unbalenced * and _.
1042 (?<=\S) \1\1 # End mark not preceded by whitespace.
1043 }sx',
1044 array(&$this, '_doItalicAndBold_strong_callback'), $text);
1045 # Then <em>:
1046 $text = preg_replace_callback(
1047 '{ ( (?<!\*)\* | (?<!_)_ ) (?=\S) (?! \1) (.+?) (?<=\S) \1 }sx',
1048 array(&$this, '_doItalicAndBold_em_callback'), $text);
1050 return $text;
1052 function _doItalicAndBold_em_callback($matches) {
1053 $text = $matches[2];
1054 $text = $this->runSpanGamut($text);
1055 return $this->hashSpan("<em>$text</em>");
1057 function _doItalicAndBold_strong_callback($matches) {
1058 $text = $matches[2];
1059 $text = $this->runSpanGamut($text);
1060 return $this->hashSpan("<strong>$text</strong>");
1064 function doBlockQuotes($text) {
1065 $text = preg_replace_callback('/
1066 ( # Wrap whole match in $1
1068 ^[ \t]*>[ \t]? # ">" at the start of a line
1069 .+\n # rest of the first line
1070 (.+\n)* # subsequent consecutive lines
1071 \n* # blanks
1074 /xm',
1075 array(&$this, '_doBlockQuotes_callback'), $text);
1077 return $text;
1079 function _doBlockQuotes_callback($matches) {
1080 $bq = $matches[1];
1081 # trim one level of quoting - trim whitespace-only lines
1082 $bq = preg_replace(array('/^[ \t]*>[ \t]?/m', '/^[ \t]+$/m'), '', $bq);
1083 $bq = $this->runBlockGamut($bq); # recurse
1085 $bq = preg_replace('/^/m', " ", $bq);
1086 # These leading spaces cause problem with <pre> content,
1087 # so we need to fix that:
1088 $bq = preg_replace_callback('{(\s*<pre>.+?</pre>)}sx',
1089 array(&$this, '_DoBlockQuotes_callback2'), $bq);
1091 return $this->hashBlock("<blockquote>\n$bq\n</blockquote>")."\n\n";
1093 function _doBlockQuotes_callback2($matches) {
1094 $pre = $matches[1];
1095 $pre = preg_replace('/^ /m', '', $pre);
1096 return $pre;
1100 function formParagraphs($text) {
1102 # Params:
1103 # $text - string to process with html <p> tags
1105 # Strip leading and trailing lines:
1106 $text = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $text);
1108 $grafs = preg_split('/\n{2,}/', $text, -1, PREG_SPLIT_NO_EMPTY);
1111 # Wrap <p> tags.
1113 foreach ($grafs as $key => $value) {
1114 if (!isset( $this->html_blocks[$value] )) {
1115 $value = $this->runSpanGamut($value);
1116 $value = preg_replace('/^([ \t]*)/', "<p>", $value);
1117 $value .= "</p>";
1118 $grafs[$key] = $this->unhash($value);
1123 # Unhashify HTML blocks
1125 foreach ($grafs as $key => $graf) {
1126 # Modify elements of @grafs in-place...
1127 if (isset($this->html_blocks[$graf])) {
1128 $block = $this->html_blocks[$graf];
1129 $graf = $block;
1130 // if (preg_match('{
1131 // \A
1132 // ( # $1 = <div> tag
1133 // <div \s+
1134 // [^>]*
1135 // \b
1136 // markdown\s*=\s* ([\'"]) # $2 = attr quote char
1137 // 1
1138 // \2
1139 // [^>]*
1140 // >
1141 // )
1142 // ( # $3 = contents
1143 // .*
1144 // )
1145 // (</div>) # $4 = closing tag
1146 // \z
1147 // }xs', $block, $matches))
1148 // {
1149 // list(, $div_open, , $div_content, $div_close) = $matches;
1151 // # We can't call Markdown(), because that resets the hash;
1152 // # that initialization code should be pulled into its own sub, though.
1153 // $div_content = $this->hashHTMLBlocks($div_content);
1155 // # Run document gamut methods on the content.
1156 // foreach ($this->document_gamut as $method => $priority) {
1157 // $div_content = $this->$method($div_content);
1158 // }
1160 // $div_open = preg_replace(
1161 // '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open);
1163 // $graf = $div_open . "\n" . $div_content . "\n" . $div_close;
1164 // }
1165 $grafs[$key] = $graf;
1169 return implode("\n\n", $grafs);
1173 function encodeAmpsAndAngles($text) {
1174 # Smart processing for ampersands and angle brackets that need to be encoded.
1176 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1177 # http://bumppo.net/projects/amputator/
1178 $text = preg_replace('/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/',
1179 '&amp;', $text);;
1181 # Encode naked <'s
1182 $text = preg_replace('{<(?![a-z/?\$!])}i', '&lt;', $text);
1184 return $text;
1188 function encodeBackslashEscapes($text) {
1190 # Parameter: String.
1191 # Returns: The string, with after processing the following backslash
1192 # escape sequences.
1194 # Must process escaped backslashes first.
1195 return str_replace(array_keys($this->backslash_escape_table),
1196 array_values($this->backslash_escape_table), $text);
1200 function doAutoLinks($text) {
1201 $text = preg_replace('{<((https?|ftp|dict):[^\'">\s]+)>}',
1202 '<a href="\1">\1</a>', $text);
1204 # Email addresses: <address@domain.foo>
1205 $text = preg_replace_callback('{
1207 (?:mailto:)?
1209 [-.\w\x80-\xFF]+
1211 [-a-z0-9\x80-\xFF]+(\.[-a-z0-9\x80-\xFF]+)*\.[a-z]+
1214 }xi',
1215 array(&$this, '_doAutoLinks_callback'), $text);
1217 return $text;
1219 function _doAutoLinks_callback($matches) {
1220 $address = $matches[1];
1221 $address = $this->unescapeSpecialChars($address);
1222 $address = $this->encodeEmailAddress($address);
1223 return $this->hashSpan($address);
1227 function encodeEmailAddress($addr) {
1229 # Input: an email address, e.g. "foo@example.com"
1231 # Output: the email address as a mailto link, with each character
1232 # of the address encoded as either a decimal or hex entity, in
1233 # the hopes of foiling most address harvesting spam bots. E.g.:
1235 # <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
1236 # &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
1237 # &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
1238 # &#101;&#46;&#x63;&#111;&#x6d;</a></p>
1240 # Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
1241 # With some optimizations by Milian Wolff.
1243 $addr = "mailto:" . $addr;
1244 $chars = preg_split('/(?<!^)(?!$)/', $addr);
1245 $seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
1247 foreach ($chars as $key => $char) {
1248 $ord = ord($char);
1249 # Ignore non-ascii chars.
1250 if ($ord < 128) {
1251 $r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
1252 # roughly 10% raw, 45% hex, 45% dec
1253 # '@' *must* be encoded. I insist.
1254 if ($r > 90 && $char != '@') /* do nothing */;
1255 else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
1256 else $chars[$key] = '&#'.$ord.';';
1260 $addr = implode('', $chars);
1261 $text = implode('', array_slice($chars, 7)); # text without `mailto:`
1262 $addr = "<a href=\"$addr\">$text</a>";
1264 return $addr;
1268 function unescapeSpecialChars($text) {
1270 # Swap back in all the special characters we've hidden.
1272 return str_replace(array_values($this->escape_table),
1273 array_keys($this->escape_table), $text);
1277 function tokenizeHTML($str) {
1279 # Parameter: String containing HTML + Markdown markup.
1280 # Returns: An array of the tokens comprising the input
1281 # string. Each token is either a tag or a run of text
1282 # between tags. Each element of the array is a
1283 # two-element array; the first is either 'tag' or 'text';
1284 # the second is the actual value.
1285 # Note: Markdown code spans are taken into account: no tag token is
1286 # generated within a code span.
1288 $tokens = array();
1290 while ($str != "") {
1292 # Each loop iteration seach for either the next tag or the next
1293 # openning code span marker. If a code span marker is found, the
1294 # code span is extracted in entierty and will result in an extra
1295 # text token.
1297 $parts = preg_split('{
1299 (?<![`\\\\])
1300 `+ # code span marker
1302 <!-- .*? --> # comment
1304 <\?.*?\?> | <%.*?%> # processing instruction
1306 <[/!$]?[-a-zA-Z0-9:]+ # regular tags
1309 (?>[^"\'>]+|"[^"]*"|\'[^\']*\')*
1313 }xs', $str, 2, PREG_SPLIT_DELIM_CAPTURE);
1315 # Create token from text preceding tag.
1316 if ($parts[0] != "") {
1317 $tokens[] = array('text', $parts[0]);
1320 # Check if we reach the end.
1321 if (count($parts) < 3) {
1322 break;
1325 # Create token from tag or code span.
1326 if ($parts[1]{0} == "`") {
1327 $tokens[] = array('text', $parts[1]);
1328 $str = $parts[2];
1330 # Skip the whole code span, pass as text token.
1331 if (preg_match('/^(.*(?<!`\\\\)'.$parts[1].'(?!`))(.*)$/',
1332 $str, $matches))
1334 $tokens[] = array('text', $matches[1]);
1335 $str = $matches[2];
1337 } else {
1338 $tokens[] = array('tag', $parts[1]);
1339 $str = $parts[2];
1343 return $tokens;
1347 function outdent($text) {
1349 # Remove one level of line-leading tabs or spaces
1351 return preg_replace("/^(\\t|[ ]{1,$this->tab_width})/m", "", $text);
1355 # String length function for detab. `_initDetab` will create a function to
1356 # hanlde UTF-8 if the default function does not exist.
1357 var $utf8_strlen = 'mb_strlen';
1359 function detab($text) {
1361 # Replace tabs with the appropriate amount of space.
1363 # For each line we separate the line in blocks delemited by
1364 # tab characters. Then we reconstruct every line by adding the
1365 # appropriate number of space between each blocks.
1367 $strlen = $this->utf8_strlen; # best strlen function for UTF-8.
1368 $lines = explode("\n", $text);
1369 $text = "";
1371 foreach ($lines as $line) {
1372 # Split in blocks.
1373 $blocks = explode("\t", $line);
1374 # Add each blocks to the line.
1375 $line = $blocks[0];
1376 unset($blocks[0]); # Do not add first block twice.
1377 foreach ($blocks as $block) {
1378 # Calculate amount of space, insert spaces, insert block.
1379 $amount = $this->tab_width -
1380 $strlen($line, 'UTF-8') % $this->tab_width;
1381 $line .= str_repeat(" ", $amount) . $block;
1383 $text .= "$line\n";
1385 return $text;
1387 function _initDetab() {
1389 # Check for the availability of the function in the `utf8_strlen` property
1390 # (probably `mb_strlen`). If the function is not available, create a
1391 # function that will loosely count the number of UTF-8 characters with a
1392 # regular expression.
1394 if (function_exists($this->utf8_strlen)) return;
1395 $this->utf8_strlen = 'Markdown_UTF8_strlen';
1397 if (function_exists($this->utf8_strlen)) return;
1398 function Markdown_UTF8_strlen($text) {
1399 return preg_match_all('/[\x00-\xBF]|[\xC0-\xFF][\x80-\xBF]*/',
1400 $text, $m);
1405 function unhash($text) {
1407 # Swap back in all the tags hashed by _HashHTMLBlocks.
1409 return str_replace(array_keys($this->html_hashes),
1410 array_values($this->html_hashes), $text);
1418 PHP Markdown
1419 ============
1421 Description
1422 -----------
1424 This is a PHP translation of the original Markdown formatter written in
1425 Perl by John Gruber.
1427 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1428 easy-to-write structured text format into HTML. Markdown's text format
1429 is most similar to that of plain text email, and supports features such
1430 as headers, *emphasis*, code blocks, blockquotes, and links.
1432 Markdown's syntax is designed not as a generic markup language, but
1433 specifically to serve as a front-end to (X)HTML. You can use span-level
1434 HTML tags anywhere in a Markdown document, and you can use block level
1435 HTML tags (like <div> and <table> as well).
1437 For more information about Markdown's syntax, see:
1439 <http://daringfireball.net/projects/markdown/>
1442 Bugs
1443 ----
1445 To file bug reports please send email to:
1447 <michel.fortin@michelf.com>
1449 Please include with your report: (1) the example input; (2) the output you
1450 expected; (3) the output Markdown actually produced.
1453 Version History
1454 ---------------
1456 See the readme file for detailed release notes for this version.
1458 1.0.1e (28 Dec 2006)
1460 1.0.1d (1 Dec 2006)
1462 1.0.1c (9 Dec 2005)
1464 1.0.1b (6 Jun 2005)
1466 1.0.1a (15 Apr 2005)
1468 1.0.1 (16 Dec 2004)
1470 1.0 (21 Aug 2004)
1473 Author & Contributors
1474 ---------------------
1476 Original Markdown by John Gruber
1477 <http://daringfireball.net/>
1479 PHP port and extras by Michel Fortin
1480 <http://www.michelf.com/>
1483 Copyright and License
1484 ---------------------
1486 Copyright (c) 2004-2006 Michel Fortin
1487 <http://www.michelf.com/>
1488 All rights reserved.
1490 Copyright (c) 2003-2006 John Gruber
1491 <http://daringfireball.net/>
1492 All rights reserved.
1494 Redistribution and use in source and binary forms, with or without
1495 modification, are permitted provided that the following conditions are
1496 met:
1498 * Redistributions of source code must retain the above copyright notice,
1499 this list of conditions and the following disclaimer.
1501 * Redistributions in binary form must reproduce the above copyright
1502 notice, this list of conditions and the following disclaimer in the
1503 documentation and/or other materials provided with the distribution.
1505 * Neither the name "Markdown" nor the names of its contributors may
1506 be used to endorse or promote products derived from this software
1507 without specific prior written permission.
1509 This software is provided by the copyright holders and contributors "as
1510 is" and any express or implied warranties, including, but not limited
1511 to, the implied warranties of merchantability and fitness for a
1512 particular purpose are disclaimed. In no event shall the copyright owner
1513 or contributors be liable for any direct, indirect, incidental, special,
1514 exemplary, or consequential damages (including, but not limited to,
1515 procurement of substitute goods or services; loss of use, data, or
1516 profits; or business interruption) however caused and on any theory of
1517 liability, whether in contract, strict liability, or tort (including
1518 negligence or otherwise) arising in any way out of the use of this
1519 software, even if advised of the possibility of such damage.