std sh til-kron
[sunny256-utils.git] / markdown
blobbf077a7f3f1f5584d729b8e6f900c03d732d6b76
1 #!/usr/bin/env perl
4 # Markdown -- A text-to-HTML conversion tool for web writers
6 # Copyright (c) 2004 John Gruber
7 # <http://daringfireball.net/projects/markdown/>
11 package Markdown;
12 require 5.006_000;
13 use strict;
14 use warnings;
16 use Digest::MD5 qw(md5_hex);
17 use vars qw($VERSION);
18 $VERSION = '1.0.1';
19 # Tue 14 Dec 2004
21 ## Disabled; causes problems under Perl 5.6.1:
22 # use utf8;
23 # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html
27 # Global default settings:
29 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
30 my $g_tab_width = 4;
34 # Globals:
37 # Regex to match balanced [brackets]. See Friedl's
38 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
39 my $g_nested_brackets;
40 $g_nested_brackets = qr{
41 (?> # Atomic matching
42 [^\[\]]+ # Anything other than brackets
45 (??{ $g_nested_brackets }) # Recursive set of nested brackets
48 }x;
51 # Table of hash values for escaped characters:
52 my %g_escape_table;
53 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
54 $g_escape_table{$char} = md5_hex($char);
58 # Global hashes, used by various utility routines
59 my %g_urls;
60 my %g_titles;
61 my %g_html_blocks;
63 # Used to track when we're inside an ordered or unordered list
64 # (see _ProcessListItems() for details):
65 my $g_list_level = 0;
68 #### Blosxom plug-in interface ##########################################
70 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
71 # which posts Markdown should process, using a "meta-markup: markdown"
72 # header. If it's set to 0 (the default), Markdown will process all
73 # entries.
74 my $g_blosxom_use_meta = 0;
76 sub start { 1; }
77 sub story {
78 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
80 if ( (! $g_blosxom_use_meta) or
81 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
83 $$body_ref = Markdown($$body_ref);
89 #### Movable Type plug-in interface #####################################
90 eval {require MT}; # Test to see if we're running in MT.
91 unless ($@) {
92 require MT;
93 import MT;
94 require MT::Template::Context;
95 import MT::Template::Context;
97 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
98 unless ($@) {
99 require MT::Plugin;
100 import MT::Plugin;
101 my $plugin = new MT::Plugin({
102 name => "Markdown",
103 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
104 doc_link => 'http://daringfireball.net/projects/markdown/'
106 MT->add_plugin( $plugin );
109 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
110 my $ctx = shift;
111 my $args = shift;
112 my $builder = $ctx->stash('builder');
113 my $tokens = $ctx->stash('tokens');
115 if (defined ($args->{'output'}) ) {
116 $ctx->stash('markdown_output', lc $args->{'output'});
119 defined (my $str = $builder->build($ctx, $tokens) )
120 or return $ctx->error($builder->errstr);
121 $str; # return value
124 MT->add_text_filter('markdown' => {
125 label => 'Markdown',
126 docs => 'http://daringfireball.net/projects/markdown/',
127 on_format => sub {
128 my $text = shift;
129 my $ctx = shift;
130 my $raw = 0;
131 if (defined $ctx) {
132 my $output = $ctx->stash('markdown_output');
133 if (defined $output && $output =~ m/^html/i) {
134 $g_empty_element_suffix = ">";
135 $ctx->stash('markdown_output', '');
137 elsif (defined $output && $output eq 'raw') {
138 $raw = 1;
139 $ctx->stash('markdown_output', '');
141 else {
142 $raw = 0;
143 $g_empty_element_suffix = " />";
146 $text = $raw ? $text : Markdown($text);
147 $text;
151 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
152 my $smartypants;
155 no warnings "once";
156 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
159 if ($smartypants) {
160 MT->add_text_filter('markdown_with_smartypants' => {
161 label => 'Markdown With SmartyPants',
162 docs => 'http://daringfireball.net/projects/markdown/',
163 on_format => sub {
164 my $text = shift;
165 my $ctx = shift;
166 if (defined $ctx) {
167 my $output = $ctx->stash('markdown_output');
168 if (defined $output && $output eq 'html') {
169 $g_empty_element_suffix = ">";
171 else {
172 $g_empty_element_suffix = " />";
175 $text = Markdown($text);
176 $text = $smartypants->($text, '1');
181 else {
182 #### BBEdit/command-line text filter interface ##########################
183 # Needs to be hidden from MT (and Blosxom when running in static mode).
185 # We're only using $blosxom::version once; tell Perl not to warn us:
186 no warnings 'once';
187 unless ( defined($blosxom::version) ) {
188 use warnings;
190 #### Check for command-line switches: #################
191 my %cli_opts;
192 use Getopt::Long;
193 Getopt::Long::Configure('pass_through');
194 GetOptions(\%cli_opts,
195 'version',
196 'shortversion',
197 'html4tags',
199 if ($cli_opts{'version'}) { # Version info
200 print "\nThis is Markdown, version $VERSION.\n";
201 print "Copyright 2004 John Gruber\n";
202 print "http://daringfireball.net/projects/markdown/\n\n";
203 exit 0;
205 if ($cli_opts{'shortversion'}) { # Just the version number string.
206 print $VERSION;
207 exit 0;
209 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
210 $g_empty_element_suffix = ">";
214 #### Process incoming text: ###########################
215 my $text;
217 local $/; # Slurp the whole file
218 $text = <>;
220 print Markdown($text);
226 sub Markdown {
228 # Main function. The order in which other subs are called here is
229 # essential. Link and image substitutions need to happen before
230 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
231 # and <img> tags get encoded.
233 my $text = shift;
235 # Clear the global hashes. If we don't clear these, you get conflicts
236 # from other articles when generating a page which contains more than
237 # one article (e.g. an index page that shows the N most recent
238 # articles):
239 %g_urls = ();
240 %g_titles = ();
241 %g_html_blocks = ();
244 # Standardize line endings:
245 $text =~ s{\r\n}{\n}g; # DOS to Unix
246 $text =~ s{\r}{\n}g; # Mac to Unix
248 # Make sure $text ends with a couple of newlines:
249 $text .= "\n\n";
251 # Convert all tabs to spaces.
252 $text = _Detab($text);
254 # Strip any lines consisting only of spaces and tabs.
255 # This makes subsequent regexen easier to write, because we can
256 # match consecutive blank lines with /\n+/ instead of something
257 # contorted like /[ \t]*\n+/ .
258 $text =~ s/^[ \t]+$//mg;
260 # Turn block-level HTML blocks into hash entries
261 $text = _HashHTMLBlocks($text);
263 # Strip link definitions, store in hashes.
264 $text = _StripLinkDefinitions($text);
266 $text = _RunBlockGamut($text);
268 $text = _UnescapeSpecialChars($text);
270 return $text . "\n";
274 sub _StripLinkDefinitions {
276 # Strips link definitions from text, stores the URLs and titles in
277 # hash references.
279 my $text = shift;
280 my $less_than_tab = $g_tab_width - 1;
282 # Link defs are in the form: ^[id]: url "optional title"
283 while ($text =~ s{
284 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
285 [ \t]*
286 \n? # maybe *one* newline
287 [ \t]*
288 <?(\S+?)>? # url = $2
289 [ \t]*
290 \n? # maybe one newline
291 [ \t]*
293 (?<=\s) # lookbehind for whitespace
294 ["(]
295 (.+?) # title = $3
296 [")]
297 [ \t]*
298 )? # title is optional
299 (?:\n+|\Z)
301 {}mx) {
302 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
303 if ($3) {
304 $g_titles{lc $1} = $3;
305 $g_titles{lc $1} =~ s/"/&quot;/g;
309 return $text;
313 sub _HashHTMLBlocks {
314 my $text = shift;
315 my $less_than_tab = $g_tab_width - 1;
317 # Hashify HTML blocks:
318 # We only want to do this for block-level HTML tags, such as headers,
319 # lists, and tables. That's because we still want to wrap <p>s around
320 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
321 # phrase emphasis, and spans. The list of tags we're looking for is
322 # hard-coded:
323 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
324 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
326 # First, look for nested blocks, e.g.:
327 # <div>
328 # <div>
329 # tags for inner block must be indented.
330 # </div>
331 # </div>
333 # The outermost tags must start at the left margin for this to match, and
334 # the inner nested divs must be indented.
335 # We need to do this before the next, more liberal match, because the next
336 # match will start at the first `<div>` and stop at the first `</div>`.
337 $text =~ s{
338 ( # save in $1
339 ^ # start of line (with /m)
340 <($block_tags_a) # start tag = $2
341 \b # word break
342 (.*\n)*? # any number of lines, minimally matching
343 </\2> # the matching end tag
344 [ \t]* # trailing spaces/tabs
345 (?=\n+|\Z) # followed by a newline or end of document
348 my $key = md5_hex($1);
349 $g_html_blocks{$key} = $1;
350 "\n\n" . $key . "\n\n";
351 }egmx;
355 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
357 $text =~ s{
358 ( # save in $1
359 ^ # start of line (with /m)
360 <($block_tags_b) # start tag = $2
361 \b # word break
362 (.*\n)*? # any number of lines, minimally matching
363 .*</\2> # the matching end tag
364 [ \t]* # trailing spaces/tabs
365 (?=\n+|\Z) # followed by a newline or end of document
368 my $key = md5_hex($1);
369 $g_html_blocks{$key} = $1;
370 "\n\n" . $key . "\n\n";
371 }egmx;
372 # Special case just for <hr />. It was easier to make a special case than
373 # to make the other regex more complicated.
374 $text =~ s{
376 (?<=\n\n) # Starting after a blank line
377 | # or
378 \A\n? # the beginning of the doc
380 ( # save in $1
381 [ ]{0,$less_than_tab}
382 <(hr) # start tag = $2
383 \b # word break
384 ([^<>])*? #
385 /?> # the matching end tag
386 [ \t]*
387 (?=\n{2,}|\Z) # followed by a blank line or end of document
390 my $key = md5_hex($1);
391 $g_html_blocks{$key} = $1;
392 "\n\n" . $key . "\n\n";
393 }egx;
395 # Special case for standalone HTML comments:
396 $text =~ s{
398 (?<=\n\n) # Starting after a blank line
399 | # or
400 \A\n? # the beginning of the doc
402 ( # save in $1
403 [ ]{0,$less_than_tab}
404 (?s:
406 (--.*?--\s*)+
409 [ \t]*
410 (?=\n{2,}|\Z) # followed by a blank line or end of document
413 my $key = md5_hex($1);
414 $g_html_blocks{$key} = $1;
415 "\n\n" . $key . "\n\n";
416 }egx;
419 return $text;
423 sub _RunBlockGamut {
425 # These are all the transformations that form block-level
426 # tags like paragraphs, headers, and list items.
428 my $text = shift;
430 $text = _DoHeaders($text);
432 # Do Horizontal Rules:
433 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
434 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
435 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
437 $text = _DoLists($text);
439 $text = _DoCodeBlocks($text);
441 $text = _DoBlockQuotes($text);
443 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
444 # was to escape raw HTML in the original Markdown source. This time,
445 # we're escaping the markup we've just created, so that we don't wrap
446 # <p> tags around block-level tags.
447 $text = _HashHTMLBlocks($text);
449 $text = _FormParagraphs($text);
451 return $text;
455 sub _RunSpanGamut {
457 # These are all the transformations that occur *within* block-level
458 # tags like paragraphs, headers, and list items.
460 my $text = shift;
462 $text = _DoCodeSpans($text);
464 $text = _EscapeSpecialChars($text);
466 # Process anchor and image tags. Images must come first,
467 # because ![foo][f] looks like an anchor.
468 $text = _DoImages($text);
469 $text = _DoAnchors($text);
471 # Make links out of things like `<http://example.com/>`
472 # Must come after _DoAnchors(), because you can use < and >
473 # delimiters in inline links like [this](<url>).
474 $text = _DoAutoLinks($text);
476 $text = _EncodeAmpsAndAngles($text);
478 $text = _DoItalicsAndBold($text);
480 # Do hard breaks:
481 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
483 return $text;
487 sub _EscapeSpecialChars {
488 my $text = shift;
489 my $tokens ||= _TokenizeHTML($text);
491 $text = ''; # rebuild $text from the tokens
492 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
493 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
495 foreach my $cur_token (@$tokens) {
496 if ($cur_token->[0] eq "tag") {
497 # Within tags, encode * and _ so they don't conflict
498 # with their use in Markdown for italics and strong.
499 # We're replacing each such character with its
500 # corresponding MD5 checksum value; this is likely
501 # overkill, but it should prevent us from colliding
502 # with the escape values by accident.
503 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
504 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
505 $text .= $cur_token->[1];
506 } else {
507 my $t = $cur_token->[1];
508 $t = _EncodeBackslashEscapes($t);
509 $text .= $t;
512 return $text;
516 sub _DoAnchors {
518 # Turn Markdown link shortcuts into XHTML <a> tags.
520 my $text = shift;
523 # First, handle reference-style links: [link text] [id]
525 $text =~ s{
526 ( # wrap whole match in $1
528 ($g_nested_brackets) # link text = $2
531 [ ]? # one optional space
532 (?:\n[ ]*)? # one optional newline followed by spaces
535 (.*?) # id = $3
539 my $result;
540 my $whole_match = $1;
541 my $link_text = $2;
542 my $link_id = lc $3;
544 if ($link_id eq "") {
545 $link_id = lc $link_text; # for shortcut links like [this][].
548 if (defined $g_urls{$link_id}) {
549 my $url = $g_urls{$link_id};
550 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
551 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
552 $result = "<a href=\"$url\"";
553 if ( defined $g_titles{$link_id} ) {
554 my $title = $g_titles{$link_id};
555 $title =~ s! \* !$g_escape_table{'*'}!gx;
556 $title =~ s! _ !$g_escape_table{'_'}!gx;
557 $result .= " title=\"$title\"";
559 $result .= ">$link_text</a>";
561 else {
562 $result = $whole_match;
564 $result;
565 }xsge;
568 # Next, inline-style links: [link text](url "optional title")
570 $text =~ s{
571 ( # wrap whole match in $1
573 ($g_nested_brackets) # link text = $2
575 \( # literal paren
576 [ \t]*
577 <?(.*?)>? # href = $3
578 [ \t]*
579 ( # $4
580 (['"]) # quote char = $5
581 (.*?) # Title = $6
582 \5 # matching quote
583 )? # title is optional
587 my $result;
588 my $whole_match = $1;
589 my $link_text = $2;
590 my $url = $3;
591 my $title = $6;
593 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
594 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
595 $result = "<a href=\"$url\"";
597 if (defined $title) {
598 $title =~ s/"/&quot;/g;
599 $title =~ s! \* !$g_escape_table{'*'}!gx;
600 $title =~ s! _ !$g_escape_table{'_'}!gx;
601 $result .= " title=\"$title\"";
604 $result .= ">$link_text</a>";
606 $result;
607 }xsge;
609 return $text;
613 sub _DoImages {
615 # Turn Markdown image shortcuts into <img> tags.
617 my $text = shift;
620 # First, handle reference-style labeled images: ![alt text][id]
622 $text =~ s{
623 ( # wrap whole match in $1
625 (.*?) # alt text = $2
628 [ ]? # one optional space
629 (?:\n[ ]*)? # one optional newline followed by spaces
632 (.*?) # id = $3
637 my $result;
638 my $whole_match = $1;
639 my $alt_text = $2;
640 my $link_id = lc $3;
642 if ($link_id eq "") {
643 $link_id = lc $alt_text; # for shortcut links like ![this][].
646 $alt_text =~ s/"/&quot;/g;
647 if (defined $g_urls{$link_id}) {
648 my $url = $g_urls{$link_id};
649 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
650 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
651 $result = "<img src=\"$url\" alt=\"$alt_text\"";
652 if (defined $g_titles{$link_id}) {
653 my $title = $g_titles{$link_id};
654 $title =~ s! \* !$g_escape_table{'*'}!gx;
655 $title =~ s! _ !$g_escape_table{'_'}!gx;
656 $result .= " title=\"$title\"";
658 $result .= $g_empty_element_suffix;
660 else {
661 # If there's no such link ID, leave intact:
662 $result = $whole_match;
665 $result;
666 }xsge;
669 # Next, handle inline images: ![alt text](url "optional title")
670 # Don't forget: encode * and _
672 $text =~ s{
673 ( # wrap whole match in $1
675 (.*?) # alt text = $2
677 \( # literal paren
678 [ \t]*
679 <?(\S+?)>? # src url = $3
680 [ \t]*
681 ( # $4
682 (['"]) # quote char = $5
683 (.*?) # title = $6
684 \5 # matching quote
685 [ \t]*
686 )? # title is optional
690 my $result;
691 my $whole_match = $1;
692 my $alt_text = $2;
693 my $url = $3;
694 my $title = '';
695 if (defined($6)) {
696 $title = $6;
699 $alt_text =~ s/"/&quot;/g;
700 $title =~ s/"/&quot;/g;
701 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
702 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
703 $result = "<img src=\"$url\" alt=\"$alt_text\"";
704 if (defined $title) {
705 $title =~ s! \* !$g_escape_table{'*'}!gx;
706 $title =~ s! _ !$g_escape_table{'_'}!gx;
707 $result .= " title=\"$title\"";
709 $result .= $g_empty_element_suffix;
711 $result;
712 }xsge;
714 return $text;
718 sub _DoHeaders {
719 my $text = shift;
721 # Setext-style headers:
722 # Header 1
723 # ========
725 # Header 2
726 # --------
728 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
729 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
730 }egmx;
732 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
733 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
734 }egmx;
737 # atx-style headers:
738 # # Header 1
739 # ## Header 2
740 # ## Header 2 with closing hashes ##
741 # ...
742 # ###### Header 6
744 $text =~ s{
745 ^(\#{1,6}) # $1 = string of #'s
746 [ \t]*
747 (.+?) # $2 = Header text
748 [ \t]*
749 \#* # optional closing #'s (not counted)
752 my $h_level = length($1);
753 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
754 }egmx;
756 return $text;
760 sub _DoLists {
762 # Form HTML ordered (numbered) and unordered (bulleted) lists.
764 my $text = shift;
765 my $less_than_tab = $g_tab_width - 1;
767 # Re-usable patterns to match list item bullets and number markers:
768 my $marker_ul = qr/[*+-]/;
769 my $marker_ol = qr/\d+[.]/;
770 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
772 # Re-usable pattern to match any entirel ul or ol list:
773 my $whole_list = qr{
774 ( # $1 = whole list
775 ( # $2
776 [ ]{0,$less_than_tab}
777 (${marker_any}) # $3 = first list item marker
778 [ \t]+
780 (?s:.+?)
781 ( # $4
784 \n{2,}
785 (?=\S)
786 (?! # Negative lookahead for another list item marker
787 [ \t]*
788 ${marker_any}[ \t]+
792 }mx;
794 # We use a different prefix before nested lists than top-level lists.
795 # See extended comment in _ProcessListItems().
797 # Note: There's a bit of duplication here. My original implementation
798 # created a scalar regex pattern as the conditional result of the test on
799 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
800 # substitution once, using the scalar as the pattern. This worked,
801 # everywhere except when running under MT on my hosting account at Pair
802 # Networks. There, this caused all rebuilds to be killed by the reaper (or
803 # perhaps they crashed, but that seems incredibly unlikely given that the
804 # same script on the same server ran fine *except* under MT. I've spent
805 # more time trying to figure out why this is happening than I'd like to
806 # admit. My only guess, backed up by the fact that this workaround works,
807 # is that Perl optimizes the substition when it can figure out that the
808 # pattern will never change, and when this optimization isn't on, we run
809 # afoul of the reaper. Thus, the slightly redundant code to that uses two
810 # static s/// patterns rather than one conditional pattern.
812 if ($g_list_level) {
813 $text =~ s{
815 $whole_list
817 my $list = $1;
818 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
819 # Turn double returns into triple returns, so that we can make a
820 # paragraph for the last item in a list, if necessary:
821 $list =~ s/\n{2,}/\n\n\n/g;
822 my $result = _ProcessListItems($list, $marker_any);
823 $result = "<$list_type>\n" . $result . "</$list_type>\n";
824 $result;
825 }egmx;
827 else {
828 $text =~ s{
829 (?:(?<=\n\n)|\A\n?)
830 $whole_list
832 my $list = $1;
833 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
834 # Turn double returns into triple returns, so that we can make a
835 # paragraph for the last item in a list, if necessary:
836 $list =~ s/\n{2,}/\n\n\n/g;
837 my $result = _ProcessListItems($list, $marker_any);
838 $result = "<$list_type>\n" . $result . "</$list_type>\n";
839 $result;
840 }egmx;
844 return $text;
848 sub _ProcessListItems {
850 # Process the contents of a single ordered or unordered list, splitting it
851 # into individual list items.
854 my $list_str = shift;
855 my $marker_any = shift;
858 # The $g_list_level global keeps track of when we're inside a list.
859 # Each time we enter a list, we increment it; when we leave a list,
860 # we decrement. If it's zero, we're not in a list anymore.
862 # We do this because when we're not inside a list, we want to treat
863 # something like this:
865 # I recommend upgrading to version
866 # 8. Oops, now this line is treated
867 # as a sub-list.
869 # As a single paragraph, despite the fact that the second line starts
870 # with a digit-period-space sequence.
872 # Whereas when we're inside a list (or sub-list), that line will be
873 # treated as the start of a sub-list. What a kludge, huh? This is
874 # an aspect of Markdown's syntax that's hard to parse perfectly
875 # without resorting to mind-reading. Perhaps the solution is to
876 # change the syntax rules such that sub-lists must start with a
877 # starting cardinal number; e.g. "1." or "a.".
879 $g_list_level++;
881 # trim trailing blank lines:
882 $list_str =~ s/\n{2,}\z/\n/;
885 $list_str =~ s{
886 (\n)? # leading line = $1
887 (^[ \t]*) # leading whitespace = $2
888 ($marker_any) [ \t]+ # list marker = $3
889 ((?s:.+?) # list item text = $4
890 (\n{1,2}))
891 (?= \n* (\z | \2 ($marker_any) [ \t]+))
893 my $item = $4;
894 my $leading_line = $1;
895 my $leading_space = $2;
897 if ($leading_line or ($item =~ m/\n{2,}/)) {
898 $item = _RunBlockGamut(_Outdent($item));
900 else {
901 # Recursion for sub-lists:
902 $item = _DoLists(_Outdent($item));
903 chomp $item;
904 $item = _RunSpanGamut($item);
907 "<li>" . $item . "</li>\n";
908 }egmx;
910 $g_list_level--;
911 return $list_str;
916 sub _DoCodeBlocks {
918 # Process Markdown `<pre><code>` blocks.
921 my $text = shift;
923 $text =~ s{
924 (?:\n\n|\A)
925 ( # $1 = the code block -- one or more lines, starting with a space/tab
927 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
928 .*\n+
931 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
933 my $codeblock = $1;
934 my $result; # return value
936 $codeblock = _EncodeCode(_Outdent($codeblock));
937 $codeblock = _Detab($codeblock);
938 $codeblock =~ s/\A\n+//; # trim leading newlines
939 $codeblock =~ s/\s+\z//; # trim trailing whitespace
941 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
943 $result;
944 }egmx;
946 return $text;
950 sub _DoCodeSpans {
952 # * Backtick quotes are used for <code></code> spans.
954 # * You can use multiple backticks as the delimiters if you want to
955 # include literal backticks in the code span. So, this input:
957 # Just type ``foo `bar` baz`` at the prompt.
959 # Will translate to:
961 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
963 # There's no arbitrary limit to the number of backticks you
964 # can use as delimters. If you need three consecutive backticks
965 # in your code, use four for delimiters, etc.
967 # * You can use spaces to get literal backticks at the edges:
969 # ... type `` `bar` `` ...
971 # Turns to:
973 # ... type <code>`bar`</code> ...
976 my $text = shift;
978 $text =~ s@
979 (`+) # $1 = Opening run of `
980 (.+?) # $2 = The code block
981 (?<!`)
982 \1 # Matching closer
983 (?!`)
985 my $c = "$2";
986 $c =~ s/^[ \t]*//g; # leading whitespace
987 $c =~ s/[ \t]*$//g; # trailing whitespace
988 $c = _EncodeCode($c);
989 "<code>$c</code>";
990 @egsx;
992 return $text;
996 sub _EncodeCode {
998 # Encode/escape certain characters inside Markdown code runs.
999 # The point is that in code, these characters are literals,
1000 # and lose their special Markdown meanings.
1002 local $_ = shift;
1004 # Encode all ampersands; HTML entities are not
1005 # entities within a Markdown code span.
1006 s/&/&amp;/g;
1008 # Encode $'s, but only if we're running under Blosxom.
1009 # (Blosxom interpolates Perl variables in article bodies.)
1011 no warnings 'once';
1012 if (defined($blosxom::version)) {
1013 s/\$/&#036;/g;
1018 # Do the angle bracket song and dance:
1019 s! < !&lt;!gx;
1020 s! > !&gt;!gx;
1022 # Now, escape characters that are magic in Markdown:
1023 s! \* !$g_escape_table{'*'}!gx;
1024 s! _ !$g_escape_table{'_'}!gx;
1025 s! { !$g_escape_table{'{'}!gx;
1026 s! } !$g_escape_table{'}'}!gx;
1027 s! \[ !$g_escape_table{'['}!gx;
1028 s! \] !$g_escape_table{']'}!gx;
1029 s! \\ !$g_escape_table{'\\'}!gx;
1031 return $_;
1035 sub _DoItalicsAndBold {
1036 my $text = shift;
1038 # <strong> must go first:
1039 $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1040 {<strong>$2</strong>}gsx;
1042 $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1043 {<em>$2</em>}gsx;
1045 return $text;
1049 sub _DoBlockQuotes {
1050 my $text = shift;
1052 $text =~ s{
1053 ( # Wrap whole match in $1
1055 ^[ \t]*>[ \t]? # '>' at the start of a line
1056 .+\n # rest of the first line
1057 (.+\n)* # subsequent consecutive lines
1058 \n* # blanks
1062 my $bq = $1;
1063 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1064 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1065 $bq = _RunBlockGamut($bq); # recurse
1067 $bq =~ s/^/ /g;
1068 # These leading spaces screw with <pre> content, so we need to fix that:
1069 $bq =~ s{
1070 (\s*<pre>.+?</pre>)
1072 my $pre = $1;
1073 $pre =~ s/^ //mg;
1074 $pre;
1075 }egsx;
1077 "<blockquote>\n$bq\n</blockquote>\n\n";
1078 }egmx;
1081 return $text;
1085 sub _FormParagraphs {
1087 # Params:
1088 # $text - string to process with html <p> tags
1090 my $text = shift;
1092 # Strip leading and trailing lines:
1093 $text =~ s/\A\n+//;
1094 $text =~ s/\n+\z//;
1096 my @grafs = split(/\n{2,}/, $text);
1099 # Wrap <p> tags.
1101 foreach (@grafs) {
1102 unless (defined( $g_html_blocks{$_} )) {
1103 $_ = _RunSpanGamut($_);
1104 s/^([ \t]*)/<p>/;
1105 $_ .= "</p>";
1110 # Unhashify HTML blocks
1112 foreach (@grafs) {
1113 if (defined( $g_html_blocks{$_} )) {
1114 $_ = $g_html_blocks{$_};
1118 return join "\n\n", @grafs;
1122 sub _EncodeAmpsAndAngles {
1123 # Smart processing for ampersands and angle brackets that need to be encoded.
1125 my $text = shift;
1127 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1128 # http://bumppo.net/projects/amputator/
1129 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1131 # Encode naked <'s
1132 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1134 return $text;
1138 sub _EncodeBackslashEscapes {
1140 # Parameter: String.
1141 # Returns: The string, with after processing the following backslash
1142 # escape sequences.
1144 local $_ = shift;
1146 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
1147 s! \\` !$g_escape_table{'`'}!gx;
1148 s! \\\* !$g_escape_table{'*'}!gx;
1149 s! \\_ !$g_escape_table{'_'}!gx;
1150 s! \\\{ !$g_escape_table{'{'}!gx;
1151 s! \\\} !$g_escape_table{'}'}!gx;
1152 s! \\\[ !$g_escape_table{'['}!gx;
1153 s! \\\] !$g_escape_table{']'}!gx;
1154 s! \\\( !$g_escape_table{'('}!gx;
1155 s! \\\) !$g_escape_table{')'}!gx;
1156 s! \\> !$g_escape_table{'>'}!gx;
1157 s! \\\# !$g_escape_table{'#'}!gx;
1158 s! \\\+ !$g_escape_table{'+'}!gx;
1159 s! \\\- !$g_escape_table{'-'}!gx;
1160 s! \\\. !$g_escape_table{'.'}!gx;
1161 s{ \\! }{$g_escape_table{'!'}}gx;
1163 return $_;
1167 sub _DoAutoLinks {
1168 my $text = shift;
1170 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1172 # Email addresses: <address@domain.foo>
1173 $text =~ s{
1175 (?:mailto:)?
1177 [-.\w]+
1179 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1183 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1184 }egix;
1186 return $text;
1190 sub _EncodeEmailAddress {
1192 # Input: an email address, e.g. "foo@example.com"
1194 # Output: the email address as a mailto link, with each character
1195 # of the address encoded as either a decimal or hex entity, in
1196 # the hopes of foiling most address harvesting spam bots. E.g.:
1198 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1199 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1200 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1202 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1203 # mailing list: <http://tinyurl.com/yu7ue>
1206 my $addr = shift;
1208 srand;
1209 my @encode = (
1210 sub { '&#' . ord(shift) . ';' },
1211 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1212 sub { shift },
1215 $addr = "mailto:" . $addr;
1217 $addr =~ s{(.)}{
1218 my $char = $1;
1219 if ( $char eq '@' ) {
1220 # this *must* be encoded. I insist.
1221 $char = $encode[int rand 1]->($char);
1222 } elsif ( $char ne ':' ) {
1223 # leave ':' alone (to spot mailto: later)
1224 my $r = rand;
1225 # roughly 10% raw, 45% hex, 45% dec
1226 $char = (
1227 $r > .9 ? $encode[2]->($char) :
1228 $r < .45 ? $encode[1]->($char) :
1229 $encode[0]->($char)
1232 $char;
1233 }gex;
1235 $addr = qq{<a href="$addr">$addr</a>};
1236 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1238 return $addr;
1242 sub _UnescapeSpecialChars {
1244 # Swap back in all the special characters we've hidden.
1246 my $text = shift;
1248 while( my($char, $hash) = each(%g_escape_table) ) {
1249 $text =~ s/$hash/$char/g;
1251 return $text;
1255 sub _TokenizeHTML {
1257 # Parameter: String containing HTML markup.
1258 # Returns: Reference to an array of the tokens comprising the input
1259 # string. Each token is either a tag (possibly with nested,
1260 # tags contained therein, such as <a href="<MTFoo>">, or a
1261 # run of text between tags. Each element of the array is a
1262 # two-element array; the first is either 'tag' or 'text';
1263 # the second is the actual value.
1266 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1267 # <http://www.bradchoate.com/past/mtregex.php>
1270 my $str = shift;
1271 my $pos = 0;
1272 my $len = length $str;
1273 my @tokens;
1275 my $depth = 6;
1276 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1277 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
1278 (?s: <\? .*? \?> ) | # processing instruction
1279 $nested_tags/ix; # nested tags
1281 while ($str =~ m/($match)/g) {
1282 my $whole_tag = $1;
1283 my $sec_start = pos $str;
1284 my $tag_start = $sec_start - length $whole_tag;
1285 if ($pos < $tag_start) {
1286 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1288 push @tokens, ['tag', $whole_tag];
1289 $pos = pos $str;
1291 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1292 \@tokens;
1296 sub _Outdent {
1298 # Remove one level of line-leading tabs or spaces
1300 my $text = shift;
1302 $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1303 return $text;
1307 sub _Detab {
1309 # Cribbed from a post by Bart Lateur:
1310 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1312 my $text = shift;
1314 $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1315 return $text;
1321 __END__
1324 =pod
1326 =head1 NAME
1328 B<Markdown>
1331 =head1 SYNOPSIS
1333 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1334 [ I<file> ... ]
1337 =head1 DESCRIPTION
1339 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1340 easy-to-write structured text format into HTML. Markdown's text format
1341 is most similar to that of plain text email, and supports features such
1342 as headers, *emphasis*, code blocks, blockquotes, and links.
1344 Markdown's syntax is designed not as a generic markup language, but
1345 specifically to serve as a front-end to (X)HTML. You can use span-level
1346 HTML tags anywhere in a Markdown document, and you can use block level
1347 HTML tags (like <div> and <table> as well).
1349 For more information about Markdown's syntax, see:
1351 http://daringfireball.net/projects/markdown/
1354 =head1 OPTIONS
1356 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1358 Markdown.pl -- -z
1360 =over 4
1363 =item B<--html4tags>
1365 Use HTML 4 style for empty element tags, e.g.:
1367 <br>
1369 instead of Markdown's default XHTML style tags, e.g.:
1371 <br />
1374 =item B<-v>, B<--version>
1376 Display Markdown's version number and copyright information.
1379 =item B<-s>, B<--shortversion>
1381 Display the short-form version number.
1384 =back
1388 =head1 BUGS
1390 To file bug reports or feature requests (other than topics listed in the
1391 Caveats section above) please send email to:
1393 support@daringfireball.net
1395 Please include with your report: (1) the example input; (2) the output
1396 you expected; (3) the output Markdown actually produced.
1399 =head1 VERSION HISTORY
1401 See the readme file for detailed release notes for this version.
1403 1.0.1 - 14 Dec 2004
1405 1.0 - 28 Aug 2004
1408 =head1 AUTHOR
1410 John Gruber
1411 http://daringfireball.net
1413 PHP port and other contributions by Michel Fortin
1414 http://michelf.com
1417 =head1 COPYRIGHT AND LICENSE
1419 Copyright (c) 2003-2004 John Gruber
1420 <http://daringfireball.net/>
1421 All rights reserved.
1423 Redistribution and use in source and binary forms, with or without
1424 modification, are permitted provided that the following conditions are
1425 met:
1427 * Redistributions of source code must retain the above copyright notice,
1428 this list of conditions and the following disclaimer.
1430 * Redistributions in binary form must reproduce the above copyright
1431 notice, this list of conditions and the following disclaimer in the
1432 documentation and/or other materials provided with the distribution.
1434 * Neither the name "Markdown" nor the names of its contributors may
1435 be used to endorse or promote products derived from this software
1436 without specific prior written permission.
1438 This software is provided by the copyright holders and contributors "as
1439 is" and any express or implied warranties, including, but not limited
1440 to, the implied warranties of merchantability and fitness for a
1441 particular purpose are disclaimed. In no event shall the copyright owner
1442 or contributors be liable for any direct, indirect, incidental, special,
1443 exemplary, or consequential damages (including, but not limited to,
1444 procurement of substitute goods or services; loss of use, data, or
1445 profits; or business interruption) however caused and on any theory of
1446 liability, whether in contract, strict liability, or tort (including
1447 negligence or otherwise) arising in any way out of the use of this
1448 software, even if advised of the possibility of such damage.
1450 =cut