markdown2.py

   1 #!/usr/bin/env python
   2 # Copyright (c) 2007-2008 ActiveState Corp.
   3 # License: MIT (http://www.opensource.org/licenses/mit-license.php)
   4
   5 r"""A fast and complete Python implementation of Markdown.
   6
   7 [from http://daringfireball.net/projects/markdown/]
   8 > Markdown is a text-to-HTML filter; it translates an easy-to-read /
   9 > easy-to-write structured text format into HTML.  Markdown's text
  10 > format is most similar to that of plain text email, and supports
  11 > features such as headers, *emphasis*, code blocks, blockquotes, and
  12 > links.
  13 >
  14 > Markdown's syntax is designed not as a generic markup language, but
  15 > specifically to serve as a front-end to (X)HTML. You can use span-level
  16 > HTML tags anywhere in a Markdown document, and you can use block level
  17 > HTML tags (like <div> and <table> as well).
  18
  19 Module usage:
  20
  21     >>> import markdown2
  22     >>> markdown2.markdown("*boo!*")  # or use `html = markdown_path(PATH)`
  23     u'<p><em>boo!</em></p>\n'
  24
  25     >>> markdowner = Markdown()
  26     >>> markdowner.convert("*boo!*")
  27     u'<p><em>boo!</em></p>\n'
  28     >>> markdowner.convert("**boom!**")
  29     u'<p><strong>boom!</strong></p>\n'
  30
  31 This implementation of Markdown implements the full "core" syntax plus a
  32 number of extras (e.g., code syntax coloring, footnotes) as described on
  33 <http://code.google.com/p/python-markdown2/wiki/Extras>.
  34 """
  35
  36 cmdln_desc = """A fast and complete Python implementation of Markdown, a
  37 text-to-HTML conversion tool for web writers.
  38 """
  39
  40 # Dev Notes:
  41 # - There is already a Python markdown processor
  42 #   (http://www.freewisdom.org/projects/python-markdown/).
  43 # - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm
  44 #   not yet sure if there implications with this. Compare 'pydoc sre'
  45 #   and 'perldoc perlre'.
  46
  47 __version_info__ = (1, 0, 1, 12, 'a') # first three nums match Markdown.pl
  48 __version__ = '1.0.1.12a'
  49 __author__ = "Trent Mick"
  50
  51 import os
  52 import sys
  53 from pprint import pprint
  54 import re
  55 import logging
  56 try:
  57     from hashlib import md5
  58 except ImportError:
  59     from md5 import md5
  60 import optparse
  61 from random import random
  62 import codecs
  63
  64
  65
  66 #---- Python version compat
  67
  68 if sys.version_info[:2] < (2,4):
  69     from sets import Set as set
  70     def reversed(sequence):
  71         for i in sequence[::-1]:
  72             yield i
  73     def _unicode_decode(s, encoding, errors='xmlcharrefreplace'):
  74         return unicode(s, encoding, errors)
  75 else:
  76     def _unicode_decode(s, encoding, errors='strict'):
  77         return s.decode(encoding, errors)
  78
  79
  80 #---- globals
  81
  82 DEBUG = False
  83 log = logging.getLogger("markdown")
  84
  85 DEFAULT_TAB_WIDTH = 4
  86
  87 # Table of hash values for escaped characters:
  88 def _escape_hash(s):
  89     # Lame attempt to avoid possible collision with someone actually
  90     # using the MD5 hexdigest of one of these chars in there text.
  91     # Other ideas: random.random(), uuid.uuid()
  92     #return md5(s).hexdigest()   # Markdown.pl effectively does this.
  93     return 'md5:'+md5(s).hexdigest()
  94 g_escape_table = dict([(ch, _escape_hash(ch))
  95                        for ch in '\\`*_{}[]()>#+-.!'])
  96
  97
  98
  99 #---- exceptions
 100
 101 class MarkdownError(Exception):
 102     pass
 103
 104
 105
 106 #---- public api
 107
 108 def markdown_path(path, encoding="utf-8",
 109                   html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
 110                   safe_mode=None, extras=None, link_patterns=None,
 111                   use_file_vars=False):
 112     text = codecs.open(path, 'r', encoding).read()
 113     return Markdown(html4tags=html4tags, tab_width=tab_width,
 114                     safe_mode=safe_mode, extras=extras,
 115                     link_patterns=link_patterns,
 116                     use_file_vars=use_file_vars).convert(text)
 117
 118 def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH,
 119              safe_mode=None, extras=None, link_patterns=None,
 120              use_file_vars=False):
 121     return Markdown(html4tags=html4tags, tab_width=tab_width,
 122                     safe_mode=safe_mode, extras=extras,
 123                     link_patterns=link_patterns,
 124                     use_file_vars=use_file_vars).convert(text)
 125
 126 class Markdown(object):
 127     # The dict of "extras" to enable in processing -- a mapping of
 128     # extra name to argument for the extra. Most extras do not have an
 129     # argument, in which case the value is None.
 130     #
 131     # This can be set via (a) subclassing and (b) the constructor
 132     # "extras" argument.
 133     extras = None
 134
 135     urls = None
 136     titles = None
 137     html_blocks = None
 138     html_spans = None
 139     html_removed_text = "[HTML_REMOVED]"  # for compat with markdown.py
 140
 141     # Used to track when we're inside an ordered or unordered list
 142     # (see _ProcessListItems() for details):
 143     list_level = 0
 144
 145     _ws_only_line_re = re.compile(r"^[ \t]+$", re.M)
 146
 147     def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
 148                  extras=None, link_patterns=None, use_file_vars=False):
 149         if html4tags:
 150             self.empty_element_suffix = ">"
 151         else:
 152             self.empty_element_suffix = " />"
 153         self.tab_width = tab_width
 154
 155         # For compatibility with earlier markdown2.py and with
 156         # markdown.py's safe_mode being a boolean,
 157         #   safe_mode == True -> "replace"
 158         if safe_mode is True:
 159             self.safe_mode = "replace"
 160         else:
 161             self.safe_mode = safe_mode
 162
 163         if self.extras is None:
 164             self.extras = {}
 165         elif not isinstance(self.extras, dict):
 166             self.extras = dict([(e, None) for e in self.extras])
 167         if extras:
 168             if not isinstance(extras, dict):
 169                 extras = dict([(e, None) for e in extras])
 170             self.extras.update(extras)
 171         assert isinstance(self.extras, dict)
 172         self._instance_extras = self.extras.copy()
 173         self.link_patterns = link_patterns
 174         self.use_file_vars = use_file_vars
 175         self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M)
 176
 177     def reset(self):
 178         self.urls = {}
 179         self.titles = {}
 180         self.html_blocks = {}
 181         self.html_spans = {}
 182         self.list_level = 0
 183         self.extras = self._instance_extras.copy()
 184         if "footnotes" in self.extras:
 185             self.footnotes = {}
 186             self.footnote_ids = []
 187
 188     def convert(self, text):
 189         """Convert the given text."""
 190         # Main function. The order in which other subs are called here is
 191         # essential. Link and image substitutions need to happen before
 192         # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
 193         # and <img> tags get encoded.
 194
 195         # Clear the global hashes. If we don't clear these, you get conflicts
 196         # from other articles when generating a page which contains more than
 197         # one article (e.g. an index page that shows the N most recent
 198         # articles):
 199         self.reset()
 200
 201         if not isinstance(text, unicode):
 202             #TODO: perhaps shouldn't presume UTF-8 for string input?
 203             text = unicode(text, 'utf-8')
 204
 205         if self.use_file_vars:
 206             # Look for emacs-style file variable hints.
 207             emacs_vars = self._get_emacs_vars(text)
 208             if "markdown-extras" in emacs_vars:
 209                 splitter = re.compile("[ ,]+")
 210                 for e in splitter.split(emacs_vars["markdown-extras"]):
 211                     if '=' in e:
 212                         ename, earg = e.split('=', 1)
 213                         try:
 214                             earg = int(earg)
 215                         except ValueError:
 216                             pass
 217                     else:
 218                         ename, earg = e, None
 219                     self.extras[ename] = earg
 220
 221         # Standardize line endings:
 222         text = re.sub("\r\n|\r", "\n", text)
 223
 224         # Make sure $text ends with a couple of newlines:
 225         text += "\n\n"
 226
 227         # Convert all tabs to spaces.
 228         text = self._detab(text)
 229
 230         # Strip any lines consisting only of spaces and tabs.
 231         # This makes subsequent regexen easier to write, because we can
 232         # match consecutive blank lines with /\n+/ instead of something
 233         # contorted like /[ \t]*\n+/ .
 234         text = self._ws_only_line_re.sub("", text)
 235
 236         if self.safe_mode:
 237             text = self._hash_html_spans(text)
 238
 239         # Turn block-level HTML blocks into hash entries
 240         text = self._hash_html_blocks(text, raw=True)
 241
 242         # Strip link definitions, store in hashes.
 243         if "footnotes" in self.extras:
 244             # Must do footnotes first because an unlucky footnote defn
 245             # looks like a link defn:
 246             #   [^4]: this "looks like a link defn"
 247             text = self._strip_footnote_definitions(text)
 248         text = self._strip_link_definitions(text)
 249
 250         text = self._run_block_gamut(text)
 251
 252         text = self._unescape_special_chars(text)
 253
 254         if "footnotes" in self.extras:
 255             text = self._add_footnotes(text)
 256
 257         if self.safe_mode:
 258             text = self._unhash_html_spans(text)
 259
 260         text += "\n"
 261         return text
 262
 263     _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE)
 264     # This regular expression is intended to match blocks like this:
 265     #    PREFIX Local Variables: SUFFIX
 266     #    PREFIX mode: Tcl SUFFIX
 267     #    PREFIX End: SUFFIX
 268     # Some notes:
 269     # - "[ \t]" is used instead of "\s" to specifically exclude newlines
 270     # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does
 271     #   not like anything other than Unix-style line terminators.
 272     _emacs_local_vars_pat = re.compile(r"""^
 273         (?P<prefix>(?:[^\r\n|\n|\r])*?)
 274         [\ \t]*Local\ Variables:[\ \t]*
 275         (?P<suffix>.*?)(?:\r\n|\n|\r)
 276         (?P<content>.*?\1End:)
 277         """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
 278
 279     def _get_emacs_vars(self, text):
 280         """Return a dictionary of emacs-style local variables.
 281
 282         Parsing is done loosely according to this spec (and according to
 283         some in-practice deviations from this):
 284         http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables
 285         """
 286         emacs_vars = {}
 287         SIZE = pow(2, 13) # 8kB
 288
 289         # Search near the start for a '-*-'-style one-liner of variables.
 290         head = text[:SIZE]
 291         if "-*-" in head:
 292             match = self._emacs_oneliner_vars_pat.search(head)
 293             if match:
 294                 emacs_vars_str = match.group(1)
 295                 assert '\n' not in emacs_vars_str
 296                 emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';')
 297                                   if s.strip()]
 298                 if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]:
 299                     # While not in the spec, this form is allowed by emacs:
 300                     #   -*- Tcl -*-
 301                     # where the implied "variable" is "mode". This form
 302                     # is only allowed if there are no other variables.
 303                     emacs_vars["mode"] = emacs_var_strs[0].strip()
 304                 else:
 305                     for emacs_var_str in emacs_var_strs:
 306                         try:
 307                             variable, value = emacs_var_str.strip().split(':', 1)
 308                         except ValueError:
 309                             log.debug("emacs variables error: malformed -*- "
 310                                       "line: %r", emacs_var_str)
 311                             continue
 312                         # Lowercase the variable name because Emacs allows "Mode"
 313                         # or "mode" or "MoDe", etc.
 314                         emacs_vars[variable.lower()] = value.strip()
 315
 316         tail = text[-SIZE:]
 317         if "Local Variables" in tail:
 318             match = self._emacs_local_vars_pat.search(tail)
 319             if match:
 320                 prefix = match.group("prefix")
 321                 suffix = match.group("suffix")
 322                 lines = match.group("content").splitlines(0)
 323                 #print "prefix=%r, suffix=%r, content=%r, lines: %s"\
 324                 #      % (prefix, suffix, match.group("content"), lines)
 325
 326                 # Validate the Local Variables block: proper prefix and suffix
 327                 # usage.
 328                 for i, line in enumerate(lines):
 329                     if not line.startswith(prefix):
 330                         log.debug("emacs variables error: line '%s' "
 331                                   "does not use proper prefix '%s'"
 332                                   % (line, prefix))
 333                         return {}
 334                     # Don't validate suffix on last line. Emacs doesn't care,
 335                     # neither should we.
 336                     if i != len(lines)-1 and not line.endswith(suffix):
 337                         log.debug("emacs variables error: line '%s' "
 338                                   "does not use proper suffix '%s'"
 339                                   % (line, suffix))
 340                         return {}
 341
 342                 # Parse out one emacs var per line.
 343                 continued_for = None
 344                 for line in lines[:-1]: # no var on the last line ("PREFIX End:")
 345                     if prefix: line = line[len(prefix):] # strip prefix
 346                     if suffix: line = line[:-len(suffix)] # strip suffix
 347                     line = line.strip()
 348                     if continued_for:
 349                         variable = continued_for
 350                         if line.endswith('\\'):
 351                             line = line[:-1].rstrip()
 352                         else:
 353                             continued_for = None
 354                         emacs_vars[variable] += ' ' + line
 355                     else:
 356                         try:
 357                             variable, value = line.split(':', 1)
 358                         except ValueError:
 359                             log.debug("local variables error: missing colon "
 360                                       "in local variables entry: '%s'" % line)
 361                             continue
 362                         # Do NOT lowercase the variable name, because Emacs only
 363                         # allows "mode" (and not "Mode", "MoDe", etc.) in this block.
 364                         value = value.strip()
 365                         if value.endswith('\\'):
 366                             value = value[:-1].rstrip()
 367                             continued_for = variable
 368                         else:
 369                             continued_for = None
 370                         emacs_vars[variable] = value
 371
 372         # Unquote values.
 373         for var, val in emacs_vars.items():
 374             if len(val) > 1 and (val.startswith('"') and val.endswith('"')
 375                or val.startswith('"') and val.endswith('"')):
 376                 emacs_vars[var] = val[1:-1]
 377
 378         return emacs_vars
 379
 380     # Cribbed from a post by Bart Lateur:
 381     # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
 382     _detab_re = re.compile(r'(.*?)\t', re.M)
 383     def _detab_sub(self, match):
 384         g1 = match.group(1)
 385         return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width))
 386     def _detab(self, text):
 387         r"""Remove (leading?) tabs from a file.
 388
 389             >>> m = Markdown()
 390             >>> m._detab("\tfoo")
 391             '    foo'
 392             >>> m._detab("  \tfoo")
 393             '    foo'
 394             >>> m._detab("\t  foo")
 395             '      foo'
 396             >>> m._detab("  foo")
 397             '  foo'
 398             >>> m._detab("  foo\n\tbar\tblam")
 399             '  foo\n    bar blam'
 400         """
 401         if '\t' not in text:
 402             return text
 403         return self._detab_re.subn(self._detab_sub, text)[0]
 404
 405     _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del'
 406     _strict_tag_block_re = re.compile(r"""
 407         (                       # save in \1
 408             ^                   # start of line  (with re.M)
 409             <(%s)               # start tag = \2
 410             \b                  # word break
 411             (.*\n)*?            # any number of lines, minimally matching
 412             </\2>               # the matching end tag
 413             [ \t]*              # trailing spaces/tabs
 414             (?=\n+|\Z)          # followed by a newline or end of document
 415         )
 416         """ % _block_tags_a,
 417         re.X | re.M)
 418
 419     _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math'
 420     _liberal_tag_block_re = re.compile(r"""
 421         (                       # save in \1
 422             ^                   # start of line  (with re.M)
 423             <(%s)               # start tag = \2
 424             \b                  # word break
 425             (.*\n)*?            # any number of lines, minimally matching
 426             .*</\2>             # the matching end tag
 427             [ \t]*              # trailing spaces/tabs
 428             (?=\n+|\Z)          # followed by a newline or end of document
 429         )
 430         """ % _block_tags_b,
 431         re.X | re.M)
 432
 433     def _hash_html_block_sub(self, match, raw=False):
 434         html = match.group(1)
 435         if raw and self.safe_mode:
 436             html = self._sanitize_html(html)
 437         key = _hash_text(html)
 438         self.html_blocks[key] = html
 439         return "\n\n" + key + "\n\n"
 440
 441     def _hash_html_blocks(self, text, raw=False):
 442         """Hashify HTML blocks
 443
 444         We only want to do this for block-level HTML tags, such as headers,
 445         lists, and tables. That's because we still want to wrap <p>s around
 446         "paragraphs" that are wrapped in non-block-level tags, such as anchors,
 447         phrase emphasis, and spans. The list of tags we're looking for is
 448         hard-coded.
 449
 450         @param raw {boolean} indicates if these are raw HTML blocks in
 451             the original source. It makes a difference in "safe" mode.
 452         """
 453         if '<' not in text:
 454             return text
 455
 456         # Pass `raw` value into our calls to self._hash_html_block_sub.
 457         hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw)
 458
 459         # First, look for nested blocks, e.g.:
 460         #   <div>
 461         #       <div>
 462         #       tags for inner block must be indented.
 463         #       </div>
 464         #   </div>
 465         #
 466         # The outermost tags must start at the left margin for this to match, and
 467         # the inner nested divs must be indented.
 468         # We need to do this before the next, more liberal match, because the next
 469         # match will start at the first `<div>` and stop at the first `</div>`.
 470         text = self._strict_tag_block_re.sub(hash_html_block_sub, text)
 471
 472         # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
 473         text = self._liberal_tag_block_re.sub(hash_html_block_sub, text)
 474
 475         # Special case just for <hr />. It was easier to make a special
 476         # case than to make the other regex more complicated.
 477         if "<hr" in text:
 478             _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width)
 479             text = _hr_tag_re.sub(hash_html_block_sub, text)
 480
 481         # Special case for standalone HTML comments:
 482         if "<!--" in text:
 483             start = 0
 484             while True:
 485                 # Delimiters for next comment block.
 486                 try:
 487                     start_idx = text.index("<!--", start)
 488                 except ValueError, ex:
 489                     break
 490                 try:
 491                     end_idx = text.index("-->", start_idx) + 3
 492                 except ValueError, ex:
 493                     break
 494
 495                 # Start position for next comment block search.
 496                 start = end_idx
 497
 498                 # Validate whitespace before comment.
 499                 if start_idx:
 500                     # - Up to `tab_width - 1` spaces before start_idx.
 501                     for i in range(self.tab_width - 1):
 502                         if text[start_idx - 1] != ' ':
 503                             break
 504                         start_idx -= 1
 505                         if start_idx == 0:
 506                             break
 507                     # - Must be preceded by 2 newlines or hit the start of
 508                     #   the document.
 509                     if start_idx == 0:
 510                         pass
 511                     elif start_idx == 1 and text[0] == '\n':
 512                         start_idx = 0  # to match minute detail of Markdown.pl regex
 513                     elif text[start_idx-2:start_idx] == '\n\n':
 514                         pass
 515                     else:
 516                         break
 517
 518                 # Validate whitespace after comment.
 519                 # - Any number of spaces and tabs.
 520                 while end_idx < len(text):
 521                     if text[end_idx] not in ' \t':
 522                         break
 523                     end_idx += 1
 524                 # - Must be following by 2 newlines or hit end of text.
 525                 if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'):
 526                     continue
 527
 528                 # Escape and hash (must match `_hash_html_block_sub`).
 529                 html = text[start_idx:end_idx]
 530                 if raw and self.safe_mode:
 531                     html = self._sanitize_html(html)
 532                 key = _hash_text(html)
 533                 self.html_blocks[key] = html
 534                 text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:]
 535
 536         if "xml" in self.extras:
 537             # Treat XML processing instructions and namespaced one-liner
 538             # tags as if they were block HTML tags. E.g., if standalone
 539             # (i.e. are their own paragraph), the following do not get
 540             # wrapped in a <p> tag:
 541             #    <?foo bar?>
 542             #
 543             #    <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/>
 544             _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width)
 545             text = _xml_oneliner_re.sub(hash_html_block_sub, text)
 546
 547         return text
 548
 549     def _strip_link_definitions(self, text):
 550         # Strips link definitions from text, stores the URLs and titles in
 551         # hash references.
 552         less_than_tab = self.tab_width - 1
 553
 554         # Link defs are in the form:
 555         #   [id]: url "optional title"
 556         _link_def_re = re.compile(r"""
 557             ^[ ]{0,%d}\[(.+)\]: # id = \1
 558               [ \t]*
 559               \n?               # maybe *one* newline
 560               [ \t]*
 561             <?(.+?)>?           # url = \2
 562               [ \t]*
 563             (?:
 564                 \n?             # maybe one newline
 565                 [ \t]*
 566                 (?<=\s)         # lookbehind for whitespace
 567                 ['"(]
 568                 ([^\n]*)        # title = \3
 569                 ['")]
 570                 [ \t]*
 571             )?  # title is optional
 572             (?:\n+|\Z)
 573             """ % less_than_tab, re.X | re.M | re.U)
 574         return _link_def_re.sub(self._extract_link_def_sub, text)
 575
 576     def _extract_link_def_sub(self, match):
 577         id, url, title = match.groups()
 578         key = id.lower()    # Link IDs are case-insensitive
 579         self.urls[key] = self._encode_amps_and_angles(url)
 580         if title:
 581             self.titles[key] = title.replace('"', '&quot;')
 582         return ""
 583
 584     def _extract_footnote_def_sub(self, match):
 585         id, text = match.groups()
 586         text = _dedent(text, skip_first_line=not text.startswith('\n')).strip()
 587         normed_id = re.sub(r'\W', '-', id)
 588         # Ensure footnote text ends with a couple newlines (for some
 589         # block gamut matches).
 590         self.footnotes[normed_id] = text + "\n\n"
 591         return ""
 592
 593     def _strip_footnote_definitions(self, text):
 594         """A footnote definition looks like this:
 595
 596             [^note-id]: Text of the note.
 597
 598                 May include one or more indented paragraphs.
 599
 600         Where,
 601         - The 'note-id' can be pretty much anything, though typically it
 602           is the number of the footnote.
 603         - The first paragraph may start on the next line, like so:
 604
 605             [^note-id]:
 606                 Text of the note.
 607         """
 608         less_than_tab = self.tab_width - 1
 609         footnote_def_re = re.compile(r'''
 610             ^[ ]{0,%d}\[\^(.+)\]:   # id = \1
 611             [ \t]*
 612             (                       # footnote text = \2
 613               # First line need not start with the spaces.
 614               (?:\s*.*\n+)
 615               (?:
 616                 (?:[ ]{%d} | \t)  # Subsequent lines must be indented.
 617                 .*\n+
 618               )*
 619             )
 620             # Lookahead for non-space at line-start, or end of doc.
 621             (?:(?=^[ ]{0,%d}\S)|\Z)
 622             ''' % (less_than_tab, self.tab_width, self.tab_width),
 623             re.X | re.M)
 624         return footnote_def_re.sub(self._extract_footnote_def_sub, text)
 625
 626
 627     _hr_res = [
 628         re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M),
 629         re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M),
 630         re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M),
 631     ]
 632
 633     def _run_block_gamut(self, text):
 634         # These are all the transformations that form block-level
 635         # tags like paragraphs, headers, and list items.
 636
 637         text = self._do_headers(text)
 638
 639         # Do Horizontal Rules:
 640         hr = "\n<hr"+self.empty_element_suffix+"\n"
 641         for hr_re in self._hr_res:
 642             text = hr_re.sub(hr, text)
 643
 644         text = self._do_lists(text)
 645
 646         if "pyshell" in self.extras:
 647             text = self._prepare_pyshell_blocks(text)
 648
 649         text = self._do_code_blocks(text)
 650
 651         text = self._do_block_quotes(text)
 652
 653         # We already ran _HashHTMLBlocks() before, in Markdown(), but that
 654         # was to escape raw HTML in the original Markdown source. This time,
 655         # we're escaping the markup we've just created, so that we don't wrap
 656         # <p> tags around block-level tags.
 657         text = self._hash_html_blocks(text)
 658
 659         text = self._form_paragraphs(text)
 660
 661         return text
 662
 663     def _pyshell_block_sub(self, match):
 664         lines = match.group(0).splitlines(0)
 665         _dedentlines(lines)
 666         indent = ' ' * self.tab_width
 667         s = ('\n' # separate from possible cuddled paragraph
 668              + indent + ('\n'+indent).join(lines)
 669              + '\n\n')
 670         return s
 671
 672     def _prepare_pyshell_blocks(self, text):
 673         """Ensure that Python interactive shell sessions are put in
 674         code blocks -- even if not properly indented.
 675         """
 676         if ">>>" not in text:
 677             return text
 678
 679         less_than_tab = self.tab_width - 1
 680         _pyshell_block_re = re.compile(r"""
 681             ^([ ]{0,%d})>>>[ ].*\n   # first line
 682             ^(\1.*\S+.*\n)*         # any number of subsequent lines
 683             ^\n                     # ends with a blank line
 684             """ % less_than_tab, re.M | re.X)
 685
 686         return _pyshell_block_re.sub(self._pyshell_block_sub, text)
 687
 688     def _run_span_gamut(self, text):
 689         # These are all the transformations that occur *within* block-level
 690         # tags like paragraphs, headers, and list items.
 691
 692         text = self._do_code_spans(text)
 693
 694         text = self._escape_special_chars(text)
 695
 696         # Process anchor and image tags.
 697         text = self._do_links(text)
 698
 699         # Make links out of things like `<http://example.com/>`
 700         # Must come after _do_links(), because you can use < and >
 701         # delimiters in inline links like [this](<url>).
 702         text = self._do_auto_links(text)
 703
 704         if "link-patterns" in self.extras:
 705             text = self._do_link_patterns(text)
 706
 707         text = self._encode_amps_and_angles(text)
 708
 709         text = self._do_italics_and_bold(text)
 710
 711         # Do hard breaks:
 712         text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text)
 713
 714         return text
 715
 716     # "Sorta" because auto-links are identified as "tag" tokens.
 717     _sorta_html_tokenize_re = re.compile(r"""
 718         (
 719             # tag
 720             </?
 721             (?:\w+)                                     # tag name
 722             (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))*  # attributes
 723             \s*/?>
 724             |
 725             # auto-link (e.g., <http://www.activestate.com/>)
 726             <\w+[^>]*>
 727             |
 728             <!--.*?-->      # comment
 729             |
 730             <\?.*?\?>       # processing instruction
 731         )
 732         """, re.X)
 733
 734     def _escape_special_chars(self, text):
 735         # Python markdown note: the HTML tokenization here differs from
 736         # that in Markdown.pl, hence the behaviour for subtle cases can
 737         # differ (I believe the tokenizer here does a better job because
 738         # it isn't susceptible to unmatched '<' and '>' in HTML tags).
 739         # Note, however, that '>' is not allowed in an auto-link URL
 740         # here.
 741         escaped = []
 742         is_html_markup = False
 743         for token in self._sorta_html_tokenize_re.split(text):
 744             if is_html_markup:
 745                 # Within tags/HTML-comments/auto-links, encode * and _
 746                 # so they don't conflict with their use in Markdown for
 747                 # italics and strong.  We're replacing each such
 748                 # character with its corresponding MD5 checksum value;
 749                 # this is likely overkill, but it should prevent us from
 750                 # colliding with the escape values by accident.
 751                 escaped.append(token.replace('*', g_escape_table['*'])
 752                                     .replace('_', g_escape_table['_']))
 753             else:
 754                 escaped.append(self._encode_backslash_escapes(token))
 755             is_html_markup = not is_html_markup
 756         return ''.join(escaped)
 757
 758     def _hash_html_spans(self, text):
 759         # Used for safe_mode.
 760
 761         def _is_auto_link(s):
 762             if ':' in s and self._auto_link_re.match(s):
 763                 return True
 764             elif '@' in s and self._auto_email_link_re.match(s):
 765                 return True
 766             return False
 767
 768         tokens = []
 769         is_html_markup = False
 770         for token in self._sorta_html_tokenize_re.split(text):
 771             if is_html_markup and not _is_auto_link(token):
 772                 sanitized = self._sanitize_html(token)
 773                 key = _hash_text(sanitized)
 774                 self.html_spans[key] = sanitized
 775                 tokens.append(key)
 776             else:
 777                 tokens.append(token)
 778             is_html_markup = not is_html_markup
 779         return ''.join(tokens)
 780
 781     def _unhash_html_spans(self, text):
 782         for key, sanitized in self.html_spans.items():
 783             text = text.replace(key, sanitized)
 784         return text
 785
 786     def _sanitize_html(self, s):
 787         if self.safe_mode == "replace":
 788             return self.html_removed_text
 789         elif self.safe_mode == "escape":
 790             replacements = [
 791                 ('&', '&amp;'),
 792                 ('<', '&lt;'),
 793                 ('>', '&gt;'),
 794             ]
 795             for before, after in replacements:
 796                 s = s.replace(before, after)
 797             return s
 798         else:
 799             raise MarkdownError("invalid value for 'safe_mode': %r (must be "
 800                                 "'escape' or 'replace')" % self.safe_mode)
 801
 802     _tail_of_inline_link_re = re.compile(r'''
 803           # Match tail of: [text](/url/) or [text](/url/ "title")
 804           \(            # literal paren
 805             [ \t]*
 806             (?P<url>            # \1
 807                 <.*?>
 808                 |
 809                 .*?
 810             )
 811             [ \t]*
 812             (                   # \2
 813               (['"])            # quote char = \3
 814               (?P<title>.*?)
 815               \3                # matching quote
 816             )?                  # title is optional
 817           \)
 818         ''', re.X | re.S)
 819     _tail_of_reference_link_re = re.compile(r'''
 820           # Match tail of: [text][id]
 821           [ ]?          # one optional space
 822           (?:\n[ ]*)?   # one optional newline followed by spaces
 823           \[
 824             (?P<id>.*?)
 825           \]
 826         ''', re.X | re.S)
 827
 828     def _do_links(self, text):
 829         """Turn Markdown link shortcuts into XHTML <a> and <img> tags.
 830
 831         This is a combination of Markdown.pl's _DoAnchors() and
 832         _DoImages(). They are done together because that simplified the
 833         approach. It was necessary to use a different approach than
 834         Markdown.pl because of the lack of atomic matching support in
 835         Python's regex engine used in $g_nested_brackets.
 836         """
 837         MAX_LINK_TEXT_SENTINEL = 300
 838
 839         # `anchor_allowed_pos` is used to support img links inside
 840         # anchors, but not anchors inside anchors. An anchor's start
 841         # pos must be `>= anchor_allowed_pos`.
 842         anchor_allowed_pos = 0
 843
 844         curr_pos = 0
 845         while True: # Handle the next link.
 846             # The next '[' is the start of:
 847             # - an inline anchor:   [text](url "title")
 848             # - a reference anchor: [text][id]
 849             # - an inline img:      ![text](url "title")
 850             # - a reference img:    ![text][id]
 851             # - a footnote ref:     [^id]
 852             #   (Only if 'footnotes' extra enabled)
 853             # - a footnote defn:    [^id]: ...
 854             #   (Only if 'footnotes' extra enabled) These have already
 855             #   been stripped in _strip_footnote_definitions() so no
 856             #   need to watch for them.
 857             # - a link definition:  [id]: url "title"
 858             #   These have already been stripped in
 859             #   _strip_link_definitions() so no need to watch for them.
 860             # - not markup:         [...anything else...
 861             try:
 862                 start_idx = text.index('[', curr_pos)
 863             except ValueError:
 864                 break
 865             text_length = len(text)
 866
 867             # Find the matching closing ']'.
 868             # Markdown.pl allows *matching* brackets in link text so we
 869             # will here too. Markdown.pl *doesn't* currently allow
 870             # matching brackets in img alt text -- we'll differ in that
 871             # regard.
 872             bracket_depth = 0
 873             for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL,
 874                                             text_length)):
 875                 ch = text[p]
 876                 if ch == ']':
 877                     bracket_depth -= 1
 878                     if bracket_depth < 0:
 879                         break
 880                 elif ch == '[':
 881                     bracket_depth += 1
 882             else:
 883                 # Closing bracket not found within sentinel length.
 884                 # This isn't markup.
 885                 curr_pos = start_idx + 1
 886                 continue
 887             link_text = text[start_idx+1:p]
 888
 889             # Possibly a footnote ref?
 890             if "footnotes" in self.extras and link_text.startswith("^"):
 891                 normed_id = re.sub(r'\W', '-', link_text[1:])
 892                 if normed_id in self.footnotes:
 893                     self.footnote_ids.append(normed_id)
 894                     result = '<sup class="footnote-ref" id="fnref-%s">' \
 895                              '<a href="#fn-%s">%s</a></sup>' \
 896                              % (normed_id, normed_id, len(self.footnote_ids))
 897                     text = text[:start_idx] + result + text[p+1:]
 898                 else:
 899                     # This id isn't defined, leave the markup alone.
 900                     curr_pos = p+1
 901                 continue
 902
 903             # Now determine what this is by the remainder.
 904             p += 1
 905             if p == text_length:
 906                 return text
 907
 908             # Inline anchor or img?
 909             if text[p] == '(': # attempt at perf improvement
 910                 match = self._tail_of_inline_link_re.match(text, p)
 911                 if match:
 912                     # Handle an inline anchor or img.
 913                     is_img = start_idx > 0 and text[start_idx-1] == "!"
 914                     if is_img:
 915                         start_idx -= 1
 916
 917                     url, title = match.group("url"), match.group("title")
 918                     if url and url[0] == '<':
 919                         url = url[1:-1]  # '<url>' -> 'url'
 920                     # We've got to encode these to avoid conflicting
 921                     # with italics/bold.
 922                     url = url.replace('*', g_escape_table['*']) \
 923                              .replace('_', g_escape_table['_'])
 924                     if title:
 925                         title_str = ' title="%s"' \
 926                             % title.replace('*', g_escape_table['*']) \
 927                                    .replace('_', g_escape_table['_']) \
 928                                    .replace('"', '&quot;')
 929                     else:
 930                         title_str = ''
 931                     if is_img:
 932                         result = '<img src="%s" alt="%s"%s%s' \
 933                             % (url, link_text.replace('"', '&quot;'),
 934                                title_str, self.empty_element_suffix)
 935                         curr_pos = start_idx + len(result)
 936                         text = text[:start_idx] + result + text[match.end():]
 937                     elif start_idx >= anchor_allowed_pos:
 938                         result_head = '<a href="%s"%s>' % (url, title_str)
 939                         result = '%s%s</a>' % (result_head, link_text)
 940                         # <img> allowed from curr_pos on, <a> from
 941                         # anchor_allowed_pos on.
 942                         curr_pos = start_idx + len(result_head)
 943                         anchor_allowed_pos = start_idx + len(result)
 944                         text = text[:start_idx] + result + text[match.end():]
 945                     else:
 946                         # Anchor not allowed here.
 947                         curr_pos = start_idx + 1
 948                     continue
 949
 950             # Reference anchor or img?
 951             else:
 952                 match = self._tail_of_reference_link_re.match(text, p)
 953                 if match:
 954                     # Handle a reference-style anchor or img.
 955                     is_img = start_idx > 0 and text[start_idx-1] == "!"
 956                     if is_img:
 957                         start_idx -= 1
 958                     link_id = match.group("id").lower()
 959                     if not link_id:
 960                         link_id = link_text.lower()  # for links like [this][]
 961                     if link_id in self.urls:
 962                         url = self.urls[link_id]
 963                         # We've got to encode these to avoid conflicting
 964                         # with italics/bold.
 965                         url = url.replace('*', g_escape_table['*']) \
 966                                  .replace('_', g_escape_table['_'])
 967                         title = self.titles.get(link_id)
 968                         if title:
 969                             title = title.replace('*', g_escape_table['*']) \
 970                                          .replace('_', g_escape_table['_'])
 971                             title_str = ' title="%s"' % title
 972                         else:
 973                             title_str = ''
 974                         if is_img:
 975                             result = '<img src="%s" alt="%s"%s%s' \
 976                                 % (url, link_text.replace('"', '&quot;'),
 977                                    title_str, self.empty_element_suffix)
 978                             curr_pos = start_idx + len(result)
 979                             text = text[:start_idx] + result + text[match.end():]
 980                         elif start_idx >= anchor_allowed_pos:
 981                             result = '<a href="%s"%s>%s</a>' \
 982                                 % (url, title_str, link_text)
 983                             result_head = '<a href="%s"%s>' % (url, title_str)
 984                             result = '%s%s</a>' % (result_head, link_text)
 985                             # <img> allowed from curr_pos on, <a> from
 986                             # anchor_allowed_pos on.
 987                             curr_pos = start_idx + len(result_head)
 988                             anchor_allowed_pos = start_idx + len(result)
 989                             text = text[:start_idx] + result + text[match.end():]
 990                         else:
 991                             # Anchor not allowed here.
 992                             curr_pos = start_idx + 1
 993                     else:
 994                         # This id isn't defined, leave the markup alone.
 995                         curr_pos = match.end()
 996                     continue
 997
 998             # Otherwise, it isn't markup.
 999             curr_pos = start_idx + 1
1000
1001         return text
1002
1003
1004     _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M)
1005     def _setext_h_sub(self, match):
1006         n = {"=": 1, "-": 2}[match.group(2)[0]]
1007         demote_headers = self.extras.get("demote-headers")
1008         if demote_headers:
1009             n = min(n + demote_headers, 6)
1010         return "<h%d>%s</h%d>\n\n" \
1011                % (n, self._run_span_gamut(match.group(1)), n)
1012
1013     _atx_h_re = re.compile(r'''
1014         ^(\#{1,6})  # \1 = string of #'s
1015         [ \t]*
1016         (.+?)       # \2 = Header text
1017         [ \t]*
1018         (?<!\\)     # ensure not an escaped trailing '#'
1019         \#*         # optional closing #'s (not counted)
1020         \n+
1021         ''', re.X | re.M)
1022     def _atx_h_sub(self, match):
1023         n = len(match.group(1))
1024         demote_headers = self.extras.get("demote-headers")
1025         if demote_headers:
1026             n = min(n + demote_headers, 6)
1027         return "<h%d>%s</h%d>\n\n" \
1028                % (n, self._run_span_gamut(match.group(2)), n)
1029
1030     def _do_headers(self, text):
1031         # Setext-style headers:
1032         #     Header 1
1033         #     ========
1034         #
1035         #     Header 2
1036         #     --------
1037         text = self._setext_h_re.sub(self._setext_h_sub, text)
1038
1039         # atx-style headers:
1040         #   # Header 1
1041         #   ## Header 2
1042         #   ## Header 2 with closing hashes ##
1043         #   ...
1044         #   ###### Header 6
1045         text = self._atx_h_re.sub(self._atx_h_sub, text)
1046
1047         return text
1048
1049
1050     _marker_ul_chars  = '*+-'
1051     _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars
1052     _marker_ul = '(?:[%s])' % _marker_ul_chars
1053     _marker_ol = r'(?:\d+\.)'
1054
1055     def _list_sub(self, match):
1056         lst = match.group(1)
1057         lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol"
1058         result = self._process_list_items(lst)
1059         if self.list_level:
1060             return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type)
1061         else:
1062             return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type)
1063
1064     def _do_lists(self, text):
1065         # Form HTML ordered (numbered) and unordered (bulleted) lists.
1066
1067         for marker_pat in (self._marker_ul, self._marker_ol):
1068             # Re-usable pattern to match any entire ul or ol list:
1069             less_than_tab = self.tab_width - 1
1070             whole_list = r'''
1071                 (                   # \1 = whole list
1072                   (                 # \2
1073                     [ ]{0,%d}
1074                     (%s)            # \3 = first list item marker
1075                     [ \t]+
1076                   )
1077                   (?:.+?)
1078                   (                 # \4
1079                       \Z
1080                     |
1081                       \n{2,}
1082                       (?=\S)
1083                       (?!           # Negative lookahead for another list item marker
1084                         [ \t]*
1085                         %s[ \t]+
1086                       )
1087                   )
1088                 )
1089             ''' % (less_than_tab, marker_pat, marker_pat)
1090
1091             # We use a different prefix before nested lists than top-level lists.
1092             # See extended comment in _process_list_items().
1093             #
1094             # Note: There's a bit of duplication here. My original implementation
1095             # created a scalar regex pattern as the conditional result of the test on
1096             # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1097             # substitution once, using the scalar as the pattern. This worked,
1098             # everywhere except when running under MT on my hosting account at Pair
1099             # Networks. There, this caused all rebuilds to be killed by the reaper (or
1100             # perhaps they crashed, but that seems incredibly unlikely given that the
1101             # same script on the same server ran fine *except* under MT. I've spent
1102             # more time trying to figure out why this is happening than I'd like to
1103             # admit. My only guess, backed up by the fact that this workaround works,
1104             # is that Perl optimizes the substition when it can figure out that the
1105             # pattern will never change, and when this optimization isn't on, we run
1106             # afoul of the reaper. Thus, the slightly redundant code to that uses two
1107             # static s/// patterns rather than one conditional pattern.
1108
1109             if self.list_level:
1110                 sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S)
1111                 text = sub_list_re.sub(self._list_sub, text)
1112             else:
1113                 list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list,
1114                                      re.X | re.M | re.S)
1115                 text = list_re.sub(self._list_sub, text)
1116
1117         return text
1118
1119     _list_item_re = re.compile(r'''
1120         (\n)?               # leading line = \1
1121         (^[ \t]*)           # leading whitespace = \2
1122         (%s) [ \t]+         # list marker = \3
1123         ((?:.+?)            # list item text = \4
1124          (\n{1,2}))         # eols = \5
1125         (?= \n* (\Z | \2 (%s) [ \t]+))
1126         ''' % (_marker_any, _marker_any),
1127         re.M | re.X | re.S)
1128
1129     _last_li_endswith_two_eols = False
1130     def _list_item_sub(self, match):
1131         item = match.group(4)
1132         leading_line = match.group(1)
1133         leading_space = match.group(2)
1134         if leading_line or "\n\n" in item or self._last_li_endswith_two_eols:
1135             item = self._run_block_gamut(self._outdent(item))
1136         else:
1137             # Recursion for sub-lists:
1138             item = self._do_lists(self._outdent(item))
1139             if item.endswith('\n'):
1140                 item = item[:-1]
1141             item = self._run_span_gamut(item)
1142         self._last_li_endswith_two_eols = (len(match.group(5)) == 2)
1143         return "<li>%s</li>\n" % item
1144
1145     def _process_list_items(self, list_str):
1146         # Process the contents of a single ordered or unordered list,
1147         # splitting it into individual list items.
1148
1149         # The $g_list_level global keeps track of when we're inside a list.
1150         # Each time we enter a list, we increment it; when we leave a list,
1151         # we decrement. If it's zero, we're not in a list anymore.
1152         #
1153         # We do this because when we're not inside a list, we want to treat
1154         # something like this:
1155         #
1156         #       I recommend upgrading to version
1157         #       8. Oops, now this line is treated
1158         #       as a sub-list.
1159         #
1160         # As a single paragraph, despite the fact that the second line starts
1161         # with a digit-period-space sequence.
1162         #
1163         # Whereas when we're inside a list (or sub-list), that line will be
1164         # treated as the start of a sub-list. What a kludge, huh? This is
1165         # an aspect of Markdown's syntax that's hard to parse perfectly
1166         # without resorting to mind-reading. Perhaps the solution is to
1167         # change the syntax rules such that sub-lists must start with a
1168         # starting cardinal number; e.g. "1." or "a.".
1169         self.list_level += 1
1170         self._last_li_endswith_two_eols = False
1171         list_str = list_str.rstrip('\n') + '\n'
1172         list_str = self._list_item_re.sub(self._list_item_sub, list_str)
1173         self.list_level -= 1
1174         return list_str
1175
1176     def _get_pygments_lexer(self, lexer_name):
1177         try:
1178             from pygments import lexers, util
1179         except ImportError:
1180             return None
1181         try:
1182             return lexers.get_lexer_by_name(lexer_name)
1183         except util.ClassNotFound:
1184             return None
1185
1186     def _color_with_pygments(self, codeblock, lexer, **formatter_opts):
1187         import pygments
1188         import pygments.formatters
1189
1190         class HtmlCodeFormatter(pygments.formatters.HtmlFormatter):
1191             def _wrap_code(self, inner):
1192                 """A function for use in a Pygments Formatter which
1193                 wraps in <code> tags.
1194                 """
1195                 yield 0, "<code>"
1196                 for tup in inner:
1197                     yield tup
1198                 yield 0, "</code>"
1199
1200             def wrap(self, source, outfile):
1201                 """Return the source with a code, pre, and div."""
1202                 return self._wrap_div(self._wrap_pre(self._wrap_code(source)))
1203
1204         formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts)
1205         return pygments.highlight(codeblock, lexer, formatter)
1206
1207     def _code_block_sub(self, match):
1208         codeblock = match.group(1)
1209         codeblock = self._outdent(codeblock)
1210         codeblock = self._detab(codeblock)
1211         codeblock = codeblock.lstrip('\n')  # trim leading newlines
1212         codeblock = codeblock.rstrip()      # trim trailing whitespace
1213
1214         if "code-color" in self.extras and codeblock.startswith(":::"):
1215             lexer_name, rest = codeblock.split('\n', 1)
1216             lexer_name = lexer_name[3:].strip()
1217             lexer = self._get_pygments_lexer(lexer_name)
1218             codeblock = rest.lstrip("\n")   # Remove lexer declaration line.
1219             if lexer:
1220                 formatter_opts = self.extras['code-color'] or {}
1221                 colored = self._color_with_pygments(codeblock, lexer,
1222                                                     **formatter_opts)
1223                 return "\n\n%s\n\n" % colored
1224
1225         codeblock = self._encode_code(codeblock)
1226         return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock
1227
1228     def _do_code_blocks(self, text):
1229         """Process Markdown `<pre><code>` blocks."""
1230         code_block_re = re.compile(r'''
1231             (?:\n\n|\A)
1232             (               # $1 = the code block -- one or more lines, starting with a space/tab
1233               (?:
1234                 (?:[ ]{%d} | \t)  # Lines must start with a tab or a tab-width of spaces
1235                 .*\n+
1236               )+
1237             )
1238             ((?=^[ ]{0,%d}\S)|\Z)   # Lookahead for non-space at line-start, or end of doc
1239             ''' % (self.tab_width, self.tab_width),
1240             re.M | re.X)
1241
1242         return code_block_re.sub(self._code_block_sub, text)
1243
1244
1245     # Rules for a code span:
1246     # - backslash escapes are not interpreted in a code span
1247     # - to include one or or a run of more backticks the delimiters must
1248     #   be a longer run of backticks
1249     # - cannot start or end a code span with a backtick; pad with a
1250     #   space and that space will be removed in the emitted HTML
1251     # See `test/tm-cases/escapes.text` for a number of edge-case
1252     # examples.
1253     _code_span_re = re.compile(r'''
1254             (?<!\\)
1255             (`+)        # \1 = Opening run of `
1256             (?!`)       # See Note A test/tm-cases/escapes.text
1257             (.+?)       # \2 = The code block
1258             (?<!`)
1259             \1          # Matching closer
1260             (?!`)
1261         ''', re.X | re.S)
1262
1263     def _code_span_sub(self, match):
1264         c = match.group(2).strip(" \t")
1265         c = self._encode_code(c)
1266         return "<code>%s</code>" % c
1267
1268     def _do_code_spans(self, text):
1269         #   *   Backtick quotes are used for <code></code> spans.
1270         #
1271         #   *   You can use multiple backticks as the delimiters if you want to
1272         #       include literal backticks in the code span. So, this input:
1273         #
1274         #         Just type ``foo `bar` baz`` at the prompt.
1275         #
1276         #       Will translate to:
1277         #
1278         #         <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1279         #
1280         #       There's no arbitrary limit to the number of backticks you
1281         #       can use as delimters. If you need three consecutive backticks
1282         #       in your code, use four for delimiters, etc.
1283         #
1284         #   *   You can use spaces to get literal backticks at the edges:
1285         #
1286         #         ... type `` `bar` `` ...
1287         #
1288         #       Turns to:
1289         #
1290         #         ... type <code>`bar`</code> ...
1291         return self._code_span_re.sub(self._code_span_sub, text)
1292
1293     def _encode_code(self, text):
1294         """Encode/escape certain characters inside Markdown code runs.
1295         The point is that in code, these characters are literals,
1296         and lose their special Markdown meanings.
1297         """
1298         replacements = [
1299             # Encode all ampersands; HTML entities are not
1300             # entities within a Markdown code span.
1301             ('&', '&amp;'),
1302             # Do the angle bracket song and dance:
1303             ('<', '&lt;'),
1304             ('>', '&gt;'),
1305             # Now, escape characters that are magic in Markdown:
1306             ('*', g_escape_table['*']),
1307             ('_', g_escape_table['_']),
1308             ('{', g_escape_table['{']),
1309             ('}', g_escape_table['}']),
1310             ('[', g_escape_table['[']),
1311             (']', g_escape_table[']']),
1312             ('\\', g_escape_table['\\']),
1313         ]
1314         for before, after in replacements:
1315             text = text.replace(before, after)
1316         return text
1317
1318     _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S)
1319     _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S)
1320     _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S)
1321     _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S)
1322     def _do_italics_and_bold(self, text):
1323         # <strong> must go first:
1324         if "code-friendly" in self.extras:
1325             text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text)
1326             text = self._code_friendly_em_re.sub(r"<em>\1</em>", text)
1327         else:
1328             text = self._strong_re.sub(r"<strong>\2</strong>", text)
1329             text = self._em_re.sub(r"<em>\2</em>", text)
1330         return text
1331
1332
1333     _block_quote_re = re.compile(r'''
1334         (                           # Wrap whole match in \1
1335           (
1336             ^[ \t]*>[ \t]?          # '>' at the start of a line
1337               .+\n                  # rest of the first line
1338             (.+\n)*                 # subsequent consecutive lines
1339             \n*                     # blanks
1340           )+
1341         )
1342         ''', re.M | re.X)
1343     _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M);
1344
1345     _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S)
1346     def _dedent_two_spaces_sub(self, match):
1347         return re.sub(r'(?m)^  ', '', match.group(1))
1348
1349     def _block_quote_sub(self, match):
1350         bq = match.group(1)
1351         bq = self._bq_one_level_re.sub('', bq)  # trim one level of quoting
1352         bq = self._ws_only_line_re.sub('', bq)  # trim whitespace-only lines
1353         bq = self._run_block_gamut(bq)          # recurse
1354
1355         bq = re.sub('(?m)^', '  ', bq)
1356         # These leading spaces screw with <pre> content, so we need to fix that:
1357         bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq)
1358
1359         return "<blockquote>\n%s\n</blockquote>\n\n" % bq
1360
1361     def _do_block_quotes(self, text):
1362         if '>' not in text:
1363             return text
1364         return self._block_quote_re.sub(self._block_quote_sub, text)
1365
1366     def _form_paragraphs(self, text):
1367         # Strip leading and trailing lines:
1368         text = text.strip('\n')
1369
1370         # Wrap <p> tags.
1371         grafs = re.split(r"\n{2,}", text)
1372         for i, graf in enumerate(grafs):
1373             if graf in self.html_blocks:
1374                 # Unhashify HTML blocks
1375                 grafs[i] = self.html_blocks[graf]
1376             else:
1377                 # Wrap <p> tags.
1378                 graf = self._run_span_gamut(graf)
1379                 grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>"
1380
1381         return "\n\n".join(grafs)
1382
1383     def _add_footnotes(self, text):
1384         if self.footnotes:
1385             footer = [
1386                 '<div class="footnotes">',
1387                 '<hr' + self.empty_element_suffix,
1388                 '<ol>',
1389             ]
1390             for i, id in enumerate(self.footnote_ids):
1391                 if i != 0:
1392                     footer.append('')
1393                 footer.append('<li id="fn-%s">' % id)
1394                 footer.append(self._run_block_gamut(self.footnotes[id]))
1395                 backlink = ('<a href="#fnref-%s" '
1396                     'class="footnoteBackLink" '
1397                     'title="Jump back to footnote %d in the text.">'
1398                     '&#8617;</a>' % (id, i+1))
1399                 if footer[-1].endswith("</p>"):
1400                     footer[-1] = footer[-1][:-len("</p>")] \
1401                         + '&nbsp;' + backlink + "</p>"
1402                 else:
1403                     footer.append("\n<p>%s</p>" % backlink)
1404                 footer.append('</li>')
1405             footer.append('</ol>')
1406             footer.append('</div>')
1407             return text + '\n\n' + '\n'.join(footer)
1408         else:
1409             return text
1410
1411     # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1412     #   http://bumppo.net/projects/amputator/
1413     _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)')
1414     _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I)
1415     _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I)
1416
1417     def _encode_amps_and_angles(self, text):
1418         # Smart processing for ampersands and angle brackets that need
1419         # to be encoded.
1420         text = self._ampersand_re.sub('&amp;', text)
1421
1422         # Encode naked <'s
1423         text = self._naked_lt_re.sub('&lt;', text)
1424
1425         # Encode naked >'s
1426         # Note: Other markdown implementations (e.g. Markdown.pl, PHP
1427         # Markdown) don't do this.
1428         text = self._naked_gt_re.sub('&gt;', text)
1429         return text
1430
1431     def _encode_backslash_escapes(self, text):
1432         for ch, escape in g_escape_table.items():
1433             text = text.replace("\\"+ch, escape)
1434         return text
1435
1436     _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I)
1437     def _auto_link_sub(self, match):
1438         g1 = match.group(1)
1439         return '<a href="%s">%s</a>' % (g1, g1)
1440
1441     _auto_email_link_re = re.compile(r"""
1442           <
1443            (?:mailto:)?
1444           (
1445               [-.\w]+
1446               \@
1447               [-\w]+(\.[-\w]+)*\.[a-zA-Z]+
1448           )
1449           >
1450         """, re.I | re.X | re.U)
1451     def _auto_email_link_sub(self, match):
1452         return self._encode_email_address(
1453             self._unescape_special_chars(match.group(1)))
1454
1455     def _do_auto_links(self, text):
1456         text = self._auto_link_re.sub(self._auto_link_sub, text)
1457         text = self._auto_email_link_re.sub(self._auto_email_link_sub, text)
1458         return text
1459
1460     def _encode_email_address(self, addr):
1461         #  Input: an email address, e.g. "foo@example.com"
1462         #
1463         #  Output: the email address as a mailto link, with each character
1464         #      of the address encoded as either a decimal or hex entity, in
1465         #      the hopes of foiling most address harvesting spam bots. E.g.:
1466         #
1467         #    <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1468         #       x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1469         #       &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1470         #
1471         #  Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1472         #  mailing list: <http://tinyurl.com/yu7ue>
1473         chars = [_xml_encode_email_char_at_random(ch)
1474                  for ch in "mailto:" + addr]
1475         # Strip the mailto: from the visible part.
1476         addr = '<a href="%s">%s</a>' \
1477                % (''.join(chars), ''.join(chars[7:]))
1478         return addr
1479
1480     def _do_link_patterns(self, text):
1481         """Caveat emptor: there isn't much guarding against link
1482         patterns being formed inside other standard Markdown links, e.g.
1483         inside a [link def][like this].
1484
1485         Dev Notes: *Could* consider prefixing regexes with a negative
1486         lookbehind assertion to attempt to guard against this.
1487         """
1488         link_from_hash = {}
1489         for regex, repl in self.link_patterns:
1490             replacements = []
1491             for match in regex.finditer(text):
1492                 if hasattr(repl, "__call__"):
1493                     href = repl(match)
1494                 else:
1495                     href = match.expand(repl)
1496                 replacements.append((match.span(), href))
1497             for (start, end), href in reversed(replacements):
1498                 escaped_href = (
1499                     href.replace('"', '&quot;')  # b/c of attr quote
1500                         # To avoid markdown <em> and <strong>:
1501                         .replace('*', g_escape_table['*'])
1502                         .replace('_', g_escape_table['_']))
1503                 link = '<a href="%s">%s</a>' % (escaped_href, text[start:end])
1504                 hash = md5(link).hexdigest()
1505                 link_from_hash[hash] = link
1506                 text = text[:start] + hash + text[end:]
1507         for hash, link in link_from_hash.items():
1508             text = text.replace(hash, link)
1509         return text
1510
1511     def _unescape_special_chars(self, text):
1512         # Swap back in all the special characters we've hidden.
1513         for ch, hash in g_escape_table.items():
1514             text = text.replace(hash, ch)
1515         return text
1516
1517     def _outdent(self, text):
1518         # Remove one level of line-leading tabs or spaces
1519         return self._outdent_re.sub('', text)
1520
1521
1522 class MarkdownWithExtras(Markdown):
1523     """A markdowner class that enables most extras:
1524
1525     - footnotes
1526     - code-color (only has effect if 'pygments' Python module on path)
1527
1528     These are not included:
1529     - pyshell (specific to Python-related documenting)
1530     - code-friendly (because it *disables* part of the syntax)
1531     - link-patterns (because you need to specify some actual
1532       link-patterns anyway)
1533     """
1534     extras = ["footnotes", "code-color"]
1535
1536
1537 #---- internal support functions
1538
1539 # From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549
1540 def _curry(*args, **kwargs):
1541     function, args = args[0], args[1:]
1542     def result(*rest, **kwrest):
1543         combined = kwargs.copy()
1544         combined.update(kwrest)
1545         return function(*args + rest, **combined)
1546     return result
1547
1548 # Recipe: regex_from_encoded_pattern (1.0)
1549 def _regex_from_encoded_pattern(s):
1550     """'foo'    -> re.compile(re.escape('foo'))
1551        '/foo/'  -> re.compile('foo')
1552        '/foo/i' -> re.compile('foo', re.I)
1553     """
1554     if s.startswith('/') and s.rfind('/') != 0:
1555         # Parse it: /PATTERN/FLAGS
1556         idx = s.rfind('/')
1557         pattern, flags_str = s[1:idx], s[idx+1:]
1558         flag_from_char = {
1559             "i": re.IGNORECASE,
1560             "l": re.LOCALE,
1561             "s": re.DOTALL,
1562             "m": re.MULTILINE,
1563             "u": re.UNICODE,
1564         }
1565         flags = 0
1566         for char in flags_str:
1567             try:
1568                 flags |= flag_from_char[char]
1569             except KeyError:
1570                 raise ValueError("unsupported regex flag: '%s' in '%s' "
1571                                  "(must be one of '%s')"
1572                                  % (char, s, ''.join(flag_from_char.keys())))
1573         return re.compile(s[1:idx], flags)
1574     else: # not an encoded regex
1575         return re.compile(re.escape(s))
1576
1577 # Recipe: dedent (0.1.2)
1578 def _dedentlines(lines, tabsize=8, skip_first_line=False):
1579     """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
1580
1581         "lines" is a list of lines to dedent.
1582         "tabsize" is the tab width to use for indent width calculations.
1583         "skip_first_line" is a boolean indicating if the first line should
1584             be skipped for calculating the indent width and for dedenting.
1585             This is sometimes useful for docstrings and similar.
1586
1587     Same as dedent() except operates on a sequence of lines. Note: the
1588     lines list is modified **in-place**.
1589     """
1590     DEBUG = False
1591     if DEBUG:
1592         print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
1593               % (tabsize, skip_first_line)
1594     indents = []
1595     margin = None
1596     for i, line in enumerate(lines):
1597         if i == 0 and skip_first_line: continue
1598         indent = 0
1599         for ch in line:
1600             if ch == ' ':
1601                 indent += 1
1602             elif ch == '\t':
1603                 indent += tabsize - (indent % tabsize)
1604             elif ch in '\r\n':
1605                 continue # skip all-whitespace lines
1606             else:
1607                 break
1608         else:
1609             continue # skip all-whitespace lines
1610         if DEBUG: print "dedent: indent=%d: %r" % (indent, line)
1611         if margin is None:
1612             margin = indent
1613         else:
1614             margin = min(margin, indent)
1615     if DEBUG: print "dedent: margin=%r" % margin
1616
1617     if margin is not None and margin > 0:
1618         for i, line in enumerate(lines):
1619             if i == 0 and skip_first_line: continue
1620             removed = 0
1621             for j, ch in enumerate(line):
1622                 if ch == ' ':
1623                     removed += 1
1624                 elif ch == '\t':
1625                     removed += tabsize - (removed % tabsize)
1626                 elif ch in '\r\n':
1627                     if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line
1628                     lines[i] = lines[i][j:]
1629                     break
1630                 else:
1631                     raise ValueError("unexpected non-whitespace char %r in "
1632                                      "line %r while removing %d-space margin"
1633                                      % (ch, line, margin))
1634                 if DEBUG:
1635                     print "dedent: %r: %r -> removed %d/%d"\
1636                           % (line, ch, removed, margin)
1637                 if removed == margin:
1638                     lines[i] = lines[i][j+1:]
1639                     break
1640                 elif removed > margin:
1641                     lines[i] = ' '*(removed-margin) + lines[i][j+1:]
1642                     break
1643             else:
1644                 if removed:
1645                     lines[i] = lines[i][removed:]
1646     return lines
1647
1648 def _dedent(text, tabsize=8, skip_first_line=False):
1649     """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text
1650
1651         "text" is the text to dedent.
1652         "tabsize" is the tab width to use for indent width calculations.
1653         "skip_first_line" is a boolean indicating if the first line should
1654             be skipped for calculating the indent width and for dedenting.
1655             This is sometimes useful for docstrings and similar.
1656
1657     textwrap.dedent(s), but don't expand tabs to spaces
1658     """
1659     lines = text.splitlines(1)
1660     _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
1661     return ''.join(lines)
1662
1663
1664 class _memoized(object):
1665    """Decorator that caches a function's return value each time it is called.
1666    If called later with the same arguments, the cached value is returned, and
1667    not re-evaluated.
1668
1669    http://wiki.python.org/moin/PythonDecoratorLibrary
1670    """
1671    def __init__(self, func):
1672       self.func = func
1673       self.cache = {}
1674    def __call__(self, *args):
1675       try:
1676          return self.cache[args]
1677       except KeyError:
1678          self.cache[args] = value = self.func(*args)
1679          return value
1680       except TypeError:
1681          # uncachable -- for instance, passing a list as an argument.
1682          # Better to not cache than to blow up entirely.
1683          return self.func(*args)
1684    def __repr__(self):
1685       """Return the function's docstring."""
1686       return self.func.__doc__
1687
1688
1689 def _xml_oneliner_re_from_tab_width(tab_width):
1690     """Standalone XML processing instruction regex."""
1691     return re.compile(r"""
1692         (?:
1693             (?<=\n\n)       # Starting after a blank line
1694             |               # or
1695             \A\n?           # the beginning of the doc
1696         )
1697         (                           # save in $1
1698             [ ]{0,%d}
1699             (?:
1700                 <\?\w+\b\s+.*?\?>   # XML processing instruction
1701                 |
1702                 <\w+:\w+\b\s+.*?/>  # namespaced single tag
1703             )
1704             [ \t]*
1705             (?=\n{2,}|\Z)       # followed by a blank line or end of document
1706         )
1707         """ % (tab_width - 1), re.X)
1708 _xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width)
1709
1710 def _hr_tag_re_from_tab_width(tab_width):
1711      return re.compile(r"""
1712         (?:
1713             (?<=\n\n)       # Starting after a blank line
1714             |               # or
1715             \A\n?           # the beginning of the doc
1716         )
1717         (                       # save in \1
1718             [ ]{0,%d}
1719             <(hr)               # start tag = \2
1720             \b                  # word break
1721             ([^<>])*?           #
1722             /?>                 # the matching end tag
1723             [ \t]*
1724             (?=\n{2,}|\Z)       # followed by a blank line or end of document
1725         )
1726         """ % (tab_width - 1), re.X)
1727 _hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width)
1728
1729
1730 def _xml_encode_email_char_at_random(ch):
1731     r = random()
1732     # Roughly 10% raw, 45% hex, 45% dec.
1733     # '@' *must* be encoded. I [John Gruber] insist.
1734     if r > 0.9 and ch != "@":
1735         return ch
1736     elif r < 0.45:
1737         # The [1:] is to drop leading '0': 0x63 -> x63
1738         return '&#%s;' % hex(ord(ch))[1:]
1739     else:
1740         return '&#%s;' % ord(ch)
1741
1742 def _hash_text(text):
1743     return 'md5:'+md5(text.encode("utf-8")).hexdigest()
1744
1745
1746 #---- mainline
1747
1748 class _NoReflowFormatter(optparse.IndentedHelpFormatter):
1749     """An optparse formatter that does NOT reflow the description."""
1750     def format_description(self, description):
1751         return description or ""
1752
1753 def _test():
1754     import doctest
1755     doctest.testmod()
1756
1757 def main(argv=None):
1758     if argv is None:
1759         argv = sys.argv
1760     if not logging.root.handlers:
1761         logging.basicConfig()
1762
1763     usage = "usage: %prog [PATHS...]"
1764     version = "%prog "+__version__
1765     parser = optparse.OptionParser(prog="markdown2", usage=usage,
1766         version=version, description=cmdln_desc,
1767         formatter=_NoReflowFormatter())
1768     parser.add_option("-v", "--verbose", dest="log_level",
1769                       action="store_const", const=logging.DEBUG,
1770                       help="more verbose output")
1771     parser.add_option("--encoding",
1772                       help="specify encoding of text content")
1773     parser.add_option("--html4tags", action="store_true", default=False,
1774                       help="use HTML 4 style for empty element tags")
1775     parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode",
1776                       help="sanitize literal HTML: 'escape' escapes "
1777                            "HTML meta chars, 'replace' replaces with an "
1778                            "[HTML_REMOVED] note")
1779     parser.add_option("-x", "--extras", action="append",
1780                       help="Turn on specific extra features (not part of "
1781                            "the core Markdown spec). Supported values: "
1782                            "'code-friendly' disables _/__ for emphasis; "
1783                            "'code-color' adds code-block syntax coloring; "
1784                            "'link-patterns' adds auto-linking based on patterns; "
1785                            "'footnotes' adds the footnotes syntax;"
1786                            "'xml' passes one-liner processing instructions and namespaced XML tags;"
1787                            "'pyshell' to put unindented Python interactive shell sessions in a <code> block.")
1788     parser.add_option("--use-file-vars",
1789                       help="Look for and use Emacs-style 'markdown-extras' "
1790                            "file var to turn on extras. See "
1791                            "<http://code.google.com/p/python-markdown2/wiki/Extras>.")
1792     parser.add_option("--link-patterns-file",
1793                       help="path to a link pattern file")
1794     parser.add_option("--self-test", action="store_true",
1795                       help="run internal self-tests (some doctests)")
1796     parser.add_option("--compare", action="store_true",
1797                       help="run against Markdown.pl as well (for testing)")
1798     parser.set_defaults(log_level=logging.INFO, compare=False,
1799                         encoding="utf-8", safe_mode=None, use_file_vars=False)
1800     opts, paths = parser.parse_args()
1801     log.setLevel(opts.log_level)
1802
1803     if opts.self_test:
1804         return _test()
1805
1806     if opts.extras:
1807         extras = {}
1808         for s in opts.extras:
1809             splitter = re.compile("[,;: ]+")
1810             for e in splitter.split(s):
1811                 if '=' in e:
1812                     ename, earg = e.split('=', 1)
1813                     try:
1814                         earg = int(earg)
1815                     except ValueError:
1816                         pass
1817                 else:
1818                     ename, earg = e, None
1819                 extras[ename] = earg
1820     else:
1821         extras = None
1822
1823     if opts.link_patterns_file:
1824         link_patterns = []
1825         f = open(opts.link_patterns_file)
1826         try:
1827             for i, line in enumerate(f.readlines()):
1828                 if not line.strip(): continue
1829                 if line.lstrip().startswith("#"): continue
1830                 try:
1831                     pat, href = line.rstrip().rsplit(None, 1)
1832                 except ValueError:
1833                     raise MarkdownError("%s:%d: invalid link pattern line: %r"
1834                                         % (opts.link_patterns_file, i+1, line))
1835                 link_patterns.append(
1836                     (_regex_from_encoded_pattern(pat), href))
1837         finally:
1838             f.close()
1839     else:
1840         link_patterns = None
1841
1842     from os.path import join, dirname, abspath
1843     markdown_pl = join(dirname(dirname(abspath(__file__))), "test",
1844                        "Markdown.pl")
1845     for path in paths:
1846         if opts.compare:
1847             print "==== Markdown.pl ===="
1848             perl_cmd = 'perl %s "%s"' % (markdown_pl, path)
1849             o = os.popen(perl_cmd)
1850             perl_html = o.read()
1851             o.close()
1852             sys.stdout.write(perl_html)
1853             print "==== markdown2.py ===="
1854         html = markdown_path(path, encoding=opts.encoding,
1855                              html4tags=opts.html4tags,
1856                              safe_mode=opts.safe_mode,
1857                              extras=extras, link_patterns=link_patterns,
1858                              use_file_vars=opts.use_file_vars)
1859         sys.stdout.write(
1860             html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace'))
1861         if opts.compare:
1862             print "==== match? %r ====" % (perl_html == html)
1863
1864
1865 if __name__ == "__main__":
1866     sys.exit( main(sys.argv) )
1867