lib/markdown.py

   1 #!/usr/bin/env python
   2
   3 SPEED_TEST = 0
   4
   5 """
   6 ====================================================================
   7 IF YOU ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION
   8 ====================================================================
   9
  10 Python-Markdown
  11 ===============
  12
  13 Converts Markdown to HTML.  Basic usage as a module:
  14
  15     import markdown
  16     html = markdown.markdown(your_text_string)
  17
  18 Started by [Manfred Stienstra](http://www.dwerg.net/).  Continued and
  19 maintained  by [Yuri Takhteyev](http://www.freewisdom.org).
  20
  21 Project website: http://www.freewisdom.org/projects/python-markdown
  22 Contact: yuri [at] freewisdom.org
  23
  24 License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
  25
  26 Version: 1.5 (May 15, 2006)
  27
  28 For changelog, see end of file
  29 """
  30
  31 import re, sys, os, random
  32
  33 # set debug level: 3 none, 2 critical, 1 informative, 0 all
  34 (VERBOSE, INFO, CRITICAL, NONE) = range(4)
  35
  36 MESSAGE_THRESHOLD = CRITICAL
  37
  38 def message(level, text) :
  39     if level >= MESSAGE_THRESHOLD :
  40         print text
  41
  42
  43 # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
  44
  45 # all tabs will be expanded to up to this many spaces
  46 TAB_LENGTH = 4
  47 ENABLE_ATTRIBUTES = 1
  48 SMART_EMPHASIS = 1
  49
  50 # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
  51
  52 FN_BACKLINK_TEXT = "zz1337820767766393qq"
  53 # a template for html placeholders
  54 HTML_PLACEHOLDER_PREFIX = "qaodmasdkwaspemas"
  55 HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%dajkqlsmdqpakldnzsdfls"
  56
  57 BLOCK_LEVEL_ELEMENTS = ['p', 'div', 'blockquote', 'pre', 'table',
  58                         'dl', 'ol', 'ul', 'script', 'noscript',
  59                         'form', 'fieldset', 'iframe', 'math', 'ins',
  60                         'del', 'hr', 'hr/']
  61
  62 def is_block_level (tag) :
  63     return ( (tag in BLOCK_LEVEL_ELEMENTS) or
  64              (tag[0] == 'h' and tag[1] in "0123456789") )
  65
  66 """
  67 ======================================================================
  68 ========================== NANODOM ===================================
  69 ======================================================================
  70
  71 The three classes below implement some of the most basic DOM
  72 methods.  I use this instead of minidom because I need a simpler
  73 functionality and do not want to require additional libraries.
  74
  75 Importantly, NanoDom does not do normalization, which is what we
  76 want. It also adds extra white space when converting DOM to string
  77 """
  78
  79
  80 class Document :
  81
  82     def appendChild(self, child) :
  83         self.documentElement = child
  84         child.parent = self
  85         self.entities = {}
  86
  87     def createElement(self, tag, textNode=None) :
  88         el = Element(tag)
  89         el.doc = self
  90         if textNode :
  91             el.appendChild(self.createTextNode(textNode))
  92         return el
  93
  94     def createTextNode(self, text) :
  95         node = TextNode(text)
  96         node.doc = self
  97         return node
  98
  99     def createEntityReference(self, entity):
 100         if entity not in self.entities:
 101             self.entities[entity] = EntityReference(entity)
 102         return self.entities[entity]
 103
 104     def toxml (self) :
 105         return self.documentElement.toxml()
 106
 107     def normalizeEntities(self, text) :
 108
 109         pairs = [ #("&", "&amp;"),
 110                   ("<", "&lt;"),
 111                   (">", "&gt;"),
 112                   ("\"", "&quot;")]
 113
 114         for old, new in pairs :
 115             text = text.replace(old, new)
 116         return text
 117
 118     def find(self, test) :
 119         return self.documentElement.find(test)
 120
 121     def unlink(self) :
 122         self.documentElement.unlink()
 123         self.documentElement = None
 124
 125
 126 class Element :
 127
 128     type = "element"
 129
 130     def __init__ (self, tag) :
 131
 132         self.nodeName = tag
 133         self.attributes = []
 134         self.attribute_values = {}
 135         self.childNodes = []
 136
 137     def unlink(self) :
 138         for child in self.childNodes :
 139             if child.type == "element" :
 140                 child.unlink()
 141         self.childNodes = None
 142
 143     def setAttribute(self, attr, value) :
 144         if not attr in self.attributes :
 145             self.attributes.append(attr)
 146
 147         self.attribute_values[attr] = value
 148
 149     def insertChild(self, position, child) :
 150         self.childNodes.insert(position, child)
 151         child.parent = self
 152
 153     def removeChild(self, child) :
 154         self.childNodes.remove(child)
 155
 156     def replaceChild(self, oldChild, newChild) :
 157         position = self.childNodes.index(oldChild)
 158         self.removeChild(oldChild)
 159         self.insertChild(position, newChild)
 160
 161     def appendChild(self, child) :
 162         self.childNodes.append(child)
 163         child.parent = self
 164
 165     def handleAttributes(self) :
 166         pass
 167
 168     def find(self, test, depth=0) :
 169         """ Returns a list of descendants that pass the test function """
 170         matched_nodes = []
 171         for child in self.childNodes :
 172             if test(child) :
 173                 matched_nodes.append(child)
 174             if child.type == "element" :
 175                 matched_nodes += child.find(test, depth+1)
 176         return matched_nodes
 177
 178     def toxml(self):
 179         if ENABLE_ATTRIBUTES :
 180             for child in self.childNodes:
 181                 child.handleAttributes()
 182         buffer = ""
 183         if self.nodeName in ['h1', 'h2', 'h3', 'h4'] :
 184             buffer += "\n"
 185         elif self.nodeName in ['li'] :
 186             buffer += "\n "
 187         buffer += "<" + self.nodeName
 188         for attr in self.attributes :
 189             value = self.attribute_values[attr]
 190             value = self.doc.normalizeEntities(value)
 191             buffer += ' %s="%s"' % (attr, value)
 192         if self.childNodes or self.nodeName in ['blockquote']:
 193             buffer += ">"
 194             for child in self.childNodes :
 195                 buffer += child.toxml()
 196             if self.nodeName == 'p' :
 197                 buffer += "\n"
 198             elif self.nodeName == 'li' :
 199                 buffer += "\n "
 200             buffer += "</%s>" % self.nodeName
 201         else :
 202             buffer += "/>"
 203         if self.nodeName in ['p', 'li', 'ul', 'ol',
 204                              'h1', 'h2', 'h3', 'h4'] :
 205             buffer += "\n"
 206
 207         return buffer
 208
 209
 210 class TextNode :
 211
 212     type = "text"
 213     attrRegExp = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123}
 214
 215     def __init__ (self, text) :
 216         self.value = text
 217
 218     def attributeCallback(self, match) :
 219         self.parent.setAttribute(match.group(1), match.group(2))
 220
 221     def handleAttributes(self) :
 222         self.value = self.attrRegExp.sub(self.attributeCallback, self.value)
 223
 224     def toxml(self) :
 225         text = self.value
 226         if not text.startswith(HTML_PLACEHOLDER_PREFIX):
 227             if self.parent.nodeName == "p" :
 228                 text = text.replace("\n", "\n   ")
 229             elif (self.parent.nodeName == "li"
 230                   and self.parent.childNodes[0]==self):
 231                 text = "\n     " + text.replace("\n", "\n     ")
 232         text = self.doc.normalizeEntities(text)
 233         return text
 234
 235
 236 class EntityReference:
 237
 238     type = "entity_ref"
 239
 240     def __init__(self, entity):
 241         self.entity = entity
 242
 243     def handleAttributes(self):
 244         pass
 245
 246     def toxml(self):
 247         return "&" + self.entity + ";"
 248
 249
 250 """
 251 ======================================================================
 252 ========================== PRE-PROCESSORS ============================
 253 ======================================================================
 254
 255 Preprocessors munge source text before we start doing anything too
 256 complicated.
 257
 258 Each preprocessor implements a "run" method that takes a pointer to
 259 a list of lines of the document, modifies it as necessary and
 260 returns either the same pointer or a pointer to a new list.
 261 """
 262
 263 class HeaderPreprocessor :
 264
 265     """
 266        Replaces underlined headers with hashed headers to avoid
 267        the nead for lookahead later.
 268     """
 269
 270     def run (self, lines) :
 271
 272         for i in range(len(lines)) :
 273             if not lines[i] :
 274                 continue
 275
 276             if lines[i].startswith("#") :
 277                 lines.insert(i+1, "\n")
 278
 279             if (i+1 <= len(lines)
 280                   and lines[i+1]
 281                   and lines[i+1][0] in ['-', '=']) :
 282
 283                 underline = lines[i+1].strip()
 284
 285                 if underline == "="*len(underline) :
 286                     lines[i] = "# " + lines[i].strip()
 287                     lines[i+1] = ""
 288                 elif underline == "-"*len(underline) :
 289                     lines[i] = "## " + lines[i].strip()
 290                     lines[i+1] = ""
 291
 292         return lines
 293
 294 HEADER_PREPROCESSOR = HeaderPreprocessor()
 295
 296 class LinePreprocessor :
 297     """Deals with HR lines (needs to be done before processing lists)"""
 298
 299     def run (self, lines) :
 300         for i in range(len(lines)) :
 301             if self._isLine(lines[i]) :
 302                 lines[i] = "<hr />"
 303         return lines
 304
 305     def _isLine(self, block) :
 306         """Determines if a block should be replaced with an <HR>"""
 307         if block.startswith("    ") : return 0  # a code block
 308         text = "".join([x for x in block if not x.isspace()])
 309         if len(text) <= 2 :
 310             return 0
 311         for pattern in ['isline1', 'isline2', 'isline3'] :
 312             m = RE.regExp[pattern].match(text)
 313             if (m and m.group(1)) :
 314                 return 1
 315         else:
 316             return 0
 317
 318 LINE_PREPROCESSOR = LinePreprocessor()
 319
 320
 321 class LineBreaksPreprocessor :
 322     """Replaces double spaces at the end of the lines with <br/ >."""
 323
 324     def run (self, lines) :
 325         for i in range(len(lines)) :
 326             if (lines[i].endswith("  ")
 327                 and not RE.regExp['tabbed'].match(lines[i]) ):
 328                 lines[i] += "<br />"
 329         return lines
 330
 331 LINE_BREAKS_PREPROCESSOR = LineBreaksPreprocessor()
 332
 333
 334 class HtmlBlockPreprocessor :
 335     """Removes html blocks from self.lines"""
 336
 337     def run (self, lines) :
 338         new_blocks = []
 339         text = "\n".join(lines)
 340         for block in text.split("\n\n") :
 341             if block.startswith("\n") :
 342                 block = block[1:]
 343             if ( (block.startswith("<") and block.rstrip().endswith(">"))
 344                  and (block[1] in ["!", "?", "@", "%"]
 345                       or is_block_level( block[1:].replace(">", " ")
 346                                          .split()[0].lower()))) :
 347                 new_blocks.append(
 348                     self.stash.store(block.strip()))
 349             else :
 350                 new_blocks.append(block)
 351         return "\n\n".join(new_blocks).split("\n")
 352
 353 HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
 354
 355
 356 class ReferencePreprocessor :
 357
 358     def run (self, lines) :
 359         new_text = [];
 360         for line in lines:
 361             m = RE.regExp['reference-def'].match(line)
 362             if m:
 363                 id = m.group(2).strip().lower()
 364                 title = dequote(m.group(4).strip()) #.replace('"', "&quot;")
 365                 self.references[id] = (m.group(3), title)
 366             else:
 367                 new_text.append(line)
 368         return new_text #+ "\n"
 369
 370 REFERENCE_PREPROCESSOR = ReferencePreprocessor()
 371
 372 """
 373 ======================================================================
 374 ========================== INLINE PATTERNS ===========================
 375 ======================================================================
 376
 377 Inline patterns such as *emphasis* are handled by means of auxiliary
 378 objects, one per pattern.  Each pattern object uses a single regular
 379 expression and needs support the following methods:
 380
 381   pattern.getCompiledRegExp() - returns a regular expression
 382
 383   pattern.handleMatch(m, doc) - takes a match object and returns
 384                                 a NanoDom node (as a part of the provided
 385                                 doc) or None
 386
 387 All of python markdown's built-in patterns subclass from BasePatter,
 388 but you can add additional patterns that don't.
 389
 390 Also note that all the regular expressions used by inline must
 391 capture the whole block.  For this reason, they all start with
 392 '^(.*)' and end with '(.*)!'.  In case with built-in expression
 393 BasePattern takes care of adding the "^(.*)" and "(.*)!".
 394
 395 Finally, the order in which regular expressions are applied is very
 396 important - e.g. if we first replace http://.../ links with <a> tags
 397 and _then_ try to replace inline html, we would end up with a mess.
 398 So, we apply the expressions in the following order:
 399
 400        * escape and backticks have to go before everything else, so
 401          that we can preempt any markdown patterns by escaping them.
 402
 403        * then we handle auto-links (must be done before inline html)
 404
 405        * then we handle inline HTML.  At this point we will simply
 406          replace all inline HTML strings with a placeholder and add
 407          the actual HTML to a hash.
 408
 409        * then inline images (must be done before links)
 410
 411        * then bracketed links, first regular then reference-style
 412
 413        * finally we apply strong and emphasis
 414 """
 415
 416 NOBRACKET = r'[^\]\[]*'
 417 BRK = ( r'\[('
 418         + (NOBRACKET + r'(\['+NOBRACKET)*6
 419         + (NOBRACKET+ r'\])*'+NOBRACKET)*6
 420         + NOBRACKET + r')\]' )
 421
 422 BACKTICK_RE = r'\`([^\`]*)\`'                    # `e= m*c^2`
 423 DOUBLE_BACKTICK_RE =  r'\`\`(.*)\`\`'            # ``e=f("`")``
 424 ESCAPE_RE = r'\\(.)'                             # \<
 425 EMPHASIS_RE = r'\*([^\*]*)\*'                    # *emphasis*
 426 STRONG_RE = r'\*\*(.*)\*\*'                      # **strong**
 427 STRONG_EM_RE = r'\*\*\*([^_]*)\*\*\*'            # ***strong***
 428
 429 if SMART_EMPHASIS:
 430     EMPHASIS_2_RE = r'(?<!\S)_(\S[^_]*)_'        # _emphasis_
 431 else :
 432     EMPHASIS_2_RE = r'_([^_]*)_'                 # _emphasis_
 433
 434 STRONG_2_RE = r'__([^_]*)__'                     # __strong__
 435 STRONG_EM_2_RE = r'___([^_]*)___'                # ___strong___
 436
 437 LINK_RE = BRK + r'\s*\(([^\)]*)\)'               # [text](url)
 438 LINK_ANGLED_RE = BRK + r'\s*\(<([^\)]*)>\)'      # [text](<url>)
 439 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(([^\)]*)\)' # ![alttxt](http://x.com/)
 440 REFERENCE_RE = BRK+ r'\s*\[([^\]]*)\]'           # [Google][3]
 441 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
 442 NOT_STRONG_RE = r'( \* )'                        # stand-alone * or _
 443 AUTOLINK_RE = r'<(http://[^>]*)>'                # <http://www.123.com>
 444 AUTOMAIL_RE = r'<([^> ]*@[^> ]*)>'               # <me@example.com>
 445 HTML_RE = r'(\<[^\>]*\>)'                        # <...>
 446 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'                # &amp;
 447
 448 class BasePattern:
 449
 450     def __init__ (self, pattern) :
 451         self.pattern = pattern
 452         self.compiled_re = re.compile("^(.*)%s(.*)$" % pattern, re.DOTALL)
 453
 454     def getCompiledRegExp (self) :
 455         return self.compiled_re
 456
 457 class SimpleTextPattern (BasePattern) :
 458
 459     def handleMatch(self, m, doc) :
 460         return doc.createTextNode(m.group(2))
 461
 462 class SimpleTagPattern (BasePattern):
 463
 464     def __init__ (self, pattern, tag) :
 465         BasePattern.__init__(self, pattern)
 466         self.tag = tag
 467
 468     def handleMatch(self, m, doc) :
 469         el = doc.createElement(self.tag)
 470         el.appendChild(doc.createTextNode(m.group(2)))
 471         return el
 472
 473 class BacktickPattern (BasePattern):
 474
 475     def __init__ (self, pattern):
 476         BasePattern.__init__(self, pattern)
 477         self.tag = "code"
 478
 479     def handleMatch(self, m, doc) :
 480         el = doc.createElement(self.tag)
 481         text = m.group(2).strip()
 482         text = text.replace("&", "&amp;")
 483         el.appendChild(doc.createTextNode(text))
 484         return el
 485
 486
 487 class DoubleTagPattern (SimpleTagPattern) :
 488
 489     def handleMatch(self, m, doc) :
 490         tag1, tag2 = self.tag.split(",")
 491         el1 = doc.createElement(tag1)
 492         el2 = doc.createElement(tag2)
 493         el1.appendChild(el2)
 494         el2.appendChild(doc.createTextNode(m.group(2)))
 495         return el1
 496
 497
 498 class HtmlPattern (BasePattern):
 499
 500     def handleMatch (self, m, doc) :
 501         place_holder = self.stash.store(m.group(2))
 502         return doc.createTextNode(place_holder)
 503
 504
 505 class LinkPattern (BasePattern):
 506
 507     def handleMatch(self, m, doc) :
 508         el = doc.createElement('a')
 509         el.appendChild(doc.createTextNode(m.group(2)))
 510         parts = m.group(9).split()
 511         # We should now have [], [href], or [href, title]
 512         if parts :
 513             el.setAttribute('href', parts[0])
 514         else :
 515             el.setAttribute('href', "")
 516         if len(parts) > 1 :
 517             # we also got a title
 518             title = " ".join(parts[1:]).strip()
 519             title = dequote(title) #.replace('"', "&quot;")
 520             el.setAttribute('title', title)
 521         return el
 522
 523
 524 class ImagePattern (BasePattern):
 525
 526     def handleMatch(self, m, doc):
 527         el = doc.createElement('img')
 528         src_parts = m.group(9).split()
 529         el.setAttribute('src', src_parts[0])
 530         if len(src_parts) > 1 :
 531             el.setAttribute('title', dequote(" ".join(src_parts[1:])))
 532         if ENABLE_ATTRIBUTES :
 533             text = doc.createTextNode(m.group(2))
 534             el.appendChild(text)
 535             text.handleAttributes()
 536             truealt = text.value
 537             el.childNodes.remove(text)
 538         else:
 539             truealt = m.group(2)
 540         el.setAttribute('alt', truealt)
 541         return el
 542
 543 class ReferencePattern (BasePattern):
 544
 545     def handleMatch(self, m, doc):
 546         if m.group(9) :
 547             id = m.group(9).lower()
 548         else :
 549             # if we got something like "[Google][]"
 550             # we'll use "google" as the id
 551             id = m.group(2).lower()
 552         if not self.references.has_key(id) : # ignore undefined refs
 553             return None
 554         href, title = self.references[id]
 555         text = m.group(2)
 556         return self.makeTag(href, title, text, doc)
 557
 558     def makeTag(self, href, title, text, doc):
 559         el = doc.createElement('a')
 560         el.setAttribute('href', href)
 561         if title :
 562             el.setAttribute('title', title)
 563         el.appendChild(doc.createTextNode(text))
 564         return el
 565
 566
 567 class ImageReferencePattern (ReferencePattern):
 568
 569     def makeTag(self, href, title, text, doc):
 570         el = doc.createElement('img')
 571         el.setAttribute('src', href)
 572         if title :
 573             el.setAttribute('title', title)
 574         el.setAttribute('alt', text)
 575         return el
 576
 577
 578 class AutolinkPattern (BasePattern):
 579
 580     def handleMatch(self, m, doc):
 581         el = doc.createElement('a')
 582         el.setAttribute('href', m.group(2))
 583         el.appendChild(doc.createTextNode(m.group(2)))
 584         return el
 585
 586 class AutomailPattern (BasePattern):
 587
 588     def handleMatch(self, m, doc) :
 589         el = doc.createElement('a')
 590         email = m.group(2)
 591         if email.startswith("mailto:"):
 592             email = email[len("mailto:"):]
 593         for letter in email:
 594             entity = doc.createEntityReference("#%d" % ord(letter))
 595             el.appendChild(entity)
 596         mailto = "mailto:" + email
 597         mailto = "".join(['&#%d;' % ord(letter) for letter in mailto])
 598         el.setAttribute('href', mailto)
 599         return el
 600
 601 ESCAPE_PATTERN          = SimpleTextPattern(ESCAPE_RE)
 602 NOT_STRONG_PATTERN      = SimpleTextPattern(NOT_STRONG_RE)
 603
 604 BACKTICK_PATTERN        = BacktickPattern(BACKTICK_RE)
 605 DOUBLE_BACKTICK_PATTERN = BacktickPattern(DOUBLE_BACKTICK_RE)
 606 STRONG_PATTERN          = SimpleTagPattern(STRONG_RE, 'strong')
 607 STRONG_PATTERN_2        = SimpleTagPattern(STRONG_2_RE, 'strong')
 608 EMPHASIS_PATTERN        = SimpleTagPattern(EMPHASIS_RE, 'em')
 609 EMPHASIS_PATTERN_2      = SimpleTagPattern(EMPHASIS_2_RE, 'em')
 610
 611 STRONG_EM_PATTERN       = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
 612 STRONG_EM_PATTERN_2     = DoubleTagPattern(STRONG_EM_2_RE, 'strong,em')
 613
 614 LINK_PATTERN            = LinkPattern(LINK_RE)
 615 LINK_ANGLED_PATTERN     = LinkPattern(LINK_ANGLED_RE)
 616 IMAGE_LINK_PATTERN      = ImagePattern(IMAGE_LINK_RE)
 617 IMAGE_REFERENCE_PATTERN = ImageReferencePattern(IMAGE_REFERENCE_RE)
 618 REFERENCE_PATTERN       = ReferencePattern(REFERENCE_RE)
 619
 620 HTML_PATTERN            = HtmlPattern(HTML_RE)
 621 ENTITY_PATTERN          = HtmlPattern(ENTITY_RE)
 622
 623 AUTOLINK_PATTERN        = AutolinkPattern(AUTOLINK_RE)
 624 AUTOMAIL_PATTERN        = AutomailPattern(AUTOMAIL_RE)
 625
 626
 627 """
 628 ======================================================================
 629 ========================== POST-PROCESSORS ===========================
 630 ======================================================================
 631
 632 Markdown also allows post-processors, which are similar to
 633 preprocessors in that they need to implement a "run" method.  Unlike
 634 pre-processors, they take a NanoDom document as a parameter and work
 635 with that.
 636 #
 637 There are currently no standard post-processors, but the footnote
 638 extension below uses one.
 639 """
 640 """
 641 ======================================================================
 642 ========================== MISC AUXILIARY CLASSES ====================
 643 ======================================================================
 644 """
 645
 646 class HtmlStash :
 647     """This class is used for stashing HTML objects that we extract
 648         in the beginning and replace with place-holders."""
 649
 650     def __init__ (self) :
 651         self.html_counter = 0 # for counting inline html segments
 652         self.rawHtmlBlocks=[]
 653
 654     def store(self, html) :
 655         """Saves an HTML segment for later reinsertion.  Returns a
 656            placeholder string that needs to be inserted into the
 657            document.
 658
 659            @param html: an html segment
 660            @returns : a placeholder string """
 661         self.rawHtmlBlocks.append(html)
 662         placeholder = HTML_PLACEHOLDER % self.html_counter
 663         self.html_counter += 1
 664         return placeholder
 665
 666
 667 class BlockGuru :
 668
 669     def _findHead(self, lines, fn, allowBlank=0) :
 670
 671         """Functional magic to help determine boundaries of indented
 672            blocks.
 673
 674            @param lines: an array of strings
 675            @param fn: a function that returns a substring of a string
 676                       if the string matches the necessary criteria
 677            @param allowBlank: specifies whether it's ok to have blank
 678                       lines between matching functions
 679            @returns: a list of post processes items and the unused
 680                       remainder of the original list"""
 681
 682         items = []
 683         item = -1
 684
 685         i = 0 # to keep track of where we are
 686
 687         for line in lines :
 688
 689             if not line.strip() and not allowBlank:
 690                 return items, lines[i:]
 691
 692             if not line.strip() and allowBlank:
 693                 # If we see a blank line, this _might_ be the end
 694                 i += 1
 695
 696                 # Find the next non-blank line
 697                 for j in range(i, len(lines)) :
 698                     if lines[j].strip() :
 699                         next = lines[j]
 700                         break
 701                 else :
 702                     # There is no more text => this is the end
 703                     break
 704
 705                 # Check if the next non-blank line is still a part of the list
 706
 707                 part = fn(next)
 708
 709                 if part :
 710                     items.append("")
 711                     continue
 712                 else :
 713                     break # found end of the list
 714
 715             part = fn(line)
 716
 717             if part :
 718                 items.append(part)
 719                 i += 1
 720                 continue
 721             else :
 722                 return items, lines[i:]
 723         else :
 724             i += 1
 725
 726         return items, lines[i:]
 727
 728
 729     def detabbed_fn(self, line) :
 730         """ An auxiliary method to be passed to _findHead """
 731         m = RE.regExp['tabbed'].match(line)
 732         if m:
 733             return m.group(4)
 734         else :
 735             return None
 736
 737
 738     def detectTabbed(self, lines) :
 739
 740         return self._findHead(lines, self.detabbed_fn,
 741                               allowBlank = 1)
 742
 743
 744 def print_error(string):
 745     """Print an error string to stderr"""
 746     sys.stderr.write(string +'\n')
 747
 748
 749 def dequote(string) :
 750     """ Removes quotes from around a string """
 751     if ( ( string.startswith('"') and string.endswith('"'))
 752          or (string.startswith("'") and string.endswith("'")) ) :
 753         return string[1:-1]
 754     else :
 755         return string
 756
 757 """
 758 ======================================================================
 759 ========================== CORE MARKDOWN =============================
 760 ======================================================================
 761
 762 This stuff is ugly, so if you are thinking of extending the syntax,
 763 see first if you can do it via pre-processors, post-processors,
 764 inline patterns or a combination of the three.
 765 """
 766
 767 class CorePatterns :
 768     """This class is scheduled for removal as part of a refactoring
 769         effort."""
 770
 771     patterns = {
 772         'header':          r'(#*)([^#]*)(#*)', # # A title
 773         'reference-def' :  r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)',
 774                            # [Google]: http://www.google.com/
 775         'containsline':    r'([-]*)$|^([=]*)', # -----, =====, etc.
 776         'ol':              r'[ ]{0,3}[\d]*\.\s+(.*)', # 1. text
 777         'ul':              r'[ ]{0,3}[*+-]\s+(.*)', # "* text"
 778         'isline1':         r'(\**)', # ***
 779         'isline2':         r'(\-*)', # ---
 780         'isline3':         r'(\_*)', # ___
 781         'tabbed':          r'((\t)|(    ))(.*)', # an indented line
 782         'quoted' :         r'> ?(.*)', # a quoted block ("> ...")
 783     }
 784
 785     def __init__ (self) :
 786
 787         self.regExp = {}
 788         for key in self.patterns.keys() :
 789             self.regExp[key] = re.compile("^%s$" % self.patterns[key],
 790                                           re.DOTALL)
 791
 792         self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M)
 793
 794 RE = CorePatterns()
 795
 796
 797 class Markdown:
 798     """ Markdown formatter class for creating an html document from
 799         Markdown text """
 800
 801
 802     def __init__(self, source=None):
 803         """Creates a new Markdown instance.
 804
 805            @param source: The text in Markdown format. """
 806
 807         if isinstance(source, unicode):
 808             source = source.encode('utf8')
 809         self.source = source
 810         self.blockGuru = BlockGuru()
 811         self.registeredExtensions = []
 812         self.stripTopLevelTags = 1
 813
 814         self.preprocessors = [ HEADER_PREPROCESSOR,
 815                                LINE_PREPROCESSOR,
 816                                HTML_BLOCK_PREPROCESSOR,
 817                                LINE_BREAKS_PREPROCESSOR,
 818                                # A footnote preprocessor will
 819                                # get inserted here
 820                                REFERENCE_PREPROCESSOR ]
 821
 822
 823         self.postprocessors = [] # a footnote postprocessor will get
 824                                  # inserted later
 825
 826         self.prePatterns = []
 827
 828
 829         self.inlinePatterns = [ DOUBLE_BACKTICK_PATTERN,
 830                                 BACKTICK_PATTERN,
 831                                 ESCAPE_PATTERN,
 832                                 IMAGE_LINK_PATTERN,
 833                                 IMAGE_REFERENCE_PATTERN,
 834                                 REFERENCE_PATTERN,
 835                                 LINK_ANGLED_PATTERN,
 836                                 LINK_PATTERN,
 837                                 AUTOLINK_PATTERN,
 838                                 AUTOMAIL_PATTERN,
 839                                 HTML_PATTERN,
 840                                 ENTITY_PATTERN,
 841                                 NOT_STRONG_PATTERN,
 842                                 STRONG_EM_PATTERN,
 843                                 STRONG_EM_PATTERN_2,
 844                                 STRONG_PATTERN,
 845                                 STRONG_PATTERN_2,
 846                                 EMPHASIS_PATTERN,
 847                                 EMPHASIS_PATTERN_2
 848                                 # The order of the handlers matters!!!
 849                                 ]
 850
 851         self.reset()
 852
 853     def registerExtension(self, extension) :
 854         self.registeredExtensions.append(extension)
 855
 856     def reset(self) :
 857         """Resets all state variables so that we can start
 858             with a new text."""
 859         self.references={}
 860         self.htmlStash = HtmlStash()
 861
 862         HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
 863         REFERENCE_PREPROCESSOR.references = self.references
 864         HTML_PATTERN.stash = self.htmlStash
 865         ENTITY_PATTERN.stash = self.htmlStash
 866         REFERENCE_PATTERN.references = self.references
 867         IMAGE_REFERENCE_PATTERN.references = self.references
 868
 869         for extension in self.registeredExtensions :
 870             extension.reset()
 871
 872
 873     def _transform(self):
 874         """Transforms the Markdown text into a XHTML body document
 875
 876            @returns: A NanoDom Document """
 877
 878         # Setup the document
 879
 880         self.doc = Document()
 881         self.top_element = self.doc.createElement("span")
 882         self.top_element.appendChild(self.doc.createTextNode('\n'))
 883         self.top_element.setAttribute('class', 'markdown')
 884         self.doc.appendChild(self.top_element)
 885
 886         # Fixup the source text
 887         text = self.source.strip()
 888         text = text.replace("\r\n", "\n").replace("\r", "\n")
 889         text += "\n\n"
 890         text = text.expandtabs(TAB_LENGTH)
 891
 892         # Split into lines and run the preprocessors that will work with
 893         # self.lines
 894
 895         self.lines = text.split("\n")
 896
 897         # Run the pre-processors on the lines
 898         for prep in self.preprocessors :
 899             self.lines = prep.run(self.lines)
 900
 901         # Create a NanoDom tree from the lines and attach it to Document
 902
 903
 904         buffer = []
 905         for line in self.lines :
 906             if line.startswith("#") :
 907                 self._processSection(self.top_element, buffer)
 908                 buffer = [line]
 909             else :
 910                 buffer.append(line)
 911         self._processSection(self.top_element, buffer)
 912
 913         #self._processSection(self.top_element, self.lines)
 914
 915         # Not sure why I put this in but let's leave it for now.
 916         self.top_element.appendChild(self.doc.createTextNode('\n'))
 917
 918         # Run the post-processors
 919         for postprocessor in self.postprocessors :
 920             postprocessor.run(self.doc)
 921
 922         return self.doc
 923
 924
 925     def _processSection(self, parent_elem, lines,
 926                         inList = 0, looseList = 0) :
 927
 928         """Process a section of a source document, looking for high
 929            level structural elements like lists, block quotes, code
 930            segments, html blocks, etc.  Some those then get stripped
 931            of their high level markup (e.g. get unindented) and the
 932            lower-level markup is processed recursively.
 933
 934            @param parent_elem: A NanoDom element to which the content
 935                                will be added
 936            @param lines: a list of lines
 937            @param inList: a level
 938            @returns: None"""
 939
 940         if not lines :
 941             return
 942
 943         # Check if this section starts with a list, a blockquote or
 944         # a code block
 945
 946         processFn = { 'ul' :     self._processUList,
 947                       'ol' :     self._processOList,
 948                       'quoted' : self._processQuote,
 949                       'tabbed' : self._processCodeBlock }
 950
 951         for regexp in ['ul', 'ol', 'quoted', 'tabbed'] :
 952             m = RE.regExp[regexp].match(lines[0])
 953             if m :
 954                 processFn[regexp](parent_elem, lines, inList)
 955                 return
 956
 957         # We are NOT looking at one of the high-level structures like
 958         # lists or blockquotes.  So, it's just a regular paragraph
 959         # (though perhaps nested inside a list or something else).  If
 960         # we are NOT inside a list, we just need to look for a blank
 961         # line to find the end of the block.  If we ARE inside a
 962         # list, however, we need to consider that a sublist does not
 963         # need to be separated by a blank line.  Rather, the following
 964         # markup is legal:
 965         #
 966         # * The top level list item
 967         #
 968         #     Another paragraph of the list.  This is where we are now.
 969         #     * Underneath we might have a sublist.
 970         #
 971
 972         if inList :
 973
 974             start, theRest = self._linesUntil(lines, (lambda line:
 975                              RE.regExp['ul'].match(line)
 976                              or RE.regExp['ol'].match(line)
 977                                               or not line.strip()))
 978
 979             self._processSection(parent_elem, start,
 980                                  inList - 1, looseList = looseList)
 981             self._processSection(parent_elem, theRest,
 982                                  inList - 1, looseList = looseList)
 983
 984
 985         else : # Ok, so it's just a simple block
 986
 987             paragraph, theRest = self._linesUntil(lines, lambda line:
 988                                                  not line.strip())
 989
 990             if len(paragraph) and paragraph[0].startswith('#') :
 991                 m = RE.regExp['header'].match(paragraph[0])
 992                 if m :
 993                     level = len(m.group(1))
 994                     h = self.doc.createElement("h%d" % level)
 995                     parent_elem.appendChild(h)
 996                     for item in self._handleInlineWrapper2(m.group(2).strip()) :
 997                         h.appendChild(item)
 998                 else :
 999                     message(CRITICAL, "We've got a problem header!")
1000
1001             elif paragraph :
1002
1003                 list = self._handleInlineWrapper2("\n".join(paragraph))
1004
1005                 if ( parent_elem.nodeName == 'li'
1006                      and not (looseList or parent_elem.childNodes)):
1007
1008                     #and not parent_elem.childNodes) :
1009                     # If this is the first paragraph inside "li", don't
1010                     # put <p> around it - append the paragraph bits directly
1011                     # onto parent_elem
1012                     el = parent_elem
1013                 else :
1014                     # Otherwise make a "p" element
1015                     el = self.doc.createElement("p")
1016                     parent_elem.appendChild(el)
1017
1018                 for item in list :
1019                     el.appendChild(item)
1020
1021             if theRest :
1022                 theRest = theRest[1:]  # skip the first (blank) line
1023
1024             self._processSection(parent_elem, theRest, inList)
1025
1026
1027
1028     def _processUList(self, parent_elem, lines, inList) :
1029         self._processList(parent_elem, lines, inList,
1030                          listexpr='ul', tag = 'ul')
1031
1032     def _processOList(self, parent_elem, lines, inList) :
1033         self._processList(parent_elem, lines, inList,
1034                          listexpr='ol', tag = 'ol')
1035
1036
1037     def _processList(self, parent_elem, lines, inList, listexpr, tag) :
1038         """Given a list of document lines starting with a list item,
1039            finds the end of the list, breaks it up, and recursively
1040            processes each list item and the remainder of the text file.
1041
1042            @param parent_elem: A dom element to which the content will be added
1043            @param lines: a list of lines
1044            @param inList: a level
1045            @returns: None"""
1046
1047         ul = self.doc.createElement(tag)  # ul might actually be '<ol>'
1048         parent_elem.appendChild(ul)
1049
1050         looseList = 0
1051
1052         # Make a list of list items
1053         items = []
1054         item = -1
1055
1056         i = 0  # a counter to keep track of where we are
1057
1058         for line in lines :
1059
1060             loose = 0
1061             if not line.strip() :
1062                 # If we see a blank line, this _might_ be the end of the list
1063                 i += 1
1064                 loose = 1
1065
1066                 # Find the next non-blank line
1067                 for j in range(i, len(lines)) :
1068                     if lines[j].strip() :
1069                         next = lines[j]
1070                         break
1071                 else :
1072                     # There is no more text => end of the list
1073                     break
1074
1075                 # Check if the next non-blank line is still a part of the list
1076                 if ( RE.regExp['ul'].match(next) or
1077                      RE.regExp['ol'].match(next) or
1078                      RE.regExp['tabbed'].match(next) ):
1079                     # get rid of any white space in the line
1080                     items[item].append(line.strip())
1081                     looseList = loose or looseList
1082                     continue
1083                 else :
1084                     break # found end of the list
1085
1086             # Now we need to detect list items (at the current level)
1087             # while also detabing child elements if necessary
1088
1089             for expr in ['ul', 'ol', 'tabbed']:
1090
1091                 m = RE.regExp[expr].match(line)
1092                 if m :
1093                     if expr in ['ul', 'ol'] :  # We are looking at a new item
1094                         if m.group(1) :
1095                             items.append([m.group(1)])
1096                             item += 1
1097                     elif expr == 'tabbed' :  # This line needs to be detabbed
1098                         items[item].append(m.group(4)) #after the 'tab'
1099
1100                     i += 1
1101                     break
1102             else :
1103                 items[item].append(line)  # Just regular continuation
1104                 i += 1 # added on 2006.02.25
1105         else :
1106             i += 1
1107
1108         # Add the dom elements
1109         for item in items :
1110             li = self.doc.createElement("li")
1111             ul.appendChild(li)
1112
1113             self._processSection(li, item, inList + 1, looseList = looseList)
1114
1115         # Process the remaining part of the section
1116
1117         self._processSection(parent_elem, lines[i:], inList)
1118
1119
1120     def _linesUntil(self, lines, condition) :
1121         """ A utility function to break a list of lines upon the
1122             first line that satisfied a condition.  The condition
1123             argument should be a predicate function.
1124             """
1125
1126         i = -1
1127         for line in lines :
1128             i += 1
1129             if condition(line) : break
1130         else :
1131             i += 1
1132         return lines[:i], lines[i:]
1133
1134     def _processQuote(self, parent_elem, lines, inList) :
1135         """Given a list of document lines starting with a quote finds
1136            the end of the quote, unindents it and recursively
1137            processes the body of the quote and the remainder of the
1138            text file.
1139
1140            @param parent_elem: DOM element to which the content will be added
1141            @param lines: a list of lines
1142            @param inList: a level
1143            @returns: None """
1144
1145         dequoted = []
1146         i = 0
1147         for line in lines :
1148             m = RE.regExp['quoted'].match(line)
1149             if m :
1150                 dequoted.append(m.group(1))
1151                 i += 1
1152             else :
1153                 break
1154         else :
1155             i += 1
1156
1157         blockquote = self.doc.createElement('blockquote')
1158         parent_elem.appendChild(blockquote)
1159
1160         self._processSection(blockquote, dequoted, inList)
1161         self._processSection(parent_elem, lines[i:], inList)
1162
1163
1164
1165
1166     def _processCodeBlock(self, parent_elem, lines, inList) :
1167         """Given a list of document lines starting with a code block
1168            finds the end of the block, puts it into the dom verbatim
1169            wrapped in ("<pre><code>") and recursively processes the
1170            the remainder of the text file.
1171
1172            @param parent_elem: DOM element to which the content will be added
1173            @param lines: a list of lines
1174            @param inList: a level
1175            @returns: None"""
1176
1177         detabbed, theRest = self.blockGuru.detectTabbed(lines)
1178
1179         pre = self.doc.createElement('pre')
1180         code = self.doc.createElement('code')
1181         parent_elem.appendChild(pre)
1182         pre.appendChild(code)
1183         text = "\n".join(detabbed).rstrip()+"\n"
1184         text = text.replace("&", "&amp;")
1185         code.appendChild(self.doc.createTextNode(text))
1186         self._processSection(parent_elem, theRest, inList)
1187
1188
1189     def _handleInlineWrapper2 (self, line) :
1190
1191
1192         parts = [line]
1193
1194         #if not(line):
1195         #    return [self.doc.createTextNode(' ')]
1196
1197         for pattern in self.inlinePatterns :
1198
1199             #print
1200             #print self.inlinePatterns.index(pattern)
1201
1202             i = 0
1203
1204             #print parts
1205             while i < len(parts) :
1206
1207                 x = parts[i]
1208                 #print i
1209                 if isinstance(x, (str, unicode)) :
1210                     result = self._applyPattern(x, pattern)
1211                     #print result
1212                     #print result
1213                     #print parts, i
1214                     if result :
1215                         i -= 1
1216                         parts.remove(x)
1217                         for y in result :
1218                             parts.insert(i+1,y)
1219
1220                 i += 1
1221
1222         for i in range(len(parts)) :
1223             x = parts[i]
1224             if isinstance(x, (str, unicode)) :
1225                 parts[i] = self.doc.createTextNode(x)
1226
1227         return parts
1228
1229
1230
1231     def _handleInlineWrapper (self, line) :
1232
1233         # A wrapper around _handleInline to avoid recursion
1234
1235         parts = [line]
1236
1237         i = 0
1238
1239         while i < len(parts) :
1240             x = parts[i]
1241             if isinstance(x, (str, unicode)) :
1242                 parts.remove(x)
1243                 result = self._handleInline(x)
1244                 for y in result :
1245                     parts.insert(i,y)
1246             else :
1247                 i += 1
1248
1249         return parts
1250
1251     def _handleInline(self,  line):
1252         """Transform a Markdown line with inline elements to an XHTML
1253         fragment.
1254
1255         This function uses auxiliary objects called inline patterns.
1256         See notes on inline patterns above.
1257
1258         @param item: A block of Markdown text
1259         @return: A list of NanoDom nodes """
1260
1261         if not(line):
1262             return [self.doc.createTextNode(' ')]
1263
1264         for pattern in self.inlinePatterns :
1265             list = self._applyPattern( line, pattern)
1266             if list: return list
1267
1268         return [self.doc.createTextNode(line)]
1269
1270     def _applyPattern(self, line, pattern) :
1271         """ Given a pattern name, this function checks if the line
1272         fits the pattern, creates the necessary elements, and returns
1273         back a list consisting of NanoDom elements and/or strings.
1274
1275         @param line: the text to be processed
1276         @param pattern: the pattern to be checked
1277
1278         @returns: the appropriate newly created NanoDom element if the
1279                   pattern matches, None otherwise.
1280         """
1281
1282         # match the line to pattern's pre-compiled reg exp.
1283         # if no match, move on.
1284
1285         m = pattern.getCompiledRegExp().match(line)
1286         if not m :
1287             return None
1288
1289         # if we got a match let the pattern make us a NanoDom node
1290         # if it doesn't, move on
1291         node = pattern.handleMatch(m, self.doc)
1292
1293         if node :
1294             # Those are in the reverse order!
1295             return ( m.groups()[-1], # the string to the left
1296                      node,           # the new node
1297                      m.group(1))     # the string to the right of the match
1298
1299         else :
1300             return None
1301
1302     def __str__(self):
1303         """Return the document in XHTML format.
1304
1305         @returns: A serialized XHTML body."""
1306         #try :
1307         doc = self._transform()
1308         xml = doc.toxml()
1309         #finally:
1310         #    doc.unlink()
1311
1312         # Let's stick in all the raw html pieces
1313
1314         for i in range(self.htmlStash.html_counter) :
1315             xml = xml.replace("<p>%s\n</p>" % (HTML_PLACEHOLDER % i),
1316                               self.htmlStash.rawHtmlBlocks[i] + "\n")
1317             xml = xml.replace(HTML_PLACEHOLDER % i,
1318                               self.htmlStash.rawHtmlBlocks[i])
1319
1320         xml = xml.replace(FN_BACKLINK_TEXT, "&#8617;")
1321
1322         # And return everything but the top level tag
1323
1324         if self.stripTopLevelTags :
1325             xml = xml.strip()[23:-7]
1326
1327         if isinstance(xml, unicode) :
1328             xml = xml.encode("utf8")
1329
1330         return xml
1331
1332
1333     toString = __str__
1334
1335
1336 """
1337 ========================= FOOTNOTES =================================
1338
1339 This section adds footnote handling to markdown.  It can be used as
1340 an example for extending python-markdown with relatively complex
1341 functionality.  While in this case the extension is included inside
1342 the module itself, it could just as easily be added from outside the
1343 module.  Not that all markdown classes above are ignorant about
1344 footnotes.  All footnote functionality is provided separately and
1345 then added to the markdown instance at the run time.
1346
1347 Footnote functionality is attached by calling extendMarkdown()
1348 method of FootnoteExtension.  The method also registers the
1349 extension to allow it's state to be reset by a call to reset()
1350 method.
1351 """
1352
1353 class FootnoteExtension :
1354
1355     DEF_RE = re.compile(r'(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)')
1356     SHORT_USE_RE = re.compile(r'\[\^([^\]]*)\]', re.M) # [^a]
1357
1358     FN_PLACE_MARKER = "///Footnotes Go Here///"
1359
1360     def __init__ (self) :
1361         self.reset()
1362
1363     def extendMarkdown(self, md) :
1364
1365         self.md = md
1366
1367         # Stateless extensions do not need to be registered
1368         md.registerExtension(self)
1369
1370         # Insert a preprocessor before ReferencePreprocessor
1371         index = md.preprocessors.index(REFERENCE_PREPROCESSOR)
1372         preprocessor = FootnotePreprocessor(self)
1373         preprocessor.md = md
1374         md.preprocessors.insert(index, preprocessor)
1375
1376         # Insert an inline pattern before ImageReferencePattern
1377         FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
1378         index = md.inlinePatterns.index(IMAGE_REFERENCE_PATTERN)
1379         md.inlinePatterns.insert(index, FootnotePattern(FOOTNOTE_RE, self))
1380
1381         # Insert a post-processor that would actually add the footnote div
1382         postprocessor = FootnotePostprocessor(self)
1383         postprocessor.extension = self
1384
1385         md.postprocessors.append(postprocessor)
1386
1387
1388     def reset(self) :
1389         # May be called by Markdown is state reset is desired
1390
1391         self.footnote_suffix = "-" + str(int(random.random()*1000000000))
1392         self.used_footnotes={}
1393         self.footnotes = {}
1394
1395     def findFootnotesPlaceholder(self, doc) :
1396         def findFootnotePlaceholderFn(node=None, indent=0):
1397             if node.type == 'text':
1398                 if node.value.find(self.FN_PLACE_MARKER) > -1 :
1399                     return True
1400
1401         fn_div_list = doc.find(findFootnotePlaceholderFn)
1402         if fn_div_list :
1403             return fn_div_list[0]
1404
1405
1406     def setFootnote(self, id, text) :
1407         self.footnotes[id] = text
1408
1409     def makeFootnoteId(self, num) :
1410         return 'fn%d%s' % (num, self.footnote_suffix)
1411
1412     def makeFootnoteRefId(self, num) :
1413         return 'fnr%d%s' % (num, self.footnote_suffix)
1414
1415     def makeFootnotesDiv (self, doc) :
1416         """Creates the div with class='footnote' and populates it with
1417            the text of the footnotes.
1418
1419            @returns: the footnote div as a dom element """
1420
1421         if not self.footnotes.keys() :
1422             return None
1423
1424         div = doc.createElement("div")
1425         div.setAttribute('class', 'footnote')
1426         hr = doc.createElement("hr")
1427         div.appendChild(hr)
1428         ol = doc.createElement("ol")
1429         div.appendChild(ol)
1430
1431         footnotes = [(self.used_footnotes[id], id)
1432                      for id in self.footnotes.keys()]
1433         footnotes.sort()
1434
1435         for i, id in footnotes :
1436             li = doc.createElement('li')
1437             li.setAttribute('id', self.makeFootnoteId(i))
1438
1439             self.md._processSection(li, self.footnotes[id].split("\n"))
1440
1441             #li.appendChild(doc.createTextNode(self.footnotes[id]))
1442
1443             backlink = doc.createElement('a')
1444             backlink.setAttribute('href', '#' + self.makeFootnoteRefId(i))
1445             backlink.setAttribute('class', 'footnoteBackLink')
1446             backlink.setAttribute('title',
1447                                   'Jump back to footnote %d in the text' % 1)
1448             backlink.appendChild(doc.createTextNode(FN_BACKLINK_TEXT))
1449
1450             if li.childNodes :
1451                 node = li.childNodes[-1]
1452                 if node.type == "text" :
1453                     node = li
1454                 node.appendChild(backlink)
1455
1456             ol.appendChild(li)
1457
1458         return div
1459
1460
1461 class FootnotePreprocessor :
1462
1463     def __init__ (self, footnotes) :
1464         self.footnotes = footnotes
1465
1466     def run(self, lines) :
1467
1468         self.blockGuru = BlockGuru()
1469         lines = self._handleFootnoteDefinitions (lines)
1470
1471         # Make a hash of all footnote marks in the text so that we
1472         # know in what order they are supposed to appear.  (This
1473         # function call doesn't really substitute anything - it's just
1474         # a way to get a callback for each occurence.
1475
1476         text = "\n".join(lines)
1477         self.footnotes.SHORT_USE_RE.sub(self.recordFootnoteUse, text)
1478
1479         return text.split("\n")
1480
1481
1482     def recordFootnoteUse(self, match) :
1483
1484         id = match.group(1)
1485         id = id.strip()
1486         nextNum = len(self.footnotes.used_footnotes.keys()) + 1
1487         self.footnotes.used_footnotes[id] = nextNum
1488
1489
1490     def _handleFootnoteDefinitions(self, lines) :
1491         """Recursively finds all footnote definitions in the lines.
1492
1493             @param lines: a list of lines of text
1494             @returns: a string representing the text with footnote
1495                       definitions removed """
1496
1497         i, id, footnote = self._findFootnoteDefinition(lines)
1498
1499         if id :
1500
1501             plain = lines[:i]
1502
1503             detabbed, theRest = self.blockGuru.detectTabbed(lines[i+1:])
1504
1505             self.footnotes.setFootnote(id,
1506                                        footnote + "\n"
1507                                        + "\n".join(detabbed))
1508
1509             more_plain = self._handleFootnoteDefinitions(theRest)
1510             return plain + [""] + more_plain
1511
1512         else :
1513             return lines
1514
1515     def _findFootnoteDefinition(self, lines) :
1516         """Finds the first line of a footnote definition.
1517
1518             @param lines: a list of lines of text
1519             @returns: the index of the line containing a footnote definition """
1520
1521         counter = 0
1522         for line in lines :
1523             m = self.footnotes.DEF_RE.match(line)
1524             if m :
1525                 return counter, m.group(2), m.group(3)
1526             counter += 1
1527         return counter, None, None
1528
1529
1530 class FootnotePattern (BasePattern) :
1531
1532     def __init__ (self, pattern, footnotes) :
1533
1534         BasePattern.__init__(self, pattern)
1535         self.footnotes = footnotes
1536
1537     def handleMatch(self, m, doc) :
1538         sup = doc.createElement('sup')
1539         a = doc.createElement('a')
1540         sup.appendChild(a)
1541         id = m.group(2)
1542         num = self.footnotes.used_footnotes[id]
1543         sup.setAttribute('id', self.footnotes.makeFootnoteRefId(num))
1544         a.setAttribute('href', '#' + self.footnotes.makeFootnoteId(num))
1545         a.appendChild(doc.createTextNode(str(num)))
1546         return sup
1547
1548 class FootnotePostprocessor :
1549
1550     def __init__ (self, footnotes) :
1551         self.footnotes = footnotes
1552
1553     def run(self, doc) :
1554         footnotesDiv = self.footnotes.makeFootnotesDiv(doc)
1555         if footnotesDiv :
1556             fnPlaceholder = self.extension.findFootnotesPlaceholder(doc)
1557             if fnPlaceholder :
1558                 fnPlaceholder.parent.replaceChild(fnPlaceholder, footnotesDiv)
1559             else :
1560                 doc.documentElement.appendChild(footnotesDiv)
1561
1562 # ====================================================================
1563
1564 def markdown(text) :
1565     message(VERBOSE, "in markdown.py, received text:\n%s" % text)
1566     return Markdown(text).toString()
1567
1568 def markdownWithFootnotes(text):
1569     message(VERBOSE, "Running markdown with footnotes, "
1570             + "received text:\n%s" % text)
1571     md = Markdown()
1572     footnoteExtension = FootnoteExtension()
1573     footnoteExtension.extendMarkdown(md)
1574     md.source = text
1575
1576     return str(md)
1577
1578 def test_markdown(args):
1579     """test markdown at the command line.
1580         in each test, arg 0 is the module name"""
1581     print "\nTEST 1: no arguments on command line"
1582     cmd_line(["markdown.py"])
1583     print "\nTEST 2a: 1 argument on command line: a good option"
1584     cmd_line(["markdown.py","-footnotes"])
1585     print "\nTEST 2b: 1 argument on command line: a bad option"
1586     cmd_line(["markdown.py","-foodnotes"])
1587     print "\nTEST 3: 1 argument on command line: non-existent input file"
1588     cmd_line(["markdown.py","junk.txt"])
1589     print "\nTEST 4: 1 argument on command line: existing input file"
1590     lines = """
1591 Markdown text with[^1]:
1592
1593 2. **bold text**,
1594 3. *italic text*.
1595
1596 Then more:
1597
1598     beginning of code block;
1599     another line of code block.
1600
1601     a second paragraph of code block.
1602
1603 more text to end our file.
1604
1605 [^1]: "italic" means emphasis.
1606 """
1607     fid = "markdown-test.txt"
1608     f1 = open(fid, 'w+')
1609     f1.write(lines)
1610     f1.close()
1611     cmd_line(["markdown.py",fid])
1612     print "\nTEST 5: 2 arguments on command line: nofootnotes and input file"
1613     cmd_line(["markdown.py","-nofootnotes", fid])
1614     print "\nTEST 6: 2 arguments on command line: footnotes and input file"
1615     cmd_line(["markdown.py","-footnotes", fid])
1616     print "\nTEST 7: 3 arguments on command line: nofootnotes,inputfile, outputfile"
1617     fidout = "markdown-test.html"
1618     cmd_line(["markdown.py","-nofootnotes", fid, fidout])
1619
1620
1621 def get_vars(args):
1622     """process the command-line args received; return usable variables"""
1623     #firstly get the variables
1624
1625     message(VERBOSE, "in get_vars(), args: %s" % args)
1626
1627     if len(args) <= 1:
1628         option, inFile, outFile = (None, None, None)
1629     elif len(args) >= 4:
1630         option, inFile, outFile = args[1:4]
1631     elif len(args) == 3:
1632         temp1, temp2 = args[1:3]
1633         if temp1[0] == '-':
1634             #then we have an option and inFile
1635             option, inFile, outFile = temp1, temp2, None
1636         else:
1637             #we have no option, so we must have inFile and outFile
1638             option, inFile, outFile = None, temp1, temp2
1639     else:
1640         #len(args) = 2
1641         #we have only one usable arg: might be an option or a file
1642         temp1 = args[1]
1643
1644         message(VERBOSE, "our single arg is: %s" % str(temp1))
1645
1646         if temp1[0] == '-':
1647             #then we have an option
1648             option, inFile, outFile = temp1, None, None
1649         else:
1650             #we have no option, so we must have inFile
1651             option, inFile, outFile = None, temp1, None
1652
1653     message(VERBOSE,
1654             "prior to validation, option: %s, inFile: %s, outFile: %s" %
1655             (str(option), str(inFile), str(outFile),))
1656
1657     return option, inFile, outFile
1658
1659
1660 USAGE = """
1661 \nUsing markdown.py:
1662
1663     python markdown.py [option] input_file_with_markdown.txt [output_file.html]
1664
1665 Options:
1666
1667     -footnotes or -fn   : generate markdown with footnotes
1668     -test or -t         : run a self-test
1669     -help or -h         : print this message
1670
1671 """
1672
1673 VALID_OPTIONS = ['footnotes','nofootnotes', 'fn', 'test', 't', 'f',
1674                  'help', 'h']
1675
1676 EXPANDED_OPTIONS =  { "fn" : "footnotes",
1677                       "t"  : "test",
1678                       "h"  : "help" }
1679
1680
1681 def validate_option(option) :
1682
1683     """ Check if the option makes sense and print an appropriate message
1684         if it isn't.
1685
1686         @return: valid option string or None
1687     """
1688
1689     #now validate the variables
1690     if (option is not None):
1691         if (len(option) > 1 and option[1:] in VALID_OPTIONS) :
1692             option = option[1:]
1693
1694             if option in EXPANDED_OPTIONS.keys() :
1695                 option = EXPANDED_OPTIONS[option]
1696             return option
1697         else:
1698             message(CRITICAL,
1699                     "\nSorry, I don't understand option %s" % option)
1700             message(CRITICAL, USAGE)
1701             return None
1702
1703
1704 def validate_input_file(inFile) :
1705     """ Check if the input file is specified and exists.
1706
1707         @return: valid input file path or None
1708     """
1709
1710     if not inFile :
1711         message(CRITICAL,
1712                 "\nI need an input filename.\n")
1713         message(CRITICAL, USAGE)
1714         return None
1715
1716
1717     if os.access(inFile, os.R_OK):
1718         return inFile
1719     else :
1720         message(CRITICAL, "Sorry, I can't find input file %s" % str(inFile))
1721         return None
1722
1723
1724
1725
1726 def cmd_line(args):
1727
1728     message(VERBOSE, "in cmd_line with args: %s" % args)
1729
1730     option, inFile, outFile = get_vars(args)
1731
1732     if option :
1733         option = validate_option(option)
1734         if not option : return
1735
1736     if option == "help" :
1737         message(CRITICAL, USAGE)
1738         return
1739     elif option == "test" :
1740         test_markdown(None)
1741         return
1742
1743     inFile = validate_input_file(inFile)
1744     if not inFile :
1745         return
1746     else :
1747         input = file(inFile).read()
1748
1749     message(VERBOSE, "Validated command line parameters:" +
1750              "\n\toption: %s, \n\tinFile: %s, \n\toutFile: %s" % (
1751              str(option), str(inFile), str(outFile),))
1752
1753     if option == "footnotes" :
1754         md_function = markdownWithFootnotes
1755     else :
1756         md_function = markdown
1757
1758     if outFile is None:
1759         print md_function(input)
1760     else:
1761         output = md_function(input)
1762         f1 = open(outFile, "w+")
1763         f1.write(output)
1764         f1.close()
1765
1766         if os.access(outFile, os.F_OK):
1767             message(INFO, "Successfully wrote %s" % outFile)
1768         else:
1769             message(INFO, "Failed to write %s" % outFile)
1770
1771
1772 if __name__ == '__main__':
1773     """ Run Markdown from the command line.
1774         Set debug = 3 at top of file to get diagnostic output"""
1775     args = sys.argv
1776
1777     #set testing=1 to test the command-line response of markdown.py
1778     testing = 0
1779     if testing:
1780         test_markdown(args)
1781     else:
1782         import time
1783         t0 = time.time()
1784         #for x in range(10) :
1785         cmd_line(args)
1786         #import profile
1787         #profile.run('cmd_line(args)', 'profile')
1788         t1 = time.time()
1789         #print "Time: %f - %f = %f" % (t1, t0, t1-t0)
1790
1791 """
1792 CHANGELOG
1793 =========
1794
1795 May 15, 2006: A bug with lists, recursion on block-level elements,
1796 run-in headers, spaces before headers, unicode input (thanks to Aaron
1797 Swartz). Sourceforge tracker #s: 1489313, 1489312, 1489311, 1488370,
1798 1485178, 1485176. (v. 1.5)
1799
1800 Mar. 24, 2006: Switched to a not-so-recursive algorithm with
1801 _handleInline.  (Version 1.4)
1802
1803 Mar. 15, 2006: Replaced some instance variables with class variables
1804 (a patch from Stelios Xanthakis).  Chris Clark's new regexps that do
1805 not trigger midword underlining.
1806
1807 Feb. 28, 2006: Clean-up and command-line handling by Stewart
1808 Midwinter. (Version 1.3)
1809
1810 Feb. 24, 2006: Fixed a bug with the last line of the list appearing
1811 again as a separate paragraph.  Incorporated Chris Clark's "mailto"
1812 patch.  Added support for <br /> at the end of lines ending in two or
1813 more spaces.  Fixed a crashing bug when using ImageReferencePattern.
1814 Added several utility methods to Nanodom.  (Version 1.2)
1815
1816 Jan. 31, 2006: Added "hr" and "hr/" to BLOCK_LEVEL_ELEMENTS and
1817 changed <hr/> to <hr />.  (Thanks to Sergej Chodarev.)
1818
1819 Nov. 26, 2005: Fixed a bug with certain tabbed lines inside lists
1820 getting wrapped in <pre><code>.  (v. 1.1)
1821
1822 Nov. 19, 2005: Made "<!...", "<?...", etc. behave like block-level
1823 HTML tags.
1824
1825 Nov. 14, 2005: Added entity code and email autolink fix by Tiago
1826 Cogumbreiro.  Fixed some small issues with backticks to get 100%
1827 compliance with John's test suite.  (v. 1.0)
1828
1829 Nov. 7, 2005: Added an unlink method for documents to aid with memory
1830 collection (per Doug Sauder's suggestion).
1831
1832 Oct. 29, 2005: Restricted a set of html tags that get treated as
1833 block-level elements.
1834
1835 Sept. 18, 2005: Refactored the whole script to make it easier to
1836 customize it and made footnote functionality into an extension.
1837 (v. 0.9)
1838
1839 Sept. 5, 2005: Fixed a bug with multi-paragraph footnotes.  Added
1840 attribute support.
1841
1842 Sept. 1, 2005: Changed the way headers are handled to allow inline
1843 syntax in headers (e.g. links) and got the lists to use p-tags
1844 correctly (v. 0.8)
1845
1846 Aug. 29, 2005: Added flexible tabs, fixed a few small issues, added
1847 basic support for footnotes.  Got rid of xml.dom.minidom and added
1848 pretty-printing. (v. 0.7)
1849
1850 Aug. 13, 2005: Fixed a number of small bugs in order to conform to the
1851 test suite.  (v. 0.6)
1852
1853 Aug. 11, 2005: Added support for inline html and entities, inline
1854 images, autolinks, underscore emphasis. Cleaned up and refactored the
1855 code, added some more comments.
1856
1857 Feb. 19, 2005: Rewrote the handling of high-level elements to allow
1858 multi-line list items and all sorts of nesting.
1859
1860 Feb. 3, 2005: Reference-style links, single-line lists, backticks,
1861 escape, emphasis in the beginning of the paragraph.
1862
1863 Nov. 2004: Added links, blockquotes, html blocks to Manfred
1864 Stienstra's code
1865
1866 Apr. 2004: Manfred's version at http://www.dwerg.net/projects/markdown/
1867
1868 """
1869
1870
1871
1872
1873
1874