third_party/markdown/treeprocessors.py

   1 # markdown is released under the BSD license
   2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
   3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
   4 # Copyright 2004 Manfred Stienstra (the original version)
   5 #
   6 # All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions are met:
  10 #
  11 # *   Redistributions of source code must retain the above copyright
  12 #     notice, this list of conditions and the following disclaimer.
  13 # *   Redistributions in binary form must reproduce the above copyright
  14 #     notice, this list of conditions and the following disclaimer in the
  15 #     documentation and/or other materials provided with the distribution.
  16 # *   Neither the name of the <organization> nor the
  17 #     names of its contributors may be used to endorse or promote products
  18 #     derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
  21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
  24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32
  33 from __future__ import unicode_literals
  34 from __future__ import absolute_import
  35 from . import util
  36 from . import odict
  37 from . import inlinepatterns
  38
  39
  40 def build_treeprocessors(md_instance, **kwargs):
  41     """ Build the default treeprocessors for Markdown. """
  42     treeprocessors = odict.OrderedDict()
  43     treeprocessors["inline"] = InlineProcessor(md_instance)
  44     treeprocessors["prettify"] = PrettifyTreeprocessor(md_instance)
  45     return treeprocessors
  46
  47
  48 def isString(s):
  49     """ Check if it's string """
  50     if not isinstance(s, util.AtomicString):
  51         return isinstance(s, util.string_type)
  52     return False
  53
  54
  55 class Treeprocessor(util.Processor):
  56     """
  57     Treeprocessors are run on the ElementTree object before serialization.
  58
  59     Each Treeprocessor implements a "run" method that takes a pointer to an
  60     ElementTree, modifies it as necessary and returns an ElementTree
  61     object.
  62
  63     Treeprocessors must extend markdown.Treeprocessor.
  64
  65     """
  66     def run(self, root):
  67         """
  68         Subclasses of Treeprocessor should implement a `run` method, which
  69         takes a root ElementTree. This method can return another ElementTree
  70         object, and the existing root ElementTree will be replaced, or it can
  71         modify the current tree and return None.
  72         """
  73         pass
  74
  75
  76 class InlineProcessor(Treeprocessor):
  77     """
  78     A Treeprocessor that traverses a tree, applying inline patterns.
  79     """
  80
  81     def __init__(self, md):
  82         self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
  83         self.__placeholder_suffix = util.ETX
  84         self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
  85                                       + len(self.__placeholder_suffix)
  86         self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
  87         self.markdown = md
  88
  89     def __makePlaceholder(self, type):
  90         """ Generate a placeholder """
  91         id = "%04d" % len(self.stashed_nodes)
  92         hash = util.INLINE_PLACEHOLDER % id
  93         return hash, id
  94
  95     def __findPlaceholder(self, data, index):
  96         """
  97         Extract id from data string, start from index
  98
  99         Keyword arguments:
 100
 101         * data: string
 102         * index: index, from which we start search
 103
 104         Returns: placeholder id and string index, after the found placeholder.
 105
 106         """
 107         m = self.__placeholder_re.search(data, index)
 108         if m:
 109             return m.group(1), m.end()
 110         else:
 111             return None, index + 1
 112
 113     def __stashNode(self, node, type):
 114         """ Add node to stash """
 115         placeholder, id = self.__makePlaceholder(type)
 116         self.stashed_nodes[id] = node
 117         return placeholder
 118
 119     def __handleInline(self, data, patternIndex=0):
 120         """
 121         Process string with inline patterns and replace it
 122         with placeholders
 123
 124         Keyword arguments:
 125
 126         * data: A line of Markdown text
 127         * patternIndex: The index of the inlinePattern to start with
 128
 129         Returns: String with placeholders.
 130
 131         """
 132         if not isinstance(data, util.AtomicString):
 133             startIndex = 0
 134             while patternIndex < len(self.markdown.inlinePatterns):
 135                 data, matched, startIndex = self.__applyPattern(
 136                     self.markdown.inlinePatterns.value_for_index(patternIndex),
 137                     data, patternIndex, startIndex)
 138                 if not matched:
 139                     patternIndex += 1
 140         return data
 141
 142     def __processElementText(self, node, subnode, isText=True):
 143         """
 144         Process placeholders in Element.text or Element.tail
 145         of Elements popped from self.stashed_nodes.
 146
 147         Keywords arguments:
 148
 149         * node: parent node
 150         * subnode: processing node
 151         * isText: bool variable, True - it's text, False - it's tail
 152
 153         Returns: None
 154
 155         """
 156         if isText:
 157             text = subnode.text
 158             subnode.text = None
 159         else:
 160             text = subnode.tail
 161             subnode.tail = None
 162
 163         childResult = self.__processPlaceholders(text, subnode)
 164
 165         if not isText and node is not subnode:
 166             pos = node.getchildren().index(subnode)
 167             node.remove(subnode)
 168         else:
 169             pos = 0
 170
 171         childResult.reverse()
 172         for newChild in childResult:
 173             node.insert(pos, newChild)
 174
 175     def __processPlaceholders(self, data, parent):
 176         """
 177         Process string with placeholders and generate ElementTree tree.
 178
 179         Keyword arguments:
 180
 181         * data: string with placeholders instead of ElementTree elements.
 182         * parent: Element, which contains processing inline data
 183
 184         Returns: list with ElementTree elements with applied inline patterns.
 185
 186         """
 187         def linkText(text):
 188             if text:
 189                 if result:
 190                     if result[-1].tail:
 191                         result[-1].tail += text
 192                     else:
 193                         result[-1].tail = text
 194                 else:
 195                     if parent.text:
 196                         parent.text += text
 197                     else:
 198                         parent.text = text
 199         result = []
 200         strartIndex = 0
 201         while data:
 202             index = data.find(self.__placeholder_prefix, strartIndex)
 203             if index != -1:
 204                 id, phEndIndex = self.__findPlaceholder(data, index)
 205
 206                 if id in self.stashed_nodes:
 207                     node = self.stashed_nodes.get(id)
 208
 209                     if index > 0:
 210                         text = data[strartIndex:index]
 211                         linkText(text)
 212
 213                     if not isString(node): # it's Element
 214                         for child in [node] + node.getchildren():
 215                             if child.tail:
 216                                 if child.tail.strip():
 217                                     self.__processElementText(node, child,False)
 218                             if child.text:
 219                                 if child.text.strip():
 220                                     self.__processElementText(child, child)
 221                     else: # it's just a string
 222                         linkText(node)
 223                         strartIndex = phEndIndex
 224                         continue
 225
 226                     strartIndex = phEndIndex
 227                     result.append(node)
 228
 229                 else: # wrong placeholder
 230                     end = index + len(self.__placeholder_prefix)
 231                     linkText(data[strartIndex:end])
 232                     strartIndex = end
 233             else:
 234                 text = data[strartIndex:]
 235                 if isinstance(data, util.AtomicString):
 236                     # We don't want to loose the AtomicString
 237                     text = util.AtomicString(text)
 238                 linkText(text)
 239                 data = ""
 240
 241         return result
 242
 243     def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
 244         """
 245         Check if the line fits the pattern, create the necessary
 246         elements, add it to stashed_nodes.
 247
 248         Keyword arguments:
 249
 250         * data: the text to be processed
 251         * pattern: the pattern to be checked
 252         * patternIndex: index of current pattern
 253         * startIndex: string index, from which we start searching
 254
 255         Returns: String with placeholders instead of ElementTree elements.
 256
 257         """
 258         match = pattern.getCompiledRegExp().match(data[startIndex:])
 259         leftData = data[:startIndex]
 260
 261         if not match:
 262             return data, False, 0
 263
 264         node = pattern.handleMatch(match)
 265
 266         if node is None:
 267             return data, True, len(leftData)+match.span(len(match.groups()))[0]
 268
 269         if not isString(node):
 270             if not isinstance(node.text, util.AtomicString):
 271                 # We need to process current node too
 272                 for child in [node] + node.getchildren():
 273                     if not isString(node):
 274                         if child.text:
 275                             child.text = self.__handleInline(child.text,
 276                                                             patternIndex + 1)
 277                         if child.tail:
 278                             child.tail = self.__handleInline(child.tail,
 279                                                             patternIndex)
 280
 281         placeholder = self.__stashNode(node, pattern.type())
 282
 283         return "%s%s%s%s" % (leftData,
 284                              match.group(1),
 285                              placeholder, match.groups()[-1]), True, 0
 286
 287     def run(self, tree):
 288         """Apply inline patterns to a parsed Markdown tree.
 289
 290         Iterate over ElementTree, find elements with inline tag, apply inline
 291         patterns and append newly created Elements to tree.  If you don't
 292         want to process your data with inline paterns, instead of normal string,
 293         use subclass AtomicString:
 294
 295             node.text = markdown.AtomicString("This will not be processed.")
 296
 297         Arguments:
 298
 299         * tree: ElementTree object, representing Markdown tree.
 300
 301         Returns: ElementTree object with applied inline patterns.
 302
 303         """
 304         self.stashed_nodes = {}
 305
 306         stack = [tree]
 307
 308         while stack:
 309             currElement = stack.pop()
 310             insertQueue = []
 311             for child in currElement.getchildren():
 312                 if child.text and not isinstance(child.text, util.AtomicString):
 313                     text = child.text
 314                     child.text = None
 315                     lst = self.__processPlaceholders(self.__handleInline(
 316                                                     text), child)
 317                     stack += lst
 318                     insertQueue.append((child, lst))
 319                 if child.tail:
 320                     tail = self.__handleInline(child.tail)
 321                     dumby = util.etree.Element('d')
 322                     tailResult = self.__processPlaceholders(tail, dumby)
 323                     if dumby.text:
 324                         child.tail = dumby.text
 325                     else:
 326                         child.tail = None
 327                     pos = currElement.getchildren().index(child) + 1
 328                     tailResult.reverse()
 329                     for newChild in tailResult:
 330                         currElement.insert(pos, newChild)
 331                 if child.getchildren():
 332                     stack.append(child)
 333
 334             for element, lst in insertQueue:
 335                 if self.markdown.enable_attributes:
 336                     if element.text and isString(element.text):
 337                         element.text = \
 338                             inlinepatterns.handleAttributes(element.text,
 339                                                                     element)
 340                 i = 0
 341                 for newChild in lst:
 342                     if self.markdown.enable_attributes:
 343                         # Processing attributes
 344                         if newChild.tail and isString(newChild.tail):
 345                             newChild.tail = \
 346                                 inlinepatterns.handleAttributes(newChild.tail,
 347                                                                     element)
 348                         if newChild.text and isString(newChild.text):
 349                             newChild.text = \
 350                                 inlinepatterns.handleAttributes(newChild.text,
 351                                                                     newChild)
 352                     element.insert(i, newChild)
 353                     i += 1
 354         return tree
 355
 356
 357 class PrettifyTreeprocessor(Treeprocessor):
 358     """ Add linebreaks to the html document. """
 359
 360     def _prettifyETree(self, elem):
 361         """ Recursively add linebreaks to ElementTree children. """
 362
 363         i = "\n"
 364         if util.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
 365             if (not elem.text or not elem.text.strip()) \
 366                     and len(elem) and util.isBlockLevel(elem[0].tag):
 367                 elem.text = i
 368             for e in elem:
 369                 if util.isBlockLevel(e.tag):
 370                     self._prettifyETree(e)
 371             if not elem.tail or not elem.tail.strip():
 372                 elem.tail = i
 373         if not elem.tail or not elem.tail.strip():
 374             elem.tail = i
 375
 376     def run(self, root):
 377         """ Add linebreaks to ElementTree root object. """
 378
 379         self._prettifyETree(root)
 380         # Do <br />'s seperately as they are often in the middle of
 381         # inline content and missed by _prettifyETree.
 382         brs = root.getiterator('br')
 383         for br in brs:
 384             if not br.tail or not br.tail.strip():
 385                 br.tail = '\n'
 386             else:
 387                 br.tail = '\n%s' % br.tail
 388         # Clean up extra empty lines at end of code blocks.
 389         pres = root.getiterator('pre')
 390         for pre in pres:
 391             if len(pre) and pre[0].tag == 'code':
 392                 pre[0].text = pre[0].text.rstrip() + '\n'