third_party/markdown/inlinepatterns.py

   1 # markdown is released under the BSD license
   2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
   3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
   4 # Copyright 2004 Manfred Stienstra (the original version)
   5 #
   6 # All rights reserved.
   7 #
   8 # Redistribution and use in source and binary forms, with or without
   9 # modification, are permitted provided that the following conditions are met:
  10 #
  11 # *   Redistributions of source code must retain the above copyright
  12 #     notice, this list of conditions and the following disclaimer.
  13 # *   Redistributions in binary form must reproduce the above copyright
  14 #     notice, this list of conditions and the following disclaimer in the
  15 #     documentation and/or other materials provided with the distribution.
  16 # *   Neither the name of the <organization> nor the
  17 #     names of its contributors may be used to endorse or promote products
  18 #     derived from this software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
  21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
  24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32
  33 """
  34 INLINE PATTERNS
  35 =============================================================================
  36
  37 Inline patterns such as *emphasis* are handled by means of auxiliary
  38 objects, one per pattern.  Pattern objects must be instances of classes
  39 that extend markdown.Pattern.  Each pattern object uses a single regular
  40 expression and needs support the following methods:
  41
  42     pattern.getCompiledRegExp() # returns a regular expression
  43
  44     pattern.handleMatch(m) # takes a match object and returns
  45                            # an ElementTree element or just plain text
  46
  47 All of python markdown's built-in patterns subclass from Pattern,
  48 but you can add additional patterns that don't.
  49
  50 Also note that all the regular expressions used by inline must
  51 capture the whole block.  For this reason, they all start with
  52 '^(.*)' and end with '(.*)!'.  In case with built-in expression
  53 Pattern takes care of adding the "^(.*)" and "(.*)!".
  54
  55 Finally, the order in which regular expressions are applied is very
  56 important - e.g. if we first replace http://.../ links with <a> tags
  57 and _then_ try to replace inline html, we would end up with a mess.
  58 So, we apply the expressions in the following order:
  59
  60 * escape and backticks have to go before everything else, so
  61   that we can preempt any markdown patterns by escaping them.
  62
  63 * then we handle auto-links (must be done before inline html)
  64
  65 * then we handle inline HTML.  At this point we will simply
  66   replace all inline HTML strings with a placeholder and add
  67   the actual HTML to a hash.
  68
  69 * then inline images (must be done before links)
  70
  71 * then bracketed links, first regular then reference-style
  72
  73 * finally we apply strong and emphasis
  74 """
  75
  76 from __future__ import absolute_import
  77 from __future__ import unicode_literals
  78 from . import util
  79 from . import odict
  80 import re
  81 try:
  82     from urllib.parse import urlparse, urlunparse
  83 except ImportError:
  84     from urlparse import urlparse, urlunparse
  85 try:
  86     from html import entities
  87 except ImportError:
  88     import htmlentitydefs as entities
  89
  90
  91 def build_inlinepatterns(md_instance, **kwargs):
  92     """ Build the default set of inline patterns for Markdown. """
  93     inlinePatterns = odict.OrderedDict()
  94     inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
  95     inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
  96     inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
  97     inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
  98     inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
  99     inlinePatterns["image_reference"] = \
 100             ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
 101     inlinePatterns["short_reference"] = \
 102             ReferencePattern(SHORT_REF_RE, md_instance)
 103     inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
 104     inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
 105     inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
 106     if md_instance.safeMode != 'escape':
 107         inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
 108     inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
 109     inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
 110     inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
 111     inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
 112     inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
 113     if md_instance.smart_emphasis:
 114         inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
 115     else:
 116         inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
 117     return inlinePatterns
 118
 119 """
 120 The actual regular expressions for patterns
 121 -----------------------------------------------------------------------------
 122 """
 123
 124 NOBRACKET = r'[^\]\[]*'
 125 BRK = ( r'\[('
 126         + (NOBRACKET + r'(\[')*6
 127         + (NOBRACKET+ r'\])*')*6
 128         + NOBRACKET + r')\]' )
 129 NOIMG = r'(?<!\!)'
 130
 131 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
 132 ESCAPE_RE = r'\\(.)'                             # \<
 133 EMPHASIS_RE = r'(\*)([^\*]+)\2'                    # *emphasis*
 134 STRONG_RE = r'(\*{2}|_{2})(.+?)\2'                      # **strong**
 135 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2'            # ***strong***
 136 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'  # _smart_emphasis_
 137 EMPHASIS_2_RE = r'(_)(.+?)\2'                 # _emphasis_
 138 LINK_RE = NOIMG + BRK + \
 139 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
 140 # [text](url) or [text](<url>) or [text](url "title")
 141
 142 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
 143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
 144 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]'           # [Google][3]
 145 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]'                   # [Google]
 146 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
 147 NOT_STRONG_RE = r'((^| )(\*|_)( |$))'                        # stand-alone * or _
 148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com>
 149 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # <me@example.com>
 150
 151 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
 152 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
 153 LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
 154
 155
 156 def dequote(string):
 157     """Remove quotes from around a string."""
 158     if ( ( string.startswith('"') and string.endswith('"'))
 159          or (string.startswith("'") and string.endswith("'")) ):
 160         return string[1:-1]
 161     else:
 162         return string
 163
 164 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
 165
 166 def handleAttributes(text, parent):
 167     """Set values of an element based on attribute definitions ({@id=123})."""
 168     def attributeCallback(match):
 169         parent.set(match.group(1), match.group(2).replace('\n', ' '))
 170     return ATTR_RE.sub(attributeCallback, text)
 171
 172
 173 """
 174 The pattern classes
 175 -----------------------------------------------------------------------------
 176 """
 177
 178 class Pattern(object):
 179     """Base class that inline patterns subclass. """
 180
 181     def __init__(self, pattern, markdown_instance=None):
 182         """
 183         Create an instant of an inline pattern.
 184
 185         Keyword arguments:
 186
 187         * pattern: A regular expression that matches a pattern
 188
 189         """
 190         self.pattern = pattern
 191         self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
 192                                       re.DOTALL | re.UNICODE)
 193
 194         # Api for Markdown to pass safe_mode into instance
 195         self.safe_mode = False
 196         if markdown_instance:
 197             self.markdown = markdown_instance
 198
 199     def getCompiledRegExp(self):
 200         """ Return a compiled regular expression. """
 201         return self.compiled_re
 202
 203     def handleMatch(self, m):
 204         """Return a ElementTree element from the given match.
 205
 206         Subclasses should override this method.
 207
 208         Keyword arguments:
 209
 210         * m: A re match object containing a match of the pattern.
 211
 212         """
 213         pass
 214
 215     def type(self):
 216         """ Return class name, to define pattern type """
 217         return self.__class__.__name__
 218
 219     def unescape(self, text):
 220         """ Return unescaped text given text with an inline placeholder. """
 221         try:
 222             stash = self.markdown.treeprocessors['inline'].stashed_nodes
 223         except KeyError:
 224             return text
 225         def itertext(el):
 226             ' Reimplement Element.itertext for older python versions '
 227             tag = el.tag
 228             if not isinstance(tag, util.string_type) and tag is not None:
 229                 return
 230             if el.text:
 231                 yield el.text
 232             for e in el:
 233                 for s in itertext(e):
 234                     yield s
 235                 if e.tail:
 236                     yield e.tail
 237         def get_stash(m):
 238             id = m.group(1)
 239             if id in stash:
 240                 value = stash.get(id)
 241                 if isinstance(value, util.string_type):
 242                     return value
 243                 else:
 244                     # An etree Element - return text content only
 245                     return ''.join(itertext(value))
 246         return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
 247
 248
 249 class SimpleTextPattern(Pattern):
 250     """ Return a simple text of group(2) of a Pattern. """
 251     def handleMatch(self, m):
 252         text = m.group(2)
 253         if text == util.INLINE_PLACEHOLDER_PREFIX:
 254             return None
 255         return text
 256
 257
 258 class EscapePattern(Pattern):
 259     """ Return an escaped character. """
 260
 261     def handleMatch(self, m):
 262         char = m.group(2)
 263         if char in self.markdown.ESCAPED_CHARS:
 264             return '%s%s%s' % (util.STX, ord(char), util.ETX)
 265         else:
 266             return '\\%s' % char
 267
 268
 269 class SimpleTagPattern(Pattern):
 270     """
 271     Return element of type `tag` with a text attribute of group(3)
 272     of a Pattern.
 273
 274     """
 275     def __init__ (self, pattern, tag):
 276         Pattern.__init__(self, pattern)
 277         self.tag = tag
 278
 279     def handleMatch(self, m):
 280         el = util.etree.Element(self.tag)
 281         el.text = m.group(3)
 282         return el
 283
 284
 285 class SubstituteTagPattern(SimpleTagPattern):
 286     """ Return an element of type `tag` with no children. """
 287     def handleMatch (self, m):
 288         return util.etree.Element(self.tag)
 289
 290
 291 class BacktickPattern(Pattern):
 292     """ Return a `<code>` element containing the matching text. """
 293     def __init__ (self, pattern):
 294         Pattern.__init__(self, pattern)
 295         self.tag = "code"
 296
 297     def handleMatch(self, m):
 298         el = util.etree.Element(self.tag)
 299         el.text = util.AtomicString(m.group(3).strip())
 300         return el
 301
 302
 303 class DoubleTagPattern(SimpleTagPattern):
 304     """Return a ElementTree element nested in tag2 nested in tag1.
 305
 306     Useful for strong emphasis etc.
 307
 308     """
 309     def handleMatch(self, m):
 310         tag1, tag2 = self.tag.split(",")
 311         el1 = util.etree.Element(tag1)
 312         el2 = util.etree.SubElement(el1, tag2)
 313         el2.text = m.group(3)
 314         return el1
 315
 316
 317 class HtmlPattern(Pattern):
 318     """ Store raw inline html and return a placeholder. """
 319     def handleMatch (self, m):
 320         rawhtml = self.unescape(m.group(2))
 321         place_holder = self.markdown.htmlStash.store(rawhtml)
 322         return place_holder
 323
 324     def unescape(self, text):
 325         """ Return unescaped text given text with an inline placeholder. """
 326         try:
 327             stash = self.markdown.treeprocessors['inline'].stashed_nodes
 328         except KeyError:
 329             return text
 330         def get_stash(m):
 331             id = m.group(1)
 332             value = stash.get(id)
 333             if value is not None:
 334                 try:
 335                     return self.markdown.serializer(value)
 336                 except:
 337                     return '\%s' % value
 338
 339         return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
 340
 341
 342 class LinkPattern(Pattern):
 343     """ Return a link element from the given match. """
 344     def handleMatch(self, m):
 345         el = util.etree.Element("a")
 346         el.text = m.group(2)
 347         title = m.group(13)
 348         href = m.group(9)
 349
 350         if href:
 351             if href[0] == "<":
 352                 href = href[1:-1]
 353             el.set("href", self.sanitize_url(self.unescape(href.strip())))
 354         else:
 355             el.set("href", "")
 356
 357         if title:
 358             title = dequote(self.unescape(title))
 359             el.set("title", title)
 360         return el
 361
 362     def sanitize_url(self, url):
 363         """
 364         Sanitize a url against xss attacks in "safe_mode".
 365
 366         Rather than specifically blacklisting `javascript:alert("XSS")` and all
 367         its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
 368         safe url formats. Most urls contain a network location, however some
 369         are known not to (i.e.: mailto links). Script urls do not contain a
 370         location. Additionally, for `javascript:...`, the scheme would be
 371         "javascript" but some aliases will appear to `urlparse()` to have no
 372         scheme. On top of that relative links (i.e.: "foo/bar.html") have no
 373         scheme. Therefore we must check "path", "parameters", "query" and
 374         "fragment" for any literal colons. We don't check "scheme" for colons
 375         because it *should* never have any and "netloc" must allow the form:
 376         `username:password@host:port`.
 377
 378         """
 379         url = url.replace(' ', '%20')
 380         if not self.markdown.safeMode:
 381             # Return immediately bipassing parsing.
 382             return url
 383
 384         try:
 385             scheme, netloc, path, params, query, fragment = url = urlparse(url)
 386         except ValueError:
 387             # Bad url - so bad it couldn't be parsed.
 388             return ''
 389
 390         locless_schemes = ['', 'mailto', 'news']
 391         allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
 392         if scheme not in allowed_schemes:
 393             # Not a known (allowed) scheme. Not safe.
 394             return ''
 395
 396         if netloc == '' and scheme not in locless_schemes:
 397             # This should not happen. Treat as suspect.
 398             return ''
 399
 400         for part in url[2:]:
 401             if ":" in part:
 402                 # A colon in "path", "parameters", "query" or "fragment" is suspect.
 403                 return ''
 404
 405         # Url passes all tests. Return url as-is.
 406         return urlunparse(url)
 407
 408 class ImagePattern(LinkPattern):
 409     """ Return a img element from the given match. """
 410     def handleMatch(self, m):
 411         el = util.etree.Element("img")
 412         src_parts = m.group(9).split()
 413         if src_parts:
 414             src = src_parts[0]
 415             if src[0] == "<" and src[-1] == ">":
 416                 src = src[1:-1]
 417             el.set('src', self.sanitize_url(self.unescape(src)))
 418         else:
 419             el.set('src', "")
 420         if len(src_parts) > 1:
 421             el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
 422
 423         if self.markdown.enable_attributes:
 424             truealt = handleAttributes(m.group(2), el)
 425         else:
 426             truealt = m.group(2)
 427
 428         el.set('alt', self.unescape(truealt))
 429         return el
 430
 431 class ReferencePattern(LinkPattern):
 432     """ Match to a stored reference and return link element. """
 433
 434     NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
 435
 436     def handleMatch(self, m):
 437         try:
 438             id = m.group(9).lower()
 439         except IndexError:
 440             id = None
 441         if not id:
 442             # if we got something like "[Google][]" or "[Goggle]"
 443             # we'll use "google" as the id
 444             id = m.group(2).lower()
 445
 446         # Clean up linebreaks in id
 447         id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
 448         if not id in self.markdown.references: # ignore undefined refs
 449             return None
 450         href, title = self.markdown.references[id]
 451
 452         text = m.group(2)
 453         return self.makeTag(href, title, text)
 454
 455     def makeTag(self, href, title, text):
 456         el = util.etree.Element('a')
 457
 458         el.set('href', self.sanitize_url(href))
 459         if title:
 460             el.set('title', title)
 461
 462         el.text = text
 463         return el
 464
 465
 466 class ImageReferencePattern(ReferencePattern):
 467     """ Match to a stored reference and return img element. """
 468     def makeTag(self, href, title, text):
 469         el = util.etree.Element("img")
 470         el.set("src", self.sanitize_url(href))
 471         if title:
 472             el.set("title", title)
 473
 474         if self.markdown.enable_attributes:
 475             text = handleAttributes(text, el)
 476
 477         el.set("alt", self.unescape(text))
 478         return el
 479
 480
 481 class AutolinkPattern(Pattern):
 482     """ Return a link Element given an autolink (`<http://example/com>`). """
 483     def handleMatch(self, m):
 484         el = util.etree.Element("a")
 485         el.set('href', self.unescape(m.group(2)))
 486         el.text = util.AtomicString(m.group(2))
 487         return el
 488
 489 class AutomailPattern(Pattern):
 490     """
 491     Return a mailto link Element given an automail link (`<foo@example.com>`).
 492     """
 493     def handleMatch(self, m):
 494         el = util.etree.Element('a')
 495         email = self.unescape(m.group(2))
 496         if email.startswith("mailto:"):
 497             email = email[len("mailto:"):]
 498
 499         def codepoint2name(code):
 500             """Return entity definition by code, or the code if not defined."""
 501             entity = entities.codepoint2name.get(code)
 502             if entity:
 503                 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
 504             else:
 505                 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
 506
 507         letters = [codepoint2name(ord(letter)) for letter in email]
 508         el.text = util.AtomicString(''.join(letters))
 509
 510         mailto = "mailto:" + email
 511         mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
 512                           ord(letter) for letter in mailto])
 513         el.set('href', mailto)
 514         return el
 515