Re-subimission of https://codereview.chromium.org/1041213003/
[chromium-blink-merge.git] / third_party / markdown / inlinepatterns.py
blob1d6fce423b0a982092a634202259ce856bb58815
1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
5 #
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
33 """
34 INLINE PATTERNS
35 =============================================================================
37 Inline patterns such as *emphasis* are handled by means of auxiliary
38 objects, one per pattern. Pattern objects must be instances of classes
39 that extend markdown.Pattern. Each pattern object uses a single regular
40 expression and needs support the following methods:
42 pattern.getCompiledRegExp() # returns a regular expression
44 pattern.handleMatch(m) # takes a match object and returns
45 # an ElementTree element or just plain text
47 All of python markdown's built-in patterns subclass from Pattern,
48 but you can add additional patterns that don't.
50 Also note that all the regular expressions used by inline must
51 capture the whole block. For this reason, they all start with
52 '^(.*)' and end with '(.*)!'. In case with built-in expression
53 Pattern takes care of adding the "^(.*)" and "(.*)!".
55 Finally, the order in which regular expressions are applied is very
56 important - e.g. if we first replace http://.../ links with <a> tags
57 and _then_ try to replace inline html, we would end up with a mess.
58 So, we apply the expressions in the following order:
60 * escape and backticks have to go before everything else, so
61 that we can preempt any markdown patterns by escaping them.
63 * then we handle auto-links (must be done before inline html)
65 * then we handle inline HTML. At this point we will simply
66 replace all inline HTML strings with a placeholder and add
67 the actual HTML to a hash.
69 * then inline images (must be done before links)
71 * then bracketed links, first regular then reference-style
73 * finally we apply strong and emphasis
74 """
76 from __future__ import absolute_import
77 from __future__ import unicode_literals
78 from . import util
79 from . import odict
80 import re
81 try:
82 from urllib.parse import urlparse, urlunparse
83 except ImportError:
84 from urlparse import urlparse, urlunparse
85 try:
86 from html import entities
87 except ImportError:
88 import htmlentitydefs as entities
91 def build_inlinepatterns(md_instance, **kwargs):
92 """ Build the default set of inline patterns for Markdown. """
93 inlinePatterns = odict.OrderedDict()
94 inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE)
95 inlinePatterns["escape"] = EscapePattern(ESCAPE_RE, md_instance)
96 inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, md_instance)
97 inlinePatterns["link"] = LinkPattern(LINK_RE, md_instance)
98 inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, md_instance)
99 inlinePatterns["image_reference"] = \
100 ImageReferencePattern(IMAGE_REFERENCE_RE, md_instance)
101 inlinePatterns["short_reference"] = \
102 ReferencePattern(SHORT_REF_RE, md_instance)
103 inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, md_instance)
104 inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, md_instance)
105 inlinePatterns["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE, 'br')
106 if md_instance.safeMode != 'escape':
107 inlinePatterns["html"] = HtmlPattern(HTML_RE, md_instance)
108 inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, md_instance)
109 inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE)
110 inlinePatterns["strong_em"] = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
111 inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong')
112 inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em')
113 if md_instance.smart_emphasis:
114 inlinePatterns["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE, 'em')
115 else:
116 inlinePatterns["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE, 'em')
117 return inlinePatterns
120 The actual regular expressions for patterns
121 -----------------------------------------------------------------------------
124 NOBRACKET = r'[^\]\[]*'
125 BRK = ( r'\[('
126 + (NOBRACKET + r'(\[')*6
127 + (NOBRACKET+ r'\])*')*6
128 + NOBRACKET + r')\]' )
129 NOIMG = r'(?<!\!)'
131 BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
132 ESCAPE_RE = r'\\(.)' # \<
133 EMPHASIS_RE = r'(\*)([^\*]+)\2' # *emphasis*
134 STRONG_RE = r'(\*{2}|_{2})(.+?)\2' # **strong**
135 STRONG_EM_RE = r'(\*{3}|_{3})(.+?)\2' # ***strong***
136 SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_
137 EMPHASIS_2_RE = r'(_)(.+?)\2' # _emphasis_
138 LINK_RE = NOIMG + BRK + \
139 r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
140 # [text](url) or [text](<url>) or [text](url "title")
142 IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)'
143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
144 REFERENCE_RE = NOIMG + BRK+ r'\s?\[([^\]]*)\]' # [Google][3]
145 SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' # [Google]
146 IMAGE_REFERENCE_RE = r'\!' + BRK + '\s?\[([^\]]*)\]' # ![alt text][2]
147 NOT_STRONG_RE = r'((^| )(\*|_)( |$))' # stand-alone * or _
148 AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com>
149 AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
151 HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
152 ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp;
153 LINE_BREAK_RE = r' \n' # two spaces at end of line
156 def dequote(string):
157 """Remove quotes from around a string."""
158 if ( ( string.startswith('"') and string.endswith('"'))
159 or (string.startswith("'") and string.endswith("'")) ):
160 return string[1:-1]
161 else:
162 return string
164 ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
166 def handleAttributes(text, parent):
167 """Set values of an element based on attribute definitions ({@id=123})."""
168 def attributeCallback(match):
169 parent.set(match.group(1), match.group(2).replace('\n', ' '))
170 return ATTR_RE.sub(attributeCallback, text)
174 The pattern classes
175 -----------------------------------------------------------------------------
178 class Pattern(object):
179 """Base class that inline patterns subclass. """
181 def __init__(self, pattern, markdown_instance=None):
183 Create an instant of an inline pattern.
185 Keyword arguments:
187 * pattern: A regular expression that matches a pattern
190 self.pattern = pattern
191 self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
192 re.DOTALL | re.UNICODE)
194 # Api for Markdown to pass safe_mode into instance
195 self.safe_mode = False
196 if markdown_instance:
197 self.markdown = markdown_instance
199 def getCompiledRegExp(self):
200 """ Return a compiled regular expression. """
201 return self.compiled_re
203 def handleMatch(self, m):
204 """Return a ElementTree element from the given match.
206 Subclasses should override this method.
208 Keyword arguments:
210 * m: A re match object containing a match of the pattern.
213 pass
215 def type(self):
216 """ Return class name, to define pattern type """
217 return self.__class__.__name__
219 def unescape(self, text):
220 """ Return unescaped text given text with an inline placeholder. """
221 try:
222 stash = self.markdown.treeprocessors['inline'].stashed_nodes
223 except KeyError:
224 return text
225 def itertext(el):
226 ' Reimplement Element.itertext for older python versions '
227 tag = el.tag
228 if not isinstance(tag, util.string_type) and tag is not None:
229 return
230 if el.text:
231 yield el.text
232 for e in el:
233 for s in itertext(e):
234 yield s
235 if e.tail:
236 yield e.tail
237 def get_stash(m):
238 id = m.group(1)
239 if id in stash:
240 value = stash.get(id)
241 if isinstance(value, util.string_type):
242 return value
243 else:
244 # An etree Element - return text content only
245 return ''.join(itertext(value))
246 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
249 class SimpleTextPattern(Pattern):
250 """ Return a simple text of group(2) of a Pattern. """
251 def handleMatch(self, m):
252 text = m.group(2)
253 if text == util.INLINE_PLACEHOLDER_PREFIX:
254 return None
255 return text
258 class EscapePattern(Pattern):
259 """ Return an escaped character. """
261 def handleMatch(self, m):
262 char = m.group(2)
263 if char in self.markdown.ESCAPED_CHARS:
264 return '%s%s%s' % (util.STX, ord(char), util.ETX)
265 else:
266 return '\\%s' % char
269 class SimpleTagPattern(Pattern):
271 Return element of type `tag` with a text attribute of group(3)
272 of a Pattern.
275 def __init__ (self, pattern, tag):
276 Pattern.__init__(self, pattern)
277 self.tag = tag
279 def handleMatch(self, m):
280 el = util.etree.Element(self.tag)
281 el.text = m.group(3)
282 return el
285 class SubstituteTagPattern(SimpleTagPattern):
286 """ Return an element of type `tag` with no children. """
287 def handleMatch (self, m):
288 return util.etree.Element(self.tag)
291 class BacktickPattern(Pattern):
292 """ Return a `<code>` element containing the matching text. """
293 def __init__ (self, pattern):
294 Pattern.__init__(self, pattern)
295 self.tag = "code"
297 def handleMatch(self, m):
298 el = util.etree.Element(self.tag)
299 el.text = util.AtomicString(m.group(3).strip())
300 return el
303 class DoubleTagPattern(SimpleTagPattern):
304 """Return a ElementTree element nested in tag2 nested in tag1.
306 Useful for strong emphasis etc.
309 def handleMatch(self, m):
310 tag1, tag2 = self.tag.split(",")
311 el1 = util.etree.Element(tag1)
312 el2 = util.etree.SubElement(el1, tag2)
313 el2.text = m.group(3)
314 return el1
317 class HtmlPattern(Pattern):
318 """ Store raw inline html and return a placeholder. """
319 def handleMatch (self, m):
320 rawhtml = self.unescape(m.group(2))
321 place_holder = self.markdown.htmlStash.store(rawhtml)
322 return place_holder
324 def unescape(self, text):
325 """ Return unescaped text given text with an inline placeholder. """
326 try:
327 stash = self.markdown.treeprocessors['inline'].stashed_nodes
328 except KeyError:
329 return text
330 def get_stash(m):
331 id = m.group(1)
332 value = stash.get(id)
333 if value is not None:
334 try:
335 return self.markdown.serializer(value)
336 except:
337 return '\%s' % value
339 return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)
342 class LinkPattern(Pattern):
343 """ Return a link element from the given match. """
344 def handleMatch(self, m):
345 el = util.etree.Element("a")
346 el.text = m.group(2)
347 title = m.group(13)
348 href = m.group(9)
350 if href:
351 if href[0] == "<":
352 href = href[1:-1]
353 el.set("href", self.sanitize_url(self.unescape(href.strip())))
354 else:
355 el.set("href", "")
357 if title:
358 title = dequote(self.unescape(title))
359 el.set("title", title)
360 return el
362 def sanitize_url(self, url):
364 Sanitize a url against xss attacks in "safe_mode".
366 Rather than specifically blacklisting `javascript:alert("XSS")` and all
367 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
368 safe url formats. Most urls contain a network location, however some
369 are known not to (i.e.: mailto links). Script urls do not contain a
370 location. Additionally, for `javascript:...`, the scheme would be
371 "javascript" but some aliases will appear to `urlparse()` to have no
372 scheme. On top of that relative links (i.e.: "foo/bar.html") have no
373 scheme. Therefore we must check "path", "parameters", "query" and
374 "fragment" for any literal colons. We don't check "scheme" for colons
375 because it *should* never have any and "netloc" must allow the form:
376 `username:password@host:port`.
379 url = url.replace(' ', '%20')
380 if not self.markdown.safeMode:
381 # Return immediately bipassing parsing.
382 return url
384 try:
385 scheme, netloc, path, params, query, fragment = url = urlparse(url)
386 except ValueError:
387 # Bad url - so bad it couldn't be parsed.
388 return ''
390 locless_schemes = ['', 'mailto', 'news']
391 allowed_schemes = locless_schemes + ['http', 'https', 'ftp', 'ftps']
392 if scheme not in allowed_schemes:
393 # Not a known (allowed) scheme. Not safe.
394 return ''
396 if netloc == '' and scheme not in locless_schemes:
397 # This should not happen. Treat as suspect.
398 return ''
400 for part in url[2:]:
401 if ":" in part:
402 # A colon in "path", "parameters", "query" or "fragment" is suspect.
403 return ''
405 # Url passes all tests. Return url as-is.
406 return urlunparse(url)
408 class ImagePattern(LinkPattern):
409 """ Return a img element from the given match. """
410 def handleMatch(self, m):
411 el = util.etree.Element("img")
412 src_parts = m.group(9).split()
413 if src_parts:
414 src = src_parts[0]
415 if src[0] == "<" and src[-1] == ">":
416 src = src[1:-1]
417 el.set('src', self.sanitize_url(self.unescape(src)))
418 else:
419 el.set('src', "")
420 if len(src_parts) > 1:
421 el.set('title', dequote(self.unescape(" ".join(src_parts[1:]))))
423 if self.markdown.enable_attributes:
424 truealt = handleAttributes(m.group(2), el)
425 else:
426 truealt = m.group(2)
428 el.set('alt', self.unescape(truealt))
429 return el
431 class ReferencePattern(LinkPattern):
432 """ Match to a stored reference and return link element. """
434 NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE)
436 def handleMatch(self, m):
437 try:
438 id = m.group(9).lower()
439 except IndexError:
440 id = None
441 if not id:
442 # if we got something like "[Google][]" or "[Goggle]"
443 # we'll use "google" as the id
444 id = m.group(2).lower()
446 # Clean up linebreaks in id
447 id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
448 if not id in self.markdown.references: # ignore undefined refs
449 return None
450 href, title = self.markdown.references[id]
452 text = m.group(2)
453 return self.makeTag(href, title, text)
455 def makeTag(self, href, title, text):
456 el = util.etree.Element('a')
458 el.set('href', self.sanitize_url(href))
459 if title:
460 el.set('title', title)
462 el.text = text
463 return el
466 class ImageReferencePattern(ReferencePattern):
467 """ Match to a stored reference and return img element. """
468 def makeTag(self, href, title, text):
469 el = util.etree.Element("img")
470 el.set("src", self.sanitize_url(href))
471 if title:
472 el.set("title", title)
474 if self.markdown.enable_attributes:
475 text = handleAttributes(text, el)
477 el.set("alt", self.unescape(text))
478 return el
481 class AutolinkPattern(Pattern):
482 """ Return a link Element given an autolink (`<http://example/com>`). """
483 def handleMatch(self, m):
484 el = util.etree.Element("a")
485 el.set('href', self.unescape(m.group(2)))
486 el.text = util.AtomicString(m.group(2))
487 return el
489 class AutomailPattern(Pattern):
491 Return a mailto link Element given an automail link (`<foo@example.com>`).
493 def handleMatch(self, m):
494 el = util.etree.Element('a')
495 email = self.unescape(m.group(2))
496 if email.startswith("mailto:"):
497 email = email[len("mailto:"):]
499 def codepoint2name(code):
500 """Return entity definition by code, or the code if not defined."""
501 entity = entities.codepoint2name.get(code)
502 if entity:
503 return "%s%s;" % (util.AMP_SUBSTITUTE, entity)
504 else:
505 return "%s#%d;" % (util.AMP_SUBSTITUTE, code)
507 letters = [codepoint2name(ord(letter)) for letter in email]
508 el.text = util.AtomicString(''.join(letters))
510 mailto = "mailto:" + email
511 mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' %
512 ord(letter) for letter in mailto])
513 el.set('href', mailto)
514 return el