1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
35 =============================================================================
37 Inline patterns such as *emphasis* are handled by means of auxiliary
38 objects, one per pattern. Pattern objects must be instances of classes
39 that extend markdown.Pattern. Each pattern object uses a single regular
40 expression and needs support the following methods:
42 pattern.getCompiledRegExp() # returns a regular expression
44 pattern.handleMatch(m) # takes a match object and returns
45 # an ElementTree element or just plain text
47 All of python markdown's built-in patterns subclass from Pattern,
48 but you can add additional patterns that don't.
50 Also note that all the regular expressions used by inline must
51 capture the whole block. For this reason, they all start with
52 '^(.*)' and end with '(.*)!'. In case with built-in expression
53 Pattern takes care of adding the "^(.*)" and "(.*)!".
55 Finally, the order in which regular expressions are applied is very
56 important - e.g. if we first replace http://.../ links with <a> tags
57 and _then_ try to replace inline html, we would end up with a mess.
58 So, we apply the expressions in the following order:
60 * escape and backticks have to go before everything else, so
61 that we can preempt any markdown patterns by escaping them.
63 * then we handle auto-links (must be done before inline html)
65 * then we handle inline HTML. At this point we will simply
66 replace all inline HTML strings with a placeholder and add
67 the actual HTML to a hash.
69 * then inline images (must be done before links)
71 * then bracketed links, first regular then reference-style
73 * finally we apply strong and emphasis
76 from __future__
import absolute_import
77 from __future__
import unicode_literals
82 from urllib
.parse
import urlparse
, urlunparse
84 from urlparse
import urlparse
, urlunparse
86 from html
import entities
88 import htmlentitydefs
as entities
91 def build_inlinepatterns(md_instance
, **kwargs
):
92 """ Build the default set of inline patterns for Markdown. """
93 inlinePatterns
= odict
.OrderedDict()
94 inlinePatterns
["backtick"] = BacktickPattern(BACKTICK_RE
)
95 inlinePatterns
["escape"] = EscapePattern(ESCAPE_RE
, md_instance
)
96 inlinePatterns
["reference"] = ReferencePattern(REFERENCE_RE
, md_instance
)
97 inlinePatterns
["link"] = LinkPattern(LINK_RE
, md_instance
)
98 inlinePatterns
["image_link"] = ImagePattern(IMAGE_LINK_RE
, md_instance
)
99 inlinePatterns
["image_reference"] = \
100 ImageReferencePattern(IMAGE_REFERENCE_RE
, md_instance
)
101 inlinePatterns
["short_reference"] = \
102 ReferencePattern(SHORT_REF_RE
, md_instance
)
103 inlinePatterns
["autolink"] = AutolinkPattern(AUTOLINK_RE
, md_instance
)
104 inlinePatterns
["automail"] = AutomailPattern(AUTOMAIL_RE
, md_instance
)
105 inlinePatterns
["linebreak"] = SubstituteTagPattern(LINE_BREAK_RE
, 'br')
106 if md_instance
.safeMode
!= 'escape':
107 inlinePatterns
["html"] = HtmlPattern(HTML_RE
, md_instance
)
108 inlinePatterns
["entity"] = HtmlPattern(ENTITY_RE
, md_instance
)
109 inlinePatterns
["not_strong"] = SimpleTextPattern(NOT_STRONG_RE
)
110 inlinePatterns
["strong_em"] = DoubleTagPattern(STRONG_EM_RE
, 'strong,em')
111 inlinePatterns
["strong"] = SimpleTagPattern(STRONG_RE
, 'strong')
112 inlinePatterns
["emphasis"] = SimpleTagPattern(EMPHASIS_RE
, 'em')
113 if md_instance
.smart_emphasis
:
114 inlinePatterns
["emphasis2"] = SimpleTagPattern(SMART_EMPHASIS_RE
, 'em')
116 inlinePatterns
["emphasis2"] = SimpleTagPattern(EMPHASIS_2_RE
, 'em')
117 return inlinePatterns
120 The actual regular expressions for patterns
121 -----------------------------------------------------------------------------
124 NOBRACKET
= r
'[^\]\[]*'
126 + (NOBRACKET
+ r
'(\[')*6
127 + (NOBRACKET
+ r
'\])*')*6
128 + NOBRACKET
+ r
')\]' )
131 BACKTICK_RE
= r
'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
132 ESCAPE_RE
= r
'\\(.)' # \<
133 EMPHASIS_RE
= r
'(\*)([^\*]+)\2' # *emphasis*
134 STRONG_RE
= r
'(\*{2}|_{2})(.+?)\2' # **strong**
135 STRONG_EM_RE
= r
'(\*{3}|_{3})(.+?)\2' # ***strong***
136 SMART_EMPHASIS_RE
= r
'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)' # _smart_emphasis_
137 EMPHASIS_2_RE
= r
'(_)(.+?)\2' # _emphasis_
138 LINK_RE
= NOIMG
+ BRK
+ \
139 r
'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
140 # [text](url) or [text](<url>) or [text](url "title")
142 IMAGE_LINK_RE
= r
'\!' + BRK
+ r
'\s*\((<.*?>|([^\)]*))\)'
143 # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
144 REFERENCE_RE
= NOIMG
+ BRK
+ r
'\s?\[([^\]]*)\]' # [Google][3]
145 SHORT_REF_RE
= NOIMG
+ r
'\[([^\]]+)\]' # [Google]
146 IMAGE_REFERENCE_RE
= r
'\!' + BRK
+ '\s?\[([^\]]*)\]' # ![alt text][2]
147 NOT_STRONG_RE
= r
'((^| )(\*|_)( |$))' # stand-alone * or _
148 AUTOLINK_RE
= r
'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^>]*)>' # <http://www.123.com>
149 AUTOMAIL_RE
= r
'<([^> \!]*@[^> ]*)>' # <me@example.com>
151 HTML_RE
= r
'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
152 ENTITY_RE
= r
'(&[\#a-zA-Z0-9]*;)' # &
153 LINE_BREAK_RE
= r
' \n' # two spaces at end of line
157 """Remove quotes from around a string."""
158 if ( ( string
.startswith('"') and string
.endswith('"'))
159 or (string
.startswith("'") and string
.endswith("'")) ):
164 ATTR_RE
= re
.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
166 def handleAttributes(text
, parent
):
167 """Set values of an element based on attribute definitions ({@id=123})."""
168 def attributeCallback(match
):
169 parent
.set(match
.group(1), match
.group(2).replace('\n', ' '))
170 return ATTR_RE
.sub(attributeCallback
, text
)
175 -----------------------------------------------------------------------------
178 class Pattern(object):
179 """Base class that inline patterns subclass. """
181 def __init__(self
, pattern
, markdown_instance
=None):
183 Create an instant of an inline pattern.
187 * pattern: A regular expression that matches a pattern
190 self
.pattern
= pattern
191 self
.compiled_re
= re
.compile("^(.*?)%s(.*?)$" % pattern
,
192 re
.DOTALL | re
.UNICODE
)
194 # Api for Markdown to pass safe_mode into instance
195 self
.safe_mode
= False
196 if markdown_instance
:
197 self
.markdown
= markdown_instance
199 def getCompiledRegExp(self
):
200 """ Return a compiled regular expression. """
201 return self
.compiled_re
203 def handleMatch(self
, m
):
204 """Return a ElementTree element from the given match.
206 Subclasses should override this method.
210 * m: A re match object containing a match of the pattern.
216 """ Return class name, to define pattern type """
217 return self
.__class
__.__name
__
219 def unescape(self
, text
):
220 """ Return unescaped text given text with an inline placeholder. """
222 stash
= self
.markdown
.treeprocessors
['inline'].stashed_nodes
226 ' Reimplement Element.itertext for older python versions '
228 if not isinstance(tag
, util
.string_type
) and tag
is not None:
233 for s
in itertext(e
):
240 value
= stash
.get(id)
241 if isinstance(value
, util
.string_type
):
244 # An etree Element - return text content only
245 return ''.join(itertext(value
))
246 return util
.INLINE_PLACEHOLDER_RE
.sub(get_stash
, text
)
249 class SimpleTextPattern(Pattern
):
250 """ Return a simple text of group(2) of a Pattern. """
251 def handleMatch(self
, m
):
253 if text
== util
.INLINE_PLACEHOLDER_PREFIX
:
258 class EscapePattern(Pattern
):
259 """ Return an escaped character. """
261 def handleMatch(self
, m
):
263 if char
in self
.markdown
.ESCAPED_CHARS
:
264 return '%s%s%s' % (util
.STX
, ord(char
), util
.ETX
)
269 class SimpleTagPattern(Pattern
):
271 Return element of type `tag` with a text attribute of group(3)
275 def __init__ (self
, pattern
, tag
):
276 Pattern
.__init
__(self
, pattern
)
279 def handleMatch(self
, m
):
280 el
= util
.etree
.Element(self
.tag
)
285 class SubstituteTagPattern(SimpleTagPattern
):
286 """ Return an element of type `tag` with no children. """
287 def handleMatch (self
, m
):
288 return util
.etree
.Element(self
.tag
)
291 class BacktickPattern(Pattern
):
292 """ Return a `<code>` element containing the matching text. """
293 def __init__ (self
, pattern
):
294 Pattern
.__init
__(self
, pattern
)
297 def handleMatch(self
, m
):
298 el
= util
.etree
.Element(self
.tag
)
299 el
.text
= util
.AtomicString(m
.group(3).strip())
303 class DoubleTagPattern(SimpleTagPattern
):
304 """Return a ElementTree element nested in tag2 nested in tag1.
306 Useful for strong emphasis etc.
309 def handleMatch(self
, m
):
310 tag1
, tag2
= self
.tag
.split(",")
311 el1
= util
.etree
.Element(tag1
)
312 el2
= util
.etree
.SubElement(el1
, tag2
)
313 el2
.text
= m
.group(3)
317 class HtmlPattern(Pattern
):
318 """ Store raw inline html and return a placeholder. """
319 def handleMatch (self
, m
):
320 rawhtml
= self
.unescape(m
.group(2))
321 place_holder
= self
.markdown
.htmlStash
.store(rawhtml
)
324 def unescape(self
, text
):
325 """ Return unescaped text given text with an inline placeholder. """
327 stash
= self
.markdown
.treeprocessors
['inline'].stashed_nodes
332 value
= stash
.get(id)
333 if value
is not None:
335 return self
.markdown
.serializer(value
)
339 return util
.INLINE_PLACEHOLDER_RE
.sub(get_stash
, text
)
342 class LinkPattern(Pattern
):
343 """ Return a link element from the given match. """
344 def handleMatch(self
, m
):
345 el
= util
.etree
.Element("a")
353 el
.set("href", self
.sanitize_url(self
.unescape(href
.strip())))
358 title
= dequote(self
.unescape(title
))
359 el
.set("title", title
)
362 def sanitize_url(self
, url
):
364 Sanitize a url against xss attacks in "safe_mode".
366 Rather than specifically blacklisting `javascript:alert("XSS")` and all
367 its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
368 safe url formats. Most urls contain a network location, however some
369 are known not to (i.e.: mailto links). Script urls do not contain a
370 location. Additionally, for `javascript:...`, the scheme would be
371 "javascript" but some aliases will appear to `urlparse()` to have no
372 scheme. On top of that relative links (i.e.: "foo/bar.html") have no
373 scheme. Therefore we must check "path", "parameters", "query" and
374 "fragment" for any literal colons. We don't check "scheme" for colons
375 because it *should* never have any and "netloc" must allow the form:
376 `username:password@host:port`.
379 url
= url
.replace(' ', '%20')
380 if not self
.markdown
.safeMode
:
381 # Return immediately bipassing parsing.
385 scheme
, netloc
, path
, params
, query
, fragment
= url
= urlparse(url
)
387 # Bad url - so bad it couldn't be parsed.
390 locless_schemes
= ['', 'mailto', 'news']
391 allowed_schemes
= locless_schemes
+ ['http', 'https', 'ftp', 'ftps']
392 if scheme
not in allowed_schemes
:
393 # Not a known (allowed) scheme. Not safe.
396 if netloc
== '' and scheme
not in locless_schemes
:
397 # This should not happen. Treat as suspect.
402 # A colon in "path", "parameters", "query" or "fragment" is suspect.
405 # Url passes all tests. Return url as-is.
406 return urlunparse(url
)
408 class ImagePattern(LinkPattern
):
409 """ Return a img element from the given match. """
410 def handleMatch(self
, m
):
411 el
= util
.etree
.Element("img")
412 src_parts
= m
.group(9).split()
415 if src
[0] == "<" and src
[-1] == ">":
417 el
.set('src', self
.sanitize_url(self
.unescape(src
)))
420 if len(src_parts
) > 1:
421 el
.set('title', dequote(self
.unescape(" ".join(src_parts
[1:]))))
423 if self
.markdown
.enable_attributes
:
424 truealt
= handleAttributes(m
.group(2), el
)
428 el
.set('alt', self
.unescape(truealt
))
431 class ReferencePattern(LinkPattern
):
432 """ Match to a stored reference and return link element. """
434 NEWLINE_CLEANUP_RE
= re
.compile(r
'[ ]?\n', re
.MULTILINE
)
436 def handleMatch(self
, m
):
438 id = m
.group(9).lower()
442 # if we got something like "[Google][]" or "[Goggle]"
443 # we'll use "google" as the id
444 id = m
.group(2).lower()
446 # Clean up linebreaks in id
447 id = self
.NEWLINE_CLEANUP_RE
.sub(' ', id)
448 if not id in self
.markdown
.references
: # ignore undefined refs
450 href
, title
= self
.markdown
.references
[id]
453 return self
.makeTag(href
, title
, text
)
455 def makeTag(self
, href
, title
, text
):
456 el
= util
.etree
.Element('a')
458 el
.set('href', self
.sanitize_url(href
))
460 el
.set('title', title
)
466 class ImageReferencePattern(ReferencePattern
):
467 """ Match to a stored reference and return img element. """
468 def makeTag(self
, href
, title
, text
):
469 el
= util
.etree
.Element("img")
470 el
.set("src", self
.sanitize_url(href
))
472 el
.set("title", title
)
474 if self
.markdown
.enable_attributes
:
475 text
= handleAttributes(text
, el
)
477 el
.set("alt", self
.unescape(text
))
481 class AutolinkPattern(Pattern
):
482 """ Return a link Element given an autolink (`<http://example/com>`). """
483 def handleMatch(self
, m
):
484 el
= util
.etree
.Element("a")
485 el
.set('href', self
.unescape(m
.group(2)))
486 el
.text
= util
.AtomicString(m
.group(2))
489 class AutomailPattern(Pattern
):
491 Return a mailto link Element given an automail link (`<foo@example.com>`).
493 def handleMatch(self
, m
):
494 el
= util
.etree
.Element('a')
495 email
= self
.unescape(m
.group(2))
496 if email
.startswith("mailto:"):
497 email
= email
[len("mailto:"):]
499 def codepoint2name(code
):
500 """Return entity definition by code, or the code if not defined."""
501 entity
= entities
.codepoint2name
.get(code
)
503 return "%s%s;" % (util
.AMP_SUBSTITUTE
, entity
)
505 return "%s#%d;" % (util
.AMP_SUBSTITUTE
, code
)
507 letters
= [codepoint2name(ord(letter
)) for letter
in email
]
508 el
.text
= util
.AtomicString(''.join(letters
))
510 mailto
= "mailto:" + email
511 mailto
= "".join([util
.AMP_SUBSTITUTE
+ '#%d;' %
512 ord(letter
) for letter
in mailto
])
513 el
.set('href', mailto
)