1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
37 Python Markdown converts Markdown to HTML and can be used as a library or
38 called from the command line.
40 ## Basic usage as a module:
43 html = markdown.markdown(your_text_string)
45 See <http://packages.python.org/Markdown/> for more
46 information and instructions on how to extend the functionality of
47 Python Markdown. Read that before you try modifying this file.
49 ## Authors and License
51 Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and
52 maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan
53 Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
55 Contact: markdown@freewisdom.org
57 Copyright 2007-2013 The Python Markdown Project (v. 1.7 and later)
58 Copyright 200? Django Software Foundation (OrderedDict implementation)
59 Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
60 Copyright 2004 Manfred Stienstra (the original version)
62 License: BSD (see LICENSE for details).
65 from __future__
import absolute_import
66 from __future__
import unicode_literals
67 from .__version
__ import version
, version_info
73 from .preprocessors
import build_preprocessors
74 from .blockprocessors
import build_block_parser
75 from .treeprocessors
import build_treeprocessors
76 from .inlinepatterns
import build_inlinepatterns
77 from .postprocessors
import build_postprocessors
78 from .extensions
import Extension
79 from .serializers
import to_html_string
, to_xhtml_string
81 __all__
= ['Markdown', 'markdown', 'markdownFromFile']
83 logger
= logging
.getLogger('MARKDOWN')
86 class Markdown(object):
87 """Convert Markdown to HTML."""
89 doc_tag
= "div" # Element used to wrap document - later removed
92 'html_replacement_text' : '[HTML_REMOVED]',
94 'enable_attributes' : True,
95 'smart_emphasis' : True,
100 'html' : to_html_string
,
101 'html4' : to_html_string
,
102 'html5' : to_html_string
,
103 'xhtml' : to_xhtml_string
,
104 'xhtml1': to_xhtml_string
,
105 'xhtml5': to_xhtml_string
,
108 ESCAPED_CHARS
= ['\\', '`', '*', '_', '{', '}', '[', ']',
109 '(', ')', '>', '#', '+', '-', '.', '!']
111 def __init__(self
, *args
, **kwargs
):
113 Creates a new Markdown instance.
117 * extensions: A list of extensions.
118 If they are of type string, the module mdx_name.py will be loaded.
119 If they are a subclass of markdown.Extension, they will be used
121 * extension_configs: Configuration settingis for extensions.
122 * output_format: Format of output. Supported formats are:
123 * "xhtml1": Outputs XHTML 1.x. Default.
124 * "xhtml5": Outputs XHTML style tags of HTML 5
125 * "xhtml": Outputs latest supported version of XHTML (currently XHTML 1.1).
126 * "html4": Outputs HTML 4
127 * "html5": Outputs HTML style tags of HTML 5
128 * "html": Outputs latest supported version of HTML (currently HTML 4).
129 Note that it is suggested that the more specific formats ("xhtml1"
130 and "html4") be used as "xhtml" or "html" may change in the future
131 if it makes sense at that time.
132 * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
133 * html_replacement_text: Text used when safe_mode is set to "replace".
134 * tab_length: Length of tabs in the source. Default: 4
135 * enable_attributes: Enable the conversion of attributes. Default: True
136 * smart_emphasis: Treat `_connected_words_` intelegently Default: True
137 * lazy_ol: Ignore number of first item of ordered lists. Default: True
141 # For backward compatibility, loop through old positional args
142 pos
= ['extensions', 'extension_configs', 'safe_mode', 'output_format']
145 if pos
[c
] not in kwargs
:
149 # ignore any additional args
152 # Loop through kwargs and assign defaults
153 for option
, default
in self
.option_defaults
.items():
154 setattr(self
, option
, kwargs
.get(option
, default
))
156 self
.safeMode
= kwargs
.get('safe_mode', False)
157 if self
.safeMode
and 'enable_attributes' not in kwargs
:
158 # Disable attributes in safeMode when not explicitly set
159 self
.enable_attributes
= False
161 self
.registeredExtensions
= []
163 self
.stripTopLevelTags
= True
168 self
.htmlStash
= util
.HtmlStash()
169 self
.set_output_format(kwargs
.get('output_format', 'xhtml1'))
170 self
.registerExtensions(extensions
=kwargs
.get('extensions', []),
171 configs
=kwargs
.get('extension_configs', {}))
174 def build_parser(self
):
175 """ Build the parser from the various parts. """
176 self
.preprocessors
= build_preprocessors(self
)
177 self
.parser
= build_block_parser(self
)
178 self
.inlinePatterns
= build_inlinepatterns(self
)
179 self
.treeprocessors
= build_treeprocessors(self
)
180 self
.postprocessors
= build_postprocessors(self
)
183 def registerExtensions(self
, extensions
, configs
):
185 Register extensions with this instance of Markdown.
189 * extensions: A list of extensions, which can either
190 be strings or objects. See the docstring on Markdown.
191 * configs: A dictionary mapping module names to config options.
194 for ext
in extensions
:
195 if isinstance(ext
, util
.string_type
):
196 ext
= self
.build_extension(ext
, configs
.get(ext
, []))
197 if isinstance(ext
, Extension
):
198 ext
.extendMarkdown(self
, globals())
199 elif ext
is not None:
201 'Extension "%s.%s" must be of type: "markdown.Extension"'
202 % (ext
.__class
__.__module
__, ext
.__class
__.__name
__))
206 def build_extension(self
, ext_name
, configs
= []):
207 """Build extension by name, then return the module.
209 The extension name may contain arguments as part of the string in the
210 following format: "extname(key1=value1,key2=value2)"
214 # Parse extensions config params (ignore the order)
215 configs
= dict(configs
)
216 pos
= ext_name
.find("(") # find the first "("
218 ext_args
= ext_name
[pos
+1:-1]
219 ext_name
= ext_name
[:pos
]
220 pairs
= [x
.split("=") for x
in ext_args
.split(",")]
221 configs
.update([(x
.strip(), y
.strip()) for (x
, y
) in pairs
])
223 # Setup the module name
224 module_name
= ext_name
225 if '.' not in ext_name
:
226 module_name
= '.'.join(['third_party.markdown.extensions', ext_name
])
228 # Try loading the extension first from one place, then another
229 try: # New style (markdown.extensons.<extension>)
230 module
= __import__(module_name
, {}, {}, [module_name
.rpartition('.')[0]])
232 module_name_old_style
= '_'.join(['mdx', ext_name
])
233 try: # Old style (mdx_<extension>)
234 module
= __import__(module_name_old_style
)
235 except ImportError as e
:
236 message
= "Failed loading extension '%s' from '%s' or '%s'" \
237 % (ext_name
, module_name
, module_name_old_style
)
238 e
.args
= (message
,) + e
.args
[1:]
241 # If the module is loaded successfully, we expect it to define a
242 # function called makeExtension()
244 return module
.makeExtension(configs
.items())
245 except AttributeError as e
:
247 message
= "Failed to initiate extension " \
248 "'%s': %s" % (ext_name
, message
)
249 e
.args
= (message
,) + e
.args
[1:]
252 def registerExtension(self
, extension
):
253 """ This gets called by the extension """
254 self
.registeredExtensions
.append(extension
)
259 Resets all state variables so that we can start with a new text.
261 self
.htmlStash
.reset()
262 self
.references
.clear()
264 for extension
in self
.registeredExtensions
:
265 if hasattr(extension
, 'reset'):
270 def set_output_format(self
, format
):
271 """ Set the output format for the class instance. """
272 self
.output_format
= format
.lower()
274 self
.serializer
= self
.output_formats
[self
.output_format
]
275 except KeyError as e
:
276 valid_formats
= list(self
.output_formats
.keys())
278 message
= 'Invalid Output Format: "%s". Use one of %s.' \
279 % (self
.output_format
,
280 '"' + '", "'.join(valid_formats
) + '"')
281 e
.args
= (message
,) + e
.args
[1:]
285 def convert(self
, source
):
287 Convert markdown to serialized XHTML or HTML.
291 * source: Source text as a Unicode string.
293 Markdown processing takes place in five steps:
295 1. A bunch of "preprocessors" munge the input text.
296 2. BlockParser() parses the high-level structural elements of the
297 pre-processed text into an ElementTree.
298 3. A bunch of "treeprocessors" are run against the ElementTree. One
299 such treeprocessor runs InlinePatterns against the ElementTree,
300 detecting inline markup.
301 4. Some post-processors are run against the text after the ElementTree
302 has been serialized into text.
303 5. The output is written to a string.
307 # Fixup the source text
308 if not source
.strip():
309 return '' # a blank unicode string
312 source
= util
.text_type(source
)
313 except UnicodeDecodeError as e
:
314 # Customise error message while maintaining original trackback
315 e
.reason
+= '. -- Note: Markdown only accepts unicode input!'
318 # Split into lines and run the line preprocessors.
319 self
.lines
= source
.split("\n")
320 for prep
in self
.preprocessors
.values():
321 self
.lines
= prep
.run(self
.lines
)
323 # Parse the high-level elements.
324 root
= self
.parser
.parseDocument(self
.lines
).getroot()
326 # Run the tree-processors
327 for treeprocessor
in self
.treeprocessors
.values():
328 newRoot
= treeprocessor
.run(root
)
332 # Serialize _properly_. Strip top-level tags.
333 output
= self
.serializer(root
)
334 if self
.stripTopLevelTags
:
336 start
= output
.index('<%s>'%self
.doc_tag
)+len(self
.doc_tag
)+2
337 end
= output
.rindex('</%s>'%self
.doc_tag
)
338 output
= output
[start
:end
].strip()
340 if output
.strip().endswith('<%s />'%self
.doc_tag
):
341 # We have an empty document
344 # We have a serious problem
345 raise ValueError('Markdown failed to strip top-level tags. Document=%r' % output
.strip())
347 # Run the text post-processors
348 for pp
in self
.postprocessors
.values():
349 output
= pp
.run(output
)
351 return output
.strip()
353 def convertFile(self
, input=None, output
=None, encoding
=None):
354 """Converts a markdown file and returns the HTML as a unicode string.
356 Decodes the file using the provided encoding (defaults to utf-8),
357 passes the file content to markdown, and outputs the html to either
358 the provided stream or the file with provided name, using the same
359 encoding as the source file. The 'xmlcharrefreplace' error handler is
360 used when encoding the output.
362 **Note:** This is the only place that decoding and encoding of unicode
363 takes place in Python-Markdown. (All other code is unicode-in /
368 * input: File object or path. Reads from stdin if `None`.
369 * output: File object or path. Writes to stdout if `None`.
370 * encoding: Encoding of input and output files. Defaults to utf-8.
374 encoding
= encoding
or "utf-8"
378 if isinstance(input, util
.string_type
):
379 input_file
= codecs
.open(input, mode
="r", encoding
=encoding
)
381 input_file
= codecs
.getreader(encoding
)(input)
382 text
= input_file
.read()
385 text
= sys
.stdin
.read()
386 if not isinstance(text
, util
.text_type
):
387 text
= text
.decode(encoding
)
389 text
= text
.lstrip('\ufeff') # remove the byte-order mark
392 html
= self
.convert(text
)
394 # Write to file or stdout
396 if isinstance(output
, util
.string_type
):
397 output_file
= codecs
.open(output
, "w",
399 errors
="xmlcharrefreplace")
400 output_file
.write(html
)
403 writer
= codecs
.getwriter(encoding
)
404 output_file
= writer(output
, errors
="xmlcharrefreplace")
405 output_file
.write(html
)
406 # Don't close here. User may want to write more.
408 # Encode manually and write bytes to stdout.
409 html
= html
.encode(encoding
, "xmlcharrefreplace")
411 # Write bytes directly to buffer (Python 3).
412 sys
.stdout
.buffer.write(html
)
413 except AttributeError:
414 # Probably Python 2, which works with bytes by default.
415 sys
.stdout
.write(html
)
422 =============================================================================
424 Those are the two functions we really mean to export: markdown() and
428 def markdown(text
, *args
, **kwargs
):
429 """Convert a markdown string to HTML and return HTML as a unicode string.
431 This is a shortcut function for `Markdown` class to cover the most
432 basic use case. It initializes an instance of Markdown, loads the
433 necessary extensions and runs the parser on the given text.
437 * text: Markdown formatted text as Unicode or ASCII string.
438 * Any arguments accepted by the Markdown class.
440 Returns: An HTML document as a string.
443 md
= Markdown(*args
, **kwargs
)
444 return md
.convert(text
)
447 def markdownFromFile(*args
, **kwargs
):
448 """Read markdown code from a file and write it to a file or a stream.
450 This is a shortcut function which initializes an instance of Markdown,
451 and calls the convertFile method rather than convert.
455 * input: a file name or readable object.
456 * output: a file name or writable object.
457 * encoding: Encoding of input and output.
458 * Any arguments accepted by the Markdown class.
461 # For backward compatibility loop through positional args
462 pos
= ['input', 'output', 'extensions', 'encoding']
465 if pos
[c
] not in kwargs
:
471 md
= Markdown(**kwargs
)
472 md
.convertFile(kwargs
.get('input', None),
473 kwargs
.get('output', None),
474 kwargs
.get('encoding', None))