1 # markdown is released under the BSD license
2 # Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
3 # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
4 # Copyright 2004 Manfred Stienstra (the original version)
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions are met:
11 # * Redistributions of source code must retain the above copyright
12 # notice, this list of conditions and the following disclaimer.
13 # * Redistributions in binary form must reproduce the above copyright
14 # notice, this list of conditions and the following disclaimer in the
15 # documentation and/or other materials provided with the distribution.
16 # * Neither the name of the <organization> nor the
17 # names of its contributors may be used to endorse or promote products
18 # derived from this software without specific prior written permission.
20 # THIS SOFTWARE IS PROVIDED BY THE PYTHON MARKDOWN PROJECT ''AS IS'' AND ANY
21 # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22 # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 # DISCLAIMED. IN NO EVENT SHALL ANY CONTRIBUTORS TO THE PYTHON MARKDOWN PROJECT
24 # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 # POSSIBILITY OF SUCH DAMAGE.
35 =============================================================================
37 Preprocessors work on source text before we start doing anything too
41 from __future__
import absolute_import
42 from __future__
import unicode_literals
48 def build_preprocessors(md_instance
, **kwargs
):
49 """ Build the default set of preprocessors used by Markdown. """
50 preprocessors
= odict
.OrderedDict()
51 preprocessors
['normalize_whitespace'] = NormalizeWhitespace(md_instance
)
52 if md_instance
.safeMode
!= 'escape':
53 preprocessors
["html_block"] = HtmlBlockPreprocessor(md_instance
)
54 preprocessors
["reference"] = ReferencePreprocessor(md_instance
)
58 class Preprocessor(util
.Processor
):
60 Preprocessors are run after the text is broken into lines.
62 Each preprocessor implements a "run" method that takes a pointer to a
63 list of lines of the document, modifies it as necessary and returns
64 either the same pointer or a pointer to a new list.
66 Preprocessors must extend markdown.Preprocessor.
71 Each subclass of Preprocessor should override the `run` method, which
72 takes the document as a list of strings split by newlines and returns
73 the (possibly modified) list of lines.
79 class NormalizeWhitespace(Preprocessor
):
80 """ Normalize whitespace for consistant parsing. """
83 source
= '\n'.join(lines
)
84 source
= source
.replace(util
.STX
, "").replace(util
.ETX
, "")
85 source
= source
.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
86 source
= source
.expandtabs(self
.markdown
.tab_length
)
87 source
= re
.sub(r
'(?<=\n) +\n', '\n', source
)
88 return source
.split('\n')
91 class HtmlBlockPreprocessor(Preprocessor
):
92 """Remove html blocks from the text and store them for later retrieval."""
94 right_tag_patterns
= ["</%s>", "%s>"]
96 \s+(?P<attr>[^>"'/= ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value"
98 \s+(?P<attr1>[^>"'/= ]+)=(?P<value1>[^> ]+) # attr=value
100 \s+(?P<attr2>[^>"'/= ]+) # attr
102 left_tag_pattern
= r
'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
103 attrs_re
= re
.compile(attrs_pattern
, re
.VERBOSE
)
104 left_tag_re
= re
.compile(left_tag_pattern
, re
.VERBOSE
)
105 markdown_in_raw
= False
107 def _get_left_tag(self
, block
):
108 m
= self
.left_tag_re
.match(block
)
111 raw_attrs
= m
.group('attrs')
114 for ma
in self
.attrs_re
.finditer(raw_attrs
):
116 if ma
.group('value'):
117 attrs
[ma
.group('attr').strip()] = ma
.group('value')
119 attrs
[ma
.group('attr').strip()] = ""
120 elif ma
.group('attr1'):
121 if ma
.group('value1'):
122 attrs
[ma
.group('attr1').strip()] = ma
.group('value1')
124 attrs
[ma
.group('attr1').strip()] = ""
125 elif ma
.group('attr2'):
126 attrs
[ma
.group('attr2').strip()] = ""
127 return tag
, len(m
.group(0)), attrs
129 tag
= block
[1:].split(">", 1)[0].lower()
130 return tag
, len(tag
)+2, {}
132 def _recursive_tagfind(self
, ltag
, rtag
, start_index
, block
):
134 i
= block
.find(rtag
, start_index
)
137 j
= block
.find(ltag
, start_index
)
138 # if no ltag, or rtag found before another ltag, return index
139 if (j
> i
or j
== -1):
141 # another ltag found before rtag, use end of ltag as starting
142 # point and search again
143 j
= block
.find('>', j
)
144 start_index
= self
._recursive
_tagfind
(ltag
, rtag
, j
+ 1, block
)
145 if start_index
== -1:
146 # HTML potentially malformed- ltag has no corresponding
150 def _get_right_tag(self
, left_tag
, left_index
, block
):
151 for p
in self
.right_tag_patterns
:
153 i
= self
._recursive
_tagfind
("<%s" % left_tag
, tag
, left_index
, block
)
155 return tag
.lstrip("<").rstrip(">"), i
156 return block
.rstrip()[-left_index
:-1].lower(), len(block
)
158 def _equal_tags(self
, left_tag
, right_tag
):
159 if left_tag
[0] in ['?', '@', '%']: # handle PHP, etc.
161 if ("/" + left_tag
) == right_tag
:
163 if (right_tag
== "--" and left_tag
== "--"):
165 elif left_tag
== right_tag
[1:] \
166 and right_tag
[0] == "/":
171 def _is_oneliner(self
, tag
):
172 return (tag
in ['hr', 'hr/'])
174 def run(self
, lines
):
175 text
= "\n".join(lines
)
177 text
= text
.rsplit("\n\n")
181 in_tag
= False # flag
185 if block
.startswith("\n"):
189 if block
.startswith("\n"):
193 if block
.startswith("<") and len(block
.strip()) > 1:
197 left_tag
, left_index
, attrs
= "--", 2, {}
199 left_tag
, left_index
, attrs
= self
._get
_left
_tag
(block
)
200 right_tag
, data_index
= self
._get
_right
_tag
(left_tag
,
203 # keep checking conditions below and maybe just append
205 if data_index
< len(block
) \
206 and (util
.isBlockLevel(left_tag
)
207 or left_tag
== '--'):
208 text
.insert(0, block
[data_index
:])
209 block
= block
[:data_index
]
211 if not (util
.isBlockLevel(left_tag
) \
212 or block
[1] in ["!", "?", "@", "%"]):
213 new_blocks
.append(block
)
216 if self
._is
_oneliner
(left_tag
):
217 new_blocks
.append(block
.strip())
220 if block
.rstrip().endswith(">") \
221 and self
._equal
_tags
(left_tag
, right_tag
):
222 if self
.markdown_in_raw
and 'markdown' in attrs
.keys():
223 start
= re
.sub(r
'\smarkdown(=[\'"]?[^> ]*[\'"]?
)?
',
224 '', block[:left_index])
225 end = block[-len(right_tag)-2:]
226 block = block[left_index:-len(right_tag)-2]
228 self.markdown.htmlStash.store(start))
229 new_blocks.append(block)
231 self.markdown.htmlStash.store(end))
234 self.markdown.htmlStash.store(block.strip()))
237 # if is block level tag and is not complete
239 if util.isBlockLevel(left_tag) or left_tag == "--" \
240 and not block.rstrip().endswith(">"):
241 items.append(block.strip())
245 self.markdown.htmlStash.store(block.strip()))
249 new_blocks.append(block)
254 right_tag, data_index = self._get_right_tag(left_tag, 0, block)
256 if self._equal_tags(left_tag, right_tag):
257 # if find closing tag
259 if data_index < len(block):
260 # we have more text after right_tag
261 items[-1] = block[:data_index]
262 text.insert(0, block[data_index:])
265 if self.markdown_in_raw and 'markdown
' in attrs.keys():
266 start = re.sub(r'\
smarkdown(=[\'"]?[^> ]*[\'"]?
)?
',
267 '', items[0][:left_index])
268 items[0] = items[0][left_index:]
269 end = items[-1][-len(right_tag)-2:]
270 items[-1] = items[-1][:-len(right_tag)-2]
272 self.markdown.htmlStash.store(start))
273 new_blocks.extend(items)
275 self.markdown.htmlStash.store(end))
278 self.markdown.htmlStash.store('\n\n'.join(items)))
282 if self.markdown_in_raw and 'markdown
' in attrs.keys():
283 start = re.sub(r'\
smarkdown(=[\'"]?[^> ]*[\'"]?
)?
',
284 '', items[0][:left_index])
285 items[0] = items[0][left_index:]
286 end = items[-1][-len(right_tag)-2:]
287 items[-1] = items[-1][:-len(right_tag)-2]
289 self.markdown.htmlStash.store(start))
290 new_blocks.extend(items)
293 self.markdown.htmlStash.store(end))
296 self.markdown.htmlStash.store('\n\n'.join(items)))
297 #new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items)))
298 new_blocks.append('\n')
300 new_text = "\n\n".join(new_blocks)
301 return new_text.split("\n")
304 class ReferencePreprocessor(Preprocessor):
305 """ Remove reference definitions from text and store for later use. """
307 TITLE = r'[ ]*(\"(.*)\"|
\'(.*)\'|\
((.*)\
))[ ]*'
308 RE = re.compile(r'^
[ ]{0,3}\
[([^\
]]*)\
]:\s
*([^
]*)[ ]*(%s)?$
' % TITLE, re.DOTALL)
309 TITLE_RE = re.compile(r'^
%s$
' % TITLE)
311 def run (self, lines):
315 m = self.RE.match(line)
317 id = m.group(1).strip().lower()
318 link = m.group(2).lstrip('<').rstrip('>')
319 t = m.group(5) or m.group(6) or m.group(7)
321 # Check next line for title
322 tm = self.TITLE_RE.match(lines[0])
325 t = tm.group(2) or tm.group(3) or tm.group(4)
326 self.markdown.references[id] = (link, t)
328 new_text.append(line)
330 return new_text #+ "\n"