Recursion work
[xmlmerge.git] / xmlmerge.py
blobdf2e55c6a5e832fcfc9f9ed2dbe2122dd33a0ea5
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
7 # This file is part of XML Merge.
9 # XML Merge is free software: you can redistribute it and/or modify it
10 # under the terms of the GNU Lesser General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or (at
12 # your option) any later version.
14 # XML Merge is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Lesser General Public License for more details.
19 # You should have received a copy of the GNU Lesser General Public License
20 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
23 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
25 """
26 The purpose of XML Merge is to preprocess any kind of XML file with great
27 flexibility.
29 XML Merge performs (among other things) recursive XML file inclusion and
30 XML element and attribute modification.
32 XML Merge is a Python module. It is normally invoked as a program from the
33 command line, but can equally well be used from within another Python
34 program or module.
35 """
37 ## IMPORTS AND CONSTANTS
39 import copy
40 import itertools
41 import optparse
42 import os
43 import re
44 import sys
46 import lxml.etree as ET
48 # Namespace mapping (can be directly used for lxml nsmap arguments):
49 xmns = {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
50 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
53 ## COMMAND LINE OPTION PARSING
55 class OptionParser(optparse.OptionParser):
57 def __init__(self, *a, **kw):
58 optparse.OptionParser.__init__(self, *a, **kw)
59 self.add_option("-i", "--input",
60 help=("(REQUIRED) input XML file"))
61 self.add_option("-o", "--output",
62 help=("output XML file (.out.xml if not given)"))
63 self.add_option("-s", "--xml-schema",
64 help=("XML Schema (.xsd) to validate output " +
65 "against"))
66 self.add_option("-r", "--reference",
67 help=("reference XML file to compare output " +
68 "against"))
69 self.add_option("-d", "--html-diff", action="store_true",
70 help=("only with -r; if output and reference " +
71 "differ, produce a HTML file showing the " +
72 "differences"))
73 self.add_option("-t", "--trace-includes", action="store_true",
74 help=("add tracing information to included " +
75 "XML fragments"))
76 self.add_option("-v", "--verbose", action="store_const",
77 dest="verbose", const=3,
78 help=("show debugging messages"))
79 self.add_option("-q", "--quiet", action="store_const",
80 dest="verbose", const=1,
81 help=("only show error messages"))
82 self.set_defaults(verbose=2)
84 # Explanation: levels of verbosity
85 # --quiet -> self.verbose == 1 # only show error messages
86 # -> self.verbose == 2 # no verbosity option given
87 # --verbose -> self.verbose == 3 # show debugging messages
90 def parse_command_line(argv):
91 """
92 parse_command_line(argv) -> optparse.Values
94 Parse argv and return an optparse.Values object containing the options.
96 This function performs all the necessary checks and conversions to make
97 sure all necessary options are given, and that all options are
98 available in a normalized format.
100 It also tries to create the containing directory for the output file if
101 it does not exist already.
103 # Parse options using OptionParser:
104 option_parser = OptionParser()
105 options, args = option_parser.parse_args(argv[1:])
107 # Make sure only options, and no other arguments, are passed on the
108 # command line:
109 try:
110 assert args == []
111 assert options.input is not None
112 except:
113 option_parser.error("Error: invalid argument list")
115 # If the output option has been omitted, build the output filename from
116 # the input filename, resulting in the file extension ".out.xml":
117 if options.output is None:
118 if options.input.lower().endswith(".xml"):
119 options.output = options.input[:-4] + ".out.xml"
120 else:
121 options.output = options.input + ".out.xml"
123 # Convert all filename options to normalized absolutized pathnames:
124 for n in "input output reference".split():
125 if getattr(options, n) is None: continue # if "-r" was not given
126 setattr(options, n, os.path.abspath(getattr(options, n)))
128 # When --verbose, print all filename options:
129 if options.verbose >= 3:
130 print "Input: %s" % options.input
131 print "Output: %s" % options.output
132 print "Reference: %s" % options.reference
134 # Make sure there is a directory where the output XML file should go:
135 try:
136 os.makedirs(os.path.dirname(options.output))
137 except:
138 pass # fail later if there still is no output directory now
140 return options
143 ## XML PROCESSING AND COMPARISON
145 def read_input_file(input_filename):
147 read_input_file(input_filename) -> ET._Element
149 Read the input file, and return the corresponding XML Element object,
150 the element tree root.
152 input_xml = ET.parse(input_filename).getroot()
153 return input_xml
155 def postprocess_xml(output_xml):
157 postprocess_xml(output_xml) -> ET._Element
159 Remove unnecessary namespace declarations and whitespace. Returns a
160 modified copy of output_xml. The argument may be modified by calling
161 this function.
163 # Remove unused namespace declarations:
164 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
165 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
166 ns_root.append(output_xml)
167 ns_root.remove(output_xml)
168 # If you don't perform this copy, each output_xml element's
169 # getroottree() will report the temporary tree containing the empty
170 # NS_ROOT element. This is not a hack, this is about how lxml works.
171 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
173 # Make pretty-printing work by removing unnecessary whitespace:
174 for el in output_xml.iter():
175 if len(el) and el.text and not el.text.strip():
176 el.text = None
177 if el.tail and not el.tail.strip():
178 el.tail = None
180 return output_xml
182 def write_output_file(output_xml, output_filename):
184 Write the output XML Element to the specified output filename.
186 output_xmltree = output_xml.getroottree()
187 output_xmltree.write(output_filename, pretty_print=True,
188 xml_declaration=True, encoding="utf-8")
190 def read_xml_schema_file(xml_schema_filename):
192 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
194 Read the XML Schema file, and return the corresponding XML Schema
195 object.
197 xml_schema_xmltree = ET.parse(xml_schema_filename)
198 xml_schema = ET.XMLSchema(xml_schema_xmltree)
199 return xml_schema
201 def match_against_schema(options, output_xml, xml_schema):
203 match_against_schema(options, output_xml, xml_schema) -> bool
205 Validate output against XML Schema.
207 The result is True if the output XML Element (tree) matches the XML
208 Schema, otherwise the result is False.
210 is_valid = xml_schema.validate(output_xml.getroottree())
211 if options.verbose >= 2:
212 if is_valid:
213 print "Output matches XML Schema."
214 else:
215 print "Output invalid according to XML Schema."
216 print xml_schema.error_log.last_error
217 return is_valid
219 def match_against_reference(options, output_xml):
221 match_against_reference(options, output_xml) -> bool
223 Compare the output string (read from file options.output) to the
224 reference string (read from options.reference). If they are not the
225 same (bytewise), and if options.html_diff is True, create an HTML file
226 showing the differences.
228 The result is True if output and reference are the same (bytewise),
229 otherwise the result is False.
231 reference_filename = options.reference
232 output_filename = options.output
233 do_html_diff = options.html_diff
235 reference_str = file(reference_filename, "rb").read()
236 output_str = file(output_filename, "rb").read()
237 is_valid = (reference_str == output_str)
238 if options.verbose >= 2:
239 if is_valid:
240 print "Output matches reference."
241 elif not do_html_diff:
242 print "Output and reference differ."
243 if do_html_diff and not is_valid:
244 html_filename = "%s.diff.html" % output_filename
245 if options.verbose >= 2:
246 print ("Output and reference differ - " +
247 "generating '%s'..." % html_filename)
248 create_reference_diff_html(html_filename, reference_str,
249 output_str)
250 return is_valid
252 def create_reference_diff_html(html_filename, reference_str, output_str):
254 Create an HTML file (created at html_filename) showing the differrences
255 between the reference string and the output string side-by-side.
257 reference_lines = reference_str.splitlines()
258 output_lines = output_str .splitlines()
260 import difflib
261 html_diff = difflib.HtmlDiff(wrapcolumn=75)
262 html_str = html_diff.make_file(reference_lines, output_lines,
263 "Reference", "Output")
264 file(html_filename, "w").write(html_str)
267 ## XML PREPROCESS CLASS
269 class XMLPreprocess(object):
271 Use:
273 >>> proc = XMLPreprocess()
274 >>> output_xml = proc(options, input_xml) # input_xml may change
277 def __init__(self):
278 super(XMLPreprocess, self).__init__()
279 self._namespace_stack = [{}]
281 def __call__(self, xml_element, namespace=None,
282 trace_includes=False, xml_filename=None):
284 XMLPreprocess()(...)
286 Preprocess the input XML Element, xml_element. The element tree of
287 xml_element will be modified in-place.
289 The namespace given should be a dict that can be used as a Python
290 namespace. This namespace will be used in XML attribute
291 substitution.
293 If trace_includes is True, the output will contain tags that
294 surround included sections of the file. The xml_filename argument
295 is then required.
297 Inclusion will recursively call this method (__call__) for
298 preprocessing the included file and for recursive inclusion.
300 if namespace is not None:
301 self._namespace_stack.append(namespace)
302 self.namespace = self._namespace_stack[-1]
303 self.trace_includes = trace_includes
304 self.xml_filename = xml_filename
306 ns = "{%s}" % xmns["xm"]
307 len_ns = len(ns)
309 # Evaluate Python expressions in the attributes of xml_element:
310 for attr_name, attr_value in xml_element.items(): # attr map
311 v = self._eval_substitution(attr_value, self.namespace)
312 xml_element.set(attr_name, v)
314 # If xml_element has xmns["xm"] as its namespace, proceed with the
315 # appropriate method of this class:
316 if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
317 tag = xml_element.tag[len_ns:] # just the tag without namespc
318 method = "_xm_" + tag.lower() # tolerate any case
319 if not hasattr(self, method):
320 raise Exception, "cannot process <xm:%s/>" % tag
321 getattr(self, method)(xml_element) # call the method
322 xml_element.getparent().remove(xml_element)
324 # If not, recurse:
325 else:
326 self._recurse_into(xml_element)
328 return None
330 def _recurse_into(self, xml_element, namespace=None):
331 for xml_sub_element in xml_element.xpath("*"):
332 self(xml_sub_element, namespace,
333 self.trace_includes, self.xml_filename)
335 _eval_substitution_regex = re.compile(r"\{(.*?)\}")
337 def _eval_substitution(self, attr_value, namespace):
339 Evaluate Python expressions within strings.
341 Internal method to perform substitution of Python expressions
342 within attribute values, {x} -> str(eval(x)). Example:
344 >>> self._attr_substitution("3 + 5 = {3 + 5} in Python", {})
345 '3 + 5 = 8 in Python'
347 Multiple Python expressions in one string are supported as well.
349 new_a_value = [] # faster than always concatenating strings
350 last_index = 0
351 for match in self._eval_substitution_regex.finditer(attr_value):
352 new_a_value.append(attr_value[last_index:match.start()])
353 result = str(eval(match.group(1), namespace, namespace))
354 new_a_value.append(result)
355 last_index = match.end()
356 new_a_value.append(attr_value[last_index:])
357 return "".join(new_a_value)
359 def _xm_addelements(self, xml_element):
361 Add subelements to, before, or after the element selected by XPath
362 (@to, @before or @after).
364 to = xml_element.get("to")
365 before = xml_element.get("before")
366 after = xml_element.get("after")
367 assert sum((to is None, before is None, after is None)) == 2
368 select = to or before or after
370 def _xm_block(self, xml_element):
372 Create a scope to contain visibility of newly assigned Python
373 variables. This works the same way that Python itself scopes
374 variables, i.e. by creating a shallow copy of the Python namespace.
375 E.g. assignments to list items will be visible to outside scopes!
377 self._recurse_into(xml_element, self.namespace.copy())
379 def _xm_comment(self, xml_element):
381 A comment that is removed by XML Merge.
383 pass # that's it
385 def _xm_include(self, xml_element):
387 Include from the specified file (@file) the elements selected by
388 XPath (@select).
391 def _xm_loop(self, xml_element):
393 Loop over a range of integer values.
395 The first attribute is evaluated as the loop counter. Example:
397 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
399 WARNING: The loop counter attribute, as well as all substitutions
400 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
401 (wholly or partially) be evaluated as Python expressions using
402 eval().
404 # Get the loop counter name and list:
405 loop_counter_name = xml_element.keys()[0]
406 loop_counter_list = eval(xml_element.get(loop_counter_name))
408 # Loop:
409 addnext_to_node = xml_element # for new elements
410 for loop_counter_value in loop_counter_list:
411 pass
413 def _xm_pythoncode(self, xml_element):
415 Execute Python code.
418 def _xm_removeattribute(self, xml_element):
420 Remove the attribute (@name) from the element selected by XPath
421 (@select).
424 def _xm_removeelements(self, xml_element):
426 Remove (zero or more) elements selected by XPath (@select).
429 def _xm_setattribute(self, xml_element):
431 Assign the value (@value) to the attribute (@name) of the element
432 selected by XPath (@select).
435 def _xm_var(self, xml_element):
437 Set a variable.
439 ns = self.namespace
440 for attr_name, attr_value in xml_element.items(): # attr map
441 ns[attr_name] = eval(attr_value, ns, ns)
444 ## MAIN FUNCTION
446 def main(argv):
448 main(argv) -> int
450 Process input to produce output according to the command line options.
452 After the XML Merge Manual, this is the first piece of the code a new
453 developer will read. Keep this code as simple as possible if you change
454 it in any way.
456 These are all possible exit status codes returned or raised (using
457 SystemExit) by main or the functions it calls:
458 - On success, and if all requested validations (-s, -r) match:
459 return 0
460 - On error, e.g. wrong options (see parse_command_line()):
461 return 1
462 - On mismatch (either XML Schema (-s) or reference (-r)):
463 return mismatch_bitmap # see end of main()
464 - To aid understanding the bitmap: If N matching functions are
465 provided, and all are requested and all fail to match the output
466 file:
467 return (2 ** N - 1) * 2 # mismatch_bitmap
469 # Parse command line to get options:
470 options = parse_command_line(argv)
472 # Input file => preprocessing => output file:
473 xml = read_input_file(options.input)
474 proc = XMLPreprocess()
475 proc(xml, trace_includes=options.trace_includes,
476 xml_filename=options.input)
477 xml = postprocess_xml(xml)
478 write_output_file(xml, options.output)
480 # If -s: Compare output to XML Schema file:
481 matches_schema = True # False means: match requested and negative
482 if options.xml_schema is not None:
483 xml_schema = read_xml_schema_file(options.xml_schema)
484 matches_schema = match_against_schema(options, xml, xml_schema)
486 # If -r: Compare output to reference:
487 matches_reference = True # False means: match requested and negative
488 if options.reference is not None:
489 matches_reference = match_against_reference(options, xml)
491 # Calculate and return the mismatch bitmap:
492 mismatch_bitmap = 0
493 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
494 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
495 return mismatch_bitmap
498 if __name__ == "__main__":
499 sys.exit(main(sys.argv))