WIP Before changing xm:Loop's loop counter to use the form i="range(a, b)"
[xmlmerge.git] / xmlmerge.py
blob2bc82698ad356fa1b52cf4f816bc564b2a4664ed
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
7 # This file is part of XML Merge.
9 # XML Merge is free software: you can redistribute it and/or modify it
10 # under the terms of the GNU Lesser General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or (at
12 # your option) any later version.
14 # XML Merge is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Lesser General Public License for more details.
19 # You should have received a copy of the GNU Lesser General Public License
20 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
23 # Developed (i.e. tested) using Python 2.6.3 and lxml 2.2.2.
25 """
26 The purpose of XML Merge is to preprocess any kind of XML file with great
27 flexibility.
29 XML Merge performs (among other things) recursive XML file inclusion and
30 XML element and attribute modification.
32 XML Merge is a Python module. It is normally invoked as a program from the
33 command line, but can equally well be used from within another Python
34 program or module.
35 """
37 ## IMPORTS AND CONSTANTS
39 import copy
40 import optparse
41 import os
42 import re
43 import sys
45 import lxml.etree as ET
47 # Namespace mapping (can be directly used for lxml nsmap arguments):
48 xmns = {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
49 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
52 ## COMMAND LINE OPTION PARSING
54 class OptionParser(optparse.OptionParser):
56 def __init__(self, *a, **kw):
57 optparse.OptionParser.__init__(self, *a, **kw)
58 self.add_option("-i", "--input",
59 help=("(REQUIRED) input XML file"))
60 self.add_option("-o", "--output",
61 help=("output XML file (.out.xml if not given)"))
62 self.add_option("-s", "--xml-schema",
63 help=("XML Schema (.xsd) to validate output " +
64 "against"))
65 self.add_option("-r", "--reference",
66 help=("reference XML file to compare output " +
67 "against"))
68 self.add_option("-d", "--html-diff", action="store_true",
69 help=("only with -r; if output and reference " +
70 "differ, produce a HTML file showing the " +
71 "differences"))
72 self.add_option("-t", "--trace-includes", action="store_true",
73 help=("add tracing information to included " +
74 "XML fragments"))
75 self.add_option("-v", "--verbose", action="store_const",
76 dest="verbose", const=3,
77 help=("show debugging messages"))
78 self.add_option("-q", "--quiet", action="store_const",
79 dest="verbose", const=1,
80 help=("only show error messages"))
81 self.set_defaults(verbose=2)
83 # Explanation: levels of verbosity
84 # --quiet -> self.verbose == 1 # only show error messages
85 # -> self.verbose == 2 # no verbosity option given
86 # --verbose -> self.verbose == 3 # show debugging messages
89 def parse_command_line(argv):
90 """
91 parse_command_line(argv) -> optparse.Values
93 Parse argv and return an optparse.Values object containing the options.
95 This function performs all the necessary checks and conversions to make
96 sure all necessary options are given, and that all options are
97 available in a normalized format.
99 It also tries to create the containing directory for the output file if
100 it does not exist already.
102 # Parse options using OptionParser:
103 option_parser = OptionParser()
104 options, args = option_parser.parse_args(argv[1:])
106 # Make sure only options, and no other arguments, are passed on the
107 # command line:
108 try:
109 assert args == []
110 assert options.input is not None
111 except:
112 print "Error: invalid argument list"
113 print
114 option_parser.print_help()
115 raise SystemExit, 1
117 # If the output option has been omitted, build the output filename from
118 # the input filename, resulting in the file extension ".out.xml":
119 if options.output is None:
120 if options.input.lower().endswith(".xml"):
121 options.output = options.input[:-4] + ".out.xml"
122 else:
123 options.output = options.input + ".out.xml"
125 # Convert all filename options to normalized absolutized pathnames:
126 for n in "input output reference".split():
127 if getattr(options, n) is None: continue # if "-r" was not given
128 setattr(options, n, os.path.abspath(getattr(options, n)))
130 # When --verbose, print all filename options:
131 if options.verbose >= 3:
132 print "Input: %s" % options.input
133 print "Output: %s" % options.output
134 print "Reference: %s" % options.reference
136 # Make sure there is a directory where the output XML file should go:
137 try:
138 os.makedirs(os.path.dirname(options.output))
139 except:
140 pass # fail later if there still is no output directory now
142 return options
145 ## XML PROCESSING AND COMPARISON
147 def read_input_file(input_filename):
149 read_input_file(input_filename) -> ET._Element
151 Read the input file, and return the corresponding XML Element object,
152 the element tree root.
154 input_xml = ET.parse(input_filename).getroot()
155 return input_xml
157 def postprocess_xml(output_xml):
159 postprocess_xml(output_xml) -> ET._Element
161 Remove unnecessary namespace declarations and whitespace. Returns a
162 modified copy of output_xml. The argument may be modified by calling
163 this function.
165 # Remove unused namespace declarations:
166 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
167 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
168 ns_root.append(output_xml)
169 ns_root.remove(output_xml)
170 # If you don't perform this copy, each output_xml element's
171 # getroottree() will report the temporary tree containing the empty
172 # NS_ROOT element. This is not a hack, this is about how lxml works.
173 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
175 # Make pretty-printing work by removing unnecessary whitespace:
176 for el in output_xml.iter():
177 if len(el) and el.text and not el.text.strip():
178 el.text = None
179 if el.tail and not el.tail.strip():
180 el.tail = None
182 return output_xml
184 def write_output_file(output_xml, output_filename):
186 Write the output XML Element to the specified output filename.
188 output_xmltree = output_xml.getroottree()
189 output_xmltree.write(output_filename, pretty_print=True,
190 xml_declaration=True, encoding="utf-8")
192 def read_xml_schema_file(xml_schema_filename):
194 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
196 Read the XML Schema file, and return the corresponding XML Schema
197 object.
199 xml_schema_xmltree = ET.parse(xml_schema_filename)
200 xml_schema = ET.XMLSchema(xml_schema_xmltree)
201 return xml_schema
203 def match_against_schema(options, output_xml, xml_schema):
205 match_against_schema(options, output_xml, xml_schema) -> bool
207 Validate output against XML Schema.
209 The result is True if the output XML Element (tree) matches the XML
210 Schema, otherwise the result is False.
212 is_valid = xml_schema.validate(output_xml.getroottree())
213 if options.verbose >= 2:
214 if is_valid:
215 print "Output matches XML Schema."
216 else:
217 print "Output invalid according to XML Schema."
218 print xml_schema.error_log.last_error
219 return is_valid
221 def match_against_reference(options, output_xml):
223 match_against_reference(options, output_xml) -> bool
225 Compare the output string (read from file options.output) to the
226 reference string (read from options.reference). If they are not the
227 same (bytewise), and if options.html_diff is True, create an HTML file
228 showing the differences.
230 The result is True if output and reference are the same (bytewise),
231 otherwise the result is False.
233 reference_filename = options.reference
234 output_filename = options.output
235 do_html_diff = options.html_diff
237 reference_str = file(reference_filename, "rb").read()
238 output_str = file(output_filename, "rb").read()
239 is_valid = (reference_str == output_str)
240 if options.verbose >= 2:
241 if is_valid:
242 print "Output matches reference."
243 elif not do_html_diff:
244 print "Output and reference differ."
245 if do_html_diff and not is_valid:
246 html_filename = "%s.diff.html" % output_filename
247 if options.verbose >= 2:
248 print ("Output and reference differ - " +
249 "generating '%s'..." % html_filename)
250 create_reference_diff_html(html_filename, reference_str,
251 output_str)
252 return is_valid
254 def create_reference_diff_html(html_filename, reference_str, output_str):
256 Create an HTML file (created at html_filename) showing the differrences
257 between the reference string and the output string side-by-side.
259 reference_lines = reference_str.splitlines()
260 output_lines = output_str .splitlines()
262 import difflib
263 html_diff = difflib.HtmlDiff(wrapcolumn=75)
264 html_str = html_diff.make_file(reference_lines, output_lines,
265 "Reference", "Output")
266 file(html_filename, "w").write(html_str)
269 ## XML PREPROCESS CLASS
271 class XMLPreprocess(object):
273 Use:
275 >>> proc = XMLPreprocess()
276 >>> output_xml = proc(options, input_xml) # input_xml may change
279 def __call__(self, xml_element, xml_filename=None,
280 trace_includes=False):
282 XMLPreprocess()(input_xml) -> ET._Element
283 XMLPreprocess()(input_xml, filename, True) -> ET._Element # traced
285 Preprocess the input XML Element to produce an output XML Element.
286 The argument may be modified.
288 If trace_includes is True, the output will contain tags that
289 surround included sections of the file.
291 Inclusion will recursively call this method (__call__) for
292 preprocessing the included file and for recursive inclusion.
294 ns = "{%s}" % xmns["xm"]
295 len_ns = len(ns)
297 # Process Loop elements:
298 for el in xml_element.xpath(".//xm:Loop", namespaces=xmns):
299 self.Loop(el)
300 el.getparent().remove(el)
302 # Process Include elements:
303 for el in xml_element.xpath(".//xm:Include", namespaces=xmns):
304 self.Include(el, xml_filename)
305 el.getparent().remove(el)
307 # Process any other elements from the XMLMerge namespace:
308 for el in xml_element.xpath(".//xm:*", namespaces=xmns):
309 tag = el.tag[len_ns:]
310 getattr(self, tag)(el)
311 el.getparent().remove(el)
313 return xml_element
315 _attr_substitution_regex = re.compile(r"\{(.*?)\}")
317 def Loop(self, loop_element):
319 Loop over a range of integer values.
321 The first attribute is evaluated as the loop counter. Example:
323 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
325 The 'format' attribute determines the
327 WARNING: All attributes (XPath "@*"), as well as all substitutions
328 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
329 (wholly or partially) be evaluated as Python expressions using
330 eval().
332 # Get loop_counter (attribute name), format (attribute value), and
333 # all variables (attribute {name: value, ...} mapping):
334 loop_counter = None # name of the first attribute besides 'format'
335 format = None
336 variables = {}
337 for name, value in loop_element.attrib.iteritems():
338 if name == "format":
339 format = value
340 continue
341 if loop_counter is None: # first attribute besides 'format'
342 loop_counter = name
343 variables[name] = value
345 # Determine the bounds of the loop counter:
346 lower_bound, upper_bound = variables[loop_counter].split("..", 1)
347 lower_bound = eval(lower_bound)
348 upper_bound = eval(upper_bound)
350 # Loop:
351 subst_re = self._attr_substitution_regex
352 for loop_value in xrange(lower_bound, upper_bound + 1):
354 # Build a Python namespace and eval() all variables:
355 namespace = {loop_counter: loop_value}
356 for name, value in variables.iteritems():
357 if name == loop_counter: continue
358 namespace[name] = eval(value, namespace, namespace)
360 # Loop over the subelements and their attributes:
361 for sub_element in loop_element.xpath("descendant::*"):
362 for name, value in sub_element.attrib.iteritems():
363 for match in subst_re.finditer(value):
365 def Include(self, el, xml_filename):
367 Include from the specified file (@file) the elements selected by
368 XPath (@select).
371 def AddElements(self, el):
373 Add subelements to, before, or after the element selected by XPath
374 (@to, @before or @after).
376 to = el.attrib.get("to")
377 before = el.attrib.get("before")
378 after = el.attrib.get("after")
379 assert sum((to is None, before is None, after is None)) == 2
380 select = to or before or after
382 def RemoveElements(self, el):
384 Remove elements selected by XPath (@select).
387 def SetAttribute(self, el):
389 Assign the value (@value) to the attribute (@name) of the element
390 selected by XPath (@select).
393 def RemoveAttribute(self, el):
395 Remove the attribute (@name) from the element selected by XPath
396 (@select).
399 def PythonCode(self, el):
401 Execute Python code.
405 ## MAIN FUNCTION
407 def main(argv):
409 main(argv) -> int
411 Process input to produce output according to the command line options.
413 After the XML Merge Manual, this is the first piece of the code a new
414 developer will read. Keep this code as simple as possible if you change
415 it in any way.
417 These are all possible exit status codes returned or raised by main or
418 the functions it calls:
419 - On success, and if all requested validations (-s, -r) match:
420 return 0
421 - On error, e.g. wrong options (see parse_command_line()):
422 return 1
423 - On mismatch (either XML Schema (-s) or reference (-r)):
424 return mismatch_bitmap # see end of main()
425 - To aid understanding the bitmap: If N matching functions are
426 provided, and all are requested and all fail to match the output
427 file:
428 return (2 ** N - 1) * 2 # mismatch_bitmap
430 # Parse command line to get options:
431 options = parse_command_line(argv)
433 # Input file => preprocessing => output file:
434 input_xml = read_input_file(options.input)
435 proc = XMLPreprocess()
436 output_xml = proc(input_xml, options.input, options.trace_includes)
437 output_xml = postprocess_xml(output_xml)
438 write_output_file(output_xml, options.output)
440 # If -s: Compare output to XML Schema file:
441 matches_schema = True # False means: match requested and negative
442 if options.xml_schema is not None:
443 xml_schema = read_xml_schema_file(options.xml_schema)
444 matches_schema = match_against_schema(options, output_xml,
445 xml_schema)
447 # If -r: Compare output to reference:
448 matches_reference = True # False means: match requested and negative
449 if options.reference is not None:
450 matches_reference = match_against_reference(options, output_xml)
452 # Calculate and return the mismatch bitmap:
453 mismatch_bitmap = 0
454 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
455 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
456 return mismatch_bitmap
459 if __name__ == "__main__":
460 sys.exit(main(sys.argv))