Version 2.0.1
[xmlmerge.git] / xmlmerge.py
blob2b7cea82be54456333d1116003eddbf3cab7dbbd
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # XML Merge 2.0.1
6 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
9 # This file is part of XML Merge.
11 # XML Merge is free software: you can redistribute it and/or modify it
12 # under the terms of the GNU Lesser General Public License as published by
13 # the Free Software Foundation, either version 3 of the License, or (at
14 # your option) any later version.
16 # XML Merge is distributed in the hope that it will be useful, but
17 # WITHOUT ANY WARRANTY; without even the implied warranty of
18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 # GNU Lesser General Public License for more details.
21 # You should have received a copy of the GNU Lesser General Public License
22 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
25 # Developed (i.e. tested) mainly on Win32 using Python 2.6.4 + lxml 2.2.2,
26 # and to a lesser extent also with Python 2.5.4 + lxml 2.1.1.
28 """
29 The purpose of XML Merge is to preprocess any kind of XML file with great
30 flexibility.
32 XML Merge performs (among other things) recursive XML file inclusion and
33 XML element and attribute modification.
35 XML Merge is a Python module. It is normally invoked as a program from the
36 command line, but can equally well be used from within another Python
37 program or module.
38 """
40 __version_info__ = (2, 0, 1)
41 __version__ = ".".join(str(n) for n in __version_info__[:2])
43 ## IMPORTS AND CONSTANTS
45 import copy
46 import itertools
47 import optparse
48 import os
49 import re
50 import sys
51 import textwrap
53 import lxml.etree as ET
55 # Namespace mapping (can be directly used for lxml nsmap arguments):
56 xmns = {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
57 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
60 ## COMMAND LINE OPTION PARSING
62 class OptionParser(optparse.OptionParser):
64 def __init__(self, *a, **kw):
65 optparse.OptionParser.__init__(self, *a, **kw)
66 self.add_option("-i", "--input",
67 help=("(REQUIRED) input XML file"))
68 self.add_option("-o", "--output",
69 help=("output XML file (.out.xml if not given)"))
70 self.add_option("-s", "--xml-schema",
71 help=("XML Schema (.xsd) to validate output " +
72 "against"))
73 self.add_option("-r", "--reference",
74 help=("reference XML file to compare output " +
75 "against"))
76 self.add_option("-d", "--html-diff", action="store_true",
77 help=("only with -r; if output and reference " +
78 "differ, produce a HTML file showing the " +
79 "differences"))
80 self.add_option("-t", "--trace-includes", action="store_true",
81 help=("add tracing information to included " +
82 "XML fragments"))
83 self.add_option("-v", "--verbose", action="store_const",
84 dest="verbose", const=3,
85 help=("show debugging messages"))
86 self.add_option("-q", "--quiet", action="store_const",
87 dest="verbose", const=1,
88 help=("only show error messages"))
89 self.set_defaults(verbose=2)
91 # Explanation: levels of verbosity
92 # --quiet -> self.verbose == 1 # only show error messages
93 # -> self.verbose == 2 # no verbosity option given
94 # --verbose -> self.verbose == 3 # show debugging messages
96 def error(self, *a, **kw):
97 self.print_help()
98 return optparse.OptionParser.error(self, *a, **kw)
101 def parse_command_line(argv):
103 parse_command_line(argv) -> optparse.Values
105 Parse argv and return an optparse.Values object containing the options.
107 This function performs all the necessary checks and conversions to make
108 sure all necessary options are given, and that all options are
109 available in a normalized format.
111 It also tries to create the containing directory for the output file if
112 it does not exist already.
114 # Parse options using OptionParser:
115 option_parser = OptionParser()
116 options, args = option_parser.parse_args(argv[1:])
118 # Make sure only options, and no other arguments, are passed on the
119 # command line:
120 try:
121 assert args == []
122 assert options.input is not None
123 except:
124 option_parser.error("Error: invalid argument list")
126 # If the output option has been omitted, build the output filename from
127 # the input filename, resulting in the file extension ".out.xml":
128 if options.output is None:
129 if options.input.lower().endswith(".xml"):
130 options.output = options.input[:-4] + ".out.xml"
131 else:
132 options.output = options.input + ".out.xml"
134 # Convert all filename options to normalized absolutized pathnames:
135 for n in "input output reference".split():
136 if getattr(options, n) is None: continue # if "-r" was not given
137 setattr(options, n, os.path.abspath(getattr(options, n)))
139 # When --verbose, print all filename options:
140 if options.verbose >= 3:
141 print "Input: %s" % options.input
142 print "Output: %s" % options.output
143 print "Reference: %s" % options.reference
145 # Make sure there is a directory where the output XML file should go:
146 try:
147 os.makedirs(os.path.dirname(options.output))
148 except:
149 pass # fail later if there still is no output directory now
151 return options
154 ## XML PROCESSING AND COMPARISON
156 def read_input_file(input_filename):
158 read_input_file(input_filename) -> ET._Element
160 Read the input file, and return the corresponding XML Element object,
161 the element tree root.
163 input_xml = ET.parse(input_filename).getroot()
164 return input_xml
166 def postprocess_xml(output_xml):
168 postprocess_xml(output_xml) -> ET._Element
170 Remove unnecessary namespace declarations and whitespace. Returns a
171 modified copy of output_xml. The argument may be modified by calling
172 this function.
174 # Remove unused namespace declarations:
175 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
176 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
177 ns_root.append(output_xml)
178 ns_root.remove(output_xml)
179 # If you don't perform this copy, each output_xml element's
180 # getroottree() will report the temporary tree containing the empty
181 # NS_ROOT element. This is not a hack, this is about how lxml works.
182 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
184 # Make pretty-printing work by removing unnecessary whitespace:
185 for el in output_xml.iter():
186 if el.text and not el.text.strip():
187 el.text = None
188 if el.tail and not el.tail.strip():
189 el.tail = None
191 return output_xml
193 def write_output_file(output_xml, output_filename):
195 Write the output XML Element to the specified output filename.
197 output_xmltree = output_xml.getroottree()
198 output_xmltree.write(output_filename, pretty_print=True,
199 xml_declaration=True, encoding="utf-8")
201 def read_xml_schema_file(xml_schema_filename):
203 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
205 Read the XML Schema file, and return the corresponding XML Schema
206 object.
208 xml_schema_xmltree = ET.parse(xml_schema_filename)
209 xml_schema = ET.XMLSchema(xml_schema_xmltree)
210 return xml_schema
212 def match_against_schema(options, output_xml, xml_schema):
214 match_against_schema(options, output_xml, xml_schema) -> bool
216 Validate output against XML Schema.
218 The result is True if the output XML Element (tree) matches the XML
219 Schema, otherwise the result is False.
221 is_valid = xml_schema.validate(output_xml.getroottree())
222 if options.verbose >= 2:
223 if is_valid:
224 print "Output matches XML Schema."
225 else:
226 print "Output invalid according to XML Schema."
227 print xml_schema.error_log.last_error
228 return is_valid
230 def match_against_reference(options, output_xml):
232 match_against_reference(options, output_xml) -> bool
234 Compare the output string (read from file options.output) to the
235 reference string (read from options.reference). If they are not the
236 same (bytewise), and if options.html_diff is True, create an HTML file
237 showing the differences.
239 The result is True if output and reference are the same (bytewise),
240 otherwise the result is False.
242 reference_filename = options.reference
243 output_filename = options.output
244 do_html_diff = options.html_diff
246 reference_str = file(reference_filename, "rb").read()
247 output_str = file(output_filename, "rb").read()
248 is_valid = (reference_str == output_str)
249 if options.verbose >= 2:
250 if is_valid:
251 print "Output matches reference."
252 elif not do_html_diff:
253 print "Output and reference differ."
254 if do_html_diff and not is_valid:
255 html_filename = "%s.diff.html" % output_filename
256 if options.verbose >= 2:
257 print ("Output and reference differ - " +
258 "generating '%s'..." % html_filename)
259 create_reference_diff_html(html_filename, reference_str,
260 output_str)
261 return is_valid
263 def create_reference_diff_html(html_filename, reference_str, output_str):
265 Create an HTML file (created at html_filename) showing the differrences
266 between the reference string and the output string side-by-side.
268 reference_lines = reference_str.splitlines()
269 output_lines = output_str .splitlines()
271 import difflib
272 html_diff = difflib.HtmlDiff(wrapcolumn=75)
273 html_str = html_diff.make_file(reference_lines, output_lines,
274 "Reference", "Output")
275 file(html_filename, "w").write(html_str)
278 ## XML ERROR REPORTING
280 def print_xml_error(xml_element, code=None):
281 print >>sys.stderr, "*** XML ERROR ***"
282 tree = xml_element.getroottree()
283 print >>sys.stderr, "File URL:", tree.docinfo.URL
284 xpath = tree.getpath(xml_element)
285 print >>sys.stderr, "Line:", xml_element.sourceline, " XPath:", xpath
286 if code is not None:
287 print >>sys.stderr, "Offending Python code / expression:"
288 print >>sys.stderr, " %s" % code.replace("\n", "\n ")
291 ## XML PREPROCESS CLASS
293 class XMLPreprocess(object):
295 Use:
297 >>> proc = XMLPreprocess()
298 >>> output_xml = proc(options, input_xml) # input_xml may change
301 def __init__(self, initial_namespace={}):
302 super(XMLPreprocess, self).__init__()
303 self._namespace_stack = [initial_namespace]
305 def __call__(self, xml_element, namespace=None,
306 trace_includes=False, xml_filename=None):
308 XMLPreprocess()(...)
310 Preprocess the input XML Element, xml_element. The element tree of
311 xml_element will be modified in-place.
313 The namespace given should be a dict that can be used as a Python
314 namespace. This namespace will be used in XML attribute
315 substitution.
317 If trace_includes is True, the output will contain tags that
318 surround included sections of the file. The xml_filename argument
319 is then required.
321 Processing tags will recursively call this method (__call__) for
322 preprocessing the included file and for recursive inclusion.
324 if namespace is not None:
325 self._namespace_stack.append(namespace)
326 self.namespace = self._namespace_stack[-1]
327 self.trace_includes = trace_includes
328 self.xml_filename = xml_filename
330 ns = "{%s}" % xmns["xm"]
331 len_ns = len(ns)
333 # Evaluate Python expressions in the attributes of xml_element:
334 for attr_name, attr_value in xml_element.items(): # attr map
335 v = self._eval_substitution(attr_value, xml_element)
336 xml_element.set(attr_name, v)
338 # If xml_element has xmns["xm"] as its namespace, proceed with the
339 # appropriate method of this class:
340 if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
341 tag = xml_element.tag[len_ns:] # just the tag without namespc
342 method = "_xm_" + tag.lower() # tolerate any case
343 if not hasattr(self, method):
344 raise Exception, "cannot process <xm:%s/>" % tag
345 getattr(self, method)(xml_element) # call the method
346 # Preserve tail text:
347 tail = xml_element.tail
348 if tail is not None:
349 prev = xml_element.getprevious()
350 parent = xml_element.getparent()
351 if prev is not None:
352 prev.tail = (prev.tail or "") + tail
353 else:
354 parent.text = (parent.text or "") + tail
355 xml_element.getparent().remove(xml_element)
357 # If not, recurse:
358 else:
359 self._recurse_into(xml_element)
361 return None
363 def _recurse_into(self, xml_element, namespace=None):
364 if namespace is not None:
365 self._namespace_stack.append(namespace)
366 for xml_sub_element in xml_element.xpath("*"):
367 self(xml_sub_element, None,
368 self.trace_includes, self.xml_filename)
369 if namespace is not None:
370 self._namespace_stack.pop()
371 self.namespace = self._namespace_stack[-1]
373 _eval_substitution_regex = re.compile(r"\{(.*?)\}")
375 def _eval_substitution(self, string, xml_element=None):
377 Evaluate Python expressions within strings.
379 Internal method to perform substitution of Python expressions
380 within attribute values, {x} -> str(eval(x)). Example:
382 >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
383 '3 + 5 = 8 in Python'
385 Multiple Python expressions in one string are supported as well.
387 new_str = [] # faster than continuously concatenating strings
388 last_index = 0
389 for match in self._eval_substitution_regex.finditer(string):
390 new_str.append(string[last_index:match.start()])
391 expression = match.group(1)
392 try:
393 result = str(eval(expression, self.namespace))
394 except:
395 if xml_element is not None:
396 print_xml_error(xml_element, code=expression)
397 print >>sys.stderr
398 raise
399 new_str.append(result)
400 last_index = match.end()
401 new_str.append(string[last_index:])
402 return "".join(new_str)
404 def _xm_addelements(self, xml_element):
406 Add subelements to, before, or after the element selected by XPath
407 (@to, @before or @after).
409 Exactly one of (@to, @before, @after) must be specified. And the
410 XPath expression must return exactly one element. These conditions
411 are checked by assertions and will raise an exception if not met.
413 to = xml_element.get("to")
414 before = xml_element.get("before")
415 after = xml_element.get("after")
417 assert sum((to is None, before is None, after is None)) == 2
418 select = to or before or after
420 selected_context_nodes = xml_element.xpath(select)
421 assert len(selected_context_nodes) == 1
423 context_node = selected_context_nodes[0]
424 replace_context_node = False
426 if to is not None:
427 f = "append"
428 if before is not None:
429 f = "addprevious"
430 if after is not None:
431 f = "addnext"
432 replace_context_node = True
434 for xml_sub_element in xml_element:
435 getattr(context_node, f)(xml_sub_element)
436 if replace_context_node:
437 context_node = xml_sub_element
439 def _xm_block(self, xml_element):
441 Create a scope to contain visibility of newly assigned Python
442 variables. This works the same way that Python itself scopes
443 variables, i.e. by creating a shallow copy of the Python namespace.
444 E.g. assignments to list items will be visible to outside scopes!
446 self._recurse_into(xml_element, self.namespace.copy())
447 for xml_sub_node in xml_element[::-1]: # get children reversed
448 xml_element.addnext(xml_sub_node)
450 def _xm_comment(self, xml_element):
452 A comment that is removed by XML Merge.
454 pass # that's it
456 def _xm_defaultvar(self, xml_element):
458 Set (zero or more) variables in the active Python namespace, if not
459 already set.
461 ns = self.namespace
462 for attr_name, attr_value in xml_element.items(): # attr map
463 if not attr_name in ns:
464 try:
465 ns[attr_name] = eval(attr_value, ns)
466 except:
467 print_xml_error(xml_element, code=attr_value)
468 print >>sys.stderr
469 raise
471 def _xm_include(self, xml_element):
473 Include from the specified file (@file) the elements selected by
474 XPath (@select) after preprocessing said file.
476 The @file attribute is the only required attribute.
478 Items can be imported from the included (and preprocessed) file's
479 Python namespace into the current file's namespace using the
480 @import attribute, which may either be a comma-separated list of
481 identifiers, or '*' to import the complete namespace.
483 Remaining attributes will be treated as variable assignments and
484 put in the Python namespace used for processing the included file.
486 attrib = xml_element.attrib
487 file_ = attrib.pop("file", None)
488 select = attrib.pop("select", None)
489 import_ = attrib.pop("import", None)
490 assert file_ is not None
491 remaining_attribs = dict(attrib.items())
493 # Load the to-be-included file:
494 p = os.path
496 xml_input_dirname = p.dirname(self.xml_filename)
497 xml_incl_filename = p.join(xml_input_dirname, file_)
498 xml_incl_filename = p.normpath(xml_incl_filename)
499 # Always use '/' for normalized tracing information:
500 xml_incl_filename = xml_incl_filename.replace("\\", "/")
502 xml_incl = ET.parse(xml_incl_filename).getroot()
504 # Build the initial namespace from a copy of the current namespace
505 # plus the remaining attributes of the <xm:Include/> element:
506 current_ns = self.namespace
507 initial_namespace = current_ns.copy()
508 for attr_name, attr_value in remaining_attribs.items(): # attr map
509 try:
510 initial_namespace[attr_name] = eval(attr_value, current_ns)
511 except:
512 print_xml_error(xml_element, code=attr_value)
513 print >>sys.stderr
514 raise
516 # Preprocess the to-be-included file:
517 proc = XMLPreprocess(initial_namespace=initial_namespace)
518 proc(xml_incl, trace_includes=self.trace_includes,
519 xml_filename=xml_incl_filename)
521 # Select elements to include:
522 included_elements = []
523 if select is not None:
524 included_elements = xml_incl.xpath(select)
526 # Include the elements:
527 context_node = xml_element
528 for inc_elem in included_elements:
529 context_node.addnext(inc_elem)
530 context_node = inc_elem
532 # Import from included namespace:
533 imported_namespace = {}
534 if import_ is not None:
535 import_ = [x.strip() for x in import_.split(",")]
536 if "*" in import_: # import all
537 imported_namespace = proc.namespace
538 else:
539 ns = proc.namespace
540 imported_namespace = dict((x, ns[x]) for x in import_)
541 self.namespace.update(imported_namespace)
543 def _xm_loop(self, xml_element):
545 Loop over a range of integer values.
547 The first attribute is evaluated as the loop counter. Example:
549 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
551 WARNING: The loop counter attribute, as well as all substitutions
552 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
553 (wholly or partially) be evaluated as Python expressions using
554 eval().
556 # Get the loop counter name and list:
557 loop_counter_name = xml_element.keys()[0]
558 loop_counter_expr = xml_element.get(loop_counter_name)
559 try:
560 loop_counter_list = eval(loop_counter_expr, self.namespace)
561 except:
562 print_xml_error(xml_element, code=loop_counter_expr)
563 print >>sys.stderr
564 raise
566 # Loop:
567 context_node = xml_element # for new elements
568 for loop_counter_value in loop_counter_list:
569 self.namespace[loop_counter_name] = loop_counter_value
570 tailtext = xml_element.tail
571 xml_element.tail = None # xml_element regarded as document
572 # xml_element_copy = copy.copy(xml_element) # CRASH
573 # The following line is the workaround for the preceeding one:
574 xml_element_copy = ET.XML(ET.tostring(xml_element))
575 xml_element.addnext(xml_element_copy) # temporarily
576 xml_element.tail = xml_element_copy.tail = tailtext
577 self._recurse_into(xml_element_copy)
578 xml_element_copy.getparent().remove(xml_element_copy)
579 if xml_element_copy.text is not None:
580 if context_node.tail is None:
581 context_node.tail = u""
582 context_node.tail += xml_element_copy.text
583 for xml_sub_node in xml_element_copy[:]:
584 context_node.addnext(xml_sub_node)
585 context_node = xml_sub_node
587 def _xm_pythoncode(self, xml_element):
589 Execute Python code in the current namespace.
591 'self' and 'xml_element' are supplied temporarily. They are added
592 to the current namespace before the 'exec' statement, and removed
593 again afterwards.
595 code = textwrap.dedent(xml_element.text).strip()
596 self.namespace["self"] = self
597 self.namespace["xml_element"] = xml_element
598 try:
599 exec code in self.namespace
600 except:
601 print_xml_error(xml_element, code=code)
602 print >>sys.stderr
603 raise
604 del self.namespace["self"], self.namespace["xml_element"]
606 def _xm_removeattributes(self, xml_element):
608 Remove the attributes (@name) from the (zero or more) elements
609 selected by XPath (@from or @select).
611 It is not considered an error if an attribute cannot be found on a
612 selected element.
614 attr_name = xml_element.get("name")
615 select_xpath = xml_element.get("from") or xml_element.get("select")
616 for xml_element_selected in xml_element.xpath(select_xpath):
617 # Can't find another way to remove an attribute than by using
618 # 'attrib':
619 attrib = xml_element_selected.attrib
620 if attr_name in attrib:
621 del xml_element_selected.attrib[attr_name]
623 def _xm_removeelements(self, xml_element):
625 Remove (zero or more) elements selected by XPath (@select).
627 select = xml_element.get("select")
628 assert select is not None
629 elements = xml_element.xpath(select)
630 for el in elements:
631 el.getparent().remove(el)
633 def _xm_setattribute(self, xml_element):
635 Assign the value (@value) to the attribute (@name) of the element
636 selected by XPath (@of or @select).
638 Example:
639 <Object index="0x1234"/>
640 <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
642 Leads to:
643 <Object index="0x1234" otherattr="hello"/>
645 select = xml_element.get("select", xml_element.get("of"))
646 name = xml_element.get("name")
647 value = xml_element.get("value")
648 assert sum((select is None, name is None, value is None)) == 0
649 elements = xml_element.xpath(select)
650 for el in elements:
651 el.set(name, value)
653 def _xm_text(self, xml_element):
655 Perform '{}' substitution on text.
657 text = xml_element.text
658 if text is None: return
659 tail = self._eval_substitution(text, xml_element)
660 tail += xml_element.tail or ""
661 xml_element.tail = tail
663 def _xm_var(self, xml_element):
665 Set (zero or more) variables in the active Python namespace.
667 ns = self.namespace
668 for attr_name, attr_value in xml_element.items(): # attr map
669 try:
670 ns[attr_name] = eval(attr_value, ns)
671 except:
672 print_xml_error(xml_element, code=attr_value)
673 print >>sys.stderr
674 raise
677 ## MAIN FUNCTION
679 def main(argv, **kargs):
681 main(argv, **kargs) -> int
683 Process input to produce output according to the command line options
684 (given in argv). These keyword arguments (**kargs) are recognized:
686 initial_namespace
687 Gets passed on as the initial Python namespace to XMLPreprocess().
689 After the XML Merge Manual, this is the first piece of the code a new
690 developer will read. Keep this code as simple as possible if you change
691 it in any way.
693 These are all possible exit status codes returned or raised (using
694 SystemExit) by main or the functions it calls:
695 - On success, and if all requested validations (-s, -r) match:
696 return 0
697 - On error, e.g. wrong options (see parse_command_line()):
698 return 1
699 - On mismatch (either XML Schema (-s) or reference (-r)):
700 return mismatch_bitmap # see end of main()
701 - To aid understanding the bitmap: If N matching functions are
702 provided, and all are requested and all fail to match the output
703 file:
704 return (2 ** N - 1) * 2 # mismatch_bitmap
706 # Parse command line to get options:
707 options = parse_command_line(argv)
709 # Input file => preprocessing => output file:
710 xml = read_input_file(options.input)
711 proc = XMLPreprocess(**kargs)
712 proc(xml, trace_includes=options.trace_includes,
713 xml_filename=options.input)
714 xml = postprocess_xml(xml)
715 write_output_file(xml, options.output)
717 # If -s: Compare output to XML Schema file:
718 matches_schema = True # False means: match requested and negative
719 if options.xml_schema is not None:
720 xml_schema = read_xml_schema_file(options.xml_schema)
721 matches_schema = match_against_schema(options, xml, xml_schema)
723 # If -r: Compare output to reference:
724 matches_reference = True # False means: match requested and negative
725 if options.reference is not None:
726 matches_reference = match_against_reference(options, xml)
728 # Calculate and return the mismatch bitmap:
729 mismatch_bitmap = 0
730 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
731 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
732 return mismatch_bitmap
735 if __name__ == "__main__":
736 sys.exit(main(sys.argv))