Complete <xm:Include/> implementation
[xmlmerge.git] / xmlmerge.py
blob377165204770b8d9fbc4ecf2da75b745a0a507bc
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # XML Merge 2.0-pre
6 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
9 # This file is part of XML Merge.
11 # XML Merge is free software: you can redistribute it and/or modify it
12 # under the terms of the GNU Lesser General Public License as published by
13 # the Free Software Foundation, either version 3 of the License, or (at
14 # your option) any later version.
16 # XML Merge is distributed in the hope that it will be useful, but
17 # WITHOUT ANY WARRANTY; without even the implied warranty of
18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 # GNU Lesser General Public License for more details.
21 # You should have received a copy of the GNU Lesser General Public License
22 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
25 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
27 """
28 The purpose of XML Merge is to preprocess any kind of XML file with great
29 flexibility.
31 XML Merge performs (among other things) recursive XML file inclusion and
32 XML element and attribute modification.
34 XML Merge is a Python module. It is normally invoked as a program from the
35 command line, but can equally well be used from within another Python
36 program or module.
37 """
39 __version_info__ = (2, 0, -1, 'git')
40 __version__ = ".".join(str(n) for n in __version_info__[:2])
42 ## IMPORTS AND CONSTANTS
44 import copy
45 import itertools
46 import optparse
47 import os
48 import re
49 import sys
50 import textwrap
52 import lxml.etree as ET
54 # Namespace mapping (can be directly used for lxml nsmap arguments):
55 xmns = {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
56 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
59 ## COMMAND LINE OPTION PARSING
61 class OptionParser(optparse.OptionParser):
63 def __init__(self, *a, **kw):
64 optparse.OptionParser.__init__(self, *a, **kw)
65 self.add_option("-i", "--input",
66 help=("(REQUIRED) input XML file"))
67 self.add_option("-o", "--output",
68 help=("output XML file (.out.xml if not given)"))
69 self.add_option("-s", "--xml-schema",
70 help=("XML Schema (.xsd) to validate output " +
71 "against"))
72 self.add_option("-r", "--reference",
73 help=("reference XML file to compare output " +
74 "against"))
75 self.add_option("-d", "--html-diff", action="store_true",
76 help=("only with -r; if output and reference " +
77 "differ, produce a HTML file showing the " +
78 "differences"))
79 self.add_option("-t", "--trace-includes", action="store_true",
80 help=("add tracing information to included " +
81 "XML fragments"))
82 self.add_option("-v", "--verbose", action="store_const",
83 dest="verbose", const=3,
84 help=("show debugging messages"))
85 self.add_option("-q", "--quiet", action="store_const",
86 dest="verbose", const=1,
87 help=("only show error messages"))
88 self.set_defaults(verbose=2)
90 # Explanation: levels of verbosity
91 # --quiet -> self.verbose == 1 # only show error messages
92 # -> self.verbose == 2 # no verbosity option given
93 # --verbose -> self.verbose == 3 # show debugging messages
95 def error(self, *a, **kw):
96 self.print_help()
97 return optparse.OptionParser.error(self, *a, **kw)
100 def parse_command_line(argv):
102 parse_command_line(argv) -> optparse.Values
104 Parse argv and return an optparse.Values object containing the options.
106 This function performs all the necessary checks and conversions to make
107 sure all necessary options are given, and that all options are
108 available in a normalized format.
110 It also tries to create the containing directory for the output file if
111 it does not exist already.
113 # Parse options using OptionParser:
114 option_parser = OptionParser()
115 options, args = option_parser.parse_args(argv[1:])
117 # Make sure only options, and no other arguments, are passed on the
118 # command line:
119 try:
120 assert args == []
121 assert options.input is not None
122 except:
123 option_parser.error("Error: invalid argument list")
125 # If the output option has been omitted, build the output filename from
126 # the input filename, resulting in the file extension ".out.xml":
127 if options.output is None:
128 if options.input.lower().endswith(".xml"):
129 options.output = options.input[:-4] + ".out.xml"
130 else:
131 options.output = options.input + ".out.xml"
133 # Convert all filename options to normalized absolutized pathnames:
134 for n in "input output reference".split():
135 if getattr(options, n) is None: continue # if "-r" was not given
136 setattr(options, n, os.path.abspath(getattr(options, n)))
138 # When --verbose, print all filename options:
139 if options.verbose >= 3:
140 print "Input: %s" % options.input
141 print "Output: %s" % options.output
142 print "Reference: %s" % options.reference
144 # Make sure there is a directory where the output XML file should go:
145 try:
146 os.makedirs(os.path.dirname(options.output))
147 except:
148 pass # fail later if there still is no output directory now
150 return options
153 ## XML PROCESSING AND COMPARISON
155 def read_input_file(input_filename):
157 read_input_file(input_filename) -> ET._Element
159 Read the input file, and return the corresponding XML Element object,
160 the element tree root.
162 input_xml = ET.parse(input_filename).getroot()
163 return input_xml
165 def postprocess_xml(output_xml):
167 postprocess_xml(output_xml) -> ET._Element
169 Remove unnecessary namespace declarations and whitespace. Returns a
170 modified copy of output_xml. The argument may be modified by calling
171 this function.
173 # Remove unused namespace declarations:
174 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
175 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
176 ns_root.append(output_xml)
177 ns_root.remove(output_xml)
178 # If you don't perform this copy, each output_xml element's
179 # getroottree() will report the temporary tree containing the empty
180 # NS_ROOT element. This is not a hack, this is about how lxml works.
181 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
183 # Make pretty-printing work by removing unnecessary whitespace:
184 for el in output_xml.iter():
185 if el.text and not el.text.strip():
186 el.text = None
187 if el.tail and not el.tail.strip():
188 el.tail = None
190 return output_xml
192 def write_output_file(output_xml, output_filename):
194 Write the output XML Element to the specified output filename.
196 output_xmltree = output_xml.getroottree()
197 output_xmltree.write(output_filename, pretty_print=True,
198 xml_declaration=True, encoding="utf-8")
200 def read_xml_schema_file(xml_schema_filename):
202 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
204 Read the XML Schema file, and return the corresponding XML Schema
205 object.
207 xml_schema_xmltree = ET.parse(xml_schema_filename)
208 xml_schema = ET.XMLSchema(xml_schema_xmltree)
209 return xml_schema
211 def match_against_schema(options, output_xml, xml_schema):
213 match_against_schema(options, output_xml, xml_schema) -> bool
215 Validate output against XML Schema.
217 The result is True if the output XML Element (tree) matches the XML
218 Schema, otherwise the result is False.
220 is_valid = xml_schema.validate(output_xml.getroottree())
221 if options.verbose >= 2:
222 if is_valid:
223 print "Output matches XML Schema."
224 else:
225 print "Output invalid according to XML Schema."
226 print xml_schema.error_log.last_error
227 return is_valid
229 def match_against_reference(options, output_xml):
231 match_against_reference(options, output_xml) -> bool
233 Compare the output string (read from file options.output) to the
234 reference string (read from options.reference). If they are not the
235 same (bytewise), and if options.html_diff is True, create an HTML file
236 showing the differences.
238 The result is True if output and reference are the same (bytewise),
239 otherwise the result is False.
241 reference_filename = options.reference
242 output_filename = options.output
243 do_html_diff = options.html_diff
245 reference_str = file(reference_filename, "rb").read()
246 output_str = file(output_filename, "rb").read()
247 is_valid = (reference_str == output_str)
248 if options.verbose >= 2:
249 if is_valid:
250 print "Output matches reference."
251 elif not do_html_diff:
252 print "Output and reference differ."
253 if do_html_diff and not is_valid:
254 html_filename = "%s.diff.html" % output_filename
255 if options.verbose >= 2:
256 print ("Output and reference differ - " +
257 "generating '%s'..." % html_filename)
258 create_reference_diff_html(html_filename, reference_str,
259 output_str)
260 return is_valid
262 def create_reference_diff_html(html_filename, reference_str, output_str):
264 Create an HTML file (created at html_filename) showing the differrences
265 between the reference string and the output string side-by-side.
267 reference_lines = reference_str.splitlines()
268 output_lines = output_str .splitlines()
270 import difflib
271 html_diff = difflib.HtmlDiff(wrapcolumn=75)
272 html_str = html_diff.make_file(reference_lines, output_lines,
273 "Reference", "Output")
274 file(html_filename, "w").write(html_str)
277 ## XML PREPROCESS CLASS
279 class XMLPreprocess(object):
281 Use:
283 >>> proc = XMLPreprocess()
284 >>> output_xml = proc(options, input_xml) # input_xml may change
287 def __init__(self, initial_namespace={}):
288 super(XMLPreprocess, self).__init__()
289 self._namespace_stack = [initial_namespace]
291 def __call__(self, xml_element, namespace=None,
292 trace_includes=False, xml_filename=None):
294 XMLPreprocess()(...)
296 Preprocess the input XML Element, xml_element. The element tree of
297 xml_element will be modified in-place.
299 The namespace given should be a dict that can be used as a Python
300 namespace. This namespace will be used in XML attribute
301 substitution.
303 If trace_includes is True, the output will contain tags that
304 surround included sections of the file. The xml_filename argument
305 is then required.
307 Processing tags will recursively call this method (__call__) for
308 preprocessing the included file and for recursive inclusion.
310 if namespace is not None:
311 self._namespace_stack.append(namespace)
312 self.namespace = self._namespace_stack[-1]
313 self.trace_includes = trace_includes
314 self.xml_filename = xml_filename
316 ns = "{%s}" % xmns["xm"]
317 len_ns = len(ns)
319 # Evaluate Python expressions in the attributes of xml_element:
320 for attr_name, attr_value in xml_element.items(): # attr map
321 v = self._eval_substitution(attr_value)
322 xml_element.set(attr_name, v)
324 # If xml_element has xmns["xm"] as its namespace, proceed with the
325 # appropriate method of this class:
326 if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
327 tag = xml_element.tag[len_ns:] # just the tag without namespc
328 method = "_xm_" + tag.lower() # tolerate any case
329 if not hasattr(self, method):
330 raise Exception, "cannot process <xm:%s/>" % tag
331 getattr(self, method)(xml_element) # call the method
332 # Preserve tail text:
333 tail = xml_element.tail
334 if tail is not None:
335 prev = xml_element.getprevious()
336 parent = xml_element.getparent()
337 if prev is not None:
338 prev.tail = (prev.tail or "") + tail
339 else:
340 parent.text = (parent.text or "") + tail
341 xml_element.getparent().remove(xml_element)
343 # If not, recurse:
344 else:
345 self._recurse_into(xml_element)
347 return None
349 def _recurse_into(self, xml_element, namespace=None):
350 if namespace is not None:
351 self._namespace_stack.append(namespace)
352 for xml_sub_element in xml_element.xpath("*"):
353 self(xml_sub_element, None,
354 self.trace_includes, self.xml_filename)
355 if namespace is not None:
356 self._namespace_stack.pop()
357 self.namespace = self._namespace_stack[-1]
359 _eval_substitution_regex = re.compile(r"\{(.*?)\}")
361 def _eval_substitution(self, string):
363 Evaluate Python expressions within strings.
365 Internal method to perform substitution of Python expressions
366 within attribute values, {x} -> str(eval(x)). Example:
368 >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
369 '3 + 5 = 8 in Python'
371 Multiple Python expressions in one string are supported as well.
373 new_str = [] # faster than always concatenating strings
374 last_index = 0
375 for match in self._eval_substitution_regex.finditer(string):
376 new_str.append(string[last_index:match.start()])
377 expression = match.group(1)
378 result = str(eval(expression, self.namespace))
379 new_str.append(result)
380 last_index = match.end()
381 new_str.append(string[last_index:])
382 return "".join(new_str)
384 def _xm_addelements(self, xml_element):
386 Add subelements to, before, or after the element selected by XPath
387 (@to, @before or @after).
389 Exactly one of (@to, @before, @after) must be specified. And the
390 XPath expression must return exactly one element. These conditions
391 are checked by assertions and will raise an exception if not met.
393 to = xml_element.get("to")
394 before = xml_element.get("before")
395 after = xml_element.get("after")
397 assert sum((to is None, before is None, after is None)) == 2
398 select = to or before or after
400 selected_context_nodes = xml_element.xpath(select)
401 assert len(selected_context_nodes) == 1
403 context_node = selected_context_nodes[0]
404 replace_context_node = False
406 if to is not None:
407 f = "append"
408 if before is not None:
409 f = "addprevious"
410 if after is not None:
411 f = "addnext"
412 replace_context_node = True
414 for xml_sub_element in xml_element:
415 getattr(context_node, f)(xml_sub_element)
416 if replace_context_node:
417 context_node = xml_sub_element
419 def _xm_block(self, xml_element):
421 Create a scope to contain visibility of newly assigned Python
422 variables. This works the same way that Python itself scopes
423 variables, i.e. by creating a shallow copy of the Python namespace.
424 E.g. assignments to list items will be visible to outside scopes!
426 self._recurse_into(xml_element, self.namespace.copy())
427 for xml_sub_node in xml_element[::-1]: # get children reversed
428 xml_element.addnext(xml_sub_node)
430 def _xm_comment(self, xml_element):
432 A comment that is removed by XML Merge.
434 pass # that's it
436 def _xm_defaultvar(self, xml_element):
438 Set (zero or more) variables in the active Python namespace, if not
439 already set.
441 ns = self.namespace
442 for attr_name, attr_value in xml_element.items(): # attr map
443 if not attr_name in ns:
444 ns[attr_name] = eval(attr_value, ns)
446 def _xm_include(self, xml_element):
448 Include from the specified file (@file) the elements selected by
449 XPath (@select) after preprocessing said file.
451 The @file attribute is the only required attribute.
453 Items can be imported from the included (and preprocessed) file's
454 Python namespace into the current file's namespace using the
455 @import attribute, which may either be a comma-separated list of
456 identifiers, or '*' to import the complete namespace.
458 Remaining attributes will be treated as variable assignments and
459 put in the Python namespace used for processing the included file.
461 attrib = xml_element.attrib
462 file_ = attrib.pop("file", None)
463 select = attrib.pop("select", None)
464 import_ = attrib.pop("import", None)
465 assert file_ is not None
466 remaining_attribs = dict(attrib.items())
468 # Load the to-be-included file:
469 p = os.path
471 xml_input_dirname = p.dirname(self.xml_filename)
472 xml_incl_filename = p.join(xml_input_dirname, file_)
473 xml_incl_filename = p.normpath(xml_incl_filename)
474 # Always use '/' for normalized tracing information:
475 xml_incl_filename = xml_incl_filename.replace("\\", "/")
477 xml_incl = ET.parse(xml_incl_filename).getroot()
479 # Build the initial namespace from remaining attributes:
480 initial_namespace = {}
481 ns = self.namespace
482 for attr_name, attr_value in remaining_attribs.items(): # attr map
483 initial_namespace[attr_name] = eval(attr_value, ns)
485 # Preprocess the to-be-included file:
486 proc = XMLPreprocess(initial_namespace=initial_namespace)
487 proc(xml_incl, trace_includes=self.trace_includes,
488 xml_filename=xml_incl_filename)
490 # Select elements to include:
491 included_elements = []
492 if select is not None:
493 included_elements = xml_incl.xpath(select)
495 # Include the elements:
496 context_node = xml_element
497 for inc_elem in included_elements:
498 context_node.addnext(inc_elem)
499 context_node = inc_elem
501 # Import from included namespace:
502 imported_namespace = {}
503 if import_ is not None:
504 import_ = [x.strip() for x in import_.split(",")]
505 if "*" in import_: # import all
506 imported_namespace = proc.namespace
507 else:
508 ns = proc.namespace
509 imported_namespace = dict((x, ns[x]) for x in import_)
510 self.namespace.update(imported_namespace)
512 def _xm_loop(self, xml_element):
514 Loop over a range of integer values.
516 The first attribute is evaluated as the loop counter. Example:
518 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
520 WARNING: The loop counter attribute, as well as all substitutions
521 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
522 (wholly or partially) be evaluated as Python expressions using
523 eval().
525 # Get the loop counter name and list:
526 loop_counter_name = xml_element.keys()[0]
527 loop_counter_list = eval(xml_element.get(loop_counter_name),
528 self.namespace)
530 # Loop:
531 context_node = xml_element # for new elements
532 for loop_counter_value in loop_counter_list:
533 self.namespace[loop_counter_name] = loop_counter_value
534 # xml_element_copy = copy.copy(xml_element) # CRASH
535 tailtext = xml_element.tail
536 xml_element.tail = None # xml_element regarded as document
537 xml_element_copy = ET.XML(ET.tostring(xml_element))
538 xml_element.tail = xml_element_copy.tail = tailtext
539 self._recurse_into(xml_element_copy)
540 if xml_element_copy.text is not None:
541 if context_node.tail is None:
542 context_node.tail = u""
543 context_node.tail += xml_element_copy.text
544 for xml_sub_node in xml_element_copy[:]:
545 context_node.addnext(xml_sub_node)
546 context_node = xml_sub_node
548 def _xm_pythoncode(self, xml_element):
550 Execute Python code in the current namespace.
552 'self' and 'xml_element' are supplied temporarily. They are added
553 to the current namespace before the 'exec' statement, and removed
554 again afterwards.
556 code = textwrap.dedent(xml_element.text).strip()
557 self.namespace["self"] = self
558 self.namespace["xml_element"] = xml_element
559 exec code in self.namespace
560 del self.namespace["self"], self.namespace["xml_element"]
562 def _xm_removeattributes(self, xml_element):
564 Remove the attributes (@name) from the (zero or more) elements
565 selected by XPath (@from or @select).
567 It is not considered an error if an attribute cannot be found on a
568 selected element.
570 attr_name = xml_element.get("name")
571 select_xpath = xml_element.get("from") or xml_element.get("select")
572 for xml_element_selected in xml_element.xpath(select_xpath):
573 # Can't find another way to remove an attribute than by using
574 # 'attrib':
575 attrib = xml_element_selected.attrib
576 if attr_name in attrib:
577 del xml_element_selected.attrib[attr_name]
579 def _xm_removeelements(self, xml_element):
581 Remove (zero or more) elements selected by XPath (@select).
583 pass # TODO
585 def _xm_setattribute(self, xml_element):
587 Assign the value (@value) to the attribute (@name) of the element
588 selected by XPath (@of or @select).
590 Example:
591 <Object index="0x1234"/>
592 <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
594 Leads to:
595 <Object index="0x1234" otherattr="hello"/>
597 pass # TODO
599 def _xm_text(self, xml_element):
601 Perform '{}' substitution on text.
603 text = xml_element.text
604 if text is None: return
605 tail = self._eval_substitution(text) + (xml_element.tail or "")
606 xml_element.tail = tail
608 def _xm_var(self, xml_element):
610 Set (zero or more) variables in the active Python namespace.
612 ns = self.namespace
613 for attr_name, attr_value in xml_element.items(): # attr map
614 ns[attr_name] = eval(attr_value, ns)
617 ## MAIN FUNCTION
619 def main(argv, **kargs):
621 main(argv, **kargs) -> int
623 Process input to produce output according to the command line options
624 (given in argv). These keyword arguments (**kargs) are recognized:
626 initial_namespace
627 Gets passed on as the initial Python namespace to XMLPreprocess().
629 After the XML Merge Manual, this is the first piece of the code a new
630 developer will read. Keep this code as simple as possible if you change
631 it in any way.
633 These are all possible exit status codes returned or raised (using
634 SystemExit) by main or the functions it calls:
635 - On success, and if all requested validations (-s, -r) match:
636 return 0
637 - On error, e.g. wrong options (see parse_command_line()):
638 return 1
639 - On mismatch (either XML Schema (-s) or reference (-r)):
640 return mismatch_bitmap # see end of main()
641 - To aid understanding the bitmap: If N matching functions are
642 provided, and all are requested and all fail to match the output
643 file:
644 return (2 ** N - 1) * 2 # mismatch_bitmap
646 # Parse command line to get options:
647 options = parse_command_line(argv)
649 # Input file => preprocessing => output file:
650 xml = read_input_file(options.input)
651 proc = XMLPreprocess(**kargs)
652 proc(xml, trace_includes=options.trace_includes,
653 xml_filename=options.input)
654 xml = postprocess_xml(xml)
655 write_output_file(xml, options.output)
657 # If -s: Compare output to XML Schema file:
658 matches_schema = True # False means: match requested and negative
659 if options.xml_schema is not None:
660 xml_schema = read_xml_schema_file(options.xml_schema)
661 matches_schema = match_against_schema(options, xml, xml_schema)
663 # If -r: Compare output to reference:
664 matches_reference = True # False means: match requested and negative
665 if options.reference is not None:
666 matches_reference = match_against_reference(options, xml)
668 # Calculate and return the mismatch bitmap:
669 mismatch_bitmap = 0
670 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
671 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
672 return mismatch_bitmap
675 if __name__ == "__main__":
676 sys.exit(main(sys.argv))