Implement <xm:AddElements/>
[xmlmerge.git] / xmlmerge.py
blob1aec5d19d51096fde004bd60d442ffa37a485c30
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # XML Merge 2.0-pre
6 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
9 # This file is part of XML Merge.
11 # XML Merge is free software: you can redistribute it and/or modify it
12 # under the terms of the GNU Lesser General Public License as published by
13 # the Free Software Foundation, either version 3 of the License, or (at
14 # your option) any later version.
16 # XML Merge is distributed in the hope that it will be useful, but
17 # WITHOUT ANY WARRANTY; without even the implied warranty of
18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 # GNU Lesser General Public License for more details.
21 # You should have received a copy of the GNU Lesser General Public License
22 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
25 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
27 # TODO: What if an attribute should include the '{' or '}' chars?
29 """
30 The purpose of XML Merge is to preprocess any kind of XML file with great
31 flexibility.
33 XML Merge performs (among other things) recursive XML file inclusion and
34 XML element and attribute modification.
36 XML Merge is a Python module. It is normally invoked as a program from the
37 command line, but can equally well be used from within another Python
38 program or module.
39 """
41 __version_info__ = (2, 0, -1, 'git')
42 __version__ = ".".join(str(n) for n in __version_info__[:2])
44 ## IMPORTS AND CONSTANTS
46 import copy
47 import itertools
48 import optparse
49 import os
50 import re
51 import sys
52 import textwrap
54 import lxml.etree as ET
56 # Namespace mapping (can be directly used for lxml nsmap arguments):
57 xmns = {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
58 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
61 ## COMMAND LINE OPTION PARSING
63 class OptionParser(optparse.OptionParser):
65 def __init__(self, *a, **kw):
66 optparse.OptionParser.__init__(self, *a, **kw)
67 self.add_option("-i", "--input",
68 help=("(REQUIRED) input XML file"))
69 self.add_option("-o", "--output",
70 help=("output XML file (.out.xml if not given)"))
71 self.add_option("-s", "--xml-schema",
72 help=("XML Schema (.xsd) to validate output " +
73 "against"))
74 self.add_option("-r", "--reference",
75 help=("reference XML file to compare output " +
76 "against"))
77 self.add_option("-d", "--html-diff", action="store_true",
78 help=("only with -r; if output and reference " +
79 "differ, produce a HTML file showing the " +
80 "differences"))
81 self.add_option("-t", "--trace-includes", action="store_true",
82 help=("add tracing information to included " +
83 "XML fragments"))
84 self.add_option("-v", "--verbose", action="store_const",
85 dest="verbose", const=3,
86 help=("show debugging messages"))
87 self.add_option("-q", "--quiet", action="store_const",
88 dest="verbose", const=1,
89 help=("only show error messages"))
90 self.set_defaults(verbose=2)
92 # Explanation: levels of verbosity
93 # --quiet -> self.verbose == 1 # only show error messages
94 # -> self.verbose == 2 # no verbosity option given
95 # --verbose -> self.verbose == 3 # show debugging messages
97 def error(self, *a, **kw):
98 self.print_help()
99 return optparse.OptionParser.error(self, *a, **kw)
102 def parse_command_line(argv):
104 parse_command_line(argv) -> optparse.Values
106 Parse argv and return an optparse.Values object containing the options.
108 This function performs all the necessary checks and conversions to make
109 sure all necessary options are given, and that all options are
110 available in a normalized format.
112 It also tries to create the containing directory for the output file if
113 it does not exist already.
115 # Parse options using OptionParser:
116 option_parser = OptionParser()
117 options, args = option_parser.parse_args(argv[1:])
119 # Make sure only options, and no other arguments, are passed on the
120 # command line:
121 try:
122 assert args == []
123 assert options.input is not None
124 except:
125 option_parser.error("Error: invalid argument list")
127 # If the output option has been omitted, build the output filename from
128 # the input filename, resulting in the file extension ".out.xml":
129 if options.output is None:
130 if options.input.lower().endswith(".xml"):
131 options.output = options.input[:-4] + ".out.xml"
132 else:
133 options.output = options.input + ".out.xml"
135 # Convert all filename options to normalized absolutized pathnames:
136 for n in "input output reference".split():
137 if getattr(options, n) is None: continue # if "-r" was not given
138 setattr(options, n, os.path.abspath(getattr(options, n)))
140 # When --verbose, print all filename options:
141 if options.verbose >= 3:
142 print "Input: %s" % options.input
143 print "Output: %s" % options.output
144 print "Reference: %s" % options.reference
146 # Make sure there is a directory where the output XML file should go:
147 try:
148 os.makedirs(os.path.dirname(options.output))
149 except:
150 pass # fail later if there still is no output directory now
152 return options
155 ## XML PROCESSING AND COMPARISON
157 def read_input_file(input_filename):
159 read_input_file(input_filename) -> ET._Element
161 Read the input file, and return the corresponding XML Element object,
162 the element tree root.
164 input_xml = ET.parse(input_filename).getroot()
165 return input_xml
167 def postprocess_xml(output_xml):
169 postprocess_xml(output_xml) -> ET._Element
171 Remove unnecessary namespace declarations and whitespace. Returns a
172 modified copy of output_xml. The argument may be modified by calling
173 this function.
175 # Remove unused namespace declarations:
176 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
177 ns_root = ET.Element("NS_ROOT", nsmap=xmns)
178 ns_root.append(output_xml)
179 ns_root.remove(output_xml)
180 # If you don't perform this copy, each output_xml element's
181 # getroottree() will report the temporary tree containing the empty
182 # NS_ROOT element. This is not a hack, this is about how lxml works.
183 output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
185 # Make pretty-printing work by removing unnecessary whitespace:
186 for el in output_xml.iter():
187 if el.text and not el.text.strip():
188 el.text = None
189 if el.tail and not el.tail.strip():
190 el.tail = None
192 return output_xml
194 def write_output_file(output_xml, output_filename):
196 Write the output XML Element to the specified output filename.
198 output_xmltree = output_xml.getroottree()
199 output_xmltree.write(output_filename, pretty_print=True,
200 xml_declaration=True, encoding="utf-8")
202 def read_xml_schema_file(xml_schema_filename):
204 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
206 Read the XML Schema file, and return the corresponding XML Schema
207 object.
209 xml_schema_xmltree = ET.parse(xml_schema_filename)
210 xml_schema = ET.XMLSchema(xml_schema_xmltree)
211 return xml_schema
213 def match_against_schema(options, output_xml, xml_schema):
215 match_against_schema(options, output_xml, xml_schema) -> bool
217 Validate output against XML Schema.
219 The result is True if the output XML Element (tree) matches the XML
220 Schema, otherwise the result is False.
222 is_valid = xml_schema.validate(output_xml.getroottree())
223 if options.verbose >= 2:
224 if is_valid:
225 print "Output matches XML Schema."
226 else:
227 print "Output invalid according to XML Schema."
228 print xml_schema.error_log.last_error
229 return is_valid
231 def match_against_reference(options, output_xml):
233 match_against_reference(options, output_xml) -> bool
235 Compare the output string (read from file options.output) to the
236 reference string (read from options.reference). If they are not the
237 same (bytewise), and if options.html_diff is True, create an HTML file
238 showing the differences.
240 The result is True if output and reference are the same (bytewise),
241 otherwise the result is False.
243 reference_filename = options.reference
244 output_filename = options.output
245 do_html_diff = options.html_diff
247 reference_str = file(reference_filename, "rb").read()
248 output_str = file(output_filename, "rb").read()
249 is_valid = (reference_str == output_str)
250 if options.verbose >= 2:
251 if is_valid:
252 print "Output matches reference."
253 elif not do_html_diff:
254 print "Output and reference differ."
255 if do_html_diff and not is_valid:
256 html_filename = "%s.diff.html" % output_filename
257 if options.verbose >= 2:
258 print ("Output and reference differ - " +
259 "generating '%s'..." % html_filename)
260 create_reference_diff_html(html_filename, reference_str,
261 output_str)
262 return is_valid
264 def create_reference_diff_html(html_filename, reference_str, output_str):
266 Create an HTML file (created at html_filename) showing the differrences
267 between the reference string and the output string side-by-side.
269 reference_lines = reference_str.splitlines()
270 output_lines = output_str .splitlines()
272 import difflib
273 html_diff = difflib.HtmlDiff(wrapcolumn=75)
274 html_str = html_diff.make_file(reference_lines, output_lines,
275 "Reference", "Output")
276 file(html_filename, "w").write(html_str)
279 ## XML PREPROCESS CLASS
281 class XMLPreprocess(object):
283 Use:
285 >>> proc = XMLPreprocess()
286 >>> output_xml = proc(options, input_xml) # input_xml may change
289 def __init__(self, initial_namespace={}):
290 super(XMLPreprocess, self).__init__()
291 self._namespace_stack = [initial_namespace]
293 def __call__(self, xml_element, namespace=None,
294 trace_includes=False, xml_filename=None):
296 XMLPreprocess()(...)
298 Preprocess the input XML Element, xml_element. The element tree of
299 xml_element will be modified in-place.
301 The namespace given should be a dict that can be used as a Python
302 namespace. This namespace will be used in XML attribute
303 substitution.
305 If trace_includes is True, the output will contain tags that
306 surround included sections of the file. The xml_filename argument
307 is then required.
309 Processing tags will recursively call this method (__call__) for
310 preprocessing the included file and for recursive inclusion.
312 if namespace is not None:
313 self._namespace_stack.append(namespace)
314 self.namespace = self._namespace_stack[-1]
315 self.trace_includes = trace_includes
316 self.xml_filename = xml_filename
318 ns = "{%s}" % xmns["xm"]
319 len_ns = len(ns)
321 # Evaluate Python expressions in the attributes of xml_element:
322 for attr_name, attr_value in xml_element.items(): # attr map
323 v = self._eval_substitution(attr_value)
324 xml_element.set(attr_name, v)
326 # If xml_element has xmns["xm"] as its namespace, proceed with the
327 # appropriate method of this class:
328 if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
329 tag = xml_element.tag[len_ns:] # just the tag without namespc
330 method = "_xm_" + tag.lower() # tolerate any case
331 if not hasattr(self, method):
332 raise Exception, "cannot process <xm:%s/>" % tag
333 getattr(self, method)(xml_element) # call the method
334 # Preserve tail text:
335 tail = xml_element.tail
336 if tail is not None:
337 prev = xml_element.getprevious()
338 parent = xml_element.getparent()
339 if prev is not None:
340 prev.tail = (prev.tail or "") + tail
341 else:
342 parent.text = (parent.text or "") + tail
343 xml_element.getparent().remove(xml_element)
345 # If not, recurse:
346 else:
347 self._recurse_into(xml_element)
349 return None
351 def _recurse_into(self, xml_element, namespace=None):
352 if namespace is not None:
353 self._namespace_stack.append(namespace)
354 for xml_sub_element in xml_element.xpath("*"):
355 self(xml_sub_element, None,
356 self.trace_includes, self.xml_filename)
357 if namespace is not None:
358 self._namespace_stack.pop()
359 self.namespace = self._namespace_stack[-1]
361 _eval_substitution_regex = re.compile(r"\{(.*?)\}")
363 def _eval_substitution(self, string):
365 Evaluate Python expressions within strings.
367 Internal method to perform substitution of Python expressions
368 within attribute values, {x} -> str(eval(x)). Example:
370 >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
371 '3 + 5 = 8 in Python'
373 Multiple Python expressions in one string are supported as well.
375 new_str = [] # faster than always concatenating strings
376 last_index = 0
377 for match in self._eval_substitution_regex.finditer(string):
378 new_str.append(string[last_index:match.start()])
379 expression = match.group(1)
380 result = str(eval(expression, self.namespace))
381 new_str.append(result)
382 last_index = match.end()
383 new_str.append(string[last_index:])
384 return "".join(new_str)
386 def _xm_addelements(self, xml_element):
388 Add subelements to, before, or after the element selected by XPath
389 (@to, @before or @after).
391 Exactly one of (@to, @before, @after) must be specified. And the
392 XPath expression must return exactly one element. These conditions
393 are checked by assertions and will raise an exception if not met.
395 to = xml_element.get("to")
396 before = xml_element.get("before")
397 after = xml_element.get("after")
399 assert sum((to is None, before is None, after is None)) == 2
400 select = to or before or after
402 selected_context_elements = xml_element.xpath(select)
403 assert len(selected_context_elements) == 1
405 context_element = selected_context_elements[0]
406 replace_context_element = False
408 if to is not None:
409 f = "append"
410 if before is not None:
411 f = "addprevious"
412 if after is not None:
413 f = "addnext"
414 replace_context_element = True
416 for xml_sub_element in xml_element:
417 getattr(context_element, f)(xml_sub_element)
418 if replace_context_element:
419 context_element = xml_sub_element
421 def _xm_block(self, xml_element):
423 Create a scope to contain visibility of newly assigned Python
424 variables. This works the same way that Python itself scopes
425 variables, i.e. by creating a shallow copy of the Python namespace.
426 E.g. assignments to list items will be visible to outside scopes!
428 self._recurse_into(xml_element, self.namespace.copy())
429 for xml_sub_node in xml_element[::-1]: # get children reversed
430 xml_element.addnext(xml_sub_node)
432 def _xm_comment(self, xml_element):
434 A comment that is removed by XML Merge.
436 pass # that's it
438 def _xm_include(self, xml_element):
440 Include from the specified file (@file) the elements selected by
441 XPath (@select).
443 pass # TODO
445 def _xm_loop(self, xml_element):
447 Loop over a range of integer values.
449 The first attribute is evaluated as the loop counter. Example:
451 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
453 WARNING: The loop counter attribute, as well as all substitutions
454 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
455 (wholly or partially) be evaluated as Python expressions using
456 eval().
458 # Get the loop counter name and list:
459 loop_counter_name = xml_element.keys()[0]
460 loop_counter_list = eval(xml_element.get(loop_counter_name),
461 self.namespace)
463 # Loop:
464 addnext_to_node = xml_element # for new elements
465 for loop_counter_value in loop_counter_list:
466 self.namespace[loop_counter_name] = loop_counter_value
467 # xml_element_copy = copy.copy(xml_element) # CRASH
468 tailtext = xml_element.tail
469 xml_element.tail = None # xml_element regarded as document
470 xml_element_copy = ET.XML(ET.tostring(xml_element))
471 xml_element.tail = xml_element_copy.tail = tailtext
472 self._recurse_into(xml_element_copy)
473 if xml_element_copy.text is not None:
474 if addnext_to_node.tail is None:
475 addnext_to_node.tail = u""
476 addnext_to_node.tail += xml_element_copy.text
477 for xml_sub_node in xml_element_copy[:]:
478 addnext_to_node.addnext(xml_sub_node)
479 addnext_to_node = xml_sub_node
481 def _xm_pythoncode(self, xml_element):
483 Execute Python code in the current namespace.
485 'self' and 'xml_element' are supplied temporarily. They are added
486 to the current namespace before the 'exec' statement, and removed
487 again afterwards.
489 code = textwrap.dedent(xml_element.text).strip()
490 self.namespace["self"] = self
491 self.namespace["xml_element"] = xml_element
492 exec code in self.namespace
493 del self.namespace["self"], self.namespace["xml_element"]
495 def _xm_removeattributes(self, xml_element):
497 Remove the attributes (@name) from the (zero or more) elements
498 selected by XPath (@from or @select).
500 It is not considered an error if an attribute cannot be found on a
501 selected element.
503 attr_name = xml_element.get("name")
504 select_xpath = xml_element.get("from") or xml_element.get("select")
505 for xml_element_selected in xml_element.xpath(select_xpath):
506 # Can't find another way to remove an attribute than by using
507 # 'attrib':
508 attrib = xml_element_selected.attrib
509 if attr_name in attrib:
510 del xml_element_selected.attrib[attr_name]
512 def _xm_removeelements(self, xml_element):
514 Remove (zero or more) elements selected by XPath (@select).
516 pass # TODO
518 def _xm_setattribute(self, xml_element):
520 Assign the value (@value) to the attribute (@name) of the element
521 selected by XPath (@of or @select).
523 Example:
524 <Object index="0x1234"/>
525 <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
527 Leads to:
528 <Object index="0x1234" otherattr="hello"/>
530 pass # TODO
532 def _xm_text(self, xml_element):
534 Perform '{}' substitution on text.
536 text = xml_element.text
537 if text is None: return
538 tail = self._eval_substitution(text) + (xml_element.tail or "")
539 xml_element.tail = tail
541 def _xm_var(self, xml_element):
543 Set (zero or more) variables in the active Python namespace.
545 ns = self.namespace
546 for attr_name, attr_value in xml_element.items(): # attr map
547 ns[attr_name] = eval(attr_value, ns, ns)
550 ## MAIN FUNCTION
552 def main(argv, **kargs):
554 main(argv, **kargs) -> int
556 Process input to produce output according to the command line options
557 (given in argv). These keyword arguments (**kargs) are recognized:
559 initial_namespace
560 Gets passed on as the initial Python namespace to XMLPreprocess().
562 After the XML Merge Manual, this is the first piece of the code a new
563 developer will read. Keep this code as simple as possible if you change
564 it in any way.
566 These are all possible exit status codes returned or raised (using
567 SystemExit) by main or the functions it calls:
568 - On success, and if all requested validations (-s, -r) match:
569 return 0
570 - On error, e.g. wrong options (see parse_command_line()):
571 return 1
572 - On mismatch (either XML Schema (-s) or reference (-r)):
573 return mismatch_bitmap # see end of main()
574 - To aid understanding the bitmap: If N matching functions are
575 provided, and all are requested and all fail to match the output
576 file:
577 return (2 ** N - 1) * 2 # mismatch_bitmap
579 # Parse command line to get options:
580 options = parse_command_line(argv)
582 # Input file => preprocessing => output file:
583 xml = read_input_file(options.input)
584 proc = XMLPreprocess(**kargs)
585 proc(xml, trace_includes=options.trace_includes,
586 xml_filename=options.input)
587 xml = postprocess_xml(xml)
588 write_output_file(xml, options.output)
590 # If -s: Compare output to XML Schema file:
591 matches_schema = True # False means: match requested and negative
592 if options.xml_schema is not None:
593 xml_schema = read_xml_schema_file(options.xml_schema)
594 matches_schema = match_against_schema(options, xml, xml_schema)
596 # If -r: Compare output to reference:
597 matches_reference = True # False means: match requested and negative
598 if options.reference is not None:
599 matches_reference = match_against_reference(options, xml)
601 # Calculate and return the mismatch bitmap:
602 mismatch_bitmap = 0
603 mismatch_bitmap |= int(not matches_schema) << 1 # 2 on mismatch
604 mismatch_bitmap |= int(not matches_reference) << 2 # 4 on mismatch
605 return mismatch_bitmap
608 if __name__ == "__main__":
609 sys.exit(main(sys.argv))