xmlmerge.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 # XML Merge 2.0-pre
   5
   6 # Copyright 2008,2009  Felix Rabe  <public@felixrabe.net>
   7
   8
   9 # This file is part of XML Merge.
  10
  11 # XML Merge is free software: you can redistribute it and/or modify it
  12 # under the terms of the GNU Lesser General Public License as published by
  13 # the Free Software Foundation, either version 3 of the License, or (at
  14 # your option) any later version.
  15
  16 # XML Merge is distributed in the hope that it will be useful, but
  17 # WITHOUT ANY WARRANTY; without even the implied warranty of
  18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 # GNU Lesser General Public License for more details.
  20
  21 # You should have received a copy of the GNU Lesser General Public License
  22 # along with XML Merge.  If not, see <http://www.gnu.org/licenses/>.
  23
  24
  25 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
  26
  27 # TODO: What if an attribute should include the '{' or '}' chars?
  28
  29 """
  30 The purpose of XML Merge is to preprocess any kind of XML file with great
  31 flexibility.
  32
  33 XML Merge performs (among other things) recursive XML file inclusion and
  34 XML element and attribute modification.
  35
  36 XML Merge is a Python module. It is normally invoked as a program from the
  37 command line, but can equally well be used from within another Python
  38 program or module.
  39 """
  40
  41 __version_info__ = (2, 0, -1, 'git')
  42 __version__ = ".".join(str(n) for n in __version_info__[:2])
  43
  44 ## IMPORTS AND CONSTANTS
  45
  46 import copy
  47 import itertools
  48 import optparse
  49 import os
  50 import re
  51 import sys
  52 import textwrap
  53
  54 import lxml.etree as ET
  55
  56 # Namespace mapping (can be directly used for lxml nsmap arguments):
  57 xmns = {"xm":   "urn:felixrabe:xmlns:xmlmerge:preprocess",
  58         "xmt":  "urn:felixrabe:xmlns:xmlmerge:inctrace"}
  59
  60
  61 ## COMMAND LINE OPTION PARSING
  62
  63 class OptionParser(optparse.OptionParser):
  64
  65     def __init__(self, *a, **kw):
  66         optparse.OptionParser.__init__(self, *a, **kw)
  67         self.add_option("-i", "--input",
  68                         help=("(REQUIRED) input XML file"))
  69         self.add_option("-o", "--output",
  70                         help=("output XML file (.out.xml if not given)"))
  71         self.add_option("-s", "--xml-schema",
  72                         help=("XML Schema (.xsd) to validate output " +
  73                               "against"))
  74         self.add_option("-r", "--reference",
  75                         help=("reference XML file to compare output " +
  76                               "against"))
  77         self.add_option("-d", "--html-diff", action="store_true",
  78                         help=("only with -r; if output and reference " +
  79                               "differ, produce a HTML file showing the " +
  80                               "differences"))
  81         self.add_option("-t", "--trace-includes", action="store_true",
  82                         help=("add tracing information to included " +
  83                               "XML fragments"))
  84         self.add_option("-v", "--verbose", action="store_const",
  85                         dest="verbose", const=3,
  86                         help=("show debugging messages"))
  87         self.add_option("-q", "--quiet", action="store_const",
  88                         dest="verbose", const=1,
  89                         help=("only show error messages"))
  90         self.set_defaults(verbose=2)
  91
  92         # Explanation: levels of verbosity
  93         # --quiet   -> self.verbose == 1  # only show error messages
  94         #           -> self.verbose == 2  # no verbosity option given
  95         # --verbose -> self.verbose == 3  # show debugging messages
  96
  97     def error(self, *a, **kw):
  98         self.print_help()
  99         return optparse.OptionParser.error(self, *a, **kw)
 100
 101
 102 def parse_command_line(argv):
 103     """
 104     parse_command_line(argv) -> optparse.Values
 105
 106     Parse argv and return an optparse.Values object containing the options.
 107
 108     This function performs all the necessary checks and conversions to make
 109     sure all necessary options are given, and that all options are
 110     available in a normalized format.
 111
 112     It also tries to create the containing directory for the output file if
 113     it does not exist already.
 114     """
 115     # Parse options using OptionParser:
 116     option_parser = OptionParser()
 117     options, args = option_parser.parse_args(argv[1:])
 118
 119     # Make sure only options, and no other arguments, are passed on the
 120     # command line:
 121     try:
 122         assert args == []
 123         assert options.input is not None
 124     except:
 125         option_parser.error("Error: invalid argument list")
 126
 127     # If the output option has been omitted, build the output filename from
 128     # the input filename, resulting in the file extension ".out.xml":
 129     if options.output is None:
 130         if options.input.lower().endswith(".xml"):
 131             options.output = options.input[:-4] + ".out.xml"
 132         else:
 133             options.output = options.input      + ".out.xml"
 134
 135     # Convert all filename options to normalized absolutized pathnames:
 136     for n in "input output reference".split():
 137         if getattr(options, n) is None: continue  # if "-r" was not given
 138         setattr(options, n, os.path.abspath(getattr(options, n)))
 139
 140     # When --verbose, print all filename options:
 141     if options.verbose >= 3:
 142         print "Input:     %s" % options.input
 143         print "Output:    %s" % options.output
 144         print "Reference: %s" % options.reference
 145
 146     # Make sure there is a directory where the output XML file should go:
 147     try:
 148         os.makedirs(os.path.dirname(options.output))
 149     except:
 150         pass  # fail later if there still is no output directory now
 151
 152     return options
 153
 154
 155 ## XML PROCESSING AND COMPARISON
 156
 157 def read_input_file(input_filename):
 158     """
 159     read_input_file(input_filename) -> ET._Element
 160
 161     Read the input file, and return the corresponding XML Element object,
 162     the element tree root.
 163     """
 164     input_xml = ET.parse(input_filename).getroot()
 165     return input_xml
 166
 167 def postprocess_xml(output_xml):
 168     """
 169     postprocess_xml(output_xml) -> ET._Element
 170
 171     Remove unnecessary namespace declarations and whitespace. Returns a
 172     modified copy of output_xml. The argument may be modified by calling
 173     this function.
 174     """
 175     # Remove unused namespace declarations:
 176     # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
 177     ns_root = ET.Element("NS_ROOT", nsmap=xmns)
 178     ns_root.append(output_xml)
 179     ns_root.remove(output_xml)
 180     # If you don't perform this copy, each output_xml element's
 181     # getroottree() will report the temporary tree containing the empty
 182     # NS_ROOT element. This is not a hack, this is about how lxml works.
 183     output_xml = ET.ElementTree(copy.copy(output_xml)).getroot()
 184
 185     # Make pretty-printing work by removing unnecessary whitespace:
 186     for el in output_xml.iter():
 187         if el.text and not el.text.strip():
 188             el.text = None
 189         if el.tail and not el.tail.strip():
 190             el.tail = None
 191
 192     return output_xml
 193
 194 def write_output_file(output_xml, output_filename):
 195     """
 196     Write the output XML Element to the specified output filename.
 197     """
 198     output_xmltree = output_xml.getroottree()
 199     output_xmltree.write(output_filename, pretty_print=True,
 200                          xml_declaration=True, encoding="utf-8")
 201
 202 def read_xml_schema_file(xml_schema_filename):
 203     """
 204     read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
 205
 206     Read the XML Schema file, and return the corresponding XML Schema
 207     object.
 208     """
 209     xml_schema_xmltree = ET.parse(xml_schema_filename)
 210     xml_schema = ET.XMLSchema(xml_schema_xmltree)
 211     return xml_schema
 212
 213 def match_against_schema(options, output_xml, xml_schema):
 214     """
 215     match_against_schema(options, output_xml, xml_schema) -> bool
 216
 217     Validate output against XML Schema.
 218
 219     The result is True if the output XML Element (tree) matches the XML
 220     Schema, otherwise the result is False.
 221     """
 222     is_valid = xml_schema.validate(output_xml.getroottree())
 223     if options.verbose >= 2:
 224         if is_valid:
 225             print "Output matches XML Schema."
 226         else:
 227             print "Output invalid according to XML Schema."
 228             print xml_schema.error_log.last_error
 229     return is_valid
 230
 231 def match_against_reference(options, output_xml):
 232     """
 233     match_against_reference(options, output_xml) -> bool
 234
 235     Compare the output string (read from file options.output) to the
 236     reference string (read from options.reference). If they are not the
 237     same (bytewise), and if options.html_diff is True, create an HTML file
 238     showing the differences.
 239
 240     The result is True if output and reference are the same (bytewise),
 241     otherwise the result is False.
 242     """
 243     reference_filename = options.reference
 244     output_filename = options.output
 245     do_html_diff = options.html_diff
 246
 247     reference_str = file(reference_filename, "rb").read()
 248     output_str = file(output_filename, "rb").read()
 249     is_valid = (reference_str == output_str)
 250     if options.verbose >= 2:
 251         if is_valid:
 252             print "Output matches reference."
 253         elif not do_html_diff:
 254             print "Output and reference differ."
 255     if do_html_diff and not is_valid:
 256         html_filename = "%s.diff.html" % output_filename
 257         if options.verbose >= 2:
 258             print ("Output and reference differ - " +
 259                    "generating '%s'..." % html_filename)
 260         create_reference_diff_html(html_filename, reference_str,
 261                                    output_str)
 262     return is_valid
 263
 264 def create_reference_diff_html(html_filename, reference_str, output_str):
 265     """
 266     Create an HTML file (created at html_filename) showing the differrences
 267     between the reference string and the output string side-by-side.
 268     """
 269     reference_lines = reference_str.splitlines()
 270     output_lines    = output_str   .splitlines()
 271
 272     import difflib
 273     html_diff = difflib.HtmlDiff(wrapcolumn=75)
 274     html_str = html_diff.make_file(reference_lines, output_lines,
 275                                    "Reference",     "Output")
 276     file(html_filename, "w").write(html_str)
 277
 278
 279 ## XML PREPROCESS CLASS
 280
 281 class XMLPreprocess(object):
 282     """
 283     Use:
 284
 285     >>> proc = XMLPreprocess()
 286     >>> output_xml = proc(options, input_xml)  # input_xml may change
 287     """
 288
 289     def __init__(self, initial_namespace={}):
 290         super(XMLPreprocess, self).__init__()
 291         self._namespace_stack = [initial_namespace]
 292
 293     def __call__(self, xml_element, namespace=None,
 294                  trace_includes=False, xml_filename=None):
 295         """
 296         XMLPreprocess()(...)
 297
 298         Preprocess the input XML Element, xml_element. The element tree of
 299         xml_element will be modified in-place.
 300
 301         The namespace given should be a dict that can be used as a Python
 302         namespace. This namespace will be used in XML attribute
 303         substitution.
 304
 305         If trace_includes is True, the output will contain tags that
 306         surround included sections of the file. The xml_filename argument
 307         is then required.
 308
 309         Processing tags will recursively call this method (__call__) for
 310         preprocessing the included file and for recursive inclusion.
 311         """
 312         if namespace is not None:
 313             self._namespace_stack.append(namespace)
 314         self.namespace = self._namespace_stack[-1]
 315         self.trace_includes = trace_includes
 316         self.xml_filename = xml_filename
 317
 318         ns = "{%s}" % xmns["xm"]
 319         len_ns = len(ns)
 320
 321         # Evaluate Python expressions in the attributes of xml_element:
 322         for attr_name, attr_value in xml_element.items():  # attr map
 323             v = self._eval_substitution(attr_value)
 324             xml_element.set(attr_name, v)
 325
 326         # If xml_element has xmns["xm"] as its namespace, proceed with the
 327         # appropriate method of this class:
 328         if xml_element.nsmap.get(xml_element.prefix) == xmns["xm"]:
 329             tag = xml_element.tag[len_ns:]  # just the tag without namespc
 330             method = "_xm_" + tag.lower()  # tolerate any case
 331             if not hasattr(self, method):
 332                 raise Exception, "cannot process <xm:%s/>" % tag
 333             getattr(self, method)(xml_element)  # call the method
 334             # Preserve tail text:
 335             tail = xml_element.tail
 336             if tail is not None:
 337                 prev = xml_element.getprevious()
 338                 parent = xml_element.getparent()
 339                 if prev is not None:
 340                     prev.tail = (prev.tail or "") + tail
 341                 else:
 342                     parent.text = (parent.text or "") + tail
 343             xml_element.getparent().remove(xml_element)
 344
 345         # If not, recurse:
 346         else:
 347             self._recurse_into(xml_element)
 348
 349         return None
 350
 351     def _recurse_into(self, xml_element, namespace=None):
 352         if namespace is not None:
 353             self._namespace_stack.append(namespace)
 354         for xml_sub_element in xml_element.xpath("*"):
 355             self(xml_sub_element, None,
 356                  self.trace_includes, self.xml_filename)
 357         if namespace is not None:
 358             self._namespace_stack.pop()
 359             self.namespace = self._namespace_stack[-1]
 360
 361     _eval_substitution_regex = re.compile(r"\{(.*?)\}")
 362
 363     def _eval_substitution(self, string):
 364         """
 365         Evaluate Python expressions within strings.
 366
 367         Internal method to perform substitution of Python expressions
 368         within attribute values, {x} -> str(eval(x)).  Example:
 369
 370         >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
 371         '3 + 5 = 8 in Python'
 372
 373         Multiple Python expressions in one string are supported as well.
 374         """
 375         new_str = []  # faster than always concatenating strings
 376         last_index = 0
 377         for match in self._eval_substitution_regex.finditer(string):
 378             new_str.append(string[last_index:match.start()])
 379             expression = match.group(1)
 380             result = str(eval(expression, self.namespace))
 381             new_str.append(result)
 382             last_index = match.end()
 383         new_str.append(string[last_index:])
 384         return "".join(new_str)
 385
 386     def _xm_addelements(self, xml_element):
 387         """
 388         Add subelements to, before, or after the element selected by XPath
 389         (@to, @before or @after).
 390
 391         Exactly one of (@to, @before, @after) must be specified.  And the
 392         XPath expression must return exactly one element.  These conditions
 393         are checked by assertions and will raise an exception if not met.
 394         """
 395         to = xml_element.get("to")
 396         before = xml_element.get("before")
 397         after = xml_element.get("after")
 398
 399         assert sum((to is None, before is None, after is None)) == 2
 400         select = to or before or after
 401
 402         selected_context_elements = xml_element.xpath(select)
 403         assert len(selected_context_elements) == 1
 404
 405         context_element = selected_context_elements[0]
 406         replace_context_element = False
 407
 408         if to is not None:
 409             f = "append"
 410         if before is not None:
 411             f = "addprevious"
 412         if after is not None:
 413             f = "addnext"
 414             replace_context_element = True
 415
 416         for xml_sub_element in xml_element:
 417             getattr(context_element, f)(xml_sub_element)
 418             if replace_context_element:
 419                 context_element = xml_sub_element
 420
 421     def _xm_block(self, xml_element):
 422         """
 423         Create a scope to contain visibility of newly assigned Python
 424         variables.  This works the same way that Python itself scopes
 425         variables, i.e. by creating a shallow copy of the Python namespace.
 426         E.g. assignments to list items will be visible to outside scopes!
 427         """
 428         self._recurse_into(xml_element, self.namespace.copy())
 429         for xml_sub_node in xml_element[::-1]:  # get children reversed
 430             xml_element.addnext(xml_sub_node)
 431
 432     def _xm_comment(self, xml_element):
 433         """
 434         A comment that is removed by XML Merge.
 435         """
 436         pass  # that's it
 437
 438     def _xm_include(self, xml_element):
 439         """
 440         Include from the specified file (@file) the elements selected by
 441         XPath (@select).
 442         """
 443         pass  # TODO
 444
 445     def _xm_loop(self, xml_element):
 446         """
 447         Loop over a range of integer values.
 448
 449         The first attribute is evaluated as the loop counter.  Example:
 450
 451             i="range(5, 9)"  =>  iterates with i being 5, 6, 7, 8
 452
 453         WARNING: The loop counter attribute, as well as all substitutions
 454         in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
 455         (wholly or partially) be evaluated as Python expressions using
 456         eval().
 457         """
 458         # Get the loop counter name and list:
 459         loop_counter_name = xml_element.keys()[0]
 460         loop_counter_list = eval(xml_element.get(loop_counter_name),
 461                                  self.namespace)
 462
 463         # Loop:
 464         addnext_to_node = xml_element  # for new elements
 465         for loop_counter_value in loop_counter_list:
 466             self.namespace[loop_counter_name] = loop_counter_value
 467             # xml_element_copy = copy.copy(xml_element)  # CRASH
 468             tailtext = xml_element.tail
 469             xml_element.tail = None  # xml_element regarded as document
 470             xml_element_copy = ET.XML(ET.tostring(xml_element))
 471             xml_element.tail = xml_element_copy.tail = tailtext
 472             self._recurse_into(xml_element_copy)
 473             if xml_element_copy.text is not None:
 474                 if addnext_to_node.tail is None:
 475                     addnext_to_node.tail = u""
 476                 addnext_to_node.tail += xml_element_copy.text
 477             for xml_sub_node in xml_element_copy[:]:
 478                 addnext_to_node.addnext(xml_sub_node)
 479                 addnext_to_node = xml_sub_node
 480
 481     def _xm_pythoncode(self, xml_element):
 482         """
 483         Execute Python code in the current namespace.
 484
 485         'self' and 'xml_element' are supplied temporarily. They are added
 486         to the current namespace before the 'exec' statement, and removed
 487         again afterwards.
 488         """
 489         code = textwrap.dedent(xml_element.text).strip()
 490         self.namespace["self"] = self
 491         self.namespace["xml_element"] = xml_element
 492         exec code in self.namespace
 493         del self.namespace["self"], self.namespace["xml_element"]
 494
 495     def _xm_removeattributes(self, xml_element):
 496         """
 497         Remove the attributes (@name) from the (zero or more) elements
 498         selected by XPath (@from or @select).
 499
 500         It is not considered an error if an attribute cannot be found on a
 501         selected element.
 502         """
 503         attr_name = xml_element.get("name")
 504         select_xpath = xml_element.get("from") or xml_element.get("select")
 505         for xml_element_selected in xml_element.xpath(select_xpath):
 506             # Can't find another way to remove an attribute than by using
 507             # 'attrib':
 508             attrib = xml_element_selected.attrib
 509             if attr_name in attrib:
 510                 del xml_element_selected.attrib[attr_name]
 511
 512     def _xm_removeelements(self, xml_element):
 513         """
 514         Remove (zero or more) elements selected by XPath (@select).
 515         """
 516         pass  # TODO
 517
 518     def _xm_setattribute(self, xml_element):
 519         """
 520         Assign the value (@value) to the attribute (@name) of the element
 521         selected by XPath (@of or @select).
 522
 523         Example:
 524             <Object index="0x1234"/>
 525             <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
 526
 527         Leads to:
 528             <Object index="0x1234" otherattr="hello"/>
 529         """
 530         pass  # TODO
 531
 532     def _xm_text(self, xml_element):
 533         """
 534         Perform '{}' substitution on text.
 535         """
 536         text = xml_element.text
 537         if text is None: return
 538         tail = self._eval_substitution(text) + (xml_element.tail or "")
 539         xml_element.tail = tail
 540
 541     def _xm_var(self, xml_element):
 542         """
 543         Set (zero or more) variables in the active Python namespace.
 544         """
 545         ns = self.namespace
 546         for attr_name, attr_value in xml_element.items():  # attr map
 547             ns[attr_name] = eval(attr_value, ns, ns)
 548
 549
 550 ## MAIN FUNCTION
 551
 552 def main(argv, **kargs):
 553     """
 554     main(argv, **kargs) -> int
 555
 556     Process input to produce output according to the command line options
 557     (given in argv).  These keyword arguments (**kargs) are recognized:
 558
 559     initial_namespace
 560       Gets passed on as the initial Python namespace to XMLPreprocess().
 561
 562     After the XML Merge Manual, this is the first piece of the code a new
 563     developer will read. Keep this code as simple as possible if you change
 564     it in any way.
 565
 566     These are all possible exit status codes returned or raised (using
 567     SystemExit) by main or the functions it calls:
 568         - On success, and if all requested validations (-s, -r) match:
 569             return 0
 570         - On error, e.g. wrong options (see parse_command_line()):
 571             return 1
 572         - On mismatch (either XML Schema (-s) or reference (-r)):
 573             return mismatch_bitmap  # see end of main()
 574         - To aid understanding the bitmap: If N matching functions are
 575           provided, and all are requested and all fail to match the output
 576           file:
 577             return (2 ** N - 1) * 2  # mismatch_bitmap
 578     """
 579     # Parse command line to get options:
 580     options = parse_command_line(argv)
 581
 582     # Input file => preprocessing => output file:
 583     xml = read_input_file(options.input)
 584     proc = XMLPreprocess(**kargs)
 585     proc(xml, trace_includes=options.trace_includes,
 586          xml_filename=options.input)
 587     xml = postprocess_xml(xml)
 588     write_output_file(xml, options.output)
 589
 590     # If -s: Compare output to XML Schema file:
 591     matches_schema = True  # False means: match requested and negative
 592     if options.xml_schema is not None:
 593         xml_schema = read_xml_schema_file(options.xml_schema)
 594         matches_schema = match_against_schema(options, xml, xml_schema)
 595
 596     # If -r: Compare output to reference:
 597     matches_reference = True  # False means: match requested and negative
 598     if options.reference is not None:
 599         matches_reference = match_against_reference(options, xml)
 600
 601     # Calculate and return the mismatch bitmap:
 602     mismatch_bitmap = 0
 603     mismatch_bitmap |= int(not matches_schema)    << 1  # 2 on mismatch
 604     mismatch_bitmap |= int(not matches_reference) << 2  # 4 on mismatch
 605     return mismatch_bitmap
 606
 607
 608 if __name__ == "__main__":
 609     sys.exit(main(sys.argv))