2 # -*- coding: utf-8 -*-
4 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
7 # This file is part of XML Merge.
9 # XML Merge is free software: you can redistribute it and/or modify it
10 # under the terms of the GNU Lesser General Public License as published by
11 # the Free Software Foundation, either version 3 of the License, or (at
12 # your option) any later version.
14 # XML Merge is distributed in the hope that it will be useful, but
15 # WITHOUT ANY WARRANTY; without even the implied warranty of
16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 # GNU Lesser General Public License for more details.
19 # You should have received a copy of the GNU Lesser General Public License
20 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
23 # Developed (i.e. tested) using Python 2.6.3 and lxml 2.2.2.
26 The purpose of XML Merge is to preprocess any kind of XML file with great
29 XML Merge performs (among other things) recursive XML file inclusion and
30 XML element and attribute modification.
32 XML Merge is a Python module. It is normally invoked as a program from the
33 command line, but can equally well be used from within another Python
37 ## IMPORTS AND CONSTANTS
46 import lxml
.etree
as ET
48 # Namespace mapping (can be directly used for lxml nsmap arguments):
49 xmns
= {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
50 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
53 ## COMMAND LINE OPTION PARSING
55 class OptionParser(optparse
.OptionParser
):
57 def __init__(self
, *a
, **kw
):
58 optparse
.OptionParser
.__init
__(self
, *a
, **kw
)
59 self
.add_option("-i", "--input",
60 help=("(REQUIRED) input XML file"))
61 self
.add_option("-o", "--output",
62 help=("output XML file (.out.xml if not given)"))
63 self
.add_option("-s", "--xml-schema",
64 help=("XML Schema (.xsd) to validate output " +
66 self
.add_option("-r", "--reference",
67 help=("reference XML file to compare output " +
69 self
.add_option("-d", "--html-diff", action
="store_true",
70 help=("only with -r; if output and reference " +
71 "differ, produce a HTML file showing the " +
73 self
.add_option("-t", "--trace-includes", action
="store_true",
74 help=("add tracing information to included " +
76 self
.add_option("-v", "--verbose", action
="store_const",
77 dest
="verbose", const
=3,
78 help=("show debugging messages"))
79 self
.add_option("-q", "--quiet", action
="store_const",
80 dest
="verbose", const
=1,
81 help=("only show error messages"))
82 self
.set_defaults(verbose
=2)
84 # Explanation: levels of verbosity
85 # --quiet -> self.verbose == 1 # only show error messages
86 # -> self.verbose == 2 # no verbosity option given
87 # --verbose -> self.verbose == 3 # show debugging messages
90 def parse_command_line(argv
):
92 parse_command_line(argv) -> optparse.Values
94 Parse argv and return an optparse.Values object containing the options.
96 This function performs all the necessary checks and conversions to make
97 sure all necessary options are given, and that all options are
98 available in a normalized format.
100 It also tries to create the containing directory for the output file if
101 it does not exist already.
103 # Parse options using OptionParser:
104 option_parser
= OptionParser()
105 options
, args
= option_parser
.parse_args(argv
[1:])
107 # Make sure only options, and no other arguments, are passed on the
111 assert options
.input is not None
113 print "Error: invalid argument list"
115 option_parser
.print_help()
118 # If the output option has been omitted, build the output filename from
119 # the input filename, resulting in the file extension ".out.xml":
120 if options
.output
is None:
121 if options
.input.lower().endswith(".xml"):
122 options
.output
= options
.input[:-4] + ".out.xml"
124 options
.output
= options
.input + ".out.xml"
126 # Convert all filename options to normalized absolutized pathnames:
127 for n
in "input output reference".split():
128 if getattr(options
, n
) is None: continue # if "-r" was not given
129 setattr(options
, n
, os
.path
.abspath(getattr(options
, n
)))
131 # When --verbose, print all filename options:
132 if options
.verbose
>= 3:
133 print "Input: %s" % options
.input
134 print "Output: %s" % options
.output
135 print "Reference: %s" % options
.reference
137 # Make sure there is a directory where the output XML file should go:
139 os
.makedirs(os
.path
.dirname(options
.output
))
141 pass # fail later if there still is no output directory now
146 ## XML PROCESSING AND COMPARISON
148 def read_input_file(input_filename
):
150 read_input_file(input_filename) -> ET._Element
152 Read the input file, and return the corresponding XML Element object,
153 the element tree root.
155 input_xml
= ET
.parse(input_filename
).getroot()
158 def postprocess_xml(output_xml
):
160 postprocess_xml(output_xml) -> ET._Element
162 Remove unnecessary namespace declarations and whitespace. Returns a
163 modified copy of output_xml. The argument may be modified by calling
166 # Remove unused namespace declarations:
167 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
168 ns_root
= ET
.Element("NS_ROOT", nsmap
=xmns
)
169 ns_root
.append(output_xml
)
170 ns_root
.remove(output_xml
)
171 # If you don't perform this copy, each output_xml element's
172 # getroottree() will report the temporary tree containing the empty
173 # NS_ROOT element. This is not a hack, this is about how lxml works.
174 output_xml
= ET
.ElementTree(copy
.copy(output_xml
)).getroot()
176 # Make pretty-printing work by removing unnecessary whitespace:
177 for el
in output_xml
.iter():
178 if len(el
) and el
.text
and not el
.text
.strip():
180 if el
.tail
and not el
.tail
.strip():
185 def write_output_file(output_xml
, output_filename
):
187 Write the output XML Element to the specified output filename.
189 output_xmltree
= output_xml
.getroottree()
190 output_xmltree
.write(output_filename
, pretty_print
=True,
191 xml_declaration
=True, encoding
="utf-8")
193 def read_xml_schema_file(xml_schema_filename
):
195 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
197 Read the XML Schema file, and return the corresponding XML Schema
200 xml_schema_xmltree
= ET
.parse(xml_schema_filename
)
201 xml_schema
= ET
.XMLSchema(xml_schema_xmltree
)
204 def match_against_schema(options
, output_xml
, xml_schema
):
206 match_against_schema(options, output_xml, xml_schema) -> bool
208 Validate output against XML Schema.
210 The result is True if the output XML Element (tree) matches the XML
211 Schema, otherwise the result is False.
213 is_valid
= xml_schema
.validate(output_xml
.getroottree())
214 if options
.verbose
>= 2:
216 print "Output matches XML Schema."
218 print "Output invalid according to XML Schema."
219 print xml_schema
.error_log
.last_error
222 def match_against_reference(options
, output_xml
):
224 match_against_reference(options, output_xml) -> bool
226 Compare the output string (read from file options.output) to the
227 reference string (read from options.reference). If they are not the
228 same (bytewise), and if options.html_diff is True, create an HTML file
229 showing the differences.
231 The result is True if output and reference are the same (bytewise),
232 otherwise the result is False.
234 reference_filename
= options
.reference
235 output_filename
= options
.output
236 do_html_diff
= options
.html_diff
238 reference_str
= file(reference_filename
, "rb").read()
239 output_str
= file(output_filename
, "rb").read()
240 is_valid
= (reference_str
== output_str
)
241 if options
.verbose
>= 2:
243 print "Output matches reference."
244 elif not do_html_diff
:
245 print "Output and reference differ."
246 if do_html_diff
and not is_valid
:
247 html_filename
= "%s.diff.html" % output_filename
248 if options
.verbose
>= 2:
249 print ("Output and reference differ - " +
250 "generating '%s'..." % html_filename
)
251 create_reference_diff_html(html_filename
, reference_str
,
255 def create_reference_diff_html(html_filename
, reference_str
, output_str
):
257 Create an HTML file (created at html_filename) showing the differrences
258 between the reference string and the output string side-by-side.
260 reference_lines
= reference_str
.splitlines()
261 output_lines
= output_str
.splitlines()
264 html_diff
= difflib
.HtmlDiff(wrapcolumn
=75)
265 html_str
= html_diff
.make_file(reference_lines
, output_lines
,
266 "Reference", "Output")
267 file(html_filename
, "w").write(html_str
)
270 ## XML PREPROCESS CLASS
272 class XMLPreprocess(object):
276 >>> proc = XMLPreprocess()
277 >>> output_xml = proc(options, input_xml) # input_xml may change
280 def __call__(self
, xml_element
, xml_filename
=None,
281 trace_includes
=False):
283 XMLPreprocess()(input_xml) -> ET._Element
284 XMLPreprocess()(input_xml, filename, True) -> ET._Element # traced
286 Preprocess the input XML Element to produce an output XML Element.
287 The argument may be modified.
289 If trace_includes is True, the output will contain tags that
290 surround included sections of the file.
292 Inclusion will recursively call this method (__call__) for
293 preprocessing the included file and for recursive inclusion.
295 ns
= "{%s}" % xmns
["xm"]
298 # Process Loop elements:
299 for el
in xml_element
.xpath(".//xm:Loop", namespaces
=xmns
):
301 el
.getparent().remove(el
)
303 # Process Include elements:
304 for el
in xml_element
.xpath(".//xm:Include", namespaces
=xmns
):
305 self
.Include(el
, xml_filename
)
306 el
.getparent().remove(el
)
308 # Process any other elements from the XMLMerge namespace:
309 for el
in xml_element
.xpath(".//xm:*", namespaces
=xmns
):
310 tag
= el
.tag
[len_ns
:]
311 getattr(self
, tag
)(el
)
312 el
.getparent().remove(el
)
316 _eval_substitution_regex
= re
.compile(r
"\{(.*?)\}")
318 def _eval_substitution(self
, attr_value
, namespace
):
320 Evaluate Python expressions within strings.
322 Internal method to perform substitution of Python expressions
323 within attribute values, {x} -> str(eval(x)). Example:
325 >>> self._attr_substitution("3 + 5 = {3 + 5} in Python", {})
326 '3 + 5 = 8 in Python'
328 Multiple Python expressions in one string are supported as well.
330 new_a_value
= [] # faster than always concatenating strings
332 for match
in self
._eval
_substitution
_regex
.finditer(attr_value
):
333 new_a_value
.append(attr_value
[last_index
:match
.start()])
334 result
= str(eval(match
.group(1), namespace
, namespace
))
335 new_a_value
.append(result
)
336 last_index
= match
.end()
337 new_a_value
.append(attr_value
[last_index
:])
338 return "".join(new_a_value
)
340 def Var(self
, var_element
):
343 def Loop(self
, loop_element
):
345 Loop over a range of integer values.
347 The first attribute is evaluated as the loop counter. Example:
349 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
351 WARNING: The loop counter attribute, as well as all substitutions
352 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
353 (wholly or partially) be evaluated as Python expressions using
356 # Get the loop counter:
357 loop_counter_name
= loop_element
.keys()[0]
358 loop_counter_list
= eval(loop_element
.get(loop_counter_name
))
361 addnext_to_node
= loop_element
# for new elements
362 for loop_counter_value
in loop_counter_list
:
363 namespace
= {loop_counter_name
: loop_counter_value
}
365 # Create a copy of the direct descendants:
366 child_copies
= [] # for the current Loop iteration
367 for child
in loop_element
: # includes non-elements by choice
368 child_copies
.append(copy
.copy(child
))
369 addnext_to_node
.addnext(child_copies
[-1])
370 addnext_to_node
= child_copies
[-1]
372 # Perform {x} -> str(eval(x)) substitution in all attributes of
374 all_descendants_iter
= (c
.xpath(".//*") for c
in child_copies
)
375 for sub_elem
in itertools
.chain(*all_descendants_iter
):
376 for attr_name
, attr_value
in sub_elem
.items():
377 # Perform attribute substitution:
378 v
= self
._eval
_substitution
(attr_value
, namespace
)
379 sub_elem
.set(attr_name
, v
)
381 def Include(self
, el
, xml_filename
):
383 Include from the specified file (@file) the elements selected by
387 def AddElements(self
, el
):
389 Add subelements to, before, or after the element selected by XPath
390 (@to, @before or @after).
393 before
= el
.get("before")
394 after
= el
.get("after")
395 assert sum((to
is None, before
is None, after
is None)) == 2
396 select
= to
or before
or after
398 def RemoveElements(self
, el
):
400 Remove elements selected by XPath (@select).
403 def SetAttribute(self
, el
):
405 Assign the value (@value) to the attribute (@name) of the element
406 selected by XPath (@select).
409 def RemoveAttribute(self
, el
):
411 Remove the attribute (@name) from the element selected by XPath
415 def PythonCode(self
, el
):
427 Process input to produce output according to the command line options.
429 After the XML Merge Manual, this is the first piece of the code a new
430 developer will read. Keep this code as simple as possible if you change
433 These are all possible exit status codes returned or raised by main or
434 the functions it calls:
435 - On success, and if all requested validations (-s, -r) match:
437 - On error, e.g. wrong options (see parse_command_line()):
439 - On mismatch (either XML Schema (-s) or reference (-r)):
440 return mismatch_bitmap # see end of main()
441 - To aid understanding the bitmap: If N matching functions are
442 provided, and all are requested and all fail to match the output
444 return (2 ** N - 1) * 2 # mismatch_bitmap
446 # Parse command line to get options:
447 options
= parse_command_line(argv
)
449 # Input file => preprocessing => output file:
450 input_xml
= read_input_file(options
.input)
451 proc
= XMLPreprocess()
452 output_xml
= proc(input_xml
, options
.input, options
.trace_includes
)
453 output_xml
= postprocess_xml(output_xml
)
454 write_output_file(output_xml
, options
.output
)
456 # If -s: Compare output to XML Schema file:
457 matches_schema
= True # False means: match requested and negative
458 if options
.xml_schema
is not None:
459 xml_schema
= read_xml_schema_file(options
.xml_schema
)
460 matches_schema
= match_against_schema(options
, output_xml
,
463 # If -r: Compare output to reference:
464 matches_reference
= True # False means: match requested and negative
465 if options
.reference
is not None:
466 matches_reference
= match_against_reference(options
, output_xml
)
468 # Calculate and return the mismatch bitmap:
470 mismatch_bitmap |
= int(not matches_schema
) << 1 # 2 on mismatch
471 mismatch_bitmap |
= int(not matches_reference
) << 2 # 4 on mismatch
472 return mismatch_bitmap
475 if __name__
== "__main__":
476 sys
.exit(main(sys
.argv
))