2 # -*- coding: utf-8 -*-
6 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
9 # This file is part of XML Merge.
11 # XML Merge is free software: you can redistribute it and/or modify it
12 # under the terms of the GNU Lesser General Public License as published by
13 # the Free Software Foundation, either version 3 of the License, or (at
14 # your option) any later version.
16 # XML Merge is distributed in the hope that it will be useful, but
17 # WITHOUT ANY WARRANTY; without even the implied warranty of
18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 # GNU Lesser General Public License for more details.
21 # You should have received a copy of the GNU Lesser General Public License
22 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
25 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
27 # TODO: What if an attribute should include the '{' or '}' chars?
30 The purpose of XML Merge is to preprocess any kind of XML file with great
33 XML Merge performs (among other things) recursive XML file inclusion and
34 XML element and attribute modification.
36 XML Merge is a Python module. It is normally invoked as a program from the
37 command line, but can equally well be used from within another Python
41 __version_info__
= (2, 0, -1, 'git')
42 __version__
= ".".join(str(n
) for n
in __version_info__
[:2])
44 ## IMPORTS AND CONSTANTS
54 import lxml
.etree
as ET
56 # Namespace mapping (can be directly used for lxml nsmap arguments):
57 xmns
= {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
58 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
61 ## COMMAND LINE OPTION PARSING
63 class OptionParser(optparse
.OptionParser
):
65 def __init__(self
, *a
, **kw
):
66 optparse
.OptionParser
.__init
__(self
, *a
, **kw
)
67 self
.add_option("-i", "--input",
68 help=("(REQUIRED) input XML file"))
69 self
.add_option("-o", "--output",
70 help=("output XML file (.out.xml if not given)"))
71 self
.add_option("-s", "--xml-schema",
72 help=("XML Schema (.xsd) to validate output " +
74 self
.add_option("-r", "--reference",
75 help=("reference XML file to compare output " +
77 self
.add_option("-d", "--html-diff", action
="store_true",
78 help=("only with -r; if output and reference " +
79 "differ, produce a HTML file showing the " +
81 self
.add_option("-t", "--trace-includes", action
="store_true",
82 help=("add tracing information to included " +
84 self
.add_option("-v", "--verbose", action
="store_const",
85 dest
="verbose", const
=3,
86 help=("show debugging messages"))
87 self
.add_option("-q", "--quiet", action
="store_const",
88 dest
="verbose", const
=1,
89 help=("only show error messages"))
90 self
.set_defaults(verbose
=2)
92 # Explanation: levels of verbosity
93 # --quiet -> self.verbose == 1 # only show error messages
94 # -> self.verbose == 2 # no verbosity option given
95 # --verbose -> self.verbose == 3 # show debugging messages
98 def parse_command_line(argv
):
100 parse_command_line(argv) -> optparse.Values
102 Parse argv and return an optparse.Values object containing the options.
104 This function performs all the necessary checks and conversions to make
105 sure all necessary options are given, and that all options are
106 available in a normalized format.
108 It also tries to create the containing directory for the output file if
109 it does not exist already.
111 # Parse options using OptionParser:
112 option_parser
= OptionParser()
113 options
, args
= option_parser
.parse_args(argv
[1:])
115 # Make sure only options, and no other arguments, are passed on the
119 assert options
.input is not None
121 option_parser
.error("Error: invalid argument list")
123 # If the output option has been omitted, build the output filename from
124 # the input filename, resulting in the file extension ".out.xml":
125 if options
.output
is None:
126 if options
.input.lower().endswith(".xml"):
127 options
.output
= options
.input[:-4] + ".out.xml"
129 options
.output
= options
.input + ".out.xml"
131 # Convert all filename options to normalized absolutized pathnames:
132 for n
in "input output reference".split():
133 if getattr(options
, n
) is None: continue # if "-r" was not given
134 setattr(options
, n
, os
.path
.abspath(getattr(options
, n
)))
136 # When --verbose, print all filename options:
137 if options
.verbose
>= 3:
138 print "Input: %s" % options
.input
139 print "Output: %s" % options
.output
140 print "Reference: %s" % options
.reference
142 # Make sure there is a directory where the output XML file should go:
144 os
.makedirs(os
.path
.dirname(options
.output
))
146 pass # fail later if there still is no output directory now
151 ## XML PROCESSING AND COMPARISON
153 def read_input_file(input_filename
):
155 read_input_file(input_filename) -> ET._Element
157 Read the input file, and return the corresponding XML Element object,
158 the element tree root.
160 input_xml
= ET
.parse(input_filename
).getroot()
163 def postprocess_xml(output_xml
):
165 postprocess_xml(output_xml) -> ET._Element
167 Remove unnecessary namespace declarations and whitespace. Returns a
168 modified copy of output_xml. The argument may be modified by calling
171 # Remove unused namespace declarations:
172 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
173 ns_root
= ET
.Element("NS_ROOT", nsmap
=xmns
)
174 ns_root
.append(output_xml
)
175 ns_root
.remove(output_xml
)
176 # If you don't perform this copy, each output_xml element's
177 # getroottree() will report the temporary tree containing the empty
178 # NS_ROOT element. This is not a hack, this is about how lxml works.
179 output_xml
= ET
.ElementTree(copy
.copy(output_xml
)).getroot()
181 # Make pretty-printing work by removing unnecessary whitespace:
182 for el
in output_xml
.iter():
183 if el
.text
and not el
.text
.strip():
185 if el
.tail
and not el
.tail
.strip():
190 def write_output_file(output_xml
, output_filename
):
192 Write the output XML Element to the specified output filename.
194 output_xmltree
= output_xml
.getroottree()
195 output_xmltree
.write(output_filename
, pretty_print
=True,
196 xml_declaration
=True, encoding
="utf-8")
198 def read_xml_schema_file(xml_schema_filename
):
200 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
202 Read the XML Schema file, and return the corresponding XML Schema
205 xml_schema_xmltree
= ET
.parse(xml_schema_filename
)
206 xml_schema
= ET
.XMLSchema(xml_schema_xmltree
)
209 def match_against_schema(options
, output_xml
, xml_schema
):
211 match_against_schema(options, output_xml, xml_schema) -> bool
213 Validate output against XML Schema.
215 The result is True if the output XML Element (tree) matches the XML
216 Schema, otherwise the result is False.
218 is_valid
= xml_schema
.validate(output_xml
.getroottree())
219 if options
.verbose
>= 2:
221 print "Output matches XML Schema."
223 print "Output invalid according to XML Schema."
224 print xml_schema
.error_log
.last_error
227 def match_against_reference(options
, output_xml
):
229 match_against_reference(options, output_xml) -> bool
231 Compare the output string (read from file options.output) to the
232 reference string (read from options.reference). If they are not the
233 same (bytewise), and if options.html_diff is True, create an HTML file
234 showing the differences.
236 The result is True if output and reference are the same (bytewise),
237 otherwise the result is False.
239 reference_filename
= options
.reference
240 output_filename
= options
.output
241 do_html_diff
= options
.html_diff
243 reference_str
= file(reference_filename
, "rb").read()
244 output_str
= file(output_filename
, "rb").read()
245 is_valid
= (reference_str
== output_str
)
246 if options
.verbose
>= 2:
248 print "Output matches reference."
249 elif not do_html_diff
:
250 print "Output and reference differ."
251 if do_html_diff
and not is_valid
:
252 html_filename
= "%s.diff.html" % output_filename
253 if options
.verbose
>= 2:
254 print ("Output and reference differ - " +
255 "generating '%s'..." % html_filename
)
256 create_reference_diff_html(html_filename
, reference_str
,
260 def create_reference_diff_html(html_filename
, reference_str
, output_str
):
262 Create an HTML file (created at html_filename) showing the differrences
263 between the reference string and the output string side-by-side.
265 reference_lines
= reference_str
.splitlines()
266 output_lines
= output_str
.splitlines()
269 html_diff
= difflib
.HtmlDiff(wrapcolumn
=75)
270 html_str
= html_diff
.make_file(reference_lines
, output_lines
,
271 "Reference", "Output")
272 file(html_filename
, "w").write(html_str
)
275 ## XML PREPROCESS CLASS
277 class XMLPreprocess(object):
281 >>> proc = XMLPreprocess()
282 >>> output_xml = proc(options, input_xml) # input_xml may change
286 super(XMLPreprocess
, self
).__init
__()
287 self
._namespace
_stack
= [{}]
289 def __call__(self
, xml_element
, namespace
=None,
290 trace_includes
=False, xml_filename
=None):
294 Preprocess the input XML Element, xml_element. The element tree of
295 xml_element will be modified in-place.
297 The namespace given should be a dict that can be used as a Python
298 namespace. This namespace will be used in XML attribute
301 If trace_includes is True, the output will contain tags that
302 surround included sections of the file. The xml_filename argument
305 Processing tags will recursively call this method (__call__) for
306 preprocessing the included file and for recursive inclusion.
308 if namespace
is not None:
309 self
._namespace
_stack
.append(namespace
)
310 self
.namespace
= self
._namespace
_stack
[-1]
311 self
.trace_includes
= trace_includes
312 self
.xml_filename
= xml_filename
314 ns
= "{%s}" % xmns
["xm"]
317 # Evaluate Python expressions in the attributes of xml_element:
318 for attr_name
, attr_value
in xml_element
.items(): # attr map
319 v
= self
._eval
_substitution
(attr_value
)
320 xml_element
.set(attr_name
, v
)
322 # If xml_element has xmns["xm"] as its namespace, proceed with the
323 # appropriate method of this class:
324 if xml_element
.nsmap
.get(xml_element
.prefix
) == xmns
["xm"]:
325 tag
= xml_element
.tag
[len_ns
:] # just the tag without namespc
326 method
= "_xm_" + tag
.lower() # tolerate any case
327 if not hasattr(self
, method
):
328 raise Exception, "cannot process <xm:%s/>" % tag
329 getattr(self
, method
)(xml_element
) # call the method
330 # Preserve tail text:
331 tail
= xml_element
.tail
333 prev
= xml_element
.getprevious()
334 parent
= xml_element
.getparent()
336 prev
.tail
= (prev
.tail
or "") + tail
338 parent
.text
= (parent
.text
or "") + tail
339 xml_element
.getparent().remove(xml_element
)
343 self
._recurse
_into
(xml_element
)
347 def _recurse_into(self
, xml_element
, namespace
=None):
348 if namespace
is not None:
349 self
._namespace
_stack
.append(namespace
)
350 for xml_sub_element
in xml_element
.xpath("*"):
351 self(xml_sub_element
, None,
352 self
.trace_includes
, self
.xml_filename
)
353 if namespace
is not None:
354 self
.namespace
= self
._namespace
_stack
.pop()
356 _eval_substitution_regex
= re
.compile(r
"\{(.*?)\}")
358 def _eval_substitution(self
, string
):
360 Evaluate Python expressions within strings.
362 Internal method to perform substitution of Python expressions
363 within attribute values, {x} -> str(eval(x)). Example:
365 >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
366 '3 + 5 = 8 in Python'
368 Multiple Python expressions in one string are supported as well.
370 new_str
= [] # faster than always concatenating strings
372 for match
in self
._eval
_substitution
_regex
.finditer(string
):
373 new_str
.append(string
[last_index
:match
.start()])
374 expression
= match
.group(1)
375 result
= str(eval(expression
, self
.namespace
))
376 new_str
.append(result
)
377 last_index
= match
.end()
378 new_str
.append(string
[last_index
:])
379 return "".join(new_str
)
381 def _xm_addelements(self
, xml_element
):
383 Add subelements to, before, or after the element selected by XPath
384 (@to, @before or @after).
386 to
= xml_element
.get("to")
387 before
= xml_element
.get("before")
388 after
= xml_element
.get("after")
389 assert sum((to
is None, before
is None, after
is None)) == 2
390 select
= to
or before
or after
392 def _xm_block(self
, xml_element
):
394 Create a scope to contain visibility of newly assigned Python
395 variables. This works the same way that Python itself scopes
396 variables, i.e. by creating a shallow copy of the Python namespace.
397 E.g. assignments to list items will be visible to outside scopes!
399 self
._recurse
_into
(xml_element
, self
.namespace
.copy())
400 for xml_sub_node
in xml_element
[::-1]: # get children reversed
401 xml_element
.addnext(xml_sub_node
)
403 def _xm_comment(self
, xml_element
):
405 A comment that is removed by XML Merge.
409 def _xm_include(self
, xml_element
):
411 Include from the specified file (@file) the elements selected by
415 def _xm_loop(self
, xml_element
):
417 Loop over a range of integer values.
419 The first attribute is evaluated as the loop counter. Example:
421 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
423 WARNING: The loop counter attribute, as well as all substitutions
424 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
425 (wholly or partially) be evaluated as Python expressions using
428 # Get the loop counter name and list:
429 loop_counter_name
= xml_element
.keys()[0]
430 loop_counter_list
= eval(xml_element
.get(loop_counter_name
),
434 addnext_to_node
= xml_element
# for new elements
435 for loop_counter_value
in loop_counter_list
:
436 self
.namespace
[loop_counter_name
] = loop_counter_value
437 # xml_element_copy = copy.copy(xml_element) # CRASH
438 xml_element_copy
= ET
.XML(ET
.tostring(xml_element
))
439 self
._recurse
_into
(xml_element_copy
)
440 for xml_sub_node
in xml_element_copy
[:]:
441 addnext_to_node
.addnext(xml_sub_node
)
442 addnext_to_node
= xml_sub_node
444 def _xm_pythoncode(self
, xml_element
):
446 Execute Python code in the current namespace.
448 'self' and 'xml_element' are supplied temporarily. They are added
449 to the current namespace before the 'exec' statement, and removed
452 code
= textwrap
.dedent(xml_element
.text
).strip()
453 self
.namespace
["self"] = self
454 self
.namespace
["xml_element"] = xml_element
455 exec code
in self
.namespace
456 del self
.namespace
["self"], self
.namespace
["xml_element"]
458 def _xm_removeattributes(self
, xml_element
):
460 Remove the attributes (@name) from the (zero or more) elements
461 selected by XPath (@from or @select).
463 It is not considered an error if an attribute cannot be found on a
466 attr_name
= xml_element
.get("name")
467 select_xpath
= xml_element
.get("from") or xml_element
.get("select")
468 for xml_element_selected
in xml_element
.xpath(select_xpath
):
469 # Can't find another way to remove an attribute than by using
471 attrib
= xml_element_selected
.attrib
472 if attr_name
in attrib
:
473 del xml_element_selected
.attrib
[attr_name
]
475 def _xm_removeelements(self
, xml_element
):
477 Remove (zero or more) elements selected by XPath (@select).
480 def _xm_setattribute(self
, xml_element
):
482 Assign the value (@value) to the attribute (@name) of the element
483 selected by XPath (@of or @select).
486 <Object index="0x1234"/>
487 <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
490 <Object index="0x1234" otherattr="hello"/>
493 def _xm_text(self
, xml_element
):
495 Perform '{}' substitution on text.
497 text
= xml_element
.text
498 if text
is None: return
499 tail
= self
._eval
_substitution
(text
) + (xml_element
.tail
or "")
500 xml_element
.tail
= tail
502 def _xm_var(self
, xml_element
):
504 Set (zero or more) variables in the active Python namespace.
507 for attr_name
, attr_value
in xml_element
.items(): # attr map
508 ns
[attr_name
] = eval(attr_value
, ns
, ns
)
517 Process input to produce output according to the command line options.
519 After the XML Merge Manual, this is the first piece of the code a new
520 developer will read. Keep this code as simple as possible if you change
523 These are all possible exit status codes returned or raised (using
524 SystemExit) by main or the functions it calls:
525 - On success, and if all requested validations (-s, -r) match:
527 - On error, e.g. wrong options (see parse_command_line()):
529 - On mismatch (either XML Schema (-s) or reference (-r)):
530 return mismatch_bitmap # see end of main()
531 - To aid understanding the bitmap: If N matching functions are
532 provided, and all are requested and all fail to match the output
534 return (2 ** N - 1) * 2 # mismatch_bitmap
536 # Parse command line to get options:
537 options
= parse_command_line(argv
)
539 # Input file => preprocessing => output file:
540 xml
= read_input_file(options
.input)
541 proc
= XMLPreprocess()
542 proc(xml
, trace_includes
=options
.trace_includes
,
543 xml_filename
=options
.input)
544 xml
= postprocess_xml(xml
)
545 write_output_file(xml
, options
.output
)
547 # If -s: Compare output to XML Schema file:
548 matches_schema
= True # False means: match requested and negative
549 if options
.xml_schema
is not None:
550 xml_schema
= read_xml_schema_file(options
.xml_schema
)
551 matches_schema
= match_against_schema(options
, xml
, xml_schema
)
553 # If -r: Compare output to reference:
554 matches_reference
= True # False means: match requested and negative
555 if options
.reference
is not None:
556 matches_reference
= match_against_reference(options
, xml
)
558 # Calculate and return the mismatch bitmap:
560 mismatch_bitmap |
= int(not matches_schema
) << 1 # 2 on mismatch
561 mismatch_bitmap |
= int(not matches_reference
) << 2 # 4 on mismatch
562 return mismatch_bitmap
565 if __name__
== "__main__":
566 sys
.exit(main(sys
.argv
))