2 # -*- coding: utf-8 -*-
6 # Copyright 2008,2009 Felix Rabe <public@felixrabe.net>
9 # This file is part of XML Merge.
11 # XML Merge is free software: you can redistribute it and/or modify it
12 # under the terms of the GNU Lesser General Public License as published by
13 # the Free Software Foundation, either version 3 of the License, or (at
14 # your option) any later version.
16 # XML Merge is distributed in the hope that it will be useful, but
17 # WITHOUT ANY WARRANTY; without even the implied warranty of
18 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 # GNU Lesser General Public License for more details.
21 # You should have received a copy of the GNU Lesser General Public License
22 # along with XML Merge. If not, see <http://www.gnu.org/licenses/>.
25 # Developed (i.e. tested) using Python 2.6.4 and lxml 2.2.2.
28 The purpose of XML Merge is to preprocess any kind of XML file with great
31 XML Merge performs (among other things) recursive XML file inclusion and
32 XML element and attribute modification.
34 XML Merge is a Python module. It is normally invoked as a program from the
35 command line, but can equally well be used from within another Python
39 __version_info__
= (2, 0, -1, 'git')
40 __version__
= ".".join(str(n
) for n
in __version_info__
[:2])
42 ## IMPORTS AND CONSTANTS
52 import lxml
.etree
as ET
54 # Namespace mapping (can be directly used for lxml nsmap arguments):
55 xmns
= {"xm": "urn:felixrabe:xmlns:xmlmerge:preprocess",
56 "xmt": "urn:felixrabe:xmlns:xmlmerge:inctrace"}
59 ## COMMAND LINE OPTION PARSING
61 class OptionParser(optparse
.OptionParser
):
63 def __init__(self
, *a
, **kw
):
64 optparse
.OptionParser
.__init
__(self
, *a
, **kw
)
65 self
.add_option("-i", "--input",
66 help=("(REQUIRED) input XML file"))
67 self
.add_option("-o", "--output",
68 help=("output XML file (.out.xml if not given)"))
69 self
.add_option("-s", "--xml-schema",
70 help=("XML Schema (.xsd) to validate output " +
72 self
.add_option("-r", "--reference",
73 help=("reference XML file to compare output " +
75 self
.add_option("-d", "--html-diff", action
="store_true",
76 help=("only with -r; if output and reference " +
77 "differ, produce a HTML file showing the " +
79 self
.add_option("-t", "--trace-includes", action
="store_true",
80 help=("add tracing information to included " +
82 self
.add_option("-v", "--verbose", action
="store_const",
83 dest
="verbose", const
=3,
84 help=("show debugging messages"))
85 self
.add_option("-q", "--quiet", action
="store_const",
86 dest
="verbose", const
=1,
87 help=("only show error messages"))
88 self
.set_defaults(verbose
=2)
90 # Explanation: levels of verbosity
91 # --quiet -> self.verbose == 1 # only show error messages
92 # -> self.verbose == 2 # no verbosity option given
93 # --verbose -> self.verbose == 3 # show debugging messages
95 def error(self
, *a
, **kw
):
97 return optparse
.OptionParser
.error(self
, *a
, **kw
)
100 def parse_command_line(argv
):
102 parse_command_line(argv) -> optparse.Values
104 Parse argv and return an optparse.Values object containing the options.
106 This function performs all the necessary checks and conversions to make
107 sure all necessary options are given, and that all options are
108 available in a normalized format.
110 It also tries to create the containing directory for the output file if
111 it does not exist already.
113 # Parse options using OptionParser:
114 option_parser
= OptionParser()
115 options
, args
= option_parser
.parse_args(argv
[1:])
117 # Make sure only options, and no other arguments, are passed on the
121 assert options
.input is not None
123 option_parser
.error("Error: invalid argument list")
125 # If the output option has been omitted, build the output filename from
126 # the input filename, resulting in the file extension ".out.xml":
127 if options
.output
is None:
128 if options
.input.lower().endswith(".xml"):
129 options
.output
= options
.input[:-4] + ".out.xml"
131 options
.output
= options
.input + ".out.xml"
133 # Convert all filename options to normalized absolutized pathnames:
134 for n
in "input output reference".split():
135 if getattr(options
, n
) is None: continue # if "-r" was not given
136 setattr(options
, n
, os
.path
.abspath(getattr(options
, n
)))
138 # When --verbose, print all filename options:
139 if options
.verbose
>= 3:
140 print "Input: %s" % options
.input
141 print "Output: %s" % options
.output
142 print "Reference: %s" % options
.reference
144 # Make sure there is a directory where the output XML file should go:
146 os
.makedirs(os
.path
.dirname(options
.output
))
148 pass # fail later if there still is no output directory now
153 ## XML PROCESSING AND COMPARISON
155 def read_input_file(input_filename
):
157 read_input_file(input_filename) -> ET._Element
159 Read the input file, and return the corresponding XML Element object,
160 the element tree root.
162 input_xml
= ET
.parse(input_filename
).getroot()
165 def postprocess_xml(output_xml
):
167 postprocess_xml(output_xml) -> ET._Element
169 Remove unnecessary namespace declarations and whitespace. Returns a
170 modified copy of output_xml. The argument may be modified by calling
173 # Remove unused namespace declarations:
174 # (http://codespeak.net/pipermail/lxml-dev/2009-September/004888.html)
175 ns_root
= ET
.Element("NS_ROOT", nsmap
=xmns
)
176 ns_root
.append(output_xml
)
177 ns_root
.remove(output_xml
)
178 # If you don't perform this copy, each output_xml element's
179 # getroottree() will report the temporary tree containing the empty
180 # NS_ROOT element. This is not a hack, this is about how lxml works.
181 output_xml
= ET
.ElementTree(copy
.copy(output_xml
)).getroot()
183 # Make pretty-printing work by removing unnecessary whitespace:
184 for el
in output_xml
.iter():
185 if el
.text
and not el
.text
.strip():
187 if el
.tail
and not el
.tail
.strip():
192 def write_output_file(output_xml
, output_filename
):
194 Write the output XML Element to the specified output filename.
196 output_xmltree
= output_xml
.getroottree()
197 output_xmltree
.write(output_filename
, pretty_print
=True,
198 xml_declaration
=True, encoding
="utf-8")
200 def read_xml_schema_file(xml_schema_filename
):
202 read_xml_schema_file(xml_schema_filename) -> ET.XMLSchema
204 Read the XML Schema file, and return the corresponding XML Schema
207 xml_schema_xmltree
= ET
.parse(xml_schema_filename
)
208 xml_schema
= ET
.XMLSchema(xml_schema_xmltree
)
211 def match_against_schema(options
, output_xml
, xml_schema
):
213 match_against_schema(options, output_xml, xml_schema) -> bool
215 Validate output against XML Schema.
217 The result is True if the output XML Element (tree) matches the XML
218 Schema, otherwise the result is False.
220 is_valid
= xml_schema
.validate(output_xml
.getroottree())
221 if options
.verbose
>= 2:
223 print "Output matches XML Schema."
225 print "Output invalid according to XML Schema."
226 print xml_schema
.error_log
.last_error
229 def match_against_reference(options
, output_xml
):
231 match_against_reference(options, output_xml) -> bool
233 Compare the output string (read from file options.output) to the
234 reference string (read from options.reference). If they are not the
235 same (bytewise), and if options.html_diff is True, create an HTML file
236 showing the differences.
238 The result is True if output and reference are the same (bytewise),
239 otherwise the result is False.
241 reference_filename
= options
.reference
242 output_filename
= options
.output
243 do_html_diff
= options
.html_diff
245 reference_str
= file(reference_filename
, "rb").read()
246 output_str
= file(output_filename
, "rb").read()
247 is_valid
= (reference_str
== output_str
)
248 if options
.verbose
>= 2:
250 print "Output matches reference."
251 elif not do_html_diff
:
252 print "Output and reference differ."
253 if do_html_diff
and not is_valid
:
254 html_filename
= "%s.diff.html" % output_filename
255 if options
.verbose
>= 2:
256 print ("Output and reference differ - " +
257 "generating '%s'..." % html_filename
)
258 create_reference_diff_html(html_filename
, reference_str
,
262 def create_reference_diff_html(html_filename
, reference_str
, output_str
):
264 Create an HTML file (created at html_filename) showing the differrences
265 between the reference string and the output string side-by-side.
267 reference_lines
= reference_str
.splitlines()
268 output_lines
= output_str
.splitlines()
271 html_diff
= difflib
.HtmlDiff(wrapcolumn
=75)
272 html_str
= html_diff
.make_file(reference_lines
, output_lines
,
273 "Reference", "Output")
274 file(html_filename
, "w").write(html_str
)
277 ## XML PREPROCESS CLASS
279 class XMLPreprocess(object):
283 >>> proc = XMLPreprocess()
284 >>> output_xml = proc(options, input_xml) # input_xml may change
287 def __init__(self
, initial_namespace
={}):
288 super(XMLPreprocess
, self
).__init
__()
289 self
._namespace
_stack
= [initial_namespace
]
291 def __call__(self
, xml_element
, namespace
=None,
292 trace_includes
=False, xml_filename
=None):
296 Preprocess the input XML Element, xml_element. The element tree of
297 xml_element will be modified in-place.
299 The namespace given should be a dict that can be used as a Python
300 namespace. This namespace will be used in XML attribute
303 If trace_includes is True, the output will contain tags that
304 surround included sections of the file. The xml_filename argument
307 Processing tags will recursively call this method (__call__) for
308 preprocessing the included file and for recursive inclusion.
310 if namespace
is not None:
311 self
._namespace
_stack
.append(namespace
)
312 self
.namespace
= self
._namespace
_stack
[-1]
313 self
.trace_includes
= trace_includes
314 self
.xml_filename
= xml_filename
316 ns
= "{%s}" % xmns
["xm"]
319 # Evaluate Python expressions in the attributes of xml_element:
320 for attr_name
, attr_value
in xml_element
.items(): # attr map
321 v
= self
._eval
_substitution
(attr_value
)
322 xml_element
.set(attr_name
, v
)
324 # If xml_element has xmns["xm"] as its namespace, proceed with the
325 # appropriate method of this class:
326 if xml_element
.nsmap
.get(xml_element
.prefix
) == xmns
["xm"]:
327 tag
= xml_element
.tag
[len_ns
:] # just the tag without namespc
328 method
= "_xm_" + tag
.lower() # tolerate any case
329 if not hasattr(self
, method
):
330 raise Exception, "cannot process <xm:%s/>" % tag
331 getattr(self
, method
)(xml_element
) # call the method
332 # Preserve tail text:
333 tail
= xml_element
.tail
335 prev
= xml_element
.getprevious()
336 parent
= xml_element
.getparent()
338 prev
.tail
= (prev
.tail
or "") + tail
340 parent
.text
= (parent
.text
or "") + tail
341 xml_element
.getparent().remove(xml_element
)
345 self
._recurse
_into
(xml_element
)
349 def _recurse_into(self
, xml_element
, namespace
=None):
350 if namespace
is not None:
351 self
._namespace
_stack
.append(namespace
)
352 for xml_sub_element
in xml_element
.xpath("*"):
353 self(xml_sub_element
, None,
354 self
.trace_includes
, self
.xml_filename
)
355 if namespace
is not None:
356 self
._namespace
_stack
.pop()
357 self
.namespace
= self
._namespace
_stack
[-1]
359 _eval_substitution_regex
= re
.compile(r
"\{(.*?)\}")
361 def _eval_substitution(self
, string
):
363 Evaluate Python expressions within strings.
365 Internal method to perform substitution of Python expressions
366 within attribute values, {x} -> str(eval(x)). Example:
368 >>> self._eval_substitution("3 + 5 = {3 + 5} in Python")
369 '3 + 5 = 8 in Python'
371 Multiple Python expressions in one string are supported as well.
373 new_str
= [] # faster than always concatenating strings
375 for match
in self
._eval
_substitution
_regex
.finditer(string
):
376 new_str
.append(string
[last_index
:match
.start()])
377 expression
= match
.group(1)
378 result
= str(eval(expression
, self
.namespace
))
379 new_str
.append(result
)
380 last_index
= match
.end()
381 new_str
.append(string
[last_index
:])
382 return "".join(new_str
)
384 def _xm_addelements(self
, xml_element
):
386 Add subelements to, before, or after the element selected by XPath
387 (@to, @before or @after).
389 Exactly one of (@to, @before, @after) must be specified. And the
390 XPath expression must return exactly one element. These conditions
391 are checked by assertions and will raise an exception if not met.
393 to
= xml_element
.get("to")
394 before
= xml_element
.get("before")
395 after
= xml_element
.get("after")
397 assert sum((to
is None, before
is None, after
is None)) == 2
398 select
= to
or before
or after
400 selected_context_nodes
= xml_element
.xpath(select
)
401 assert len(selected_context_nodes
) == 1
403 context_node
= selected_context_nodes
[0]
404 replace_context_node
= False
408 if before
is not None:
410 if after
is not None:
412 replace_context_node
= True
414 for xml_sub_element
in xml_element
:
415 getattr(context_node
, f
)(xml_sub_element
)
416 if replace_context_node
:
417 context_node
= xml_sub_element
419 def _xm_block(self
, xml_element
):
421 Create a scope to contain visibility of newly assigned Python
422 variables. This works the same way that Python itself scopes
423 variables, i.e. by creating a shallow copy of the Python namespace.
424 E.g. assignments to list items will be visible to outside scopes!
426 self
._recurse
_into
(xml_element
, self
.namespace
.copy())
427 for xml_sub_node
in xml_element
[::-1]: # get children reversed
428 xml_element
.addnext(xml_sub_node
)
430 def _xm_comment(self
, xml_element
):
432 A comment that is removed by XML Merge.
436 def _xm_defaultvar(self
, xml_element
):
438 Set (zero or more) variables in the active Python namespace, if not
442 for attr_name
, attr_value
in xml_element
.items(): # attr map
443 if not attr_name
in ns
:
444 ns
[attr_name
] = eval(attr_value
, ns
)
446 def _xm_include(self
, xml_element
):
448 Include from the specified file (@file) the elements selected by
449 XPath (@select) after preprocessing said file.
451 The @file attribute is the only required attribute.
453 Items can be imported from the included (and preprocessed) file's
454 Python namespace into the current file's namespace using the
455 @import attribute, which may either be a comma-separated list of
456 identifiers, or '*' to import the complete namespace.
458 Remaining attributes will be treated as variable assignments and
459 put in the Python namespace used for processing the included file.
461 attrib
= xml_element
.attrib
462 file_
= attrib
.pop("file", None)
463 select
= attrib
.pop("select", None)
464 import_
= attrib
.pop("import", None)
465 assert file_
is not None
466 remaining_attribs
= dict(attrib
.items())
468 # Load the to-be-included file:
471 xml_input_dirname
= p
.dirname(self
.xml_filename
)
472 xml_incl_filename
= p
.join(xml_input_dirname
, file_
)
473 xml_incl_filename
= p
.normpath(xml_incl_filename
)
474 # Always use '/' for normalized tracing information:
475 xml_incl_filename
= xml_incl_filename
.replace("\\", "/")
477 xml_incl
= ET
.parse(xml_incl_filename
).getroot()
479 # Build the initial namespace from remaining attributes:
480 initial_namespace
= {}
482 for attr_name
, attr_value
in remaining_attribs
.items(): # attr map
483 initial_namespace
[attr_name
] = eval(attr_value
, ns
)
485 # Preprocess the to-be-included file:
486 proc
= XMLPreprocess(initial_namespace
=initial_namespace
)
487 proc(xml_incl
, trace_includes
=self
.trace_includes
,
488 xml_filename
=xml_incl_filename
)
490 # Select elements to include:
491 included_elements
= []
492 if select
is not None:
493 included_elements
= xml_incl
.xpath(select
)
495 # Include the elements:
496 context_node
= xml_element
497 for inc_elem
in included_elements
:
498 context_node
.addnext(inc_elem
)
499 context_node
= inc_elem
501 # Import from included namespace:
502 imported_namespace
= {}
503 if import_
is not None:
504 import_
= [x
.strip() for x
in import_
.split(",")]
505 if "*" in import_
: # import all
506 imported_namespace
= proc
.namespace
509 imported_namespace
= dict((x
, ns
[x
]) for x
in import_
)
510 self
.namespace
.update(imported_namespace
)
512 def _xm_loop(self
, xml_element
):
514 Loop over a range of integer values.
516 The first attribute is evaluated as the loop counter. Example:
518 i="range(5, 9)" => iterates with i being 5, 6, 7, 8
520 WARNING: The loop counter attribute, as well as all substitutions
521 in subelement attributes (XPath ".//@*": "...{foo_bar}...") will
522 (wholly or partially) be evaluated as Python expressions using
525 # Get the loop counter name and list:
526 loop_counter_name
= xml_element
.keys()[0]
527 loop_counter_list
= eval(xml_element
.get(loop_counter_name
),
531 context_node
= xml_element
# for new elements
532 for loop_counter_value
in loop_counter_list
:
533 self
.namespace
[loop_counter_name
] = loop_counter_value
534 # xml_element_copy = copy.copy(xml_element) # CRASH
535 tailtext
= xml_element
.tail
536 xml_element
.tail
= None # xml_element regarded as document
537 xml_element_copy
= ET
.XML(ET
.tostring(xml_element
))
538 xml_element
.tail
= xml_element_copy
.tail
= tailtext
539 self
._recurse
_into
(xml_element_copy
)
540 if xml_element_copy
.text
is not None:
541 if context_node
.tail
is None:
542 context_node
.tail
= u
""
543 context_node
.tail
+= xml_element_copy
.text
544 for xml_sub_node
in xml_element_copy
[:]:
545 context_node
.addnext(xml_sub_node
)
546 context_node
= xml_sub_node
548 def _xm_pythoncode(self
, xml_element
):
550 Execute Python code in the current namespace.
552 'self' and 'xml_element' are supplied temporarily. They are added
553 to the current namespace before the 'exec' statement, and removed
556 code
= textwrap
.dedent(xml_element
.text
).strip()
557 self
.namespace
["self"] = self
558 self
.namespace
["xml_element"] = xml_element
559 exec code
in self
.namespace
560 del self
.namespace
["self"], self
.namespace
["xml_element"]
562 def _xm_removeattributes(self
, xml_element
):
564 Remove the attributes (@name) from the (zero or more) elements
565 selected by XPath (@from or @select).
567 It is not considered an error if an attribute cannot be found on a
570 attr_name
= xml_element
.get("name")
571 select_xpath
= xml_element
.get("from") or xml_element
.get("select")
572 for xml_element_selected
in xml_element
.xpath(select_xpath
):
573 # Can't find another way to remove an attribute than by using
575 attrib
= xml_element_selected
.attrib
576 if attr_name
in attrib
:
577 del xml_element_selected
.attrib
[attr_name
]
579 def _xm_removeelements(self
, xml_element
):
581 Remove (zero or more) elements selected by XPath (@select).
585 def _xm_setattribute(self
, xml_element
):
587 Assign the value (@value) to the attribute (@name) of the element
588 selected by XPath (@of or @select).
591 <Object index="0x1234"/>
592 <xm:SetAttribute of="../Object" name="otherattr" value="hallo"/>
595 <Object index="0x1234" otherattr="hello"/>
599 def _xm_text(self
, xml_element
):
601 Perform '{}' substitution on text.
603 text
= xml_element
.text
604 if text
is None: return
605 tail
= self
._eval
_substitution
(text
) + (xml_element
.tail
or "")
606 xml_element
.tail
= tail
608 def _xm_var(self
, xml_element
):
610 Set (zero or more) variables in the active Python namespace.
613 for attr_name
, attr_value
in xml_element
.items(): # attr map
614 ns
[attr_name
] = eval(attr_value
, ns
)
619 def main(argv
, **kargs
):
621 main(argv, **kargs) -> int
623 Process input to produce output according to the command line options
624 (given in argv). These keyword arguments (**kargs) are recognized:
627 Gets passed on as the initial Python namespace to XMLPreprocess().
629 After the XML Merge Manual, this is the first piece of the code a new
630 developer will read. Keep this code as simple as possible if you change
633 These are all possible exit status codes returned or raised (using
634 SystemExit) by main or the functions it calls:
635 - On success, and if all requested validations (-s, -r) match:
637 - On error, e.g. wrong options (see parse_command_line()):
639 - On mismatch (either XML Schema (-s) or reference (-r)):
640 return mismatch_bitmap # see end of main()
641 - To aid understanding the bitmap: If N matching functions are
642 provided, and all are requested and all fail to match the output
644 return (2 ** N - 1) * 2 # mismatch_bitmap
646 # Parse command line to get options:
647 options
= parse_command_line(argv
)
649 # Input file => preprocessing => output file:
650 xml
= read_input_file(options
.input)
651 proc
= XMLPreprocess(**kargs
)
652 proc(xml
, trace_includes
=options
.trace_includes
,
653 xml_filename
=options
.input)
654 xml
= postprocess_xml(xml
)
655 write_output_file(xml
, options
.output
)
657 # If -s: Compare output to XML Schema file:
658 matches_schema
= True # False means: match requested and negative
659 if options
.xml_schema
is not None:
660 xml_schema
= read_xml_schema_file(options
.xml_schema
)
661 matches_schema
= match_against_schema(options
, xml
, xml_schema
)
663 # If -r: Compare output to reference:
664 matches_reference
= True # False means: match requested and negative
665 if options
.reference
is not None:
666 matches_reference
= match_against_reference(options
, xml
)
668 # Calculate and return the mismatch bitmap:
670 mismatch_bitmap |
= int(not matches_schema
) << 1 # 2 on mismatch
671 mismatch_bitmap |
= int(not matches_reference
) << 2 # 4 on mismatch
672 return mismatch_bitmap
675 if __name__
== "__main__":
676 sys
.exit(main(sys
.argv
))