tools/metrics/common/pretty_print_xml.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Utility file for pretty print xml file.
   6
   7 The function PrettyPrintNode will be used for formatting both histograms.xml
   8 and actions.xml.
   9 """
  10
  11 import logging
  12 import textwrap
  13 import xml.dom.minidom
  14
  15 WRAP_COLUMN = 80
  16
  17
  18 class Error(Exception):
  19   pass
  20
  21
  22 def LastLineLength(s):
  23   """Returns the length of the last line in s.
  24
  25   Args:
  26     s: A multi-line string, including newlines.
  27
  28   Returns:
  29     The length of the last line in s, in characters.
  30   """
  31   if s.rfind('\n') == -1: return len(s)
  32   return len(s) - s.rfind('\n') - len('\n')
  33
  34
  35 def XmlEscape(s):
  36   """XML-escapes the given string, replacing magic characters (&<>") with their
  37   escaped equivalents."""
  38   s = s.replace("&", "&amp;").replace("<", "&lt;")
  39   s = s.replace("\"", "&quot;").replace(">", "&gt;")
  40   return s
  41
  42
  43 class XmlStyle(object):
  44   """A class that stores all style specification for an output xml file."""
  45
  46   def __init__(self, attribute_order, tags_that_have_extra_newline,
  47                tags_that_dont_indent, tags_that_allow_single_line):
  48     # List of tag names for top-level nodes whose children are not indented.
  49     self.attribute_order = attribute_order
  50     self.tags_that_have_extra_newline = tags_that_have_extra_newline
  51     self.tags_that_dont_indent = tags_that_dont_indent
  52     self.tags_that_allow_single_line = tags_that_allow_single_line
  53
  54   def PrettyPrintNode(self, node, indent=0):
  55     """Pretty-prints the given XML node at the given indent level.
  56
  57     Args:
  58       node: The minidom node to pretty-print.
  59       indent: The current indent level.
  60
  61     Returns:
  62       The pretty-printed string (including embedded newlines).
  63
  64     Raises:
  65       Error if the XML has unknown tags or attributes.
  66     """
  67     # Handle the top-level document node.
  68     if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE:
  69       return '\n'.join([self.PrettyPrintNode(n) for n in node.childNodes])
  70
  71     # Handle text nodes.
  72     if node.nodeType == xml.dom.minidom.Node.TEXT_NODE:
  73       # Wrap each paragraph in the text to fit in the 80 column limit.
  74       wrapper = textwrap.TextWrapper()
  75       wrapper.initial_indent = ' ' * indent
  76       wrapper.subsequent_indent = ' ' * indent
  77       wrapper.break_on_hyphens = False
  78       wrapper.break_long_words = False
  79       wrapper.width = WRAP_COLUMN
  80       text = XmlEscape(node.data)
  81       # Remove any common indent.
  82       text = textwrap.dedent(text.strip('\n'))
  83       lines = text.split('\n')
  84       # Split the text into paragraphs at blank line boundaries.
  85       paragraphs = [[]]
  86       for l in lines:
  87         if len(l.strip()) == 0 and len(paragraphs[-1]) > 0:
  88           paragraphs.append([])
  89         else:
  90           paragraphs[-1].append(l)
  91       # Remove trailing empty paragraph if present.
  92       if len(paragraphs) > 0 and len(paragraphs[-1]) == 0:
  93         paragraphs = paragraphs[:-1]
  94       # Wrap each paragraph and separate with two newlines.
  95       return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs])
  96
  97     # Handle element nodes.
  98     if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
  99       newlines_after_open, newlines_before_close, newlines_after_close = (
 100           self.tags_that_have_extra_newline.get(node.tagName, (1, 1, 0)))
 101       # Open the tag.
 102       s = ' ' * indent + '<' + node.tagName
 103
 104       # Calculate how much space to allow for the '>' or '/>'.
 105       closing_chars = 1
 106       if not node.childNodes:
 107         closing_chars = 2
 108
 109       # Pretty-print the attributes.
 110       attributes = node.attributes.keys()
 111       if attributes:
 112         # Reorder the attributes.
 113         if node.tagName not in self.attribute_order:
 114           unrecognized_attributes = attributes
 115         else:
 116           unrecognized_attributes = (
 117               [a for a in attributes
 118                if a not in self.attribute_order[node.tagName]])
 119           attributes = [a for a in self.attribute_order[node.tagName]
 120                         if a in attributes]
 121
 122         for a in unrecognized_attributes:
 123           logging.error(
 124               'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName))
 125         if unrecognized_attributes:
 126           raise Error()
 127
 128         for a in attributes:
 129           value = XmlEscape(node.attributes[a].value)
 130           # Replace sequences of whitespace with single spaces.
 131           words = value.split()
 132           a_str = ' %s="%s"' % (a, ' '.join(words))
 133           # Start a new line if the attribute will make this line too long.
 134           if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
 135             s += '\n' + ' ' * (indent + 3)
 136           # Output everything up to the first quote.
 137           s += ' %s="' % (a)
 138           value_indent_level = LastLineLength(s)
 139           # Output one word at a time, splitting to the next line where
 140           # necessary.
 141           column = value_indent_level
 142           for i, word in enumerate(words):
 143             # This is slightly too conservative since not every word will be
 144             # followed by the closing characters...
 145             if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
 146               s = s.rstrip()  # remove any trailing whitespace
 147               s += '\n' + ' ' * value_indent_level
 148               column = value_indent_level
 149             s += word + ' '
 150             column += len(word) + 1
 151           s = s.rstrip()  # remove any trailing whitespace
 152           s += '"'
 153         s = s.rstrip()  # remove any trailing whitespace
 154
 155       # Pretty-print the child nodes.
 156       if node.childNodes:
 157         s += '>'
 158         # Calculate the new indent level for child nodes.
 159         new_indent = indent
 160         if node.tagName not in self.tags_that_dont_indent:
 161           new_indent += 2
 162         child_nodes = node.childNodes
 163
 164         # Recursively pretty-print the child nodes.
 165         child_nodes = [self.PrettyPrintNode(n, indent=new_indent)
 166                        for n in child_nodes]
 167         child_nodes = [c for c in child_nodes if len(c.strip()) > 0]
 168
 169         # Determine whether we can fit the entire node on a single line.
 170         close_tag = '</%s>' % node.tagName
 171         space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
 172         if (node.tagName in self.tags_that_allow_single_line and
 173             len(child_nodes) == 1 and
 174             len(child_nodes[0].strip()) <= space_left):
 175           s += child_nodes[0].strip()
 176         else:
 177           s += '\n' * newlines_after_open + '\n'.join(child_nodes)
 178           s += '\n' * newlines_before_close + ' ' * indent
 179         s += close_tag
 180       else:
 181         s += '/>'
 182       s += '\n' * newlines_after_close
 183       return s
 184
 185     # Handle comment nodes.
 186     if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE:
 187       return '<!--%s-->\n' % node.data
 188
 189     # Ignore other node types. This could be a processing instruction
 190     # (<? ... ?>) or cdata section (<![CDATA[...]]!>), neither of which are
 191     # legal in the histograms XML at present.
 192     logging.error('Ignoring unrecognized node data: %s' % node.toxml())
 193     raise Error()