tools/metrics/histograms/pretty_print.py

   1 # Copyright 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Pretty-prints the histograms.xml file, alphabetizing tags, wrapping text
   6 at 80 chars, enforcing standard attribute ordering, and standardizing
   7 indentation.
   8
   9 This is quite a bit more complicated than just calling tree.toprettyxml();
  10 we need additional customization, like special attribute ordering in tags
  11 and wrapping text nodes, so we implement our own full custom XML pretty-printer.
  12 """
  13
  14 from __future__ import with_statement
  15
  16 import diffutil
  17 import json
  18 import logging
  19 import shutil
  20 import sys
  21 import textwrap
  22 import xml.dom.minidom
  23
  24
  25 WRAP_COLUMN = 80
  26
  27 # Desired order for tag attributes; attributes listed here will appear first,
  28 # and in the same order as in these lists.
  29 # { tag_name: [attribute_name, ...] }
  30 ATTRIBUTE_ORDER = {
  31   'enum': ['name', 'type'],
  32   'histogram': ['name', 'enum', 'units'],
  33   'int': ['value', 'label'],
  34   'fieldtrial': ['name', 'separator', 'ordering'],
  35   'group': ['name', 'label'],
  36   'affected-histogram': ['name'],
  37   'with-group': ['name'],
  38 }
  39
  40 # Tag names for top-level nodes whose children we don't want to indent.
  41 TAGS_THAT_DONT_INDENT = [
  42   'histogram-configuration',
  43   'histograms',
  44   'fieldtrials',
  45   'enums'
  46 ]
  47
  48 # Extra vertical spacing rules for special tag names.
  49 # {tag_name: (newlines_after_open, newlines_before_close, newlines_after_close)}
  50 TAGS_THAT_HAVE_EXTRA_NEWLINE = {
  51   'histogram-configuration': (2, 1, 1),
  52   'histograms': (2, 1, 1),
  53   'fieldtrials': (2, 1, 1),
  54   'enums': (2, 1, 1),
  55   'histogram': (1, 1, 1),
  56   'enum': (1, 1, 1),
  57   'fieldtrial': (1, 1, 1),
  58 }
  59
  60 # Tags that we allow to be squished into a single line for brevity.
  61 TAGS_THAT_ALLOW_SINGLE_LINE = [
  62   'summary',
  63   'int',
  64 ]
  65
  66 # Tags whose children we want to alphabetize. The key is the parent tag name,
  67 # and the value is a pair of the tag name of the children we want to sort,
  68 # and a key function that maps each child node to the desired sort key.
  69 ALPHABETIZATION_RULES = {
  70   'histograms': ('histogram', lambda n: n.attributes['name'].value.lower()),
  71   'enums': ('enum', lambda n: n.attributes['name'].value.lower()),
  72   'enum': ('int', lambda n: int(n.attributes['value'].value)),
  73   'fieldtrials': ('fieldtrial', lambda n: n.attributes['name'].value.lower()),
  74   'fieldtrial': ('affected-histogram',
  75                  lambda n: n.attributes['name'].value.lower()),
  76 }
  77
  78
  79 class Error(Exception):
  80   pass
  81
  82
  83 def LastLineLength(s):
  84   """Returns the length of the last line in s.
  85
  86   Args:
  87     s: A multi-line string, including newlines.
  88
  89   Returns:
  90     The length of the last line in s, in characters.
  91   """
  92   if s.rfind('\n') == -1: return len(s)
  93   return len(s) - s.rfind('\n') - len('\n')
  94
  95
  96 def XmlEscape(s):
  97   """XML-escapes the given string, replacing magic characters (&<>") with their
  98   escaped equivalents."""
  99   s = s.replace("&", "&amp;").replace("<", "&lt;")
 100   s = s.replace("\"", "&quot;").replace(">", "&gt;")
 101   return s
 102
 103
 104 def PrettyPrintNode(node, indent=0):
 105   """Pretty-prints the given XML node at the given indent level.
 106
 107   Args:
 108     node: The minidom node to pretty-print.
 109     indent: The current indent level.
 110
 111   Returns:
 112     The pretty-printed string (including embedded newlines).
 113
 114   Raises:
 115     Error if the XML has unknown tags or attributes.
 116   """
 117   # Handle the top-level document node.
 118   if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE:
 119     return '\n'.join([PrettyPrintNode(n) for n in node.childNodes])
 120
 121   # Handle text nodes.
 122   if node.nodeType == xml.dom.minidom.Node.TEXT_NODE:
 123     # Wrap each paragraph in the text to fit in the 80 column limit.
 124     wrapper = textwrap.TextWrapper()
 125     wrapper.initial_indent = ' ' * indent
 126     wrapper.subsequent_indent = ' ' * indent
 127     wrapper.break_on_hyphens = False
 128     wrapper.break_long_words = False
 129     wrapper.width = WRAP_COLUMN
 130     text = XmlEscape(node.data)
 131     # Remove any common indent.
 132     text = textwrap.dedent(text.strip('\n'))
 133     lines = text.split('\n')
 134     # Split the text into paragraphs at blank line boundaries.
 135     paragraphs = [[]]
 136     for l in lines:
 137       if len(l.strip()) == 0 and len(paragraphs[-1]) > 0:
 138         paragraphs.append([])
 139       else:
 140         paragraphs[-1].append(l)
 141     # Remove trailing empty paragraph if present.
 142     if len(paragraphs) > 0 and len(paragraphs[-1]) == 0:
 143       paragraphs = paragraphs[:-1]
 144     # Wrap each paragraph and separate with two newlines.
 145     return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs])
 146
 147   # Handle element nodes.
 148   if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
 149     newlines_after_open, newlines_before_close, newlines_after_close = (
 150       TAGS_THAT_HAVE_EXTRA_NEWLINE.get(node.tagName, (1, 1, 0)))
 151     # Open the tag.
 152     s = ' ' * indent + '<' + node.tagName
 153
 154     # Calculate how much space to allow for the '>' or '/>'.
 155     closing_chars = 1
 156     if not node.childNodes:
 157       closing_chars = 2
 158
 159     # Pretty-print the attributes.
 160     attributes = node.attributes.keys()
 161     if attributes:
 162       # Reorder the attributes.
 163       if not node.tagName in ATTRIBUTE_ORDER:
 164         unrecognized_attributes = attributes;
 165       else:
 166         unrecognized_attributes = (
 167           [a for a in attributes if not a in ATTRIBUTE_ORDER[node.tagName]])
 168         attributes = (
 169           [a for a in ATTRIBUTE_ORDER[node.tagName] if a in attributes])
 170
 171       for a in unrecognized_attributes:
 172         logging.error(
 173             'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName))
 174       if unrecognized_attributes:
 175         raise Error()
 176
 177       for a in attributes:
 178         value = XmlEscape(node.attributes[a].value)
 179         # Replace sequences of whitespace with single spaces.
 180         words = value.split()
 181         a_str = ' %s="%s"' % (a, ' '.join(words))
 182         # Start a new line if the attribute will make this line too long.
 183         if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
 184           s += '\n' + ' ' * (indent + 3)
 185         # Output everything up to the first quote.
 186         s += ' %s="' % (a)
 187         value_indent_level = LastLineLength(s)
 188         # Output one word at a time, splitting to the next line where necessary.
 189         column = value_indent_level
 190         for i, word in enumerate(words):
 191           # This is slightly too conservative since not every word will be
 192           # followed by the closing characters...
 193           if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
 194             s = s.rstrip()  # remove any trailing whitespace
 195             s += '\n' + ' ' * value_indent_level
 196             column = value_indent_level
 197           s += word + ' '
 198           column += len(word) + 1
 199         s = s.rstrip()  # remove any trailing whitespace
 200         s += '"'
 201       s = s.rstrip()  # remove any trailing whitespace
 202
 203     # Pretty-print the child nodes.
 204     if node.childNodes:
 205       s += '>'
 206       # Calculate the new indent level for child nodes.
 207       new_indent = indent
 208       if node.tagName not in TAGS_THAT_DONT_INDENT:
 209         new_indent += 2
 210       child_nodes = node.childNodes
 211
 212       # Recursively pretty-print the child nodes.
 213       child_nodes = [PrettyPrintNode(n, indent=new_indent) for n in child_nodes]
 214       child_nodes = [c for c in child_nodes if len(c.strip()) > 0]
 215
 216       # Determine whether we can fit the entire node on a single line.
 217       close_tag = '</%s>' % node.tagName
 218       space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
 219       if (node.tagName in TAGS_THAT_ALLOW_SINGLE_LINE and
 220           len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left):
 221         s += child_nodes[0].strip()
 222       else:
 223         s += '\n' * newlines_after_open + '\n'.join(child_nodes)
 224         s += '\n' * newlines_before_close + ' ' * indent
 225       s += close_tag
 226     else:
 227       s += '/>'
 228     s += '\n' * newlines_after_close
 229     return s
 230
 231   # Handle comment nodes.
 232   if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE:
 233     return '<!--%s-->\n' % node.data
 234
 235   # Ignore other node types. This could be a processing instruction (<? ... ?>)
 236   # or cdata section (<![CDATA[...]]!>), neither of which are legal in the
 237   # histograms XML at present.
 238   logging.error('Ignoring unrecognized node data: %s' % node.toxml())
 239   raise Error()
 240
 241
 242 def unsafeAppendChild(parent, child):
 243   """Append child to parent's list of children, ignoring the possibility that it
 244   is already in another node's childNodes list.  Requires that the previous
 245   parent of child is discarded (to avoid non-tree DOM graphs).
 246   This can provide a significant speedup as O(n^2) operations are removed (in
 247   particular, each child insertion avoids the need to traverse the old parent's
 248   entire list of children)."""
 249   child.parentNode = None
 250   parent.appendChild(child)
 251   child.parentNode = parent
 252
 253
 254 def TransformByAlphabetizing(node):
 255   """Transform the given XML by alphabetizing specific node types according to
 256   the rules in ALPHABETIZATION_RULES.
 257
 258   Args:
 259     node: The minidom node to transform.
 260
 261   Returns:
 262     The minidom node, with children appropriately alphabetized. Note that the
 263     transformation is done in-place, i.e. the original minidom tree is modified
 264     directly.
 265   """
 266   if node.nodeType != xml.dom.minidom.Node.ELEMENT_NODE:
 267     for c in node.childNodes: TransformByAlphabetizing(c)
 268     return node
 269
 270   # Element node with a tag name that we alphabetize the children of?
 271   if node.tagName in ALPHABETIZATION_RULES:
 272     subtag, key_function = ALPHABETIZATION_RULES[node.tagName]
 273     # Remove the subnodes to be alphabetized.
 274     clone = node.cloneNode(False)
 275     subnodes = []
 276     for c in node.childNodes:
 277       if (c.nodeType == xml.dom.minidom.Node.ELEMENT_NODE and
 278           c.tagName == subtag):
 279         subnodes.append(c)
 280         continue
 281       unsafeAppendChild(clone, c)
 282     # Sort the subnodes.
 283     subnodes.sort(key=key_function)
 284     # Readd the subnodes, transforming each recursively.
 285     for c in subnodes:
 286       unsafeAppendChild(clone, TransformByAlphabetizing(c))
 287     node = clone
 288     return node
 289
 290   # Recursively handle other element nodes and other node types.
 291   for c in node.childNodes: TransformByAlphabetizing(c)
 292   return node
 293
 294
 295 def PrettyPrint(raw_xml):
 296   """Pretty-print the given XML.
 297
 298   Args:
 299     xml: The contents of the histograms XML file, as a string.
 300
 301   Returns:
 302     The pretty-printed version.
 303   """
 304   tree = xml.dom.minidom.parseString(raw_xml)
 305   tree = TransformByAlphabetizing(tree)
 306   return PrettyPrintNode(tree)
 307
 308
 309 def main():
 310   logging.basicConfig(level=logging.INFO)
 311
 312   presubmit = ('--presubmit' in sys.argv)
 313
 314   logging.info('Loading histograms.xml...')
 315   with open('histograms.xml', 'rb') as f:
 316     xml = f.read()
 317
 318   # Check there are no CR ('\r') characters in the file.
 319   if '\r' in xml:
 320     logging.info('DOS-style line endings (CR characters) detected - these are '
 321                  'not allowed. Please run dos2unix histograms.xml')
 322     sys.exit(1)
 323
 324   logging.info('Pretty-printing...')
 325   try:
 326     pretty = PrettyPrint(xml)
 327   except Error:
 328     logging.error('Aborting parsing due to fatal errors.')
 329     sys.exit(1)
 330
 331   if xml == pretty:
 332     logging.info('histograms.xml is correctly pretty-printed.')
 333     sys.exit(0)
 334   if presubmit:
 335     logging.info('histograms.xml is not formatted correctly; run '
 336                  'pretty_print.py to fix.')
 337     sys.exit(1)
 338   if not diffutil.PromptUserToAcceptDiff(
 339       xml, pretty,
 340       'Is the prettified version acceptable?'):
 341     logging.error('Aborting')
 342     return
 343
 344   logging.info('Creating backup file histograms.before.pretty-print.xml')
 345   shutil.move('histograms.xml', 'histograms.before.pretty-print.xml')
 346
 347   logging.info('Writing new histograms.xml file')
 348   with open('histograms.xml', 'wb') as f:
 349     f.write(pretty)
 350
 351
 352 if __name__ == '__main__':
 353   main()