tools/metrics/histograms/pretty_print.py

   1 #!/usr/bin/env python
   2 # Copyright 2013 The Chromium Authors. All rights reserved.
   3 # Use of this source code is governed by a BSD-style license that can be
   4 # found in the LICENSE file.
   5
   6 """Pretty-prints the histograms.xml file, alphabetizing tags, wrapping text
   7 at 80 chars, enforcing standard attribute ordering, and standardizing
   8 indentation.
   9
  10 This is quite a bit more complicated than just calling tree.toprettyxml();
  11 we need additional customization, like special attribute ordering in tags
  12 and wrapping text nodes, so we implement our own full custom XML pretty-printer.
  13 """
  14
  15 from __future__ import with_statement
  16
  17 import diffutil
  18 import json
  19 import logging
  20 import os
  21 import shutil
  22 import sys
  23 import textwrap
  24 import xml.dom.minidom
  25
  26 sys.path.insert(1, os.path.join(sys.path[0], '..', '..', 'python'))
  27 from google import path_utils
  28
  29 WRAP_COLUMN = 80
  30
  31 # Desired order for tag attributes; attributes listed here will appear first,
  32 # and in the same order as in these lists.
  33 # { tag_name: [attribute_name, ...] }
  34 ATTRIBUTE_ORDER = {
  35   'enum': ['name', 'type'],
  36   'histogram': ['name', 'enum', 'units'],
  37   'int': ['value', 'label'],
  38   'fieldtrial': ['name', 'separator', 'ordering'],
  39   'group': ['name', 'label'],
  40   'affected-histogram': ['name'],
  41   'with-group': ['name'],
  42 }
  43
  44 # Tag names for top-level nodes whose children we don't want to indent.
  45 TAGS_THAT_DONT_INDENT = [
  46   'histogram-configuration',
  47   'histograms',
  48   'fieldtrials',
  49   'enums'
  50 ]
  51
  52 # Extra vertical spacing rules for special tag names.
  53 # {tag_name: (newlines_after_open, newlines_before_close, newlines_after_close)}
  54 TAGS_THAT_HAVE_EXTRA_NEWLINE = {
  55   'histogram-configuration': (2, 1, 1),
  56   'histograms': (2, 1, 1),
  57   'fieldtrials': (2, 1, 1),
  58   'enums': (2, 1, 1),
  59   'histogram': (1, 1, 1),
  60   'enum': (1, 1, 1),
  61   'fieldtrial': (1, 1, 1),
  62 }
  63
  64 # Tags that we allow to be squished into a single line for brevity.
  65 TAGS_THAT_ALLOW_SINGLE_LINE = [
  66   'summary',
  67   'int',
  68 ]
  69
  70 # Tags whose children we want to alphabetize. The key is the parent tag name,
  71 # and the value is a pair of the tag name of the children we want to sort,
  72 # and a key function that maps each child node to the desired sort key.
  73 ALPHABETIZATION_RULES = {
  74   'histograms': ('histogram', lambda n: n.attributes['name'].value.lower()),
  75   'enums': ('enum', lambda n: n.attributes['name'].value.lower()),
  76   'enum': ('int', lambda n: int(n.attributes['value'].value)),
  77   'fieldtrials': ('fieldtrial', lambda n: n.attributes['name'].value.lower()),
  78   'fieldtrial': ('affected-histogram',
  79                  lambda n: n.attributes['name'].value.lower()),
  80 }
  81
  82
  83 class Error(Exception):
  84   pass
  85
  86
  87 def LastLineLength(s):
  88   """Returns the length of the last line in s.
  89
  90   Args:
  91     s: A multi-line string, including newlines.
  92
  93   Returns:
  94     The length of the last line in s, in characters.
  95   """
  96   if s.rfind('\n') == -1: return len(s)
  97   return len(s) - s.rfind('\n') - len('\n')
  98
  99
 100 def XmlEscape(s):
 101   """XML-escapes the given string, replacing magic characters (&<>") with their
 102   escaped equivalents."""
 103   s = s.replace("&", "&amp;").replace("<", "&lt;")
 104   s = s.replace("\"", "&quot;").replace(">", "&gt;")
 105   return s
 106
 107
 108 def PrettyPrintNode(node, indent=0):
 109   """Pretty-prints the given XML node at the given indent level.
 110
 111   Args:
 112     node: The minidom node to pretty-print.
 113     indent: The current indent level.
 114
 115   Returns:
 116     The pretty-printed string (including embedded newlines).
 117
 118   Raises:
 119     Error if the XML has unknown tags or attributes.
 120   """
 121   # Handle the top-level document node.
 122   if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE:
 123     return '\n'.join([PrettyPrintNode(n) for n in node.childNodes])
 124
 125   # Handle text nodes.
 126   if node.nodeType == xml.dom.minidom.Node.TEXT_NODE:
 127     # Wrap each paragraph in the text to fit in the 80 column limit.
 128     wrapper = textwrap.TextWrapper()
 129     wrapper.initial_indent = ' ' * indent
 130     wrapper.subsequent_indent = ' ' * indent
 131     wrapper.break_on_hyphens = False
 132     wrapper.break_long_words = False
 133     wrapper.width = WRAP_COLUMN
 134     text = XmlEscape(node.data)
 135     # Remove any common indent.
 136     text = textwrap.dedent(text.strip('\n'))
 137     lines = text.split('\n')
 138     # Split the text into paragraphs at blank line boundaries.
 139     paragraphs = [[]]
 140     for l in lines:
 141       if len(l.strip()) == 0 and len(paragraphs[-1]) > 0:
 142         paragraphs.append([])
 143       else:
 144         paragraphs[-1].append(l)
 145     # Remove trailing empty paragraph if present.
 146     if len(paragraphs) > 0 and len(paragraphs[-1]) == 0:
 147       paragraphs = paragraphs[:-1]
 148     # Wrap each paragraph and separate with two newlines.
 149     return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs])
 150
 151   # Handle element nodes.
 152   if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
 153     newlines_after_open, newlines_before_close, newlines_after_close = (
 154       TAGS_THAT_HAVE_EXTRA_NEWLINE.get(node.tagName, (1, 1, 0)))
 155     # Open the tag.
 156     s = ' ' * indent + '<' + node.tagName
 157
 158     # Calculate how much space to allow for the '>' or '/>'.
 159     closing_chars = 1
 160     if not node.childNodes:
 161       closing_chars = 2
 162
 163     # Pretty-print the attributes.
 164     attributes = node.attributes.keys()
 165     if attributes:
 166       # Reorder the attributes.
 167       if not node.tagName in ATTRIBUTE_ORDER:
 168         unrecognized_attributes = attributes;
 169       else:
 170         unrecognized_attributes = (
 171           [a for a in attributes if not a in ATTRIBUTE_ORDER[node.tagName]])
 172         attributes = (
 173           [a for a in ATTRIBUTE_ORDER[node.tagName] if a in attributes])
 174
 175       for a in unrecognized_attributes:
 176         logging.error(
 177             'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName))
 178       if unrecognized_attributes:
 179         raise Error()
 180
 181       for a in attributes:
 182         value = XmlEscape(node.attributes[a].value)
 183         # Replace sequences of whitespace with single spaces.
 184         words = value.split()
 185         a_str = ' %s="%s"' % (a, ' '.join(words))
 186         # Start a new line if the attribute will make this line too long.
 187         if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
 188           s += '\n' + ' ' * (indent + 3)
 189         # Output everything up to the first quote.
 190         s += ' %s="' % (a)
 191         value_indent_level = LastLineLength(s)
 192         # Output one word at a time, splitting to the next line where necessary.
 193         column = value_indent_level
 194         for i, word in enumerate(words):
 195           # This is slightly too conservative since not every word will be
 196           # followed by the closing characters...
 197           if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
 198             s = s.rstrip()  # remove any trailing whitespace
 199             s += '\n' + ' ' * value_indent_level
 200             column = value_indent_level
 201           s += word + ' '
 202           column += len(word) + 1
 203         s = s.rstrip()  # remove any trailing whitespace
 204         s += '"'
 205       s = s.rstrip()  # remove any trailing whitespace
 206
 207     # Pretty-print the child nodes.
 208     if node.childNodes:
 209       s += '>'
 210       # Calculate the new indent level for child nodes.
 211       new_indent = indent
 212       if node.tagName not in TAGS_THAT_DONT_INDENT:
 213         new_indent += 2
 214       child_nodes = node.childNodes
 215
 216       # Recursively pretty-print the child nodes.
 217       child_nodes = [PrettyPrintNode(n, indent=new_indent) for n in child_nodes]
 218       child_nodes = [c for c in child_nodes if len(c.strip()) > 0]
 219
 220       # Determine whether we can fit the entire node on a single line.
 221       close_tag = '</%s>' % node.tagName
 222       space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
 223       if (node.tagName in TAGS_THAT_ALLOW_SINGLE_LINE and
 224           len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left):
 225         s += child_nodes[0].strip()
 226       else:
 227         s += '\n' * newlines_after_open + '\n'.join(child_nodes)
 228         s += '\n' * newlines_before_close + ' ' * indent
 229       s += close_tag
 230     else:
 231       s += '/>'
 232     s += '\n' * newlines_after_close
 233     return s
 234
 235   # Handle comment nodes.
 236   if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE:
 237     return '<!--%s-->\n' % node.data
 238
 239   # Ignore other node types. This could be a processing instruction (<? ... ?>)
 240   # or cdata section (<![CDATA[...]]!>), neither of which are legal in the
 241   # histograms XML at present.
 242   logging.error('Ignoring unrecognized node data: %s' % node.toxml())
 243   raise Error()
 244
 245
 246 def unsafeAppendChild(parent, child):
 247   """Append child to parent's list of children, ignoring the possibility that it
 248   is already in another node's childNodes list.  Requires that the previous
 249   parent of child is discarded (to avoid non-tree DOM graphs).
 250   This can provide a significant speedup as O(n^2) operations are removed (in
 251   particular, each child insertion avoids the need to traverse the old parent's
 252   entire list of children)."""
 253   child.parentNode = None
 254   parent.appendChild(child)
 255   child.parentNode = parent
 256
 257
 258 def TransformByAlphabetizing(node):
 259   """Transform the given XML by alphabetizing specific node types according to
 260   the rules in ALPHABETIZATION_RULES.
 261
 262   Args:
 263     node: The minidom node to transform.
 264
 265   Returns:
 266     The minidom node, with children appropriately alphabetized. Note that the
 267     transformation is done in-place, i.e. the original minidom tree is modified
 268     directly.
 269   """
 270   if node.nodeType != xml.dom.minidom.Node.ELEMENT_NODE:
 271     for c in node.childNodes: TransformByAlphabetizing(c)
 272     return node
 273
 274   # Element node with a tag name that we alphabetize the children of?
 275   if node.tagName in ALPHABETIZATION_RULES:
 276     # Put subnodes in a list of node,key pairs to allow for custom sorting.
 277     subtag, key_function = ALPHABETIZATION_RULES[node.tagName]
 278     subnodes = []
 279     last_key = -1
 280     for c in node.childNodes:
 281       if (c.nodeType == xml.dom.minidom.Node.ELEMENT_NODE and
 282           c.tagName == subtag):
 283         last_key = key_function(c)
 284       # Subnodes that we don't want to rearrange use the last node's key,
 285       # so they stay in the same relative position.
 286       subnodes.append( (c, last_key) )
 287
 288     # Sort the subnode list.
 289     subnodes.sort(key=lambda pair: pair[1])
 290
 291     # Re-add the subnodes, transforming each recursively.
 292     while node.firstChild:
 293       node.removeChild(node.firstChild)
 294     for (c, _) in subnodes:
 295       unsafeAppendChild(node, TransformByAlphabetizing(c))
 296     return node
 297
 298   # Recursively handle other element nodes and other node types.
 299   for c in node.childNodes: TransformByAlphabetizing(c)
 300   return node
 301
 302
 303 def PrettyPrint(raw_xml):
 304   """Pretty-print the given XML.
 305
 306   Args:
 307     xml: The contents of the histograms XML file, as a string.
 308
 309   Returns:
 310     The pretty-printed version.
 311   """
 312   tree = xml.dom.minidom.parseString(raw_xml)
 313   tree = TransformByAlphabetizing(tree)
 314   return PrettyPrintNode(tree)
 315
 316
 317 def main():
 318   logging.basicConfig(level=logging.INFO)
 319
 320   presubmit = ('--presubmit' in sys.argv)
 321
 322   histograms_filename = 'histograms.xml'
 323   histograms_backup_filename = 'histograms.before.pretty-print.xml'
 324
 325   script_dir = path_utils.ScriptDir()
 326
 327   histograms_pathname = os.path.join(script_dir, histograms_filename)
 328   histograms_backup_pathname = os.path.join(script_dir,
 329                                             histograms_backup_filename)
 330
 331   logging.info('Loading %s...' % histograms_filename)
 332   with open(histograms_pathname, 'rb') as f:
 333     xml = f.read()
 334
 335   # Check there are no CR ('\r') characters in the file.
 336   if '\r' in xml:
 337     logging.info('DOS-style line endings (CR characters) detected - these are '
 338                  'not allowed. Please run dos2unix %s' % histograms_filename)
 339     sys.exit(1)
 340
 341   logging.info('Pretty-printing...')
 342   try:
 343     pretty = PrettyPrint(xml)
 344   except Error:
 345     logging.error('Aborting parsing due to fatal errors.')
 346     sys.exit(1)
 347
 348   if xml == pretty:
 349     logging.info('%s is correctly pretty-printed.' % histograms_filename)
 350     sys.exit(0)
 351   if presubmit:
 352     logging.info('%s is not formatted correctly; run pretty_print.py to fix.' %
 353                  histograms_filename)
 354     sys.exit(1)
 355   if not diffutil.PromptUserToAcceptDiff(
 356       xml, pretty,
 357       'Is the prettified version acceptable?'):
 358     logging.error('Aborting')
 359     return
 360
 361   logging.info('Creating backup file %s' % histograms_backup_filename)
 362   shutil.move(histograms_pathname, histograms_backup_pathname)
 363
 364   logging.info('Writing new %s file' % histograms_filename)
 365   with open(histograms_pathname, 'wb') as f:
 366     f.write(pretty)
 367
 368
 369 if __name__ == '__main__':
 370   main()