tools/metrics/histograms/extract_histograms.py

   1 # Copyright 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Extract histogram names from the description XML file.
   6
   7 For more information on the format of the XML file, which is self-documenting,
   8 see histograms.xml; however, here is a simple example to get you started. The
   9 XML below will generate the following five histograms:
  10
  11     HistogramTime
  12     HistogramEnum
  13     HistogramEnum_Chrome
  14     HistogramEnum_IE
  15     HistogramEnum_Firefox
  16
  17 <histogram-configuration>
  18
  19 <histograms>
  20
  21 <histogram name="HistogramTime" units="milliseconds">
  22   <summary>A brief description.</summary>
  23   <details>This is a more thorough description of this histogram.</details>
  24 </histogram>
  25
  26 <histogram name="HistogramEnum" enum="MyEnumType">
  27   <summary>This histogram sports an enum value type.</summary>
  28 </histogram>
  29
  30 </histograms>
  31
  32 <enums>
  33
  34 <enum name="MyEnumType">
  35   <summary>This is an example enum type, where the values mean little.</summary>
  36   <int value="1" label="FIRST_VALUE">This is the first value.</int>
  37   <int value="2" label="SECOND_VALUE">This is the second value.</int>
  38 </enum>
  39
  40 </enums>
  41
  42 <fieldtrials>
  43
  44 <fieldtrial name="BrowserType">
  45   <group name="Chrome"/>
  46   <group name="IE"/>
  47   <group name="Firefox"/>
  48   <affected-histogram name="HistogramEnum"/>
  49 </fieldtrial>
  50
  51 </fieldtrials>
  52
  53 </histogram-configuration>
  54
  55 """
  56
  57 import copy
  58 import logging
  59 import xml.dom.minidom
  60
  61
  62 MAX_FIELDTRIAL_DEPENDENCY_DEPTH = 5
  63
  64
  65 class Error(Exception):
  66   pass
  67
  68
  69 def _JoinChildNodes(tag):
  70   """Join child nodes into a single text.
  71
  72   Applicable to leafs like 'summary' and 'detail'.
  73
  74   Args:
  75     tag: parent node
  76
  77   Returns:
  78     a string with concatenated nodes' text representation.
  79   """
  80   return ''.join(c.toxml() for c in tag.childNodes).strip()
  81
  82
  83 def _NormalizeString(s):
  84   """Normalizes a string (possibly of multiple lines) by replacing each
  85   whitespace sequence with a single space.
  86
  87   Args:
  88     s: The string to normalize, e.g. '  \n a  b c\n d  '
  89
  90   Returns:
  91     The normalized string, e.g. 'a b c d'
  92   """
  93   return ' '.join(s.split())
  94
  95
  96 def _NormalizeAllAttributeValues(node):
  97   """Recursively normalizes all tag attribute values in the given tree.
  98
  99   Args:
 100     node: The minidom node to be normalized.
 101
 102   Returns:
 103     The normalized minidom node.
 104   """
 105   if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
 106     for a in node.attributes.keys():
 107       node.attributes[a].value = _NormalizeString(node.attributes[a].value)
 108
 109   for c in node.childNodes: _NormalizeAllAttributeValues(c)
 110   return node
 111
 112
 113 def _ExpandHistogramNameWithFieldTrial(group_name, histogram_name, fieldtrial):
 114   """Creates a new histogram name based on the field trial group.
 115
 116   Args:
 117     group_name: The name of the field trial group. May be empty.
 118     histogram_name: The name of the histogram. May be of the form
 119       Group.BaseName or BaseName
 120     field_trial: The FieldTrial XML element.
 121
 122   Returns:
 123     A string with the expanded histogram name.
 124
 125   Raises:
 126     Error if the expansion can't be done.
 127   """
 128   if fieldtrial.hasAttribute('separator'):
 129     separator = fieldtrial.getAttribute('separator')
 130   else:
 131     separator = '_'
 132
 133   if fieldtrial.hasAttribute('ordering'):
 134     ordering = fieldtrial.getAttribute('ordering')
 135   else:
 136     ordering = 'suffix'
 137   if ordering not in ['prefix', 'suffix']:
 138     logging.error('ordering needs to be prefix or suffix, value is %s' %
 139                   ordering)
 140     raise Error()
 141
 142   if not group_name:
 143     return histogram_name
 144
 145   if ordering == 'suffix':
 146     return histogram_name + separator + group_name
 147
 148   # For prefixes, the group_name is inserted between the "cluster" and the
 149   # "remainder", e.g. Foo.BarHist expanded with gamma becomes Foo.gamma_BarHist.
 150   sections = histogram_name.split('.')
 151   if len(sections) <= 1:
 152     logging.error(
 153       'Prefix Field Trial expansions require histogram names which include a '
 154       'dot separator. Histogram name is %s, and Field Trial is %s' %
 155       (histogram_name, fieldtrial.getAttribute('name')))
 156     raise Error()
 157
 158   cluster = sections[0] + '.'
 159   remainder = '.'.join(sections[1:])
 160   return cluster + group_name + separator + remainder
 161
 162
 163 def _ExtractEnumsFromXmlTree(tree):
 164   """Extract all <enum> nodes in the tree into a dictionary."""
 165
 166   enums = {}
 167   have_errors = False
 168
 169   last_name = None
 170   for enum in tree.getElementsByTagName("enum"):
 171     if enum.getAttribute('type') != 'int':
 172       logging.error('Unknown enum type %s' % enum.getAttribute('type'))
 173       have_errors = True
 174       continue
 175
 176     name = enum.getAttribute('name')
 177     if last_name is not None and name.lower() < last_name.lower():
 178       logging.error('Enums %s and %s are not in alphabetical order'
 179                     % (last_name, name))
 180       have_errors = True
 181     last_name = name
 182
 183     if name in enums:
 184       logging.error('Duplicate enum %s' % name)
 185       have_errors = True
 186       continue
 187
 188     last_int_value = None
 189     enum_dict = {}
 190     enum_dict['name'] = name
 191     enum_dict['values'] = {}
 192
 193     for int_tag in enum.getElementsByTagName("int"):
 194       value_dict = {}
 195       int_value = int(int_tag.getAttribute('value'))
 196       if last_int_value is not None and int_value < last_int_value:
 197         logging.error('Enum %s int values %d and %d are not in numerical order'
 198                       % (name, last_int_value, int_value))
 199         have_errors = True
 200       last_int_value = int_value
 201       if int_value in enum_dict['values']:
 202         logging.error('Duplicate enum value %d for enum %s' % (int_value, name))
 203         have_errors = True
 204         continue
 205       value_dict['label'] = int_tag.getAttribute('label')
 206       value_dict['summary'] = _JoinChildNodes(int_tag)
 207       enum_dict['values'][int_value] = value_dict
 208
 209     summary_nodes = enum.getElementsByTagName("summary")
 210     if len(summary_nodes) > 0:
 211       enum_dict['summary'] = _NormalizeString(_JoinChildNodes(summary_nodes[0]))
 212
 213     enums[name] = enum_dict
 214
 215   return enums, have_errors
 216
 217
 218 def _ExtractHistogramsFromXmlTree(tree, enums):
 219   """Extract all <histogram> nodes in the tree into a dictionary."""
 220
 221   # Process the histograms. The descriptions can include HTML tags.
 222   histograms = {}
 223   have_errors = False
 224   last_name = None
 225   for histogram in tree.getElementsByTagName("histogram"):
 226     name = histogram.getAttribute('name')
 227     if last_name is not None and name.lower() < last_name.lower():
 228       logging.error('Histograms %s and %s are not in alphabetical order'
 229                     % (last_name, name))
 230       have_errors = True
 231     last_name = name
 232     if name in histograms:
 233       logging.error('Duplicate histogram definition %s' % name)
 234       have_errors = True
 235       continue
 236     histograms[name] = histogram_entry = {}
 237
 238     # Find <summary> tag.
 239     summary_nodes = histogram.getElementsByTagName("summary")
 240     if len(summary_nodes) > 0:
 241       histogram_entry['summary'] = _NormalizeString(
 242           _JoinChildNodes(summary_nodes[0]))
 243     else:
 244       histogram_entry['summary'] = 'TBD'
 245
 246     # Find <obsolete> tag.
 247     obsolete_nodes = histogram.getElementsByTagName("obsolete")
 248     if len(obsolete_nodes) > 0:
 249       reason = _JoinChildNodes(obsolete_nodes[0])
 250       histogram_entry['obsolete'] = reason
 251
 252     # Handle units.
 253     if histogram.hasAttribute('units'):
 254       histogram_entry['units'] = histogram.getAttribute('units')
 255
 256     # Find <details> tag.
 257     details_nodes = histogram.getElementsByTagName("details")
 258     if len(details_nodes) > 0:
 259       histogram_entry['details'] = _NormalizeString(
 260           _JoinChildNodes(details_nodes[0]))
 261
 262     # Handle enum types.
 263     if histogram.hasAttribute('enum'):
 264       enum_name = histogram.getAttribute('enum')
 265       if not enum_name in enums:
 266         logging.error('Unknown enum %s in histogram %s' % (enum_name, name))
 267         have_errors = True
 268       else:
 269         histogram_entry['enum'] = enums[enum_name]
 270
 271   return histograms, have_errors
 272
 273
 274 def _UpdateHistogramsWithFieldTrialInformation(tree, histograms):
 275   """Process field trials' tags and combine with affected histograms.
 276
 277   The histograms dictionary will be updated in-place by adding new histograms
 278   created by combining histograms themselves with field trials targetting these
 279   histograms.
 280
 281   Args:
 282     tree: XML dom tree.
 283     histograms: a dictinary of histograms previously extracted from the tree;
 284
 285   Returns:
 286     True if any errors were found.
 287   """
 288   have_errors = False
 289
 290   # Verify order of fieldtrial fields first.
 291   last_name = None
 292   for fieldtrial in tree.getElementsByTagName("fieldtrial"):
 293     name = fieldtrial.getAttribute('name')
 294     if last_name is not None and name.lower() < last_name.lower():
 295       logging.error('Field trials %s and %s are not in alphabetical order'
 296                     % (last_name, name))
 297       have_errors = True
 298     last_name = name
 299
 300   # Field trials can depend on other field trials, so we need to be careful.
 301   # Make a temporary copy of the list of field trials to use as a queue.
 302   # Field trials whose dependencies have not yet been processed will get
 303   # relegated to the back of the queue to be processed later.
 304   reprocess_queue = []
 305   def GenerateFieldTrials():
 306     for f in tree.getElementsByTagName("fieldtrial"): yield 0, f
 307     for r, f in reprocess_queue: yield r, f
 308
 309   for reprocess_count, fieldtrial in GenerateFieldTrials():
 310     # Check dependencies first
 311     dependencies_valid = True
 312     affected_histograms = fieldtrial.getElementsByTagName('affected-histogram')
 313     for affected_histogram in affected_histograms:
 314       histogram_name = affected_histogram.getAttribute('name')
 315       if not histogram_name in histograms:
 316         # Base histogram is missing
 317         dependencies_valid = False
 318         missing_dependency = histogram_name
 319         break
 320     if not dependencies_valid:
 321       if reprocess_count < MAX_FIELDTRIAL_DEPENDENCY_DEPTH:
 322         reprocess_queue.append( (reprocess_count + 1, fieldtrial) )
 323         continue
 324       else:
 325         logging.error('Field trial %s is missing its dependency %s'
 326                       % (fieldtrial.getAttribute('name'),
 327                          missing_dependency))
 328         have_errors = True
 329         continue
 330
 331     name = fieldtrial.getAttribute('name')
 332     groups = fieldtrial.getElementsByTagName('group')
 333     group_labels = {}
 334     for group in groups:
 335       group_labels[group.getAttribute('name')] = group.getAttribute('label')
 336
 337     last_histogram_name = None
 338     for affected_histogram in affected_histograms:
 339       histogram_name = affected_histogram.getAttribute('name')
 340       if (last_histogram_name is not None
 341           and histogram_name.lower() < last_histogram_name.lower()):
 342         logging.error('Affected histograms %s and %s of field trial %s are not '
 343                       'in alphabetical order'
 344                       % (last_histogram_name, histogram_name, name))
 345         have_errors = True
 346       last_histogram_name = histogram_name
 347       base_description = histograms[histogram_name]
 348       with_groups = affected_histogram.getElementsByTagName('with-group')
 349       if len(with_groups) > 0:
 350         histogram_groups = with_groups
 351       else:
 352         histogram_groups = groups
 353       for group in histogram_groups:
 354         group_name = group.getAttribute('name')
 355         try:
 356           new_histogram_name = _ExpandHistogramNameWithFieldTrial(
 357             group_name, histogram_name, fieldtrial)
 358           if new_histogram_name != histogram_name:
 359             histograms[new_histogram_name] = copy.deepcopy(
 360               histograms[histogram_name])
 361
 362           group_label = group_labels.get(group_name, '')
 363
 364           if not 'fieldtrial_groups' in histograms[new_histogram_name]:
 365             histograms[new_histogram_name]['fieldtrial_groups'] = []
 366           histograms[new_histogram_name]['fieldtrial_groups'].append(group_name)
 367
 368           if not 'fieldtrial_names' in histograms[new_histogram_name]:
 369             histograms[new_histogram_name]['fieldtrial_names'] = []
 370           histograms[new_histogram_name]['fieldtrial_names'].append(name)
 371
 372           if not 'fieldtrial_labels' in histograms[new_histogram_name]:
 373             histograms[new_histogram_name]['fieldtrial_labels'] = []
 374           histograms[new_histogram_name]['fieldtrial_labels'].append(
 375             group_label)
 376
 377         except Error:
 378           have_errors = True
 379
 380   return have_errors
 381
 382
 383 def ExtractHistogramsFromFile(file_handle):
 384   """Compute the histogram names and descriptions from the XML representation.
 385
 386   Args:
 387     file_handle: A file or file-like with XML content.
 388
 389   Returns:
 390     a tuple of (histograms, status) where histograms is a dictionary mapping
 391     histogram names to dictionaries containing histogram descriptions and status
 392     is a boolean indicating if errros were encoutered in processing.
 393   """
 394   tree = xml.dom.minidom.parse(file_handle)
 395   _NormalizeAllAttributeValues(tree)
 396
 397   enums, enum_errors = _ExtractEnumsFromXmlTree(tree)
 398   histograms, histogram_errors = _ExtractHistogramsFromXmlTree(tree, enums)
 399   update_errors = _UpdateHistogramsWithFieldTrialInformation(tree, histograms)
 400
 401   return histograms, enum_errors or histogram_errors or update_errors
 402
 403
 404 def ExtractHistograms(filename):
 405   """Load histogram definitions from a disk file.
 406   Args:
 407     filename: a file path to load data from.
 408
 409   Raises:
 410     Error if the file is not well-formatted.
 411   """
 412   with open(filename, 'r') as f:
 413     histograms, had_errors = ExtractHistogramsFromFile(f)
 414     if had_errors:
 415       logging.error('Error parsing %s' % filename)
 416       raise Error()
 417     return histograms
 418
 419
 420 def ExtractNames(histograms):
 421   return sorted(histograms.keys())