tools/metrics/histograms/extract_histograms.py

   1 # Copyright 2013 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Extract histogram names from the description XML file.
   6
   7 For more information on the format of the XML file, which is self-documenting,
   8 see histograms.xml; however, here is a simple example to get you started. The
   9 XML below will generate the following five histograms:
  10
  11     HistogramTime
  12     HistogramEnum
  13     HistogramEnum_Chrome
  14     HistogramEnum_IE
  15     HistogramEnum_Firefox
  16
  17 <histogram-configuration>
  18
  19 <histograms>
  20
  21 <histogram name="HistogramTime" units="milliseconds">
  22   <summary>A brief description.</summary>
  23   <details>This is a more thorough description of this histogram.</details>
  24 </histogram>
  25
  26 <histogram name="HistogramEnum" enum="MyEnumType">
  27   <summary>This histogram sports an enum value type.</summary>
  28 </histogram>
  29
  30 </histograms>
  31
  32 <enums>
  33
  34 <enum name="MyEnumType">
  35   <summary>This is an example enum type, where the values mean little.</summary>
  36   <int value="1" label="FIRST_VALUE">This is the first value.</int>
  37   <int value="2" label="SECOND_VALUE">This is the second value.</int>
  38 </enum>
  39
  40 </enums>
  41
  42 <histogram_suffixes_list>
  43
  44 <histogram_suffixes name="BrowserType">
  45   <suffix name="Chrome"/>
  46   <suffix name="IE"/>
  47   <suffix name="Firefox"/>
  48   <affected-histogram name="HistogramEnum"/>
  49 </histogram_suffixes>
  50
  51 </histogram_suffixes_list>
  52
  53 </histogram-configuration>
  54
  55 """
  56
  57 import copy
  58 import logging
  59 import xml.dom.minidom
  60
  61 OWNER_FIELD_PLACEHOLDER = (
  62     'Please list the metric\'s owners. Add more owner tags as needed.')
  63
  64 MAX_HISTOGRAM_SUFFIX_DEPENDENCY_DEPTH = 5
  65
  66
  67 class Error(Exception):
  68   pass
  69
  70
  71 def _JoinChildNodes(tag):
  72   """Join child nodes into a single text.
  73
  74   Applicable to leafs like 'summary' and 'detail'.
  75
  76   Args:
  77     tag: parent node
  78
  79   Returns:
  80     a string with concatenated nodes' text representation.
  81   """
  82   return ''.join(c.toxml() for c in tag.childNodes).strip()
  83
  84
  85 def _NormalizeString(s):
  86   """Replaces all whitespace sequences with a single space.
  87
  88   The function properly handles multi-line strings.
  89
  90   Args:
  91     s: The string to normalize, ('  \\n a  b c\\n d  ').
  92
  93   Returns:
  94     The normalized string (a b c d).
  95   """
  96   return ' '.join(s.split())
  97
  98
  99 def _NormalizeAllAttributeValues(node):
 100   """Recursively normalizes all tag attribute values in the given tree.
 101
 102   Args:
 103     node: The minidom node to be normalized.
 104
 105   Returns:
 106     The normalized minidom node.
 107   """
 108   if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
 109     for a in node.attributes.keys():
 110       node.attributes[a].value = _NormalizeString(node.attributes[a].value)
 111
 112   for c in node.childNodes:
 113     _NormalizeAllAttributeValues(c)
 114   return node
 115
 116
 117 def _ExpandHistogramNameWithSuffixes(suffix_name, histogram_name,
 118                                      histogram_suffixes_node):
 119   """Creates a new histogram name based on a histogram suffix.
 120
 121   Args:
 122     suffix_name: The suffix string to apply to the histogram name. May be empty.
 123     histogram_name: The name of the histogram. May be of the form
 124       Group.BaseName or BaseName.
 125     histogram_suffixes_node: The histogram_suffixes XML node.
 126
 127   Returns:
 128     A string with the expanded histogram name.
 129
 130   Raises:
 131     Error: if the expansion can't be done.
 132   """
 133   if histogram_suffixes_node.hasAttribute('separator'):
 134     separator = histogram_suffixes_node.getAttribute('separator')
 135   else:
 136     separator = '_'
 137
 138   if histogram_suffixes_node.hasAttribute('ordering'):
 139     ordering = histogram_suffixes_node.getAttribute('ordering')
 140   else:
 141     ordering = 'suffix'
 142   if ordering not in ['prefix', 'suffix']:
 143     logging.error('ordering needs to be prefix or suffix, value is %s',
 144                   ordering)
 145     raise Error()
 146
 147   if not suffix_name:
 148     return histogram_name
 149
 150   if ordering == 'suffix':
 151     return histogram_name + separator + suffix_name
 152
 153   # For prefixes, the suffix_name is inserted between the "cluster" and the
 154   # "remainder", e.g. Foo.BarHist expanded with gamma becomes Foo.gamma_BarHist.
 155   sections = histogram_name.split('.')
 156   if len(sections) <= 1:
 157     logging.error(
 158         'Prefix Field Trial expansions require histogram names which include a '
 159         'dot separator. Histogram name is %s, and Field Trial is %s',
 160         histogram_name, histogram_suffixes_node.getAttribute('name'))
 161     raise Error()
 162
 163   cluster = sections[0] + '.'
 164   remainder = '.'.join(sections[1:])
 165   return cluster + suffix_name + separator + remainder
 166
 167
 168 def _ExtractEnumsFromXmlTree(tree):
 169   """Extract all <enum> nodes in the tree into a dictionary."""
 170
 171   enums = {}
 172   have_errors = False
 173
 174   last_name = None
 175   for enum in tree.getElementsByTagName('enum'):
 176     if enum.getAttribute('type') != 'int':
 177       logging.error('Unknown enum type %s', enum.getAttribute('type'))
 178       have_errors = True
 179       continue
 180
 181     name = enum.getAttribute('name')
 182     if last_name is not None and name.lower() < last_name.lower():
 183       logging.error('Enums %s and %s are not in alphabetical order',
 184                     last_name, name)
 185       have_errors = True
 186     last_name = name
 187
 188     if name in enums:
 189       logging.error('Duplicate enum %s', name)
 190       have_errors = True
 191       continue
 192
 193     last_int_value = None
 194     enum_dict = {}
 195     enum_dict['name'] = name
 196     enum_dict['values'] = {}
 197
 198     for int_tag in enum.getElementsByTagName('int'):
 199       value_dict = {}
 200       int_value = int(int_tag.getAttribute('value'))
 201       if last_int_value is not None and int_value < last_int_value:
 202         logging.error('Enum %s int values %d and %d are not in numerical order',
 203                       name, last_int_value, int_value)
 204         have_errors = True
 205       last_int_value = int_value
 206       if int_value in enum_dict['values']:
 207         logging.error('Duplicate enum value %d for enum %s', int_value, name)
 208         have_errors = True
 209         continue
 210       value_dict['label'] = int_tag.getAttribute('label')
 211       value_dict['summary'] = _JoinChildNodes(int_tag)
 212       enum_dict['values'][int_value] = value_dict
 213
 214     summary_nodes = enum.getElementsByTagName('summary')
 215     if summary_nodes:
 216       enum_dict['summary'] = _NormalizeString(_JoinChildNodes(summary_nodes[0]))
 217
 218     enums[name] = enum_dict
 219
 220   return enums, have_errors
 221
 222
 223 def _ExtractOwners(xml_node):
 224   """Extract all owners into a list from owner tag under |xml_node|."""
 225   owners = []
 226   for owner_node in xml_node.getElementsByTagName('owner'):
 227     owner_entry = _NormalizeString(_JoinChildNodes(owner_node))
 228     if OWNER_FIELD_PLACEHOLDER not in owner_entry:
 229       owners.append(owner_entry)
 230   return owners
 231
 232
 233 def _ExtractHistogramsFromXmlTree(tree, enums):
 234   """Extract all <histogram> nodes in the tree into a dictionary."""
 235
 236   # Process the histograms. The descriptions can include HTML tags.
 237   histograms = {}
 238   have_errors = False
 239   last_name = None
 240   for histogram in tree.getElementsByTagName('histogram'):
 241     name = histogram.getAttribute('name')
 242     if last_name is not None and name.lower() < last_name.lower():
 243       logging.error('Histograms %s and %s are not in alphabetical order',
 244                     last_name, name)
 245       have_errors = True
 246     last_name = name
 247     if name in histograms:
 248       logging.error('Duplicate histogram definition %s', name)
 249       have_errors = True
 250       continue
 251     histograms[name] = histogram_entry = {}
 252
 253     # Find <owner> tag.
 254     owners = _ExtractOwners(histogram)
 255     if owners:
 256       histogram_entry['owners'] = owners
 257
 258     # Find <summary> tag.
 259     summary_nodes = histogram.getElementsByTagName('summary')
 260     if summary_nodes:
 261       histogram_entry['summary'] = _NormalizeString(
 262           _JoinChildNodes(summary_nodes[0]))
 263     else:
 264       histogram_entry['summary'] = 'TBD'
 265
 266     # Find <obsolete> tag.
 267     obsolete_nodes = histogram.getElementsByTagName('obsolete')
 268     if obsolete_nodes:
 269       reason = _JoinChildNodes(obsolete_nodes[0])
 270       histogram_entry['obsolete'] = reason
 271
 272     # Handle units.
 273     if histogram.hasAttribute('units'):
 274       histogram_entry['units'] = histogram.getAttribute('units')
 275
 276     # Find <details> tag.
 277     details_nodes = histogram.getElementsByTagName('details')
 278     if details_nodes:
 279       histogram_entry['details'] = _NormalizeString(
 280           _JoinChildNodes(details_nodes[0]))
 281
 282     # Handle enum types.
 283     if histogram.hasAttribute('enum'):
 284       enum_name = histogram.getAttribute('enum')
 285       if enum_name not in enums:
 286         logging.error('Unknown enum %s in histogram %s', enum_name, name)
 287         have_errors = True
 288       else:
 289         histogram_entry['enum'] = enums[enum_name]
 290
 291   return histograms, have_errors
 292
 293
 294 # Finds an <obsolete> node amongst |node|'s immediate children and returns its
 295 # content as a string. Returns None if no such node exists.
 296 def _GetObsoleteReason(node):
 297   for child in node.childNodes:
 298     if child.localName == 'obsolete':
 299       # There can be at most 1 obsolete element per node.
 300       return _JoinChildNodes(child)
 301   return None
 302
 303
 304 def _UpdateHistogramsWithSuffixes(tree, histograms):
 305   """Process <histogram_suffixes> tags and combine with affected histograms.
 306
 307   The histograms dictionary will be updated in-place by adding new histograms
 308   created by combining histograms themselves with histogram_suffixes targeting
 309   these histograms.
 310
 311   Args:
 312     tree: XML dom tree.
 313     histograms: a dictionary of histograms previously extracted from the tree;
 314
 315   Returns:
 316     True if any errors were found.
 317   """
 318   have_errors = False
 319
 320   histogram_suffix_tag = 'histogram_suffixes'
 321   suffix_tag = 'suffix'
 322   with_tag = 'with-suffix'
 323
 324   # Verify order of histogram_suffixes fields first.
 325   last_name = None
 326   for histogram_suffixes in tree.getElementsByTagName(histogram_suffix_tag):
 327     name = histogram_suffixes.getAttribute('name')
 328     if last_name is not None and name.lower() < last_name.lower():
 329       logging.error('histogram_suffixes %s and %s are not in alphabetical '
 330                     'order', last_name, name)
 331       have_errors = True
 332     last_name = name
 333
 334   # histogram_suffixes can depend on other histogram_suffixes, so we need to be
 335   # careful. Make a temporary copy of the list of histogram_suffixes to use as a
 336   # queue. histogram_suffixes whose dependencies have not yet been processed
 337   # will get relegated to the back of the queue to be processed later.
 338   reprocess_queue = []
 339   def GenerateHistogramSuffixes():
 340     for f in tree.getElementsByTagName(histogram_suffix_tag):
 341       yield 0, f
 342     for r, f in reprocess_queue:
 343       yield r, f
 344
 345   for reprocess_count, histogram_suffixes in GenerateHistogramSuffixes():
 346     # Check dependencies first
 347     dependencies_valid = True
 348     affected_histograms = histogram_suffixes.getElementsByTagName(
 349         'affected-histogram')
 350     for affected_histogram in affected_histograms:
 351       histogram_name = affected_histogram.getAttribute('name')
 352       if histogram_name not in histograms:
 353         # Base histogram is missing
 354         dependencies_valid = False
 355         missing_dependency = histogram_name
 356         break
 357     if not dependencies_valid:
 358       if reprocess_count < MAX_HISTOGRAM_SUFFIX_DEPENDENCY_DEPTH:
 359         reprocess_queue.append((reprocess_count + 1, histogram_suffixes))
 360         continue
 361       else:
 362         logging.error('histogram_suffixes %s is missing its dependency %s',
 363                       histogram_suffixes.getAttribute('name'),
 364                       missing_dependency)
 365         have_errors = True
 366         continue
 367
 368     # If the suffix group has an obsolete tag, all suffixes it generates inherit
 369     # its reason.
 370     group_obsolete_reason = _GetObsoleteReason(histogram_suffixes)
 371
 372     name = histogram_suffixes.getAttribute('name')
 373     suffix_nodes = histogram_suffixes.getElementsByTagName(suffix_tag)
 374     suffix_labels = {}
 375     for suffix in suffix_nodes:
 376       suffix_labels[suffix.getAttribute('name')] = suffix.getAttribute('label')
 377     # Find owners list under current histogram_suffixes tag.
 378     owners = _ExtractOwners(histogram_suffixes)
 379
 380     last_histogram_name = None
 381     for affected_histogram in affected_histograms:
 382       histogram_name = affected_histogram.getAttribute('name')
 383       if (last_histogram_name is not None
 384           and histogram_name.lower() < last_histogram_name.lower()):
 385         logging.error('Affected histograms %s and %s of histogram_suffixes %s '
 386                       'are not in alphabetical order',
 387                       last_histogram_name, histogram_name, name)
 388         have_errors = True
 389       last_histogram_name = histogram_name
 390       with_suffixes = affected_histogram.getElementsByTagName(with_tag)
 391       if with_suffixes:
 392         suffixes_to_add = with_suffixes
 393       else:
 394         suffixes_to_add = suffix_nodes
 395       for suffix in suffixes_to_add:
 396         suffix_name = suffix.getAttribute('name')
 397         try:
 398           new_histogram_name = _ExpandHistogramNameWithSuffixes(
 399               suffix_name, histogram_name, histogram_suffixes)
 400           if new_histogram_name != histogram_name:
 401             histograms[new_histogram_name] = copy.deepcopy(
 402                 histograms[histogram_name])
 403
 404           suffix_label = suffix_labels.get(suffix_name, '')
 405
 406           # TODO(yiyaoliu): Rename these to be consistent with the new naming.
 407           # It is kept unchanged for now to be it's used by dashboards.
 408           if 'fieldtrial_groups' not in histograms[new_histogram_name]:
 409             histograms[new_histogram_name]['fieldtrial_groups'] = []
 410           histograms[new_histogram_name]['fieldtrial_groups'].append(
 411               suffix_name)
 412
 413           if 'fieldtrial_names' not in histograms[new_histogram_name]:
 414             histograms[new_histogram_name]['fieldtrial_names'] = []
 415           histograms[new_histogram_name]['fieldtrial_names'].append(name)
 416
 417           if 'fieldtrial_labels' not in histograms[new_histogram_name]:
 418             histograms[new_histogram_name]['fieldtrial_labels'] = []
 419           histograms[new_histogram_name]['fieldtrial_labels'].append(
 420               suffix_label)
 421
 422           # If no owners are added for this histogram-suffixes, it inherits the
 423           # owners of its parents.
 424           if owners:
 425             histograms[new_histogram_name]['owners'] = owners
 426
 427           # If a suffix has an obsolete node, it's marked as obsolete for the
 428           # specified reason, overwriting its group's obsoletion reason if the
 429           # group itself was obsolete as well.
 430           obsolete_reason = _GetObsoleteReason(suffix)
 431           if not obsolete_reason:
 432             obsolete_reason = group_obsolete_reason
 433
 434           # If the suffix has an obsolete tag, all histograms it generates
 435           # inherit it.
 436           if obsolete_reason:
 437             histograms[new_histogram_name]['obsolete'] = obsolete_reason
 438
 439         except Error:
 440           have_errors = True
 441
 442   return have_errors
 443
 444
 445 def ExtractHistogramsFromFile(file_handle):
 446   """Compute the histogram names and descriptions from the XML representation.
 447
 448   Args:
 449     file_handle: A file or file-like with XML content.
 450
 451   Returns:
 452     a tuple of (histograms, status) where histograms is a dictionary mapping
 453     histogram names to dictionaries containing histogram descriptions and status
 454     is a boolean indicating if errros were encoutered in processing.
 455   """
 456   tree = xml.dom.minidom.parse(file_handle)
 457   _NormalizeAllAttributeValues(tree)
 458
 459   enums, enum_errors = _ExtractEnumsFromXmlTree(tree)
 460   histograms, histogram_errors = _ExtractHistogramsFromXmlTree(tree, enums)
 461   update_errors = _UpdateHistogramsWithSuffixes(tree, histograms)
 462
 463   return histograms, enum_errors or histogram_errors or update_errors
 464
 465
 466 def ExtractHistograms(filename):
 467   """Load histogram definitions from a disk file.
 468
 469   Args:
 470     filename: a file path to load data from.
 471
 472   Returns:
 473     a dictionary of histogram descriptions.
 474
 475   Raises:
 476     Error: if the file is not well-formatted.
 477   """
 478   with open(filename, 'r') as f:
 479     histograms, had_errors = ExtractHistogramsFromFile(f)
 480     if had_errors:
 481       logging.error('Error parsing %s', filename)
 482       raise Error()
 483     return histograms
 484
 485
 486 def ExtractNames(histograms):
 487   return sorted(histograms.keys())