android_webview/tools/copyright_scanner.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Utilities for scanning source files to determine code authorship.
   6 """
   7
   8 import itertools
   9
  10 def ForwardSlashesToOsPathSeps(input_api, path):
  11   """Converts forward slashes ('/') in the input path to OS-specific
  12   path separators. Used when the paths come from outside and are using
  13   UNIX path separators. Only works for relative paths!
  14   Args:
  15     input_api: InputAPI, as in presubmit scripts.
  16     path: The path to convert.
  17   Returns:
  18     Converted path.
  19   """
  20   return input_api.os_path.join(*path.split('/'))
  21
  22 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
  23   """Similar to UNIX utility find(1), searches for files in the directories.
  24   Automatically leaves out only source code files and excludes third_party
  25   directories.
  26   Args:
  27     input_api: InputAPI, as in presubmit scripts.
  28     root_dir: The root directory, to which all other paths are relative.
  29     start_paths_list: The list of paths to start search from. Each path can
  30       be a file or a directory.
  31     excluded_dirs_list: The list of directories to skip.
  32   Returns:
  33     The list of source code files found, relative to |root_dir|.
  34   """
  35   excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
  36   # Using a common pattern for third-partyies makes the ignore regexp shorter
  37   excluded_dirs_list.append('third_party')
  38
  39   path_join = input_api.os_path.join
  40   EXTRA_EXCLUDED_DIRS = [
  41     # VCS dirs
  42     path_join('.git'),
  43     path_join('.svn'),
  44     # Build output
  45     path_join('out', 'Debug'),
  46     path_join('out', 'Release'),
  47     # 'Copyright' appears in license agreements
  48     path_join('chrome', 'app', 'resources'),
  49     # Quickoffice js files from internal src used on buildbots.
  50     # crbug.com/350472.
  51     path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
  52     # This is a test output directory
  53     path_join('chrome', 'tools', 'test', 'reference_build'),
  54     # blink style copy right headers.
  55     path_join('content', 'shell', 'renderer', 'test_runner'),
  56     # blink style copy right headers.
  57     path_join('content', 'shell', 'tools', 'plugin'),
  58     # This is tests directory, doesn't exist in the snapshot
  59     path_join('content', 'test', 'data'),
  60     # This is a tests directory that doesn't exist in the shipped product.
  61     path_join('gin', 'test'),
  62     # This is a test output directory
  63     path_join('data', 'dom_perf'),
  64     # This is a tests directory that doesn't exist in the shipped product.
  65     path_join('tools', 'perf', 'page_sets'),
  66     path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
  67     # Histogram tools, doesn't exist in the snapshot
  68     path_join('tools', 'histograms'),
  69     # Swarming tools, doesn't exist in the snapshot
  70     path_join('tools', 'swarming_client'),
  71     # ARM sysroot, doesn't exist in the snapshot
  72     path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
  73     # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
  74     path_join('arm-sysroot'),
  75     # Data is not part of open source chromium, but are included on some bots.
  76     path_join('data'),
  77     # This is not part of open source chromium, but are included on some bots.
  78     path_join('skia', 'tools', 'clusterfuzz-data')
  79   ]
  80   excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)
  81
  82   # Surround the directory names with OS path separators.
  83   dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d]
  84   def IsBlacklistedDir(d):
  85     for item in dirs_blacklist:
  86       if item in d:
  87         return True
  88     return False
  89
  90   files_whitelist_re = input_api.re.compile(
  91     r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
  92     '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
  93     '|tex|mli?)$')
  94   files = []
  95
  96   base_path_len = len(root_dir)
  97   for path in start_paths_list:
  98     full_path = path_join(root_dir, path)
  99     if input_api.os_path.isfile(full_path):
 100       if files_whitelist_re.search(path) and \
 101           not IsBlacklistedDir(full_path[base_path_len:]):  # Keep '/' prefix.
 102         files.append(path)
 103     else:
 104       for dirpath, dirnames, filenames in input_api.os_walk(full_path):
 105         # Remove excluded subdirs for faster scanning.
 106         for item in dirnames[:]:
 107           if IsBlacklistedDir(
 108               path_join(dirpath, item)[base_path_len + 1:]):
 109             dirnames.remove(item)
 110         for filename in filenames:
 111           filepath = \
 112               path_join(dirpath, filename)[base_path_len + 1:]
 113           if files_whitelist_re.search(filepath) and \
 114               not IsBlacklistedDir(filepath):
 115             files.append(filepath)
 116   return files
 117
 118
 119 class _GeneratedFilesDetector(object):
 120   GENERATED_FILE = 'GENERATED FILE'
 121   NO_COPYRIGHT = '*No copyright*'
 122
 123   def __init__(self, input_api):
 124     self.python_multiline_string_double_re = \
 125       input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
 126     self.python_multiline_string_single_re = \
 127       input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
 128     self.automatically_generated_re = input_api.re.compile(
 129       r'(All changes made in this file will be lost'
 130       '|DO NOT (EDIT|delete this file)'
 131       '|Generated (at|automatically|data)'
 132       '|Automatically generated'
 133       '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
 134
 135   def IsGeneratedFile(self, header):
 136     header = header.upper()
 137     if '"""' in header:
 138       header = self.python_multiline_string_double_re.sub('', header)
 139     if "'''" in header:
 140       header = self.python_multiline_string_single_re.sub('', header)
 141     # First do simple strings lookup to save time.
 142     if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
 143       return True
 144     if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
 145         'GENERATED' in header:
 146       return self.automatically_generated_re.search(header)
 147     return False
 148
 149
 150 class _CopyrightsScanner(object):
 151   @staticmethod
 152   def StaticInit(input_api):
 153     _CopyrightsScanner._c_comment_re = \
 154       input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
 155     _CopyrightsScanner._copyright_indicator = \
 156       r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
 157     _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
 158       r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
 159       r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
 160     _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
 161       r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
 162
 163   def __init__(self, input_api):
 164     self.max_line_numbers_proximity = 3
 165     self.last_a_item_line_number = -200
 166     self.last_b_item_line_number = -100
 167     self.re = input_api.re
 168
 169   def _CloseLineNumbers(self, a, b):
 170     return 0 <= a - b <= self.max_line_numbers_proximity
 171
 172   def MatchLine(self, line_number, line):
 173     if '"' in line:
 174       line = _CopyrightsScanner._c_comment_re.sub('', line)
 175     upcase_line = line.upper()
 176     # Record '(a)' and '(b)' last occurences in C++ comments.
 177     # This is to filter out '(c)' used as a list item inside C++ comments.
 178     # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
 179     cpp_comment_idx = upcase_line.find('//')
 180     if cpp_comment_idx != -1:
 181       if upcase_line.find('(A)') > cpp_comment_idx:
 182         self.last_a_item_line_number = line_number
 183       if upcase_line.find('(B)') > cpp_comment_idx:
 184         self.last_b_item_line_number = line_number
 185     # Fast bailout, uses the same patterns as _copyright_indicator regexp.
 186     if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
 187         and not '\xc2\xa9' in upcase_line:
 188       c_item_index = upcase_line.find('(C)')
 189       if c_item_index == -1:
 190         return None
 191       if c_item_index > cpp_comment_idx and \
 192           self._CloseLineNumbers(line_number,
 193                                  self.last_b_item_line_number) and \
 194           self._CloseLineNumbers(self.last_b_item_line_number,
 195                                  self.last_a_item_line_number):
 196         return None
 197     copyr = None
 198     m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
 199     if m and \
 200         not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
 201       copyr = m.group(0)
 202       # Prettify the authorship string.
 203       copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
 204       copyr = self.re.sub(
 205         _CopyrightsScanner._copyright_indicator, '', copyr, \
 206         flags=self.re.IGNORECASE)
 207       copyr = self.re.sub(r'^\s+', '', copyr)
 208       copyr = self.re.sub(r'\s{2,}', ' ', copyr)
 209       copyr = self.re.sub(r'\\@', '@', copyr)
 210     return copyr
 211
 212
 213 def FindCopyrights(input_api, root_dir, files_to_scan):
 214   """Determines code autorship, and finds generated files.
 215   Args:
 216     input_api: InputAPI, as in presubmit scripts.
 217     root_dir: The root directory, to which all other paths are relative.
 218     files_to_scan: The list of file names to scan.
 219   Returns:
 220     The list of copyrights associated with each of the files given.
 221     If the certain file is generated, the corresponding list consists a single
 222     entry -- 'GENERATED_FILE' string. If the file has no copyright info,
 223     the corresponding list contains 'NO_COPYRIGHT' string.
 224   """
 225   generated_files_detector = _GeneratedFilesDetector(input_api)
 226   _CopyrightsScanner.StaticInit(input_api)
 227   copyrights = []
 228   for file_name in files_to_scan:
 229     linenum = 0
 230     header = []
 231     file_copyrights = []
 232     scanner = _CopyrightsScanner(input_api)
 233     contents = input_api.ReadFile(
 234       input_api.os_path.join(root_dir, file_name), 'r')
 235     for l in contents.split('\n'):
 236       linenum += 1
 237       if linenum <= 25:
 238         header.append(l)
 239       c = scanner.MatchLine(linenum, l)
 240       if c:
 241         file_copyrights.append(c)
 242     if generated_files_detector.IsGeneratedFile('\n'.join(header)):
 243       copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
 244     elif file_copyrights:
 245       copyrights.append(file_copyrights)
 246     else:
 247       copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
 248   return copyrights
 249
 250
 251 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
 252   """Looks for files that are not belong exlusively to the Chromium Authors.
 253   Args:
 254     input_api: InputAPI, as in presubmit scripts.
 255     root_dir: The root directory, to which all other paths are relative.
 256     files_to_scan: The list of file names to scan.
 257   Returns:
 258     The list of file names that contain non-Chromium copyrights.
 259   """
 260   copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
 261   offending_files = []
 262   allowed_copyrights_re = input_api.re.compile(
 263     r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
 264     'All rights reserved.*)$')
 265   for f, cs in itertools.izip(files_to_scan, copyrights):
 266     if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
 267        cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
 268       continue
 269     for c in cs:
 270       if not allowed_copyrights_re.match(c):
 271         offending_files.append(input_api.os_path.normpath(f))
 272         break
 273   return offending_files
 274
 275
 276 def _GetWhitelistFileName(input_api):
 277   return input_api.os_path.join(
 278     'android_webview', 'tools', 'third_party_files_whitelist.txt')
 279
 280 def _ProcessWhitelistedFilesList(input_api, lines):
 281   whitelisted_files = []
 282   for line in lines:
 283     match = input_api.re.match(r'([^#\s]+)', line)
 284     if match:
 285       whitelisted_files.append(
 286         ForwardSlashesToOsPathSeps(input_api, match.group(1)))
 287   return whitelisted_files
 288
 289
 290 def LoadWhitelistedFilesList(input_api):
 291   """Loads and parses the 3rd party code whitelist file.
 292     input_api: InputAPI of presubmit scripts.
 293   Returns:
 294     The list of files.
 295   """
 296   full_file_name = input_api.os_path.join(
 297     input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
 298   file_data = input_api.ReadFile(full_file_name, 'rb')
 299   return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())
 300
 301
 302 def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
 303   """Compares whitelist contents with the results of file scanning.
 304     input_api: InputAPI of presubmit scripts.
 305     whitelisted_files: Whitelisted files list.
 306     offending_files: Files that contain 3rd party code.
 307   Returns:
 308     A triplet of "unknown", "missing", and "stale" file lists.
 309     "Unknown" are files that contain 3rd party code but not whitelisted.
 310     "Missing" are files that are whitelisted but doesn't really exist.
 311     "Stale" are files that are whitelisted unnecessarily.
 312   """
 313   unknown = set(offending_files) - set(whitelisted_files)
 314   missing = [f for f in whitelisted_files if not input_api.os_path.isfile(f)]
 315   stale = set(whitelisted_files) - set(offending_files) - set(missing)
 316   return (list(unknown), missing, list(stale))
 317
 318
 319 def _GetDeletedContents(affected_file):
 320   """Returns a list of all deleted lines.
 321   AffectedFile class from presubmit_support is lacking this functionality.
 322   """
 323   deleted_lines = []
 324   for line in affected_file.GenerateScmDiff().splitlines():
 325     if line.startswith('-') and not line.startswith('--'):
 326       deleted_lines.append(line[1:])
 327   return deleted_lines
 328
 329 def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
 330   # We pass empty 'known third-party' dirs list here. Since this is a patch
 331   # for the Chromium's src tree, it must contain properly licensed Chromium
 332   # code. Any third-party code must be put into a directory named 'third_party',
 333   # and such dirs are automatically excluded by FindFiles.
 334   files_to_scan = FindFiles(
 335     input_api, input_api.change.RepositoryRoot(), files_to_check, [])
 336   offending_files = FindCopyrightViolations(
 337     input_api, input_api.change.RepositoryRoot(), files_to_scan)
 338   return AnalyzeScanResults(
 339     input_api, whitelisted_files, offending_files)
 340
 341 def ScanAtPresubmit(input_api, output_api):
 342   """Invoked at change presubmit time. Verifies that updated non third-party
 343   code doesn't contain external copyrighted code.
 344     input_api: InputAPI of presubmit scripts.
 345     output_api: OutputAPI of presubmit scripts.
 346   """
 347   files_to_check = set([])
 348   deleted_files = set([])
 349   whitelist_contents_changed = False
 350   for f in input_api.AffectedFiles():
 351     if f.LocalPath() == _GetWhitelistFileName(input_api):
 352       whitelist_contents_changed = True
 353       deleted_files |= set(_ProcessWhitelistedFilesList(
 354         input_api, _GetDeletedContents(f)))
 355       continue
 356     if f.Action() != 'D':
 357       files_to_check.add(f.LocalPath())
 358     else:
 359       deleted_files.add(f.LocalPath())
 360   whitelisted_files = set(LoadWhitelistedFilesList(input_api))
 361   if not whitelist_contents_changed:
 362     whitelisted_files &= files_to_check | deleted_files
 363   else:
 364     # Need to re-check the entire contents of the whitelist file.
 365     # Also add files removed from the whitelist. If the file has indeed been
 366     # deleted, the scanner will not complain.
 367     files_to_check |= whitelisted_files | deleted_files
 368
 369   (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
 370     input_api, list(whitelisted_files), list(files_to_check))
 371   results = []
 372   if unknown_files:
 373     results.append(output_api.PresubmitError(
 374         'The following files contain a third-party license but are not in ' \
 375         'a listed third-party directory and are not whitelisted. You must ' \
 376         'add the following files to the whitelist file ' \
 377         '%s:' % _GetWhitelistFileName(input_api),
 378         sorted(unknown_files)))
 379   if missing_files:
 380     results.append(output_api.PresubmitPromptWarning(
 381         'The following files are whitelisted in %s, ' \
 382         'but do not exist or not files:' % _GetWhitelistFileName(input_api),
 383         sorted(missing_files)))
 384   if stale_files:
 385     results.append(output_api.PresubmitPromptWarning(
 386         'The following files are whitelisted unnecessarily. You must ' \
 387         'remove the following files from the whitelist file ' \
 388         '%s:' % _GetWhitelistFileName(input_api),
 389         sorted(stale_files)))
 390   return results