android_webview/tools/copyright_scanner.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """Utilities for scanning source files to determine code authorship.
   6 """
   7
   8 import itertools
   9
  10
  11 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
  12   """Similar to UNIX utility find(1), searches for files in the directories.
  13   Automatically leaves out only source code files and excludes third_party
  14   directories.
  15   Args:
  16     input_api: InputAPI, as in presubmit scripts.
  17     root_dir: The root directory, to which all other paths are relative.
  18     start_paths_list: The list of paths to start search from. Each path can
  19       be a file or a directory.
  20     excluded_dirs_list: The list of directories to skip.
  21   Returns:
  22     The list of source code files found, relative to |root_dir|.
  23   """
  24   excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
  25   # Using a common pattern for third-partyies makes the ignore regexp shorter
  26   excluded_dirs_list.append('third_party')
  27
  28   EXTRA_EXCLUDED_DIRS = [
  29     # VCS dirs
  30     '.git',
  31     '.svn',
  32     # Build output
  33     'out/Debug',
  34     'out/Release',
  35     # 'Copyright' appears in license agreements
  36     'chrome/app/resources',
  37     # Quickoffice js files from internal src used on buildbots.
  38     # crbug.com/350472.
  39     'chrome/browser/resources/chromeos/quickoffice',
  40     # This is a test output directory
  41     'chrome/tools/test/reference_build',
  42     # blink style copy right headers.
  43     'content/shell/renderer/test_runner',
  44     # blink style copy right headers.
  45     'content/shell/tools/plugin',
  46     # This is tests directory, doesn't exist in the snapshot
  47     'content/test/data',
  48     # This is a tests directory that doesn't exist in the shipped product.
  49     'gin/test',
  50     # This is a test output directory
  51     'data/dom_perf',
  52     # This is a tests directory that doesn't exist in the shipped product.
  53     'tools/perf/page_sets',
  54     'tools/perf/page_sets/tough_animation_cases',
  55     # Histogram tools, doesn't exist in the snapshot
  56     'tools/histograms',
  57     # Swarming tools, doesn't exist in the snapshot
  58     'tools/swarming_client',
  59     # ARM sysroot, doesn't exist in the snapshot
  60     'chrome/installer/linux/debian_wheezy_arm-sysroot',
  61     # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
  62     'arm-sysroot',
  63     # Data is not part of open source chromium, but are included on some bots.
  64     'data',
  65     # This is not part of open source chromium, but are included on some bots.
  66     'skia/tools/clusterfuzz-data'
  67   ]
  68   excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)
  69
  70   dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list]
  71   def IsBlacklistedDir(d):
  72     for item in dirs_blacklist:
  73       if item in d:
  74         return True
  75     return False
  76
  77   files_whitelist_re = input_api.re.compile(
  78     r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
  79     '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
  80     '|tex|mli?)$')
  81   files = []
  82
  83   base_path_len = len(root_dir)
  84   for path in start_paths_list:
  85     full_path = input_api.os_path.join(root_dir, path)
  86     if input_api.os_path.isfile(full_path):
  87       if files_whitelist_re.search(path) and \
  88           not IsBlacklistedDir(full_path[base_path_len:]):  # Keep '/' prefix.
  89         files.append(path)
  90     else:
  91       for dirpath, dirnames, filenames in input_api.os_walk(full_path):
  92         # Remove excluded subdirs for faster scanning.
  93         for item in dirnames[:]:
  94           if IsBlacklistedDir(
  95               input_api.os_path.join(dirpath, item)[base_path_len + 1:]):
  96             dirnames.remove(item)
  97         for filename in filenames:
  98           filepath = \
  99               input_api.os_path.join(dirpath, filename)[base_path_len + 1:]
 100           if files_whitelist_re.search(filepath) and \
 101               not IsBlacklistedDir(filepath):
 102             files.append(filepath)
 103   return files
 104
 105
 106 class _GeneratedFilesDetector(object):
 107   GENERATED_FILE = 'GENERATED FILE'
 108   NO_COPYRIGHT = '*No copyright*'
 109
 110   def __init__(self, input_api):
 111     self.python_multiline_string_double_re = \
 112       input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
 113     self.python_multiline_string_single_re = \
 114       input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
 115     self.automatically_generated_re = input_api.re.compile(
 116       r'(All changes made in this file will be lost'
 117       '|DO NOT (EDIT|delete this file)'
 118       '|Generated (at|automatically|data)'
 119       '|Automatically generated'
 120       '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
 121
 122   def IsGeneratedFile(self, header):
 123     header = header.upper()
 124     if '"""' in header:
 125       header = self.python_multiline_string_double_re.sub('', header)
 126     if "'''" in header:
 127       header = self.python_multiline_string_single_re.sub('', header)
 128     # First do simple strings lookup to save time.
 129     if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
 130       return True
 131     if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
 132         'GENERATED' in header:
 133       return self.automatically_generated_re.search(header)
 134     return False
 135
 136
 137 class _CopyrightsScanner(object):
 138   @staticmethod
 139   def StaticInit(input_api):
 140     _CopyrightsScanner._c_comment_re = \
 141       input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
 142     _CopyrightsScanner._copyright_indicator = \
 143       r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
 144     _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
 145       r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
 146       r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
 147     _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
 148       r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
 149
 150   def __init__(self, input_api):
 151     self.max_line_numbers_proximity = 3
 152     self.last_a_item_line_number = -200
 153     self.last_b_item_line_number = -100
 154     self.re = input_api.re
 155
 156   def _CloseLineNumbers(self, a, b):
 157     return 0 <= a - b <= self.max_line_numbers_proximity
 158
 159   def MatchLine(self, line_number, line):
 160     if '"' in line:
 161       line = _CopyrightsScanner._c_comment_re.sub('', line)
 162     upcase_line = line.upper()
 163     # Record '(a)' and '(b)' last occurences in C++ comments.
 164     # This is to filter out '(c)' used as a list item inside C++ comments.
 165     # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
 166     cpp_comment_idx = upcase_line.find('//')
 167     if cpp_comment_idx != -1:
 168       if upcase_line.find('(A)') > cpp_comment_idx:
 169         self.last_a_item_line_number = line_number
 170       if upcase_line.find('(B)') > cpp_comment_idx:
 171         self.last_b_item_line_number = line_number
 172     # Fast bailout, uses the same patterns as _copyright_indicator regexp.
 173     if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
 174         and not '\xc2\xa9' in upcase_line:
 175       c_item_index = upcase_line.find('(C)')
 176       if c_item_index == -1:
 177         return None
 178       if c_item_index > cpp_comment_idx and \
 179           self._CloseLineNumbers(line_number,
 180                                  self.last_b_item_line_number) and \
 181           self._CloseLineNumbers(self.last_b_item_line_number,
 182                                  self.last_a_item_line_number):
 183         return None
 184     copyr = None
 185     m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
 186     if m and \
 187         not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
 188       copyr = m.group(0)
 189       # Prettify the authorship string.
 190       copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
 191       copyr = self.re.sub(
 192         _CopyrightsScanner._copyright_indicator, '', copyr, \
 193         flags=self.re.IGNORECASE)
 194       copyr = self.re.sub(r'^\s+', '', copyr)
 195       copyr = self.re.sub(r'\s{2,}', ' ', copyr)
 196       copyr = self.re.sub(r'\\@', '@', copyr)
 197     return copyr
 198
 199
 200 def FindCopyrights(input_api, root_dir, files_to_scan):
 201   """Determines code autorship, and finds generated files.
 202   Args:
 203     input_api: InputAPI, as in presubmit scripts.
 204     root_dir: The root directory, to which all other paths are relative.
 205     files_to_scan: The list of file names to scan.
 206   Returns:
 207     The list of copyrights associated with each of the files given.
 208     If the certain file is generated, the corresponding list consists a single
 209     entry -- 'GENERATED_FILE' string. If the file has no copyright info,
 210     the corresponding list contains 'NO_COPYRIGHT' string.
 211   """
 212   generated_files_detector = _GeneratedFilesDetector(input_api)
 213   _CopyrightsScanner.StaticInit(input_api)
 214   copyrights = []
 215   for file_name in files_to_scan:
 216     linenum = 0
 217     header = []
 218     file_copyrights = []
 219     scanner = _CopyrightsScanner(input_api)
 220     contents = input_api.ReadFile(
 221       input_api.os_path.join(root_dir, file_name), 'r')
 222     for l in contents.split('\n'):
 223       linenum += 1
 224       if linenum <= 25:
 225         header.append(l)
 226       c = scanner.MatchLine(linenum, l)
 227       if c:
 228         file_copyrights.append(c)
 229     if generated_files_detector.IsGeneratedFile('\n'.join(header)):
 230       copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
 231     elif file_copyrights:
 232       copyrights.append(file_copyrights)
 233     else:
 234       copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
 235   return copyrights
 236
 237
 238 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
 239   """Looks for files that are not belong exlusively to the Chromium Authors.
 240   Args:
 241     input_api: InputAPI, as in presubmit scripts.
 242     root_dir: The root directory, to which all other paths are relative.
 243     files_to_scan: The list of file names to scan.
 244   Returns:
 245     The list of file names that contain non-Chromium copyrights.
 246   """
 247   copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
 248   offending_files = []
 249   allowed_copyrights_re = input_api.re.compile(
 250     r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
 251     'All rights reserved.*)$')
 252   for f, cs in itertools.izip(files_to_scan, copyrights):
 253     if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
 254        cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
 255       continue
 256     for c in cs:
 257       if not allowed_copyrights_re.match(c):
 258         offending_files.append(input_api.os_path.normpath(f))
 259         break
 260   return offending_files
 261
 262
 263 def _GetWhitelistFileName(input_api):
 264   return input_api.os_path.join(
 265     'android_webview', 'tools', 'third_party_files_whitelist.txt')
 266
 267 def _ProcessWhitelistedFilesList(input_api, lines):
 268   whitelisted_files = []
 269   for line in lines:
 270     match = input_api.re.match(r'([^#\s]+)', line)
 271     if match:
 272       whitelisted_files.append(match.group(1))
 273   return whitelisted_files
 274
 275
 276 def LoadWhitelistedFilesList(input_api):
 277   """Loads and parses the 3rd party code whitelist file.
 278     input_api: InputAPI of presubmit scripts.
 279   Returns:
 280     The list of files.
 281   """
 282   full_file_name = input_api.os_path.join(
 283     input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
 284   file_data = input_api.ReadFile(full_file_name, 'rb')
 285   return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())
 286
 287
 288 def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
 289   """Compares whitelist contents with the results of file scanning.
 290     input_api: InputAPI of presubmit scripts.
 291     whitelisted_files: Whitelisted files list.
 292     offending_files: Files that contain 3rd party code.
 293   Returns:
 294     A triplet of "unknown", "missing", and "stale" file lists.
 295     "Unknown" are files that contain 3rd party code but not whitelisted.
 296     "Missing" are files that are whitelisted but doesn't really exist.
 297     "Stale" are files that are whitelisted unnecessarily.
 298   """
 299   unknown = set(offending_files) - set(whitelisted_files)
 300   missing = [f for f in whitelisted_files if not input_api.os_path.isfile(f)]
 301   stale = set(whitelisted_files) - set(offending_files) - set(missing)
 302   return (list(unknown), missing, list(stale))
 303
 304
 305 def _GetDeletedContents(affected_file):
 306   """Returns a list of all deleted lines.
 307   AffectedFile class from presubmit_support is lacking this functionality.
 308   """
 309   deleted_lines = []
 310   for line in affected_file.GenerateScmDiff().splitlines():
 311     if line.startswith('-') and not line.startswith('--'):
 312       deleted_lines.append(line[1:])
 313   return deleted_lines
 314
 315 def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
 316   # We pass empty 'known third-party' dirs list here. Since this is a patch
 317   # for the Chromium's src tree, it must contain properly licensed Chromium
 318   # code. Any third-party code must be put into a directory named 'third_party',
 319   # and such dirs are automatically excluded by FindFiles.
 320   files_to_scan = FindFiles(
 321     input_api, input_api.change.RepositoryRoot(), files_to_check, [])
 322   offending_files = FindCopyrightViolations(
 323     input_api, input_api.change.RepositoryRoot(), files_to_scan)
 324   return AnalyzeScanResults(
 325     input_api, whitelisted_files, offending_files)
 326
 327 def ScanAtPresubmit(input_api, output_api):
 328   """Invoked at change presubmit time. Verifies that updated non third-party
 329   code doesn't contain external copyrighted code.
 330     input_api: InputAPI of presubmit scripts.
 331     output_api: OutputAPI of presubmit scripts.
 332   """
 333   files_to_check = set([])
 334   deleted_files = set([])
 335   whitelist_contents_changed = False
 336   for f in input_api.AffectedFiles():
 337     if f.LocalPath() == _GetWhitelistFileName(input_api):
 338       whitelist_contents_changed = True
 339       deleted_files |= set(_ProcessWhitelistedFilesList(
 340         input_api, _GetDeletedContents(f)))
 341       continue
 342     if f.Action() != 'D':
 343       files_to_check.add(f.LocalPath())
 344     else:
 345       deleted_files.add(f.LocalPath())
 346   whitelisted_files = set(LoadWhitelistedFilesList(input_api))
 347   if not whitelist_contents_changed:
 348     whitelisted_files &= files_to_check | deleted_files
 349   else:
 350     # Need to re-check the entire contents of the whitelist file.
 351     # Also add files removed from the whitelist. If the file has indeed been
 352     # deleted, the scanner will not complain.
 353     files_to_check |= whitelisted_files | deleted_files
 354
 355   (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
 356     input_api, list(whitelisted_files), list(files_to_check))
 357   results = []
 358   if unknown_files:
 359     results.append(output_api.PresubmitError(
 360         'The following files contain a third-party license but are not in ' \
 361         'a listed third-party directory and are not whitelisted. You must ' \
 362         'add the following files to the whitelist file ' \
 363         '%s:' % _GetWhitelistFileName(input_api),
 364         sorted(unknown_files)))
 365   if missing_files:
 366     results.append(output_api.PresubmitPromptWarning(
 367         'The following files are whitelisted in %s, ' \
 368         'but do not exist or not files:' % _GetWhitelistFileName(input_api),
 369         sorted(missing_files)))
 370   if stale_files:
 371     results.append(output_api.PresubmitPromptWarning(
 372         'The following files are whitelisted unnecessarily. You must ' \
 373         'remove the following files from the whitelist file ' \
 374         '%s:' % _GetWhitelistFileName(input_api),
 375         sorted(stale_files)))
 376   return results