Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / tools / copyright_scanner / copyright_scanner.py
blob439603a2132fa0b0acab37cd81e2fcc599c4df52
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """Utilities for scanning source files to determine code authorship.
6 """
8 import itertools
10 def ForwardSlashesToOsPathSeps(input_api, path):
11 """Converts forward slashes ('/') in the input path to OS-specific
12 path separators. Used when the paths come from outside and are using
13 UNIX path separators. Only works for relative paths!
14 Args:
15 input_api: InputAPI, as in presubmit scripts.
16 path: The path to convert.
17 Returns:
18 Converted path.
19 """
20 return input_api.os_path.join(*path.split('/'))
22 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
23 """Similar to UNIX utility find(1), searches for files in the directories.
24 Automatically leaves out only source code files and excludes third_party
25 directories.
26 Args:
27 input_api: InputAPI, as in presubmit scripts.
28 root_dir: The root directory, to which all other paths are relative.
29 start_paths_list: The list of paths to start search from. Each path can
30 be a file or a directory.
31 excluded_dirs_list: The list of directories to skip.
32 Returns:
33 The list of source code files found, relative to |root_dir|.
34 """
35 excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
36 # Using a common pattern for third-partyies makes the ignore regexp shorter
37 excluded_dirs_list.append('third_party')
39 path_join = input_api.os_path.join
40 EXTRA_EXCLUDED_DIRS = [
41 # VCS dirs
42 path_join('.git'),
43 path_join('.svn'),
44 # Build output
45 path_join('out', 'Debug'),
46 path_join('out', 'Release'),
47 # 'Copyright' appears in license agreements
48 path_join('chrome', 'app', 'resources'),
49 # Quickoffice js files from internal src used on buildbots.
50 # crbug.com/350472.
51 path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
52 # This is a test output directory
53 path_join('chrome', 'tools', 'test', 'reference_build'),
54 # blink style copy right headers.
55 path_join('content', 'shell', 'renderer', 'test_runner'),
56 # blink style copy right headers.
57 path_join('content', 'shell', 'tools', 'plugin'),
58 # This is tests directory, doesn't exist in the snapshot
59 path_join('content', 'test', 'data'),
60 # This is a tests directory that doesn't exist in the shipped product.
61 path_join('gin', 'test'),
62 # This is a test output directory
63 path_join('data', 'dom_perf'),
64 # This is a tests directory that doesn't exist in the shipped product.
65 path_join('tools', 'perf', 'page_sets'),
66 path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
67 # Histogram tools, doesn't exist in the snapshot
68 path_join('tools', 'histograms'),
69 # Swarming tools, doesn't exist in the snapshot
70 path_join('tools', 'swarming_client'),
71 # ARM sysroot, doesn't exist in the snapshot
72 path_join('build', 'linux', 'debian_wheezy_arm-sysroot'),
73 # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
74 path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
75 # Data is not part of open source chromium, but are included on some bots.
76 path_join('data'),
77 # This is not part of open source chromium, but are included on some bots.
78 path_join('skia', 'tools', 'clusterfuzz-data'),
79 # Not shipped, only relates to Chrome for Android, but not to WebView
80 path_join('clank'),
82 excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)
84 # Surround the directory names with OS path separators.
85 dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d]
86 def IsBlacklistedDir(d):
87 for item in dirs_blacklist:
88 if item in d:
89 return True
90 return False
92 files_whitelist_re = input_api.re.compile(
93 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
94 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
95 '|tex|mli?)$')
96 files = []
98 base_path_len = len(root_dir)
99 for path in start_paths_list:
100 full_path = path_join(root_dir, path)
101 if input_api.os_path.isfile(full_path):
102 if files_whitelist_re.search(path) and \
103 not IsBlacklistedDir(full_path[base_path_len:]): # Keep '/' prefix.
104 files.append(path)
105 else:
106 for dirpath, dirnames, filenames in input_api.os_walk(full_path):
107 # Remove excluded subdirs for faster scanning.
108 for item in dirnames[:]:
109 if IsBlacklistedDir(
110 path_join(dirpath, item)[base_path_len + 1:]):
111 dirnames.remove(item)
112 for filename in filenames:
113 filepath = \
114 path_join(dirpath, filename)[base_path_len + 1:]
115 if files_whitelist_re.search(filepath) and \
116 not IsBlacklistedDir(filepath):
117 files.append(filepath)
118 return files
121 class _GeneratedFilesDetector(object):
122 GENERATED_FILE = 'GENERATED FILE'
123 NO_COPYRIGHT = '*No copyright*'
125 def __init__(self, input_api):
126 self.python_multiline_string_double_re = \
127 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
128 self.python_multiline_string_single_re = \
129 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
130 self.automatically_generated_re = input_api.re.compile(
131 r'(All changes made in this file will be lost'
132 '|DO NOT (EDIT|delete this file)'
133 '|Generated (at|automatically|data)'
134 '|Automatically generated'
135 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
137 def IsGeneratedFile(self, header):
138 header = header.upper()
139 if '"""' in header:
140 header = self.python_multiline_string_double_re.sub('', header)
141 if "'''" in header:
142 header = self.python_multiline_string_single_re.sub('', header)
143 # First do simple strings lookup to save time.
144 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
145 return True
146 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
147 'GENERATED' in header:
148 return self.automatically_generated_re.search(header)
149 return False
152 class _CopyrightsScanner(object):
153 @staticmethod
154 def StaticInit(input_api):
155 _CopyrightsScanner._c_comment_re = \
156 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
157 _CopyrightsScanner._copyright_indicator = \
158 r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
159 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
160 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
161 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
162 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
163 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
165 def __init__(self, input_api):
166 self.max_line_numbers_proximity = 3
167 self.last_a_item_line_number = -200
168 self.last_b_item_line_number = -100
169 self.re = input_api.re
171 def _CloseLineNumbers(self, a, b):
172 return 0 <= a - b <= self.max_line_numbers_proximity
174 def MatchLine(self, line_number, line):
175 if '"' in line:
176 line = _CopyrightsScanner._c_comment_re.sub('', line)
177 upcase_line = line.upper()
178 # Record '(a)' and '(b)' last occurences in C++ comments.
179 # This is to filter out '(c)' used as a list item inside C++ comments.
180 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
181 cpp_comment_idx = upcase_line.find('//')
182 if cpp_comment_idx != -1:
183 if upcase_line.find('(A)') > cpp_comment_idx:
184 self.last_a_item_line_number = line_number
185 if upcase_line.find('(B)') > cpp_comment_idx:
186 self.last_b_item_line_number = line_number
187 # Fast bailout, uses the same patterns as _copyright_indicator regexp.
188 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
189 and not '\xc2\xa9' in upcase_line:
190 c_item_index = upcase_line.find('(C)')
191 if c_item_index == -1:
192 return None
193 if c_item_index > cpp_comment_idx and \
194 self._CloseLineNumbers(line_number,
195 self.last_b_item_line_number) and \
196 self._CloseLineNumbers(self.last_b_item_line_number,
197 self.last_a_item_line_number):
198 return None
199 copyr = None
200 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
201 if m and \
202 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
203 copyr = m.group(0)
204 # Prettify the authorship string.
205 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
206 copyr = self.re.sub(
207 _CopyrightsScanner._copyright_indicator, '', copyr, \
208 flags=self.re.IGNORECASE)
209 copyr = self.re.sub(r'^\s+', '', copyr)
210 copyr = self.re.sub(r'\s{2,}', ' ', copyr)
211 copyr = self.re.sub(r'\\@', '@', copyr)
212 return copyr
215 def FindCopyrights(input_api, root_dir, files_to_scan):
216 """Determines code autorship, and finds generated files.
217 Args:
218 input_api: InputAPI, as in presubmit scripts.
219 root_dir: The root directory, to which all other paths are relative.
220 files_to_scan: The list of file names to scan.
221 Returns:
222 The list of copyrights associated with each of the files given.
223 If the certain file is generated, the corresponding list consists a single
224 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
225 the corresponding list contains 'NO_COPYRIGHT' string.
227 generated_files_detector = _GeneratedFilesDetector(input_api)
228 _CopyrightsScanner.StaticInit(input_api)
229 copyrights = []
230 for file_name in files_to_scan:
231 linenum = 0
232 header = []
233 file_copyrights = []
234 scanner = _CopyrightsScanner(input_api)
235 contents = input_api.ReadFile(
236 input_api.os_path.join(root_dir, file_name), 'r')
237 for l in contents.split('\n'):
238 linenum += 1
239 if linenum <= 25:
240 header.append(l)
241 c = scanner.MatchLine(linenum, l)
242 if c:
243 file_copyrights.append(c)
244 if generated_files_detector.IsGeneratedFile('\n'.join(header)):
245 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
246 elif file_copyrights:
247 copyrights.append(file_copyrights)
248 else:
249 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
250 return copyrights
253 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
254 """Looks for files that are not belong exlusively to the Chromium Authors.
255 Args:
256 input_api: InputAPI, as in presubmit scripts.
257 root_dir: The root directory, to which all other paths are relative.
258 files_to_scan: The list of file names to scan.
259 Returns:
260 The list of file names that contain non-Chromium copyrights.
262 copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
263 offending_files = []
264 allowed_copyrights_re = input_api.re.compile(
265 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
266 'All rights reserved.*)$')
267 for f, cs in itertools.izip(files_to_scan, copyrights):
268 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
269 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
270 continue
271 for c in cs:
272 if not allowed_copyrights_re.match(c):
273 offending_files.append(input_api.os_path.normpath(f))
274 break
275 return offending_files
278 def _GetWhitelistFileName(input_api):
279 return input_api.os_path.join(
280 'tools', 'copyright_scanner', 'third_party_files_whitelist.txt')
282 def _ProcessWhitelistedFilesList(input_api, lines):
283 whitelisted_files = []
284 for line in lines:
285 match = input_api.re.match(r'([^#\s]+)', line)
286 if match:
287 whitelisted_files.append(
288 ForwardSlashesToOsPathSeps(input_api, match.group(1)))
289 return whitelisted_files
292 def LoadWhitelistedFilesList(input_api):
293 """Loads and parses the 3rd party code whitelist file.
294 input_api: InputAPI of presubmit scripts.
295 Returns:
296 The list of files.
298 full_file_name = input_api.os_path.join(
299 input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
300 file_data = input_api.ReadFile(full_file_name, 'rb')
301 return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())
304 def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
305 """Compares whitelist contents with the results of file scanning.
306 input_api: InputAPI of presubmit scripts.
307 whitelisted_files: Whitelisted files list.
308 offending_files: Files that contain 3rd party code.
309 Returns:
310 A triplet of "unknown", "missing", and "stale" file lists.
311 "Unknown" are files that contain 3rd party code but not whitelisted.
312 "Missing" are files that are whitelisted but doesn't really exist.
313 "Stale" are files that are whitelisted unnecessarily.
315 unknown = set(offending_files) - set(whitelisted_files)
316 missing = [f for f in whitelisted_files if not input_api.os_path.isfile(
317 input_api.os_path.join(input_api.change.RepositoryRoot(), f))]
318 stale = set(whitelisted_files) - set(offending_files) - set(missing)
319 return (list(unknown), missing, list(stale))
322 def _GetDeletedContents(affected_file):
323 """Returns a list of all deleted lines.
324 AffectedFile class from presubmit_support is lacking this functionality.
326 deleted_lines = []
327 for line in affected_file.GenerateScmDiff().splitlines():
328 if line.startswith('-') and not line.startswith('--'):
329 deleted_lines.append(line[1:])
330 return deleted_lines
332 def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
333 # We pass empty 'known third-party' dirs list here. Since this is a patch
334 # for the Chromium's src tree, it must contain properly licensed Chromium
335 # code. Any third-party code must be put into a directory named 'third_party',
336 # and such dirs are automatically excluded by FindFiles.
337 files_to_scan = FindFiles(
338 input_api, input_api.change.RepositoryRoot(), files_to_check, [])
339 offending_files = FindCopyrightViolations(
340 input_api, input_api.change.RepositoryRoot(), files_to_scan)
341 return AnalyzeScanResults(
342 input_api, whitelisted_files, offending_files)
344 def ScanAtPresubmit(input_api, output_api):
345 """Invoked at change presubmit time. Verifies that updated non third-party
346 code doesn't contain external copyrighted code.
347 input_api: InputAPI of presubmit scripts.
348 output_api: OutputAPI of presubmit scripts.
350 files_to_check = set([])
351 deleted_files = set([])
352 whitelist_contents_changed = False
353 for f in input_api.AffectedFiles():
354 if f.LocalPath() == _GetWhitelistFileName(input_api):
355 whitelist_contents_changed = True
356 deleted_files |= set(_ProcessWhitelistedFilesList(
357 input_api, _GetDeletedContents(f)))
358 continue
359 if f.Action() != 'D':
360 files_to_check.add(f.LocalPath())
361 else:
362 deleted_files.add(f.LocalPath())
363 whitelisted_files = set(LoadWhitelistedFilesList(input_api))
364 if not whitelist_contents_changed:
365 whitelisted_files &= files_to_check | deleted_files
366 else:
367 # Need to re-check the entire contents of the whitelist file.
368 # Also add files removed from the whitelist. If the file has indeed been
369 # deleted, the scanner will not complain.
370 files_to_check |= whitelisted_files | deleted_files
372 (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
373 input_api, list(whitelisted_files), list(files_to_check))
374 results = []
375 if unknown_files:
376 results.append(output_api.PresubmitError(
377 'The following files contain a third-party license but are not in ' \
378 'a listed third-party directory and are not whitelisted. You must ' \
379 'add the following files to the whitelist file %s\n' \
380 '(Note that if the code you are adding does not actually contain ' \
381 'any third-party code, it may contain the word "copyright", which ' \
382 'should be masked out, e.g. by writing it as "copy-right"):' \
383 '' % _GetWhitelistFileName(input_api),
384 sorted(unknown_files)))
385 if missing_files:
386 results.append(output_api.PresubmitPromptWarning(
387 'The following files are whitelisted in %s, ' \
388 'but do not exist or not files:' % _GetWhitelistFileName(input_api),
389 sorted(missing_files)))
390 if stale_files:
391 results.append(output_api.PresubmitPromptWarning(
392 'The following files are whitelisted unnecessarily. You must ' \
393 'remove the following files from the whitelist file ' \
394 '%s:' % _GetWhitelistFileName(input_api),
395 sorted(stale_files)))
396 return results