Implementation of the NetworkingConfigService
[chromium-blink-merge.git] / android_webview / tools / copyright_scanner.py
blobbe21231ec2ff41a5937ecdb42c0a45e72f853461
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """Utilities for scanning source files to determine code authorship.
6 """
8 import itertools
10 def ForwardSlashesToOsPathSeps(input_api, path):
11 """Converts forward slashes ('/') in the input path to OS-specific
12 path separators. Used when the paths come from outside and are using
13 UNIX path separators. Only works for relative paths!
14 Args:
15 input_api: InputAPI, as in presubmit scripts.
16 path: The path to convert.
17 Returns:
18 Converted path.
19 """
20 return input_api.os_path.join(*path.split('/'))
22 def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
23 """Similar to UNIX utility find(1), searches for files in the directories.
24 Automatically leaves out only source code files and excludes third_party
25 directories.
26 Args:
27 input_api: InputAPI, as in presubmit scripts.
28 root_dir: The root directory, to which all other paths are relative.
29 start_paths_list: The list of paths to start search from. Each path can
30 be a file or a directory.
31 excluded_dirs_list: The list of directories to skip.
32 Returns:
33 The list of source code files found, relative to |root_dir|.
34 """
35 excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
36 # Using a common pattern for third-partyies makes the ignore regexp shorter
37 excluded_dirs_list.append('third_party')
39 path_join = input_api.os_path.join
40 EXTRA_EXCLUDED_DIRS = [
41 # VCS dirs
42 path_join('.git'),
43 path_join('.svn'),
44 # Build output
45 path_join('out', 'Debug'),
46 path_join('out', 'Release'),
47 # 'Copyright' appears in license agreements
48 path_join('chrome', 'app', 'resources'),
49 # Quickoffice js files from internal src used on buildbots.
50 # crbug.com/350472.
51 path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
52 # This is a test output directory
53 path_join('chrome', 'tools', 'test', 'reference_build'),
54 # blink style copy right headers.
55 path_join('content', 'shell', 'renderer', 'test_runner'),
56 # blink style copy right headers.
57 path_join('content', 'shell', 'tools', 'plugin'),
58 # This is tests directory, doesn't exist in the snapshot
59 path_join('content', 'test', 'data'),
60 # This is a tests directory that doesn't exist in the shipped product.
61 path_join('gin', 'test'),
62 # This is a test output directory
63 path_join('data', 'dom_perf'),
64 # This is a tests directory that doesn't exist in the shipped product.
65 path_join('tools', 'perf', 'page_sets'),
66 path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
67 # Histogram tools, doesn't exist in the snapshot
68 path_join('tools', 'histograms'),
69 # Swarming tools, doesn't exist in the snapshot
70 path_join('tools', 'swarming_client'),
71 # ARM sysroot, doesn't exist in the snapshot
72 path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
73 # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
74 path_join('arm-sysroot'),
75 # Data is not part of open source chromium, but are included on some bots.
76 path_join('data'),
77 # This is not part of open source chromium, but are included on some bots.
78 path_join('skia', 'tools', 'clusterfuzz-data')
80 excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)
82 # Surround the directory names with OS path separators.
83 dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d]
84 def IsBlacklistedDir(d):
85 for item in dirs_blacklist:
86 if item in d:
87 return True
88 return False
90 files_whitelist_re = input_api.re.compile(
91 r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
92 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
93 '|tex|mli?)$')
94 files = []
96 base_path_len = len(root_dir)
97 for path in start_paths_list:
98 full_path = path_join(root_dir, path)
99 if input_api.os_path.isfile(full_path):
100 if files_whitelist_re.search(path) and \
101 not IsBlacklistedDir(full_path[base_path_len:]): # Keep '/' prefix.
102 files.append(path)
103 else:
104 for dirpath, dirnames, filenames in input_api.os_walk(full_path):
105 # Remove excluded subdirs for faster scanning.
106 for item in dirnames[:]:
107 if IsBlacklistedDir(
108 path_join(dirpath, item)[base_path_len + 1:]):
109 dirnames.remove(item)
110 for filename in filenames:
111 filepath = \
112 path_join(dirpath, filename)[base_path_len + 1:]
113 if files_whitelist_re.search(filepath) and \
114 not IsBlacklistedDir(filepath):
115 files.append(filepath)
116 return files
119 class _GeneratedFilesDetector(object):
120 GENERATED_FILE = 'GENERATED FILE'
121 NO_COPYRIGHT = '*No copyright*'
123 def __init__(self, input_api):
124 self.python_multiline_string_double_re = \
125 input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
126 self.python_multiline_string_single_re = \
127 input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
128 self.automatically_generated_re = input_api.re.compile(
129 r'(All changes made in this file will be lost'
130 '|DO NOT (EDIT|delete this file)'
131 '|Generated (at|automatically|data)'
132 '|Automatically generated'
133 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)
135 def IsGeneratedFile(self, header):
136 header = header.upper()
137 if '"""' in header:
138 header = self.python_multiline_string_double_re.sub('', header)
139 if "'''" in header:
140 header = self.python_multiline_string_single_re.sub('', header)
141 # First do simple strings lookup to save time.
142 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
143 return True
144 if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
145 'GENERATED' in header:
146 return self.automatically_generated_re.search(header)
147 return False
150 class _CopyrightsScanner(object):
151 @staticmethod
152 def StaticInit(input_api):
153 _CopyrightsScanner._c_comment_re = \
154 input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
155 _CopyrightsScanner._copyright_indicator = \
156 r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
157 _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
158 r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
159 r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
160 _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
161 r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)
163 def __init__(self, input_api):
164 self.max_line_numbers_proximity = 3
165 self.last_a_item_line_number = -200
166 self.last_b_item_line_number = -100
167 self.re = input_api.re
169 def _CloseLineNumbers(self, a, b):
170 return 0 <= a - b <= self.max_line_numbers_proximity
172 def MatchLine(self, line_number, line):
173 if '"' in line:
174 line = _CopyrightsScanner._c_comment_re.sub('', line)
175 upcase_line = line.upper()
176 # Record '(a)' and '(b)' last occurences in C++ comments.
177 # This is to filter out '(c)' used as a list item inside C++ comments.
178 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
179 cpp_comment_idx = upcase_line.find('//')
180 if cpp_comment_idx != -1:
181 if upcase_line.find('(A)') > cpp_comment_idx:
182 self.last_a_item_line_number = line_number
183 if upcase_line.find('(B)') > cpp_comment_idx:
184 self.last_b_item_line_number = line_number
185 # Fast bailout, uses the same patterns as _copyright_indicator regexp.
186 if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
187 and not '\xc2\xa9' in upcase_line:
188 c_item_index = upcase_line.find('(C)')
189 if c_item_index == -1:
190 return None
191 if c_item_index > cpp_comment_idx and \
192 self._CloseLineNumbers(line_number,
193 self.last_b_item_line_number) and \
194 self._CloseLineNumbers(self.last_b_item_line_number,
195 self.last_a_item_line_number):
196 return None
197 copyr = None
198 m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
199 if m and \
200 not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
201 copyr = m.group(0)
202 # Prettify the authorship string.
203 copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
204 copyr = self.re.sub(
205 _CopyrightsScanner._copyright_indicator, '', copyr, \
206 flags=self.re.IGNORECASE)
207 copyr = self.re.sub(r'^\s+', '', copyr)
208 copyr = self.re.sub(r'\s{2,}', ' ', copyr)
209 copyr = self.re.sub(r'\\@', '@', copyr)
210 return copyr
213 def FindCopyrights(input_api, root_dir, files_to_scan):
214 """Determines code autorship, and finds generated files.
215 Args:
216 input_api: InputAPI, as in presubmit scripts.
217 root_dir: The root directory, to which all other paths are relative.
218 files_to_scan: The list of file names to scan.
219 Returns:
220 The list of copyrights associated with each of the files given.
221 If the certain file is generated, the corresponding list consists a single
222 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
223 the corresponding list contains 'NO_COPYRIGHT' string.
225 generated_files_detector = _GeneratedFilesDetector(input_api)
226 _CopyrightsScanner.StaticInit(input_api)
227 copyrights = []
228 for file_name in files_to_scan:
229 linenum = 0
230 header = []
231 file_copyrights = []
232 scanner = _CopyrightsScanner(input_api)
233 contents = input_api.ReadFile(
234 input_api.os_path.join(root_dir, file_name), 'r')
235 for l in contents.split('\n'):
236 linenum += 1
237 if linenum <= 25:
238 header.append(l)
239 c = scanner.MatchLine(linenum, l)
240 if c:
241 file_copyrights.append(c)
242 if generated_files_detector.IsGeneratedFile('\n'.join(header)):
243 copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
244 elif file_copyrights:
245 copyrights.append(file_copyrights)
246 else:
247 copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
248 return copyrights
251 def FindCopyrightViolations(input_api, root_dir, files_to_scan):
252 """Looks for files that are not belong exlusively to the Chromium Authors.
253 Args:
254 input_api: InputAPI, as in presubmit scripts.
255 root_dir: The root directory, to which all other paths are relative.
256 files_to_scan: The list of file names to scan.
257 Returns:
258 The list of file names that contain non-Chromium copyrights.
260 copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
261 offending_files = []
262 allowed_copyrights_re = input_api.re.compile(
263 r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
264 'All rights reserved.*)$')
265 for f, cs in itertools.izip(files_to_scan, copyrights):
266 if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
267 cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
268 continue
269 for c in cs:
270 if not allowed_copyrights_re.match(c):
271 offending_files.append(input_api.os_path.normpath(f))
272 break
273 return offending_files
276 def _GetWhitelistFileName(input_api):
277 return input_api.os_path.join(
278 'android_webview', 'tools', 'third_party_files_whitelist.txt')
280 def _ProcessWhitelistedFilesList(input_api, lines):
281 whitelisted_files = []
282 for line in lines:
283 match = input_api.re.match(r'([^#\s]+)', line)
284 if match:
285 whitelisted_files.append(
286 ForwardSlashesToOsPathSeps(input_api, match.group(1)))
287 return whitelisted_files
290 def LoadWhitelistedFilesList(input_api):
291 """Loads and parses the 3rd party code whitelist file.
292 input_api: InputAPI of presubmit scripts.
293 Returns:
294 The list of files.
296 full_file_name = input_api.os_path.join(
297 input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
298 file_data = input_api.ReadFile(full_file_name, 'rb')
299 return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())
302 def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
303 """Compares whitelist contents with the results of file scanning.
304 input_api: InputAPI of presubmit scripts.
305 whitelisted_files: Whitelisted files list.
306 offending_files: Files that contain 3rd party code.
307 Returns:
308 A triplet of "unknown", "missing", and "stale" file lists.
309 "Unknown" are files that contain 3rd party code but not whitelisted.
310 "Missing" are files that are whitelisted but doesn't really exist.
311 "Stale" are files that are whitelisted unnecessarily.
313 unknown = set(offending_files) - set(whitelisted_files)
314 missing = [f for f in whitelisted_files if not input_api.os_path.isfile(f)]
315 stale = set(whitelisted_files) - set(offending_files) - set(missing)
316 return (list(unknown), missing, list(stale))
319 def _GetDeletedContents(affected_file):
320 """Returns a list of all deleted lines.
321 AffectedFile class from presubmit_support is lacking this functionality.
323 deleted_lines = []
324 for line in affected_file.GenerateScmDiff().splitlines():
325 if line.startswith('-') and not line.startswith('--'):
326 deleted_lines.append(line[1:])
327 return deleted_lines
329 def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
330 # We pass empty 'known third-party' dirs list here. Since this is a patch
331 # for the Chromium's src tree, it must contain properly licensed Chromium
332 # code. Any third-party code must be put into a directory named 'third_party',
333 # and such dirs are automatically excluded by FindFiles.
334 files_to_scan = FindFiles(
335 input_api, input_api.change.RepositoryRoot(), files_to_check, [])
336 offending_files = FindCopyrightViolations(
337 input_api, input_api.change.RepositoryRoot(), files_to_scan)
338 return AnalyzeScanResults(
339 input_api, whitelisted_files, offending_files)
341 def ScanAtPresubmit(input_api, output_api):
342 """Invoked at change presubmit time. Verifies that updated non third-party
343 code doesn't contain external copyrighted code.
344 input_api: InputAPI of presubmit scripts.
345 output_api: OutputAPI of presubmit scripts.
347 files_to_check = set([])
348 deleted_files = set([])
349 whitelist_contents_changed = False
350 for f in input_api.AffectedFiles():
351 if f.LocalPath() == _GetWhitelistFileName(input_api):
352 whitelist_contents_changed = True
353 deleted_files |= set(_ProcessWhitelistedFilesList(
354 input_api, _GetDeletedContents(f)))
355 continue
356 if f.Action() != 'D':
357 files_to_check.add(f.LocalPath())
358 else:
359 deleted_files.add(f.LocalPath())
360 whitelisted_files = set(LoadWhitelistedFilesList(input_api))
361 if not whitelist_contents_changed:
362 whitelisted_files &= files_to_check | deleted_files
363 else:
364 # Need to re-check the entire contents of the whitelist file.
365 # Also add files removed from the whitelist. If the file has indeed been
366 # deleted, the scanner will not complain.
367 files_to_check |= whitelisted_files | deleted_files
369 (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
370 input_api, list(whitelisted_files), list(files_to_check))
371 results = []
372 if unknown_files:
373 results.append(output_api.PresubmitError(
374 'The following files contain a third-party license but are not in ' \
375 'a listed third-party directory and are not whitelisted. You must ' \
376 'add the following files to the whitelist file ' \
377 '%s:' % _GetWhitelistFileName(input_api),
378 sorted(unknown_files)))
379 if missing_files:
380 results.append(output_api.PresubmitPromptWarning(
381 'The following files are whitelisted in %s, ' \
382 'but do not exist or not files:' % _GetWhitelistFileName(input_api),
383 sorted(missing_files)))
384 if stale_files:
385 results.append(output_api.PresubmitPromptWarning(
386 'The following files are whitelisted unnecessarily. You must ' \
387 'remove the following files from the whitelist file ' \
388 '%s:' % _GetWhitelistFileName(input_api),
389 sorted(stale_files)))
390 return results