1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """Utilities for scanning source files to determine code authorship.
10 def ForwardSlashesToOsPathSeps(input_api
, path
):
11 """Converts forward slashes ('/') in the input path to OS-specific
12 path separators. Used when the paths come from outside and are using
13 UNIX path separators. Only works for relative paths!
15 input_api: InputAPI, as in presubmit scripts.
16 path: The path to convert.
20 return input_api
.os_path
.join(*path
.split('/'))
22 def FindFiles(input_api
, root_dir
, start_paths_list
, excluded_dirs_list
):
23 """Similar to UNIX utility find(1), searches for files in the directories.
24 Automatically leaves out only source code files and excludes third_party
27 input_api: InputAPI, as in presubmit scripts.
28 root_dir: The root directory, to which all other paths are relative.
29 start_paths_list: The list of paths to start search from. Each path can
30 be a file or a directory.
31 excluded_dirs_list: The list of directories to skip.
33 The list of source code files found, relative to |root_dir|.
35 excluded_dirs_list
= [d
for d
in excluded_dirs_list
if not 'third_party' in d
]
36 # Using a common pattern for third-partyies makes the ignore regexp shorter
37 excluded_dirs_list
.append('third_party')
39 path_join
= input_api
.os_path
.join
40 EXTRA_EXCLUDED_DIRS
= [
45 path_join('out', 'Debug'),
46 path_join('out', 'Release'),
47 # 'Copyright' appears in license agreements
48 path_join('chrome', 'app', 'resources'),
49 # Quickoffice js files from internal src used on buildbots.
51 path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
52 # This is a test output directory
53 path_join('chrome', 'tools', 'test', 'reference_build'),
54 # blink style copy right headers.
55 path_join('content', 'shell', 'renderer', 'test_runner'),
56 # blink style copy right headers.
57 path_join('content', 'shell', 'tools', 'plugin'),
58 # This is tests directory, doesn't exist in the snapshot
59 path_join('content', 'test', 'data'),
60 # This is a tests directory that doesn't exist in the shipped product.
61 path_join('gin', 'test'),
62 # This is a test output directory
63 path_join('data', 'dom_perf'),
64 # This is a tests directory that doesn't exist in the shipped product.
65 path_join('tools', 'perf', 'page_sets'),
66 path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
67 # Histogram tools, doesn't exist in the snapshot
68 path_join('tools', 'histograms'),
69 # Swarming tools, doesn't exist in the snapshot
70 path_join('tools', 'swarming_client'),
71 # ARM sysroot, doesn't exist in the snapshot
72 path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
73 # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
74 path_join('arm-sysroot'),
75 # Data is not part of open source chromium, but are included on some bots.
77 # This is not part of open source chromium, but are included on some bots.
78 path_join('skia', 'tools', 'clusterfuzz-data')
80 excluded_dirs_list
.extend(EXTRA_EXCLUDED_DIRS
)
82 # Surround the directory names with OS path separators.
83 dirs_blacklist
= [path_join('.', d
, '')[1:] for d
in excluded_dirs_list
if d
]
84 def IsBlacklistedDir(d
):
85 for item
in dirs_blacklist
:
90 files_whitelist_re
= input_api
.re
.compile(
91 r
'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
92 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
96 base_path_len
= len(root_dir
)
97 for path
in start_paths_list
:
98 full_path
= path_join(root_dir
, path
)
99 if input_api
.os_path
.isfile(full_path
):
100 if files_whitelist_re
.search(path
) and \
101 not IsBlacklistedDir(full_path
[base_path_len
:]): # Keep '/' prefix.
104 for dirpath
, dirnames
, filenames
in input_api
.os_walk(full_path
):
105 # Remove excluded subdirs for faster scanning.
106 for item
in dirnames
[:]:
108 path_join(dirpath
, item
)[base_path_len
+ 1:]):
109 dirnames
.remove(item
)
110 for filename
in filenames
:
112 path_join(dirpath
, filename
)[base_path_len
+ 1:]
113 if files_whitelist_re
.search(filepath
) and \
114 not IsBlacklistedDir(filepath
):
115 files
.append(filepath
)
119 class _GeneratedFilesDetector(object):
120 GENERATED_FILE
= 'GENERATED FILE'
121 NO_COPYRIGHT
= '*No copyright*'
123 def __init__(self
, input_api
):
124 self
.python_multiline_string_double_re
= \
125 input_api
.re
.compile(r
'"""[^"]*(?:"""|$)', flags
=input_api
.re
.MULTILINE
)
126 self
.python_multiline_string_single_re
= \
127 input_api
.re
.compile(r
"'''[^']*(?:'''|$)", flags
=input_api
.re
.MULTILINE
)
128 self
.automatically_generated_re
= input_api
.re
.compile(
129 r
'(All changes made in this file will be lost'
130 '|DO NOT (EDIT|delete this file)'
131 '|Generated (at|automatically|data)'
132 '|Automatically generated'
133 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags
=input_api
.re
.IGNORECASE
)
135 def IsGeneratedFile(self
, header
):
136 header
= header
.upper()
138 header
= self
.python_multiline_string_double_re
.sub('', header
)
140 header
= self
.python_multiline_string_single_re
.sub('', header
)
141 # First do simple strings lookup to save time.
142 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header
:
144 if 'DO NOT EDIT' in header
or 'DO NOT DELETE' in header
or \
145 'GENERATED' in header
:
146 return self
.automatically_generated_re
.search(header
)
150 class _CopyrightsScanner(object):
152 def StaticInit(input_api
):
153 _CopyrightsScanner
._c
_comment
_re
= \
154 input_api
.re
.compile(r
'''"[^"\\]*(?:\\.[^"\\]*)*"''')
155 _CopyrightsScanner
._copyright
_indicator
= \
156 r
'(?:copyright|copr\.|\xc2\xa9|\(c\))'
157 _CopyrightsScanner
._full
_copyright
_indicator
_re
= input_api
.re
.compile(
158 r
'(?:\W|^)' + _CopyrightsScanner
._copyright
_indicator
+ \
159 r
'(?::\s*|\s+)(\w.*)$', input_api
.re
.IGNORECASE
)
160 _CopyrightsScanner
._copyright
_disindicator
_re
= input_api
.re
.compile(
161 r
'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api
.re
.IGNORECASE
)
163 def __init__(self
, input_api
):
164 self
.max_line_numbers_proximity
= 3
165 self
.last_a_item_line_number
= -200
166 self
.last_b_item_line_number
= -100
167 self
.re
= input_api
.re
169 def _CloseLineNumbers(self
, a
, b
):
170 return 0 <= a
- b
<= self
.max_line_numbers_proximity
172 def MatchLine(self
, line_number
, line
):
174 line
= _CopyrightsScanner
._c
_comment
_re
.sub('', line
)
175 upcase_line
= line
.upper()
176 # Record '(a)' and '(b)' last occurences in C++ comments.
177 # This is to filter out '(c)' used as a list item inside C++ comments.
178 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
179 cpp_comment_idx
= upcase_line
.find('//')
180 if cpp_comment_idx
!= -1:
181 if upcase_line
.find('(A)') > cpp_comment_idx
:
182 self
.last_a_item_line_number
= line_number
183 if upcase_line
.find('(B)') > cpp_comment_idx
:
184 self
.last_b_item_line_number
= line_number
185 # Fast bailout, uses the same patterns as _copyright_indicator regexp.
186 if not 'COPYRIGHT' in upcase_line
and not 'COPR.' in upcase_line \
187 and not '\xc2\xa9' in upcase_line
:
188 c_item_index
= upcase_line
.find('(C)')
189 if c_item_index
== -1:
191 if c_item_index
> cpp_comment_idx
and \
192 self
._CloseLineNumbers
(line_number
,
193 self
.last_b_item_line_number
) and \
194 self
._CloseLineNumbers
(self
.last_b_item_line_number
,
195 self
.last_a_item_line_number
):
198 m
= _CopyrightsScanner
._full
_copyright
_indicator
_re
.search(line
)
200 not _CopyrightsScanner
._copyright
_disindicator
_re
.match(m
.group(1)):
202 # Prettify the authorship string.
203 copyr
= self
.re
.sub(r
'([,.])?\s*$/', '', copyr
)
205 _CopyrightsScanner
._copyright
_indicator
, '', copyr
, \
206 flags
=self
.re
.IGNORECASE
)
207 copyr
= self
.re
.sub(r
'^\s+', '', copyr
)
208 copyr
= self
.re
.sub(r
'\s{2,}', ' ', copyr
)
209 copyr
= self
.re
.sub(r
'\\@', '@', copyr
)
213 def FindCopyrights(input_api
, root_dir
, files_to_scan
):
214 """Determines code autorship, and finds generated files.
216 input_api: InputAPI, as in presubmit scripts.
217 root_dir: The root directory, to which all other paths are relative.
218 files_to_scan: The list of file names to scan.
220 The list of copyrights associated with each of the files given.
221 If the certain file is generated, the corresponding list consists a single
222 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
223 the corresponding list contains 'NO_COPYRIGHT' string.
225 generated_files_detector
= _GeneratedFilesDetector(input_api
)
226 _CopyrightsScanner
.StaticInit(input_api
)
228 for file_name
in files_to_scan
:
232 scanner
= _CopyrightsScanner(input_api
)
233 contents
= input_api
.ReadFile(
234 input_api
.os_path
.join(root_dir
, file_name
), 'r')
235 for l
in contents
.split('\n'):
239 c
= scanner
.MatchLine(linenum
, l
)
241 file_copyrights
.append(c
)
242 if generated_files_detector
.IsGeneratedFile('\n'.join(header
)):
243 copyrights
.append([_GeneratedFilesDetector
.GENERATED_FILE
])
244 elif file_copyrights
:
245 copyrights
.append(file_copyrights
)
247 copyrights
.append([_GeneratedFilesDetector
.NO_COPYRIGHT
])
251 def FindCopyrightViolations(input_api
, root_dir
, files_to_scan
):
252 """Looks for files that are not belong exlusively to the Chromium Authors.
254 input_api: InputAPI, as in presubmit scripts.
255 root_dir: The root directory, to which all other paths are relative.
256 files_to_scan: The list of file names to scan.
258 The list of file names that contain non-Chromium copyrights.
260 copyrights
= FindCopyrights(input_api
, root_dir
, files_to_scan
)
262 allowed_copyrights_re
= input_api
.re
.compile(
263 r
'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
264 'All rights reserved.*)$')
265 for f
, cs
in itertools
.izip(files_to_scan
, copyrights
):
266 if cs
[0] == _GeneratedFilesDetector
.GENERATED_FILE
or \
267 cs
[0] == _GeneratedFilesDetector
.NO_COPYRIGHT
:
270 if not allowed_copyrights_re
.match(c
):
271 offending_files
.append(input_api
.os_path
.normpath(f
))
273 return offending_files
276 def _GetWhitelistFileName(input_api
):
277 return input_api
.os_path
.join(
278 'android_webview', 'tools', 'third_party_files_whitelist.txt')
280 def _ProcessWhitelistedFilesList(input_api
, lines
):
281 whitelisted_files
= []
283 match
= input_api
.re
.match(r
'([^#\s]+)', line
)
285 whitelisted_files
.append(
286 ForwardSlashesToOsPathSeps(input_api
, match
.group(1)))
287 return whitelisted_files
290 def LoadWhitelistedFilesList(input_api
):
291 """Loads and parses the 3rd party code whitelist file.
292 input_api: InputAPI of presubmit scripts.
296 full_file_name
= input_api
.os_path
.join(
297 input_api
.change
.RepositoryRoot(), _GetWhitelistFileName(input_api
))
298 file_data
= input_api
.ReadFile(full_file_name
, 'rb')
299 return _ProcessWhitelistedFilesList(input_api
, file_data
.splitlines())
302 def AnalyzeScanResults(input_api
, whitelisted_files
, offending_files
):
303 """Compares whitelist contents with the results of file scanning.
304 input_api: InputAPI of presubmit scripts.
305 whitelisted_files: Whitelisted files list.
306 offending_files: Files that contain 3rd party code.
308 A triplet of "unknown", "missing", and "stale" file lists.
309 "Unknown" are files that contain 3rd party code but not whitelisted.
310 "Missing" are files that are whitelisted but doesn't really exist.
311 "Stale" are files that are whitelisted unnecessarily.
313 unknown
= set(offending_files
) - set(whitelisted_files
)
314 missing
= [f
for f
in whitelisted_files
if not input_api
.os_path
.isfile(f
)]
315 stale
= set(whitelisted_files
) - set(offending_files
) - set(missing
)
316 return (list(unknown
), missing
, list(stale
))
319 def _GetDeletedContents(affected_file
):
320 """Returns a list of all deleted lines.
321 AffectedFile class from presubmit_support is lacking this functionality.
324 for line
in affected_file
.GenerateScmDiff().splitlines():
325 if line
.startswith('-') and not line
.startswith('--'):
326 deleted_lines
.append(line
[1:])
329 def _DoScanAtPresubmit(input_api
, whitelisted_files
, files_to_check
):
330 # We pass empty 'known third-party' dirs list here. Since this is a patch
331 # for the Chromium's src tree, it must contain properly licensed Chromium
332 # code. Any third-party code must be put into a directory named 'third_party',
333 # and such dirs are automatically excluded by FindFiles.
334 files_to_scan
= FindFiles(
335 input_api
, input_api
.change
.RepositoryRoot(), files_to_check
, [])
336 offending_files
= FindCopyrightViolations(
337 input_api
, input_api
.change
.RepositoryRoot(), files_to_scan
)
338 return AnalyzeScanResults(
339 input_api
, whitelisted_files
, offending_files
)
341 def ScanAtPresubmit(input_api
, output_api
):
342 """Invoked at change presubmit time. Verifies that updated non third-party
343 code doesn't contain external copyrighted code.
344 input_api: InputAPI of presubmit scripts.
345 output_api: OutputAPI of presubmit scripts.
347 files_to_check
= set([])
348 deleted_files
= set([])
349 whitelist_contents_changed
= False
350 for f
in input_api
.AffectedFiles():
351 if f
.LocalPath() == _GetWhitelistFileName(input_api
):
352 whitelist_contents_changed
= True
353 deleted_files |
= set(_ProcessWhitelistedFilesList(
354 input_api
, _GetDeletedContents(f
)))
356 if f
.Action() != 'D':
357 files_to_check
.add(f
.LocalPath())
359 deleted_files
.add(f
.LocalPath())
360 whitelisted_files
= set(LoadWhitelistedFilesList(input_api
))
361 if not whitelist_contents_changed
:
362 whitelisted_files
&= files_to_check | deleted_files
364 # Need to re-check the entire contents of the whitelist file.
365 # Also add files removed from the whitelist. If the file has indeed been
366 # deleted, the scanner will not complain.
367 files_to_check |
= whitelisted_files | deleted_files
369 (unknown_files
, missing_files
, stale_files
) = _DoScanAtPresubmit(
370 input_api
, list(whitelisted_files
), list(files_to_check
))
373 results
.append(output_api
.PresubmitError(
374 'The following files contain a third-party license but are not in ' \
375 'a listed third-party directory and are not whitelisted. You must ' \
376 'add the following files to the whitelist file ' \
377 '%s:' % _GetWhitelistFileName(input_api
),
378 sorted(unknown_files
)))
380 results
.append(output_api
.PresubmitPromptWarning(
381 'The following files are whitelisted in %s, ' \
382 'but do not exist or not files:' % _GetWhitelistFileName(input_api
),
383 sorted(missing_files
)))
385 results
.append(output_api
.PresubmitPromptWarning(
386 'The following files are whitelisted unnecessarily. You must ' \
387 'remove the following files from the whitelist file ' \
388 '%s:' % _GetWhitelistFileName(input_api
),
389 sorted(stale_files
)))