1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 """Utilities for scanning source files to determine code authorship.
11 def FindFiles(input_api
, root_dir
, start_paths_list
, excluded_dirs_list
):
12 """Similar to UNIX utility find(1), searches for files in the directories.
13 Automatically leaves out only source code files and excludes third_party
16 input_api: InputAPI, as in presubmit scripts.
17 root_dir: The root directory, to which all other paths are relative.
18 start_paths_list: The list of paths to start search from. Each path can
19 be a file or a directory.
20 excluded_dirs_list: The list of directories to skip.
22 The list of source code files found, relative to |root_dir|.
24 excluded_dirs_list
= [d
for d
in excluded_dirs_list
if not 'third_party' in d
]
25 # Using a common pattern for third-partyies makes the ignore regexp shorter
26 excluded_dirs_list
.append('third_party')
28 EXTRA_EXCLUDED_DIRS
= [
35 # 'Copyright' appears in license agreements
36 'chrome/app/resources',
37 # Quickoffice js files from internal src used on buildbots.
39 'chrome/browser/resources/chromeos/quickoffice',
40 # This is a test output directory
41 'chrome/tools/test/reference_build',
42 # blink style copy right headers.
43 'content/shell/renderer/test_runner',
44 # blink style copy right headers.
45 'content/shell/tools/plugin',
46 # This is tests directory, doesn't exist in the snapshot
48 # This is a tests directory that doesn't exist in the shipped product.
50 # This is a test output directory
52 # This is a tests directory that doesn't exist in the shipped product.
53 'tools/perf/page_sets',
54 'tools/perf/page_sets/tough_animation_cases',
55 # Histogram tools, doesn't exist in the snapshot
57 # Swarming tools, doesn't exist in the snapshot
58 'tools/swarming_client',
59 # ARM sysroot, doesn't exist in the snapshot
60 'chrome/installer/linux/debian_wheezy_arm-sysroot',
61 # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
63 # Data is not part of open source chromium, but are included on some bots.
65 # This is not part of open source chromium, but are included on some bots.
66 'skia/tools/clusterfuzz-data'
68 excluded_dirs_list
.extend(EXTRA_EXCLUDED_DIRS
)
70 dirs_blacklist
= ['/' + d
+ '/' for d
in excluded_dirs_list
]
71 def IsBlacklistedDir(d
):
72 for item
in dirs_blacklist
:
77 files_whitelist_re
= input_api
.re
.compile(
78 r
'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
79 '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
83 base_path_len
= len(root_dir
)
84 for path
in start_paths_list
:
85 full_path
= input_api
.os_path
.join(root_dir
, path
)
86 if input_api
.os_path
.isfile(full_path
):
87 if files_whitelist_re
.search(path
) and \
88 not IsBlacklistedDir(full_path
[base_path_len
:]): # Keep '/' prefix.
91 for dirpath
, dirnames
, filenames
in input_api
.os_walk(full_path
):
92 # Remove excluded subdirs for faster scanning.
93 for item
in dirnames
[:]:
95 input_api
.os_path
.join(dirpath
, item
)[base_path_len
+ 1:]):
97 for filename
in filenames
:
99 input_api
.os_path
.join(dirpath
, filename
)[base_path_len
+ 1:]
100 if files_whitelist_re
.search(filepath
) and \
101 not IsBlacklistedDir(filepath
):
102 files
.append(filepath
)
106 class _GeneratedFilesDetector(object):
107 GENERATED_FILE
= 'GENERATED FILE'
108 NO_COPYRIGHT
= '*No copyright*'
110 def __init__(self
, input_api
):
111 self
.python_multiline_string_double_re
= \
112 input_api
.re
.compile(r
'"""[^"]*(?:"""|$)', flags
=input_api
.re
.MULTILINE
)
113 self
.python_multiline_string_single_re
= \
114 input_api
.re
.compile(r
"'''[^']*(?:'''|$)", flags
=input_api
.re
.MULTILINE
)
115 self
.automatically_generated_re
= input_api
.re
.compile(
116 r
'(All changes made in this file will be lost'
117 '|DO NOT (EDIT|delete this file)'
118 '|Generated (at|automatically|data)'
119 '|Automatically generated'
120 '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags
=input_api
.re
.IGNORECASE
)
122 def IsGeneratedFile(self
, header
):
123 header
= header
.upper()
125 header
= self
.python_multiline_string_double_re
.sub('', header
)
127 header
= self
.python_multiline_string_single_re
.sub('', header
)
128 # First do simple strings lookup to save time.
129 if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header
:
131 if 'DO NOT EDIT' in header
or 'DO NOT DELETE' in header
or \
132 'GENERATED' in header
:
133 return self
.automatically_generated_re
.search(header
)
137 class _CopyrightsScanner(object):
139 def StaticInit(input_api
):
140 _CopyrightsScanner
._c
_comment
_re
= \
141 input_api
.re
.compile(r
'''"[^"\\]*(?:\\.[^"\\]*)*"''')
142 _CopyrightsScanner
._copyright
_indicator
= \
143 r
'(?:copyright|copr\.|\xc2\xa9|\(c\))'
144 _CopyrightsScanner
._full
_copyright
_indicator
_re
= input_api
.re
.compile(
145 r
'(?:\W|^)' + _CopyrightsScanner
._copyright
_indicator
+ \
146 r
'(?::\s*|\s+)(\w.*)$', input_api
.re
.IGNORECASE
)
147 _CopyrightsScanner
._copyright
_disindicator
_re
= input_api
.re
.compile(
148 r
'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api
.re
.IGNORECASE
)
150 def __init__(self
, input_api
):
151 self
.max_line_numbers_proximity
= 3
152 self
.last_a_item_line_number
= -200
153 self
.last_b_item_line_number
= -100
154 self
.re
= input_api
.re
156 def _CloseLineNumbers(self
, a
, b
):
157 return 0 <= a
- b
<= self
.max_line_numbers_proximity
159 def MatchLine(self
, line_number
, line
):
161 line
= _CopyrightsScanner
._c
_comment
_re
.sub('', line
)
162 upcase_line
= line
.upper()
163 # Record '(a)' and '(b)' last occurences in C++ comments.
164 # This is to filter out '(c)' used as a list item inside C++ comments.
165 # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
166 cpp_comment_idx
= upcase_line
.find('//')
167 if cpp_comment_idx
!= -1:
168 if upcase_line
.find('(A)') > cpp_comment_idx
:
169 self
.last_a_item_line_number
= line_number
170 if upcase_line
.find('(B)') > cpp_comment_idx
:
171 self
.last_b_item_line_number
= line_number
172 # Fast bailout, uses the same patterns as _copyright_indicator regexp.
173 if not 'COPYRIGHT' in upcase_line
and not 'COPR.' in upcase_line \
174 and not '\xc2\xa9' in upcase_line
:
175 c_item_index
= upcase_line
.find('(C)')
176 if c_item_index
== -1:
178 if c_item_index
> cpp_comment_idx
and \
179 self
._CloseLineNumbers
(line_number
,
180 self
.last_b_item_line_number
) and \
181 self
._CloseLineNumbers
(self
.last_b_item_line_number
,
182 self
.last_a_item_line_number
):
185 m
= _CopyrightsScanner
._full
_copyright
_indicator
_re
.search(line
)
187 not _CopyrightsScanner
._copyright
_disindicator
_re
.match(m
.group(1)):
189 # Prettify the authorship string.
190 copyr
= self
.re
.sub(r
'([,.])?\s*$/', '', copyr
)
192 _CopyrightsScanner
._copyright
_indicator
, '', copyr
, \
193 flags
=self
.re
.IGNORECASE
)
194 copyr
= self
.re
.sub(r
'^\s+', '', copyr
)
195 copyr
= self
.re
.sub(r
'\s{2,}', ' ', copyr
)
196 copyr
= self
.re
.sub(r
'\\@', '@', copyr
)
200 def FindCopyrights(input_api
, root_dir
, files_to_scan
):
201 """Determines code autorship, and finds generated files.
203 input_api: InputAPI, as in presubmit scripts.
204 root_dir: The root directory, to which all other paths are relative.
205 files_to_scan: The list of file names to scan.
207 The list of copyrights associated with each of the files given.
208 If the certain file is generated, the corresponding list consists a single
209 entry -- 'GENERATED_FILE' string. If the file has no copyright info,
210 the corresponding list contains 'NO_COPYRIGHT' string.
212 generated_files_detector
= _GeneratedFilesDetector(input_api
)
213 _CopyrightsScanner
.StaticInit(input_api
)
215 for file_name
in files_to_scan
:
219 scanner
= _CopyrightsScanner(input_api
)
220 contents
= input_api
.ReadFile(
221 input_api
.os_path
.join(root_dir
, file_name
), 'r')
222 for l
in contents
.split('\n'):
226 c
= scanner
.MatchLine(linenum
, l
)
228 file_copyrights
.append(c
)
229 if generated_files_detector
.IsGeneratedFile('\n'.join(header
)):
230 copyrights
.append([_GeneratedFilesDetector
.GENERATED_FILE
])
231 elif file_copyrights
:
232 copyrights
.append(file_copyrights
)
234 copyrights
.append([_GeneratedFilesDetector
.NO_COPYRIGHT
])
238 def FindCopyrightViolations(input_api
, root_dir
, files_to_scan
):
239 """Looks for files that are not belong exlusively to the Chromium Authors.
241 input_api: InputAPI, as in presubmit scripts.
242 root_dir: The root directory, to which all other paths are relative.
243 files_to_scan: The list of file names to scan.
245 The list of file names that contain non-Chromium copyrights.
247 copyrights
= FindCopyrights(input_api
, root_dir
, files_to_scan
)
249 allowed_copyrights_re
= input_api
.re
.compile(
250 r
'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
251 'All rights reserved.*)$')
252 for f
, cs
in itertools
.izip(files_to_scan
, copyrights
):
253 if cs
[0] == _GeneratedFilesDetector
.GENERATED_FILE
or \
254 cs
[0] == _GeneratedFilesDetector
.NO_COPYRIGHT
:
257 if not allowed_copyrights_re
.match(c
):
258 offending_files
.append(input_api
.os_path
.normpath(f
))
260 return offending_files
263 def _GetWhitelistFileName(input_api
):
264 return input_api
.os_path
.join(
265 'android_webview', 'tools', 'third_party_files_whitelist.txt')
267 def _ProcessWhitelistedFilesList(input_api
, lines
):
268 whitelisted_files
= []
270 match
= input_api
.re
.match(r
'([^#\s]+)', line
)
272 whitelisted_files
.append(match
.group(1))
273 return whitelisted_files
276 def LoadWhitelistedFilesList(input_api
):
277 """Loads and parses the 3rd party code whitelist file.
278 input_api: InputAPI of presubmit scripts.
282 full_file_name
= input_api
.os_path
.join(
283 input_api
.change
.RepositoryRoot(), _GetWhitelistFileName(input_api
))
284 file_data
= input_api
.ReadFile(full_file_name
, 'rb')
285 return _ProcessWhitelistedFilesList(input_api
, file_data
.splitlines())
288 def AnalyzeScanResults(input_api
, whitelisted_files
, offending_files
):
289 """Compares whitelist contents with the results of file scanning.
290 input_api: InputAPI of presubmit scripts.
291 whitelisted_files: Whitelisted files list.
292 offending_files: Files that contain 3rd party code.
294 A triplet of "unknown", "missing", and "stale" file lists.
295 "Unknown" are files that contain 3rd party code but not whitelisted.
296 "Missing" are files that are whitelisted but doesn't really exist.
297 "Stale" are files that are whitelisted unnecessarily.
299 unknown
= set(offending_files
) - set(whitelisted_files
)
300 missing
= [f
for f
in whitelisted_files
if not input_api
.os_path
.isfile(f
)]
301 stale
= set(whitelisted_files
) - set(offending_files
) - set(missing
)
302 return (list(unknown
), missing
, list(stale
))
305 def _GetDeletedContents(affected_file
):
306 """Returns a list of all deleted lines.
307 AffectedFile class from presubmit_support is lacking this functionality.
310 for line
in affected_file
.GenerateScmDiff().splitlines():
311 if line
.startswith('-') and not line
.startswith('--'):
312 deleted_lines
.append(line
[1:])
315 def _DoScanAtPresubmit(input_api
, whitelisted_files
, files_to_check
):
316 # We pass empty 'known third-party' dirs list here. Since this is a patch
317 # for the Chromium's src tree, it must contain properly licensed Chromium
318 # code. Any third-party code must be put into a directory named 'third_party',
319 # and such dirs are automatically excluded by FindFiles.
320 files_to_scan
= FindFiles(
321 input_api
, input_api
.change
.RepositoryRoot(), files_to_check
, [])
322 offending_files
= FindCopyrightViolations(
323 input_api
, input_api
.change
.RepositoryRoot(), files_to_scan
)
324 return AnalyzeScanResults(
325 input_api
, whitelisted_files
, offending_files
)
327 def ScanAtPresubmit(input_api
, output_api
):
328 """Invoked at change presubmit time. Verifies that updated non third-party
329 code doesn't contain external copyrighted code.
330 input_api: InputAPI of presubmit scripts.
331 output_api: OutputAPI of presubmit scripts.
333 files_to_check
= set([])
334 deleted_files
= set([])
335 whitelist_contents_changed
= False
336 for f
in input_api
.AffectedFiles():
337 if f
.LocalPath() == _GetWhitelistFileName(input_api
):
338 whitelist_contents_changed
= True
339 deleted_files |
= set(_ProcessWhitelistedFilesList(
340 input_api
, _GetDeletedContents(f
)))
342 if f
.Action() != 'D':
343 files_to_check
.add(f
.LocalPath())
345 deleted_files
.add(f
.LocalPath())
346 whitelisted_files
= set(LoadWhitelistedFilesList(input_api
))
347 if not whitelist_contents_changed
:
348 whitelisted_files
&= files_to_check | deleted_files
350 # Need to re-check the entire contents of the whitelist file.
351 # Also add files removed from the whitelist. If the file has indeed been
352 # deleted, the scanner will not complain.
353 files_to_check |
= whitelisted_files | deleted_files
355 (unknown_files
, missing_files
, stale_files
) = _DoScanAtPresubmit(
356 input_api
, list(whitelisted_files
), list(files_to_check
))
359 results
.append(output_api
.PresubmitError(
360 'The following files contain a third-party license but are not in ' \
361 'a listed third-party directory and are not whitelisted. You must ' \
362 'add the following files to the whitelist file ' \
363 '%s:' % _GetWhitelistFileName(input_api
),
364 sorted(unknown_files
)))
366 results
.append(output_api
.PresubmitPromptWarning(
367 'The following files are whitelisted in %s, ' \
368 'but do not exist or not files:' % _GetWhitelistFileName(input_api
),
369 sorted(missing_files
)))
371 results
.append(output_api
.PresubmitPromptWarning(
372 'The following files are whitelisted unnecessarily. You must ' \
373 'remove the following files from the whitelist file ' \
374 '%s:' % _GetWhitelistFileName(input_api
),
375 sorted(stale_files
)))