Added tag v0-5-20090608 for changeset 19e9fdde919e
[Melange.git] / scripts / munge.py
blob169220ee6b10f930ab1ac9aa933c6b8134e92dbe
1 #!/usr/bin/python2.5
3 # Copyright 2008 the Melange authors.
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
9 # http://www.apache.org/licenses/LICENSE-2.0
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
17 # __doc__ string is slightly unconventional because it is used as usage text
18 """%prog [OPTIONS] [FIND_REGEX] [REPLACE_FORMAT]
20 Script to list, search, and modify files using Python regex patterns.
22 OPTIONS: optional command-line flags; see %prog --help
24 FIND_REGEX: an optional valid Python regular expression pattern;
25 if supplied, only files containing at least one match will be processed;
26 matching file paths will be printed; if supplied, REPLACE_FORMAT will be
27 used to convert the match groups into formatted output.
29 REPLACE_FORMAT: an optional valid Python format string;
30 FIND_REGEX must be supplied first if REPLACE_FORMAT is supplied;
31 positional arguments will be replaced with ordered groups from
32 FIND_REGEX matches, and named arguments will be replaced with named
33 groups from FIND_REGEX matches."""
35 __authors__ = [
36 '"Todd Larsen" <tlarsen@google.com>',
40 import dircache
41 import errno
42 import os
43 import optparse
44 import re
45 import sre_constants
46 import sys
49 class Error(Exception):
50 """Base class of all exceptions in this module.
51 """
52 pass
55 def compileRegex(pattern):
56 """Compiles a Python regex pattern into a regex object.
58 Args:
59 pattern: valid Python regex pattern string, or an already-compiled
60 regex object (in which case this function is is a no-op)
62 Returns:
63 regex object compiled from pattern
65 Raises:
66 Error if pattern could not be compiled.
67 """
68 try:
69 return re.compile(pattern)
70 except sre_constants.error, error:
71 msg = 're.compile: %s\n%s' % (error.args[0], pattern)
72 raise Error(errno.EINVAL, msg)
75 def findAll(text_to_search, pattern):
76 """Returns all matches of a regex in a string.
78 Args:
79 text_to_search: string in which to find matches
80 pattern: Python regex pattern (or already-compiled regex object)
81 indicating which matches to retrieve
83 Returns:
84 a (possibly empty) list of the matches found, as strings
85 """
86 matches = []
88 def _captureMatchText(match):
89 match_text = match.group()
90 matches.append(match_text)
91 return match_text
93 compileRegex(pattern).sub(_captureMatchText, text_to_search)
95 return matches
98 def getFileContents(file_path):
99 """Reads the contents of a file as a single string, then closes the file.
101 Args:
102 file_path: path to the file to read its contents into a string
104 Returns:
105 a single string containing the entire contents of the file
107 file_to_read = open(file_path)
108 file_contents = file_to_read.read()
109 file_to_read.close()
110 return file_contents
113 def findAllInFile(file_path, pattern, *ignored_args, **ignored_kwargs):
114 """Action to return a list of all pattern matches in a file.
116 Args:
117 file_path: path of file to manipulate
118 pattern: see findAll()
119 *ignored_args: other positional arguments which are ignored
120 command-line arguments not used by this action callable
121 **ignored_kwargs: other keyword arguments which are ignored
122 command-line options not used by this action callable
124 Returns:
125 two-tuple of boolean indicating if any match was found and a
126 (possibly empty) list of the matches found, as strings (to be used
127 as printable output of the action)
129 matches = findAll(getFileContents(file_path), pattern)
131 if matches:
132 found = True
133 else:
134 found = False
136 return found, matches
139 def replaceAll(original, pattern, format):
140 """Substitutes formatted text for all matches in a string.
142 Args:
143 original: original string in which to find and replace matches
144 pattern: Python regex pattern (or already-compiled regex object)
145 indicating which matches to replace
146 format: Python format string specifying how to format the
147 replacement text; how this format string is interpreted depends
148 on the contents of the pattern; if the pattern contains:
149 named groups: format is expected to contain named format specifiers
150 unnamed groups: format is expected to contain exactly the same
151 number of unnamed format specifiers as the number of groups in
152 pattern
153 no groups: format is expected to contain a single format specifier
154 (in which case the entire match is supplied to it), or no format
155 specifier at all (in which case the "format" string simply
156 replaces the match with no substitutions from the match itself)
158 Returns:
159 two-tuple of the text with all matches replaced as specified by
160 pattern and format, and a list of the original matches, each followed
161 by its replacement
163 matches_and_replacements = []
165 def _replaceWithFormat(match):
166 formatted_match = None
168 if match.groupdict():
169 try:
170 formatted_match = format % match.groupdict()
171 except TypeError:
172 pass
174 if (not formatted_match) and match.groups():
175 try:
176 formatted_match = format % match.groups()
177 except TypeError:
178 pass
180 if (not formatted_match):
181 try:
182 formatted_match = format % match.group()
183 except TypeError:
184 formatted_match = format
186 matches_and_replacements.append(match.group())
187 matches_and_replacements.append(formatted_match)
188 return formatted_match
190 replaced = compileRegex(pattern).sub(_replaceWithFormat, original)
192 return replaced, matches_and_replacements
195 def writeAltFileIfExt(path, ext, contents):
196 """Writes a file if path and additional extension are supplied.
198 If path or ext are not supplied, no file is written.
200 Args:
201 path: path of file to be written, to which ext will be appended
202 ext: additional file extension that will be appended to path
203 contents: contents of file to be written, as a string
205 if (not path) or (not ext):
206 return
208 if ext.startswith('.'):
209 ext = ext[1:]
211 alt_path = '%s.%s' % (path, ext)
212 alt_file = open(alt_path, 'w')
213 alt_file.write(contents)
214 alt_file.close()
217 def replaceAllInFile(file_path, pattern, format,
218 new_ext=None, backup_ext=None,
219 overwrite_files=False,
220 *ignored_args, **ignored_kwargs):
221 """Substitutes formatted text for all matches in a file.
223 Args:
224 file_path: path of file to manipulate
225 pattern, format: see replaceAll()
226 *ignored_args: other positional arguments which are ignored
227 command-line arguments not used by this action callable
228 **ignored_kwargs: other keyword arguments which are ignored
229 command-line options not used by this action callable
231 Returns:
232 two-tuple of boolean indicating if any match was found and a
233 list of printable output text lines containing pairs of original
234 pattern matches each followed by the formatted replacement
236 original = getFileContents(file_path)
238 replaced, matches_and_replacements = replaceAll(
239 original, pattern, format)
241 if matches_and_replacements:
242 found = True
243 writeAltFileIfExt(file_path, new_ext, replaced)
244 writeAltFileIfExt(file_path, backup_ext, original)
246 if overwrite_files:
247 if replaced != original:
248 replaced_file = open(file_path, 'w')
249 replaced_file.write(replaced)
250 replaced_file.close()
251 else:
252 found = False
254 return found, matches_and_replacements
257 def listFile(*ignored_args, **ignored_kwargs):
258 """No-op action callable that ignores arguments and returns (True, []).
260 return True, [] # match only based on file names, which was done by caller
263 def applyActionToFiles(action, action_args,
264 start_path='', abs_path=False, files_pattern='',
265 recurse_dirs=False, dirs_pattern='',
266 follow_symlinks=False, quiet_output=False,
267 hide_paths=False, hide_text=False, **action_options):
268 """Applies a callable action to files, based on options and arguments.
270 Args:
271 action: callable that expects a file path argument, positional arguments
272 (action_args), and keyword options from the command-line options dict;
273 and returns a "matched" boolean and a list of output strings
274 action_args: list of positional arguments, if any; passed to action
275 callable unchanged
276 start_path: required path of initial directory to visit
277 abs_path: optional boolean indicating to use absolute paths
278 files_pattern: required Python regex (object or pattern) which selects
279 which files to pass to the action callable
280 recurse_dirs: boolean indicating if subdirectories should be traversed
281 dirs_pattern: Python regex (object or pattern) which selects which
282 subdirectories to traverse if recurse_dirs is True
283 follow_symlinks: boolean indicating if symlinks should be traversed
284 quiet_output: optional boolean indicating if output should be suppressed
285 hide_paths: optional boolean indicating to omit file paths from output
286 hide_text: optional boolean indicating to omit find/replace text from
287 output
288 **action_options: remaining keyword arguments that are passed unchanged
289 to the action callable
291 Returns:
292 two-tuple containing an exit code and a (possibly empty) list of
293 output strings
295 Raises:
296 Error exception if problems occur (file I/O, invalid regex, etc.).
298 exit_code = errno.ENOENT
299 output = []
301 start_path = os.path.expandvars(os.path.expanduser(start_path))
303 if abs_path:
304 start_path = os.path.abspath(start_path)
306 paths = [start_path]
308 files_regex = compileRegex(files_pattern)
310 if recurse_dirs:
311 dirs_regex = compileRegex(dirs_pattern)
313 while paths:
314 sub_paths = []
316 for path in paths:
317 # expand iterator into an actual list and sort it
318 try:
319 items = dircache.listdir(path)[:]
320 except (IOError, OSError), error:
321 raise Error(error.args[0], '%s: %s' % (
322 error.__class__.__name__, error.args[1]))
324 items.sort()
326 for item in items:
327 item_path = os.path.join(path, item)
329 if os.path.islink(item_path):
330 if not follow_symlinks:
331 continue # do not follow symlinks (ignore them)
333 if os.path.isdir(item_path):
334 if recurse_dirs:
335 if dirs_regex.match(item):
336 sub_paths.append(item_path)
337 continue
339 if os.path.isfile(item_path) and files_regex.match(item):
340 try:
341 matched, found_output = action(item_path, *action_args,
342 **action_options)
343 except (IOError, OSError), error:
344 raise Error(error.args[0], '%s: %s' % (
345 error.__class__.__name__, error.args[1]))
347 if matched:
348 exit_code = 0 # at least one matched file has now been found
350 if (not quiet_output) and (not hide_paths):
351 output.append(item_path)
353 if (not quiet_output) and (not hide_text):
354 output.extend(found_output)
356 paths = sub_paths
358 return exit_code, output
361 class _ErrorOptionParser(optparse.OptionParser):
362 """Customized optparse.OptionParser that does not call sys.exit().
365 def error(self, msg):
366 """Raises an Error exception, instead of calling sys.exit().
368 raise Error(errno.EINVAL, msg)
371 def _buildParser():
372 """Returns a custom OptionParser for parsing command-line arguments.
374 parser = _ErrorOptionParser(__doc__)
376 filter_group = optparse.OptionGroup(parser,
377 'File Options',
378 'Options used to select which files to process.')
380 filter_group.add_option(
381 '-f', '--files', dest='files_pattern',
382 default='(?!^.*\.pyc|.*\.ico|.*\.gif|.*\.png|.*\.jpg$)',
383 metavar='FILES_REGEX',
384 help=('Python regex pattern (*not* a glob!) defining files to process'
385 ' in each directory [default: %default]'))
387 filter_group.add_option(
388 '-F', '--follow', dest='follow_symlinks', default=False,
389 action='store_true',
390 help=('follow file and subdirectory symlinks (possibly *DANGEROUS*)'
391 ' [default: %default]'))
393 parser.add_option_group(filter_group)
395 dir_group = optparse.OptionGroup(parser,
396 'Directory Options',
397 'Options used to indicate which directories to traverse.')
399 dir_group.add_option(
400 '-s', '--start', dest='start_path', default=os.curdir, metavar='PATH',
401 help='directory in which to start processing files [default: %default]')
403 dir_group.add_option(
404 '-R', '--recursive', dest='recurse_dirs', default=False,
405 action='store_true',
406 help='recurse into subdirectories [default: %default]')
408 dir_group.add_option(
409 '-d', '--dirs', dest='dirs_pattern', default='^[^.].*$',
410 metavar='SUBDIRS_REGEX',
411 help=('Python regex pattern (*not* a glob!) defining subdirectories to'
412 ' recurse into (if --recursive) [default: %default]'))
414 parser.add_option_group(dir_group)
416 output_group = optparse.OptionGroup(parser,
417 'Output Options',
418 'Options used to control program output.')
420 output_group.add_option(
421 '-a', '--abspath', dest='abs_path', default=False, action='store_true',
422 help=('output absolute paths instead of relative paths'
423 ' [default: %default]'))
425 output_group.add_option(
426 '', '--nopaths', dest='hide_paths', default=False, action='store_true',
427 help=('suppress printing of file path names for successfully matched'
428 ' files to stdout [default: %default]'))
430 output_group.add_option(
431 '', '--notext', dest='hide_text', default=False, action='store_true',
432 help=('suppress find/replace text output to stdout (but still print'
433 ' paths if not --nopath, and still perform replacements if'
434 ' specified) [default: %default]'))
436 output_group.add_option(
437 '-q', '--quiet', dest='quiet_output', default=False, action='store_true',
438 help=('suppress *all* printed output to stdout (but still perform'
439 ' replacements if specified) [default: %default]'))
441 parser.add_option_group(output_group)
443 replace_group = optparse.OptionGroup(parser,
444 'Replace Options',
445 'Options applied when matches in files are replaced with substitutions.'
446 ' (Only possible if REPLACE_FORMAT is supplied.)')
448 replace_group.add_option(
449 '-o', '--overwrite', dest='overwrite_files', default=False,
450 action='store_true',
451 help=('overwrite original files with formatted text substituted for'
452 ' matches [default: %default]'))
454 replace_group.add_option(
455 '-b', '--backup', dest='backup_ext', default='', metavar='EXTENSION',
456 help=('if supplied, and file would be overwritten, backup original'
457 ' file with the supplied extension [default is no backups of'
458 ' overwritten files are kept]'))
460 replace_group.add_option(
461 '-n', '--new', dest='new_ext', default='', metavar='EXTENSION',
462 help=('if supplied, and file has matches and and is altered by'
463 ' substitutions, create a new file with the supplied extension'
464 ' [default is no new file is created]'))
466 parser.add_option_group(replace_group)
468 return parser
471 def _parseArgs(cmd_line_args):
472 """Builds a command-line option parser and parses command-line arguments.
474 Args:
475 cmd_line_args: command-line arguments, excluding the argv[0] program name
477 Returns:
478 four-tuple of action callable, supplied command-line options (including
479 those defined by defaults in the command-line parser) as a dict,
480 remaining positional command-line arguments, and the parser itself
482 Raises:
483 Error if problems occurred during commmand-line argument parsing.
485 parser = _buildParser()
486 options, args = parser.parse_args(args=cmd_line_args)
488 if not args:
489 # no FIND_REGEX or REPLACE_PATTERN supplied, so just match based
490 # on file name and subdirectory name patterns
491 action = listFile
492 elif len(args) == 1:
493 # FIND_REGEX supplied, but not REPLACE_PATTERN, so just match based
494 # on file name and subdirectory name patterns, and then on file
495 # contents
496 action = findAllInFile
497 elif len(args) == 2:
498 # FIND_REGEX and REPLACE_PATTERN both supplied, so match based
499 # on file name and subdirectory name patterns, and then do a find and
500 # replace on file contents
501 action = replaceAllInFile
502 else:
503 raise Error(errno.EINVAL,'too many (%d) arguments supplied:\n%s' % (
504 len(args), ' '.join(args)))
506 return action, vars(options), args, parser
509 def _main(argv):
510 """Wrapper that catches exceptions, prints output, and returns exit status.
512 Normal program output is printed to stdout. Error output (including
513 exception text) is printed to stderr.
515 Args:
516 argv: script arguments, usually sys.argv; argv[0] is expected to be the
517 program name
519 Returns:
520 exit code suitable for sys.exit()
522 options = {} # empty options, used if _parseArgs() fails
523 parser = None
525 try:
526 action, options, args, parser = _parseArgs(argv[1:])
527 exit_code, output = applyActionToFiles(action, args, **options)
529 if output: print '\n'.join(output)
531 except Error, error:
532 if not options.get('quiet_output'):
533 print >>sys.stderr, '\nERROR: (%s: %s) %s\n' % (
534 error.args[0], os.strerror(error.args[0]), error.args[1])
536 if parser:
537 print >>sys.stderr, parser.get_usage()
539 exit_code = error.args[0]
541 return exit_code
544 if __name__ == '__main__':
545 sys.exit(_main(sys.argv))