Fix an amazing number of typos & malformed sentences reported by Detlef
[python/dscho.git] / Lib / string.py
blob92158ee9d3776d2947d0205a42691069b37ef47b
1 # module 'string' -- A collection of string operations
3 # Warning: most of the code you see here isn't normally used nowadays.
4 # At the end of this file most functions are replaced by built-in
5 # functions imported from built-in module "strop".
7 """Common string manipulations.
9 Public module variables:
11 whitespace -- a string containing all characters considered whitespace
12 lowercase -- a string containing all characters considered lowercase letters
13 uppercase -- a string containing all characters considered uppercase letters
14 letters -- a string containing all characters considered letters
15 digits -- a string containing all characters considered decimal digits
16 hexdigits -- a string containing all characters considered hexadecimal digits
17 octdigits -- a string containing all characters considered octal digits
19 """
21 # Some strings for ctype-style character classification
22 whitespace = ' \t\n\r\v\f'
23 lowercase = 'abcdefghijklmnopqrstuvwxyz'
24 uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
25 letters = lowercase + uppercase
26 digits = '0123456789'
27 hexdigits = digits + 'abcdef' + 'ABCDEF'
28 octdigits = '01234567'
30 # Case conversion helpers
31 _idmap = ''
32 for i in range(256): _idmap = _idmap + chr(i)
33 _lower = _idmap[:ord('A')] + lowercase + _idmap[ord('Z')+1:]
34 _upper = _idmap[:ord('a')] + uppercase + _idmap[ord('z')+1:]
35 _swapcase = _upper[:ord('A')] + lowercase + _upper[ord('Z')+1:]
36 del i
38 # Backward compatible names for exceptions
39 index_error = ValueError
40 atoi_error = ValueError
41 atof_error = ValueError
42 atol_error = ValueError
44 # convert UPPER CASE letters to lower case
45 def lower(s):
46 """lower(s) -> string
48 Return a copy of the string s converted to lowercase.
50 """
51 res = ''
52 for c in s:
53 res = res + _lower[ord(c)]
54 return res
56 # Convert lower case letters to UPPER CASE
57 def upper(s):
58 """upper(s) -> string
60 Return a copy of the string s converted to uppercase.
62 """
63 res = ''
64 for c in s:
65 res = res + _upper[ord(c)]
66 return res
68 # Swap lower case letters and UPPER CASE
69 def swapcase(s):
70 """swapcase(s) -> string
72 Return a copy of the string s with upper case characters
73 converted to lowercase and vice versa.
75 """
76 res = ''
77 for c in s:
78 res = res + _swapcase[ord(c)]
79 return res
81 # Strip leading and trailing tabs and spaces
82 def strip(s):
83 """strip(s) -> string
85 Return a copy of the string s with leading and trailing
86 whitespace removed.
88 """
89 i, j = 0, len(s)
90 while i < j and s[i] in whitespace: i = i+1
91 while i < j and s[j-1] in whitespace: j = j-1
92 return s[i:j]
94 # Strip leading tabs and spaces
95 def lstrip(s):
96 """lstrip(s) -> string
98 Return a copy of the string s with leading whitespace removed.
101 i, j = 0, len(s)
102 while i < j and s[i] in whitespace: i = i+1
103 return s[i:j]
105 # Strip trailing tabs and spaces
106 def rstrip(s):
107 """rstrip(s) -> string
109 Return a copy of the string s with trailing whitespace
110 removed.
113 i, j = 0, len(s)
114 while i < j and s[j-1] in whitespace: j = j-1
115 return s[i:j]
118 # Split a string into a list of space/tab-separated words
119 # NB: split(s) is NOT the same as splitfields(s, ' ')!
120 def split(s, sep=None, maxsplit=0):
121 """split(str [,sep [,maxsplit]]) -> list of strings
123 Return a list of the words in the string s, using sep as the
124 delimiter string. If maxsplit is nonzero, splits into at most
125 maxsplit words If sep is not specified, any whitespace string
126 is a separator. Maxsplit defaults to 0.
128 (split and splitfields are synonymous)
131 if sep is not None: return splitfields(s, sep, maxsplit)
132 res = []
133 i, n = 0, len(s)
134 if maxsplit <= 0: maxsplit = n
135 count = 0
136 while i < n:
137 while i < n and s[i] in whitespace: i = i+1
138 if i == n: break
139 if count >= maxsplit:
140 res.append(s[i:])
141 break
142 j = i
143 while j < n and s[j] not in whitespace: j = j+1
144 count = count + 1
145 res.append(s[i:j])
146 i = j
147 return res
149 # Split a list into fields separated by a given string
150 # NB: splitfields(s, ' ') is NOT the same as split(s)!
151 # splitfields(s, '') returns [s] (in analogy with split() in nawk)
152 def splitfields(s, sep=None, maxsplit=0):
153 """splitfields(str [,sep [,maxsplit]]) -> list of strings
155 Return a list of the words in the string s, using sep as the
156 delimiter string. If maxsplit is nonzero, splits into at most
157 maxsplit words If sep is not specified, any whitespace string
158 is a separator. Maxsplit defaults to 0.
160 (split and splitfields are synonymous)
163 if sep is None: return split(s, None, maxsplit)
164 res = []
165 nsep = len(sep)
166 if nsep == 0:
167 return [s]
168 ns = len(s)
169 if maxsplit <= 0: maxsplit = ns
170 i = j = 0
171 count = 0
172 while j+nsep <= ns:
173 if s[j:j+nsep] == sep:
174 count = count + 1
175 res.append(s[i:j])
176 i = j = j + nsep
177 if count >= maxsplit: break
178 else:
179 j = j + 1
180 res.append(s[i:])
181 return res
183 # Join words with spaces between them
184 def join(words, sep = ' '):
185 """join(list [,sep]) -> string
187 Return a string composed of the words in list, with
188 intervening occurences of sep. Sep defaults to a single
189 space.
191 (joinfields and join are synonymous)
194 return joinfields(words, sep)
196 # Join fields with optional separator
197 def joinfields(words, sep = ' '):
198 """joinfields(list [,sep]) -> string
200 Return a string composed of the words in list, with
201 intervening occurences of sep. The default separator is a
202 single space.
204 (joinfields and join are synonymous)
207 res = ''
208 for w in words:
209 res = res + (sep + w)
210 return res[len(sep):]
212 # Find substring, raise exception if not found
213 def index(s, sub, i = 0, last=None):
214 """index(s, sub [,start [,end]]) -> int
216 Return the lowest index in s where substring sub is found,
217 such that sub is contained within s[start,end]. Optional
218 arguments start and end are interpreted as in slice notation.
220 Raise ValueError if not found.
223 if last is None: last = len(s)
224 res = find(s, sub, i, last)
225 if res < 0:
226 raise ValueError, 'substring not found in string.index'
227 return res
229 # Find last substring, raise exception if not found
230 def rindex(s, sub, i = 0, last=None):
231 """rindex(s, sub [,start [,end]]) -> int
233 Return the highest index in s where substring sub is found,
234 such that sub is contained within s[start,end]. Optional
235 arguments start and end are interpreted as in slice notation.
237 Raise ValueError if not found.
240 if last is None: last = len(s)
241 res = rfind(s, sub, i, last)
242 if res < 0:
243 raise ValueError, 'substring not found in string.index'
244 return res
246 # Count non-overlapping occurrences of substring
247 def count(s, sub, i = 0, last=None):
248 """count(s, sub[, start[,end]]) -> int
250 Return the number of occurrences of substring sub in string
251 s[start:end]. Optional arguments start and end are
252 interpreted as in slice notation.
255 Slen = len(s) # cache this value, for speed
256 if last is None:
257 last = Slen
258 elif last < 0:
259 last = max(0, last + Slen)
260 elif last > Slen:
261 last = Slen
262 if i < 0: i = max(0, i + Slen)
263 n = len(sub)
264 m = last + 1 - n
265 if n == 0: return m-i
266 r = 0
267 while i < m:
268 if sub == s[i:i+n]:
269 r = r+1
270 i = i+n
271 else:
272 i = i+1
273 return r
275 # Find substring, return -1 if not found
276 def find(s, sub, i = 0, last=None):
277 """find(s, sub [,start [,end]]) -> in
279 Return the lowest index in s where substring sub is found,
280 such that sub is contained within s[start,end]. Optional
281 arguments start and end are interpreted as in slice notation.
283 Return -1 on failure.
286 Slen = len(s) # cache this value, for speed
287 if last is None:
288 last = Slen
289 elif last < 0:
290 last = max(0, last + Slen)
291 elif last > Slen:
292 last = Slen
293 if i < 0: i = max(0, i + Slen)
294 n = len(sub)
295 m = last + 1 - n
296 while i < m:
297 if sub == s[i:i+n]: return i
298 i = i+1
299 return -1
301 # Find last substring, return -1 if not found
302 def rfind(s, sub, i = 0, last=None):
303 """rfind(s, sub [,start [,end]]) -> int
305 Return the highest index in s where substring sub is found,
306 such that sub is contained within s[start,end]. Optional
307 arguments start and end are interpreted as in slice notation.
309 Return -1 on failure.
312 Slen = len(s) # cache this value, for speed
313 if last is None:
314 last = Slen
315 elif last < 0:
316 last = max(0, last + Slen)
317 elif last > Slen:
318 last = Slen
319 if i < 0: i = max(0, i + Slen)
320 n = len(sub)
321 m = last + 1 - n
322 r = -1
323 while i < m:
324 if sub == s[i:i+n]: r = i
325 i = i+1
326 return r
328 # "Safe" environment for eval()
329 _safe_env = {"__builtins__": {}}
331 # Convert string to float
332 _re = None
333 def atof(str):
334 """atof(s) -> float
336 Return the floating point number represented by the string s.
339 global _re
340 if _re is None:
341 # Don't fail if re doesn't exist -- just skip the syntax check
342 try:
343 import re
344 except ImportError:
345 _re = 0
346 else:
347 _re = re
348 sign = ''
349 s = strip(str)
350 if s and s[0] in '+-':
351 sign = s[0]
352 s = s[1:]
353 if not s:
354 raise ValueError, 'non-float argument to string.atof'
355 while s[0] == '0' and len(s) > 1 and s[1] in digits: s = s[1:]
356 if _re and not _re.match('[0-9]*(\.[0-9]*)?([eE][-+]?[0-9]+)?$', s):
357 raise ValueError, 'non-float argument to string.atof'
358 try:
359 return float(eval(sign + s, _safe_env))
360 except SyntaxError:
361 raise ValueError, 'non-float argument to string.atof'
363 # Convert string to integer
364 def atoi(str, base=10):
365 """atoi(s [,base]) -> int
367 Return the integer represented by the string s in the given
368 base, which defaults to 10. The string s must consist of one
369 or more digits, possibly preceded by a sign. If base is 0, it
370 is chosen from the leading characters of s, 0 for octal, 0x or
371 0X for hexadecimal. If base is 16, a preceding 0x or 0X is
372 accepted.
375 if base != 10:
376 # We only get here if strop doesn't define atoi()
377 raise ValueError, "this string.atoi doesn't support base != 10"
378 sign = ''
379 s = strip(str)
380 if s and s[0] in '+-':
381 sign = s[0]
382 s = s[1:]
383 if not s:
384 raise ValueError, 'non-integer argument to string.atoi'
385 while s[0] == '0' and len(s) > 1: s = s[1:]
386 for c in s:
387 if c not in digits:
388 raise ValueError, 'non-integer argument to string.atoi'
389 return eval(sign + s, _safe_env)
391 # Convert string to long integer
392 def atol(str, base=10):
393 """atol(s [,base]) -> long
395 Return the long integer represented by the string s in the
396 given base, which defaults to 10. The string s must consist
397 of one or more digits, possibly preceded by a sign. If base
398 is 0, it is chosen from the leading characters of s, 0 for
399 octal, 0x or 0X for hexadecimal. If base is 16, a preceding
400 0x or 0X is accepted. A trailing L or l is not accepted,
401 unless base is 0.
404 if base != 10:
405 # We only get here if strop doesn't define atol()
406 raise ValueError, "this string.atol doesn't support base != 10"
407 sign = ''
408 s = strip(str)
409 if s and s[0] in '+-':
410 sign = s[0]
411 s = s[1:]
412 if not s:
413 raise ValueError, 'non-integer argument to string.atol'
414 while s[0] == '0' and len(s) > 1: s = s[1:]
415 for c in s:
416 if c not in digits:
417 raise ValueError, 'non-integer argument to string.atol'
418 return eval(sign + s + 'L', _safe_env)
420 # Left-justify a string
421 def ljust(s, width):
422 """ljust(s, width) -> string
424 Return a left-justified version of s, in a field of the
425 specified width, padded with spaces as needed. The string is
426 never truncated.
429 n = width - len(s)
430 if n <= 0: return s
431 return s + ' '*n
433 # Right-justify a string
434 def rjust(s, width):
435 """rjust(s, width) -> string
437 Return a right-justified version of s, in a field of the
438 specified width, padded with spaces as needed. The string is
439 never truncated.
442 n = width - len(s)
443 if n <= 0: return s
444 return ' '*n + s
446 # Center a string
447 def center(s, width):
448 """center(s, width) -> string
450 Return a center version of s, in a field of the specified
451 width. padded with spaces as needed. The string is never
452 truncated.
455 n = width - len(s)
456 if n <= 0: return s
457 half = n/2
458 if n%2 and width%2:
459 # This ensures that center(center(s, i), j) = center(s, j)
460 half = half+1
461 return ' '*half + s + ' '*(n-half)
463 # Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
464 # Decadent feature: the argument may be a string or a number
465 # (Use of this is deprecated; it should be a string as with ljust c.s.)
466 def zfill(x, width):
467 """zfill(x, width) -> string
469 Pad a numeric string x with zeros on the left, to fill a field
470 of the specified width. The string x is never truncated.
473 if type(x) == type(''): s = x
474 else: s = `x`
475 n = len(s)
476 if n >= width: return s
477 sign = ''
478 if s[0] in ('-', '+'):
479 sign, s = s[0], s[1:]
480 return sign + '0'*(width-n) + s
482 # Expand tabs in a string.
483 # Doesn't take non-printing chars into account, but does understand \n.
484 def expandtabs(s, tabsize=8):
485 """expandtabs(s [,tabsize]) -> string
487 Return a copy of the string s with all tab characters replaced
488 by the appropriate number of spaces, depending on the current
489 column, and the tabsize (default 8).
492 res = line = ''
493 for c in s:
494 if c == '\t':
495 c = ' '*(tabsize - len(line)%tabsize)
496 line = line + c
497 if c == '\n':
498 res = res + line
499 line = ''
500 return res + line
502 # Character translation through look-up table.
503 def translate(s, table, deletions=""):
504 """translate(s,table [,deletechars]) -> string
506 Return a copy of the string s, where all characters occurring
507 in the optional argument deletechars are removed, and the
508 remaining characters have been mapped through the given
509 translation table, which must be a string of length 256.
512 if type(table) != type('') or len(table) != 256:
513 raise TypeError, \
514 "translation table must be 256 characters long"
515 res = ""
516 for c in s:
517 if c not in deletions:
518 res = res + table[ord(c)]
519 return res
521 # Capitalize a string, e.g. "aBc dEf" -> "Abc def".
522 def capitalize(s):
523 """capitalize(s) -> string
525 Return a copy of the string s with only its first character
526 capitalized.
529 return upper(s[:1]) + lower(s[1:])
531 # Capitalize the words in a string, e.g. " aBc dEf " -> "Abc Def".
532 # See also regsub.capwords().
533 def capwords(s, sep=None):
534 """capwords(s, [sep]) -> string
536 Split the argument into words using split, capitalize each
537 word using capitalize, and join the capitalized words using
538 join. Note that this replaces runs of whitespace characters by
539 a single space.
542 return join(map(capitalize, split(s, sep)), sep or ' ')
544 # Construct a translation string
545 _idmapL = None
546 def maketrans(fromstr, tostr):
547 """maketrans(frm, to) -> string
549 Return a translation table (a string of 256 bytes long)
550 suitable for use in string.translate. The strings frm and to
551 must be of the same length.
554 if len(fromstr) != len(tostr):
555 raise ValueError, "maketrans arguments must have same length"
556 global _idmapL
557 if not _idmapL:
558 _idmapL = map(None, _idmap)
559 L = _idmapL[:]
560 fromstr = map(ord, fromstr)
561 for i in range(len(fromstr)):
562 L[fromstr[i]] = tostr[i]
563 return joinfields(L, "")
565 # Substring replacement (global)
566 def replace(str, old, new, maxsplit=0):
567 """replace (str, old, new[, maxsplit]) -> string
569 Return a copy of string str with all occurrences of substring
570 old replaced by new. If the optional argument maxsplit is
571 given, only the first maxsplit occurrences are replaced.
574 return joinfields(splitfields(str, old, maxsplit), new)
577 # Try importing optional built-in module "strop" -- if it exists,
578 # it redefines some string operations that are 100-1000 times faster.
579 # It also defines values for whitespace, lowercase and uppercase
580 # that match <ctype.h>'s definitions.
582 try:
583 from strop import *
584 letters = lowercase + uppercase
585 except ImportError:
586 pass # Use the original, slow versions