Lib/string.py

   1 # module 'string' -- A collection of string operations
   2
   3 # Warning: most of the code you see here isn't normally used nowadays.
   4 # At the end of this file most functions are replaced by built-in
   5 # functions imported from built-in module "strop".
   6
   7 """Common string manipulations.
   8
   9 Public module variables:
  10
  11 whitespace -- a string containing all characters considered whitespace
  12 lowercase -- a string containing all characters considered lowercase letters
  13 uppercase -- a string containing all characters considered uppercase letters
  14 letters -- a string containing all characters considered letters
  15 digits -- a string containing all characters considered decimal digits
  16 hexdigits -- a string containing all characters considered hexadecimal digits
  17 octdigits -- a string containing all characters considered octal digits
  18
  19 """
  20
  21 # Some strings for ctype-style character classification
  22 whitespace = ' \t\n\r\v\f'
  23 lowercase = 'abcdefghijklmnopqrstuvwxyz'
  24 uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
  25 letters = lowercase + uppercase
  26 digits = '0123456789'
  27 hexdigits = digits + 'abcdef' + 'ABCDEF'
  28 octdigits = '01234567'
  29
  30 # Case conversion helpers
  31 _idmap = ''
  32 for i in range(256): _idmap = _idmap + chr(i)
  33 _lower = _idmap[:ord('A')] + lowercase + _idmap[ord('Z')+1:]
  34 _upper = _idmap[:ord('a')] + uppercase + _idmap[ord('z')+1:]
  35 _swapcase = _upper[:ord('A')] + lowercase + _upper[ord('Z')+1:]
  36 del i
  37
  38 # Backward compatible names for exceptions
  39 index_error = ValueError
  40 atoi_error = ValueError
  41 atof_error = ValueError
  42 atol_error = ValueError
  43
  44 # convert UPPER CASE letters to lower case
  45 def lower(s):
  46         """lower(s) -> string
  47
  48         Return a copy of the string s converted to lowercase.
  49
  50         """
  51         res = ''
  52         for c in s:
  53                 res = res + _lower[ord(c)]
  54         return res
  55
  56 # Convert lower case letters to UPPER CASE
  57 def upper(s):
  58         """upper(s) -> string
  59
  60         Return a copy of the string s converted to uppercase.
  61
  62         """
  63         res = ''
  64         for c in s:
  65                 res = res + _upper[ord(c)]
  66         return res
  67
  68 # Swap lower case letters and UPPER CASE
  69 def swapcase(s):
  70         """swapcase(s) -> string
  71
  72         Return a copy of the string s with upper case characters
  73         converted to lowercase and vice versa.
  74
  75         """
  76         res = ''
  77         for c in s:
  78                 res = res + _swapcase[ord(c)]
  79         return res
  80
  81 # Strip leading and trailing tabs and spaces
  82 def strip(s):
  83         """strip(s) -> string
  84
  85         Return a copy of the string s with leading and trailing
  86         whitespace removed.
  87
  88         """
  89         i, j = 0, len(s)
  90         while i < j and s[i] in whitespace: i = i+1
  91         while i < j and s[j-1] in whitespace: j = j-1
  92         return s[i:j]
  93
  94 # Strip leading tabs and spaces
  95 def lstrip(s):
  96         """lstrip(s) -> string
  97
  98         Return a copy of the string s with leading whitespace removed.
  99
 100         """
 101         i, j = 0, len(s)
 102         while i < j and s[i] in whitespace: i = i+1
 103         return s[i:j]
 104
 105 # Strip trailing tabs and spaces
 106 def rstrip(s):
 107         """rstrip(s) -> string
 108
 109         Return a copy of the string s with trailing whitespace
 110         removed.
 111
 112         """
 113         i, j = 0, len(s)
 114         while i < j and s[j-1] in whitespace: j = j-1
 115         return s[i:j]
 116
 117
 118 # Split a string into a list of space/tab-separated words
 119 # NB: split(s) is NOT the same as splitfields(s, ' ')!
 120 def split(s, sep=None, maxsplit=0):
 121         """split(str [,sep [,maxsplit]]) -> list of strings
 122
 123         Return a list of the words in the string s, using sep as the
 124         delimiter string.  If maxsplit is nonzero, splits into at most
 125         maxsplit words If sep is not specified, any whitespace string
 126         is a separator.  Maxsplit defaults to 0.
 127
 128         (split and splitfields are synonymous)
 129
 130         """
 131         if sep is not None: return splitfields(s, sep, maxsplit)
 132         res = []
 133         i, n = 0, len(s)
 134         if maxsplit <= 0: maxsplit = n
 135         count = 0
 136         while i < n:
 137                 while i < n and s[i] in whitespace: i = i+1
 138                 if i == n: break
 139                 if count >= maxsplit:
 140                     res.append(s[i:])
 141                     break
 142                 j = i
 143                 while j < n and s[j] not in whitespace: j = j+1
 144                 count = count + 1
 145                 res.append(s[i:j])
 146                 i = j
 147         return res
 148
 149 # Split a list into fields separated by a given string
 150 # NB: splitfields(s, ' ') is NOT the same as split(s)!
 151 # splitfields(s, '') returns [s] (in analogy with split() in nawk)
 152 def splitfields(s, sep=None, maxsplit=0):
 153         """splitfields(str [,sep [,maxsplit]]) -> list of strings
 154
 155         Return a list of the words in the string s, using sep as the
 156         delimiter string.  If maxsplit is nonzero, splits into at most
 157         maxsplit words If sep is not specified, any whitespace string
 158         is a separator.  Maxsplit defaults to 0.
 159
 160         (split and splitfields are synonymous)
 161
 162         """
 163         if sep is None: return split(s, None, maxsplit)
 164         res = []
 165         nsep = len(sep)
 166         if nsep == 0:
 167                 return [s]
 168         ns = len(s)
 169         if maxsplit <= 0: maxsplit = ns
 170         i = j = 0
 171         count = 0
 172         while j+nsep <= ns:
 173                 if s[j:j+nsep] == sep:
 174                         count = count + 1
 175                         res.append(s[i:j])
 176                         i = j = j + nsep
 177                         if count >= maxsplit: break
 178                 else:
 179                         j = j + 1
 180         res.append(s[i:])
 181         return res
 182
 183 # Join words with spaces between them
 184 def join(words, sep = ' '):
 185         """join(list [,sep]) -> string
 186
 187         Return a string composed of the words in list, with
 188         intervening occurences of sep.  Sep defaults to a single
 189         space.
 190
 191         (joinfields and join are synonymous)
 192
 193         """
 194         return joinfields(words, sep)
 195
 196 # Join fields with optional separator
 197 def joinfields(words, sep = ' '):
 198         """joinfields(list [,sep]) -> string
 199
 200         Return a string composed of the words in list, with
 201         intervening occurences of sep.  The default separator is a
 202         single space.
 203
 204         (joinfields and join are synonymous)
 205
 206         """
 207         res = ''
 208         for w in words:
 209                 res = res + (sep + w)
 210         return res[len(sep):]
 211
 212 # Find substring, raise exception if not found
 213 def index(s, sub, i = 0, last=None):
 214         """index(s, sub [,start [,end]]) -> int
 215
 216         Return the lowest index in s where substring sub is found,
 217         such that sub is contained within s[start,end].  Optional
 218         arguments start and end are interpreted as in slice notation.
 219
 220         Raise ValueError if not found.
 221
 222         """
 223         if last is None: last = len(s)
 224         res = find(s, sub, i, last)
 225         if res < 0:
 226                 raise ValueError, 'substring not found in string.index'
 227         return res
 228
 229 # Find last substring, raise exception if not found
 230 def rindex(s, sub, i = 0, last=None):
 231         """rindex(s, sub [,start [,end]]) -> int
 232
 233         Return the highest index in s where substring sub is found,
 234         such that sub is contained within s[start,end].  Optional
 235         arguments start and end are interpreted as in slice notation.
 236
 237         Raise ValueError if not found.
 238
 239         """
 240         if last is None: last = len(s)
 241         res = rfind(s, sub, i, last)
 242         if res < 0:
 243                 raise ValueError, 'substring not found in string.index'
 244         return res
 245
 246 # Count non-overlapping occurrences of substring
 247 def count(s, sub, i = 0, last=None):
 248         """count(s, sub[, start[,end]]) -> int
 249
 250         Return the number of occurrences of substring sub in string
 251         s[start:end].  Optional arguments start and end are
 252         interpreted as in slice notation.
 253
 254         """
 255         Slen = len(s)  # cache this value, for speed
 256         if last is None:
 257                 last = Slen
 258         elif last < 0:
 259                 last = max(0, last + Slen)
 260         elif last > Slen:
 261                 last = Slen
 262         if i < 0: i = max(0, i + Slen)
 263         n = len(sub)
 264         m = last + 1 - n
 265         if n == 0: return m-i
 266         r = 0
 267         while i < m:
 268                 if sub == s[i:i+n]:
 269                         r = r+1
 270                         i = i+n
 271                 else:
 272                         i = i+1
 273         return r
 274
 275 # Find substring, return -1 if not found
 276 def find(s, sub, i = 0, last=None):
 277         """find(s, sub [,start [,end]]) -> in
 278
 279         Return the lowest index in s where substring sub is found,
 280         such that sub is contained within s[start,end].  Optional
 281         arguments start and end are interpreted as in slice notation.
 282
 283         Return -1 on failure.
 284
 285         """
 286         Slen = len(s)  # cache this value, for speed
 287         if last is None:
 288                 last = Slen
 289         elif last < 0:
 290                 last = max(0, last + Slen)
 291         elif last > Slen:
 292                 last = Slen
 293         if i < 0: i = max(0, i + Slen)
 294         n = len(sub)
 295         m = last + 1 - n
 296         while i < m:
 297                 if sub == s[i:i+n]: return i
 298                 i = i+1
 299         return -1
 300
 301 # Find last substring, return -1 if not found
 302 def rfind(s, sub, i = 0, last=None):
 303         """rfind(s, sub [,start [,end]]) -> int
 304
 305         Return the highest index in s where substring sub is found,
 306         such that sub is contained within s[start,end].  Optional
 307         arguments start and end are interpreted as in slice notation.
 308
 309         Return -1 on failure.
 310
 311         """
 312         Slen = len(s)  # cache this value, for speed
 313         if last is None:
 314                 last = Slen
 315         elif last < 0:
 316                 last = max(0, last + Slen)
 317         elif last > Slen:
 318                 last = Slen
 319         if i < 0: i = max(0, i + Slen)
 320         n = len(sub)
 321         m = last + 1 - n
 322         r = -1
 323         while i < m:
 324                 if sub == s[i:i+n]: r = i
 325                 i = i+1
 326         return r
 327
 328 # "Safe" environment for eval()
 329 _safe_env = {"__builtins__": {}}
 330
 331 # Convert string to float
 332 _re = None
 333 def atof(str):
 334         """atof(s) -> float
 335
 336         Return the floating point number represented by the string s.
 337
 338         """
 339         global _re
 340         if _re is None:
 341                 # Don't fail if re doesn't exist -- just skip the syntax check
 342                 try:
 343                         import re
 344                 except ImportError:
 345                         _re = 0
 346                 else:
 347                         _re = re
 348         sign = ''
 349         s = strip(str)
 350         if s and s[0] in '+-':
 351                 sign = s[0]
 352                 s = s[1:]
 353         if not s:
 354                 raise ValueError, 'non-float argument to string.atof'
 355         while s[0] == '0' and len(s) > 1 and s[1] in digits: s = s[1:]
 356         if _re and not _re.match('[0-9]*(\.[0-9]*)?([eE][-+]?[0-9]+)?$', s):
 357                 raise ValueError, 'non-float argument to string.atof'
 358         try:
 359                 return float(eval(sign + s, _safe_env))
 360         except SyntaxError:
 361                 raise ValueError, 'non-float argument to string.atof'
 362
 363 # Convert string to integer
 364 def atoi(str, base=10):
 365         """atoi(s [,base]) -> int
 366
 367         Return the integer represented by the string s in the given
 368         base, which defaults to 10.  The string s must consist of one
 369         or more digits, possibly preceded by a sign.  If base is 0, it
 370         is chosen from the leading characters of s, 0 for octal, 0x or
 371         0X for hexadecimal.  If base is 16, a preceding 0x or 0X is
 372         accepted.
 373
 374         """
 375         if base != 10:
 376                 # We only get here if strop doesn't define atoi()
 377                 raise ValueError, "this string.atoi doesn't support base != 10"
 378         sign = ''
 379         s = strip(str)
 380         if s and s[0] in '+-':
 381                 sign = s[0]
 382                 s = s[1:]
 383         if not s:
 384                 raise ValueError, 'non-integer argument to string.atoi'
 385         while s[0] == '0' and len(s) > 1: s = s[1:]
 386         for c in s:
 387                 if c not in digits:
 388                         raise ValueError, 'non-integer argument to string.atoi'
 389         return eval(sign + s, _safe_env)
 390
 391 # Convert string to long integer
 392 def atol(str, base=10):
 393         """atol(s [,base]) -> long
 394
 395         Return the long integer represented by the string s in the
 396         given base, which defaults to 10.  The string s must consist
 397         of one or more digits, possibly preceded by a sign.  If base
 398         is 0, it is chosen from the leading characters of s, 0 for
 399         octal, 0x or 0X for hexadecimal.  If base is 16, a preceding
 400         0x or 0X is accepted.  A trailing L or l is not accepted,
 401         unless base is 0.
 402
 403         """
 404         if base != 10:
 405                 # We only get here if strop doesn't define atol()
 406                 raise ValueError, "this string.atol doesn't support base != 10"
 407         sign = ''
 408         s = strip(str)
 409         if s and s[0] in '+-':
 410                 sign = s[0]
 411                 s = s[1:]
 412         if not s:
 413                 raise ValueError, 'non-integer argument to string.atol'
 414         while s[0] == '0' and len(s) > 1: s = s[1:]
 415         for c in s:
 416                 if c not in digits:
 417                         raise ValueError, 'non-integer argument to string.atol'
 418         return eval(sign + s + 'L', _safe_env)
 419
 420 # Left-justify a string
 421 def ljust(s, width):
 422         """ljust(s, width) -> string
 423
 424         Return a left-justified version of s, in a field of the
 425         specified width, padded with spaces as needed.  The string is
 426         never truncated.
 427
 428         """
 429         n = width - len(s)
 430         if n <= 0: return s
 431         return s + ' '*n
 432
 433 # Right-justify a string
 434 def rjust(s, width):
 435         """rjust(s, width) -> string
 436
 437         Return a right-justified version of s, in a field of the
 438         specified width, padded with spaces as needed.  The string is
 439         never truncated.
 440
 441         """
 442         n = width - len(s)
 443         if n <= 0: return s
 444         return ' '*n + s
 445
 446 # Center a string
 447 def center(s, width):
 448         """center(s, width) -> string
 449
 450         Return a center version of s, in a field of the specified
 451         width. padded with spaces as needed.  The string is never
 452         truncated.
 453
 454         """
 455         n = width - len(s)
 456         if n <= 0: return s
 457         half = n/2
 458         if n%2 and width%2:
 459                 # This ensures that center(center(s, i), j) = center(s, j)
 460                 half = half+1
 461         return ' '*half +  s + ' '*(n-half)
 462
 463 # Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
 464 # Decadent feature: the argument may be a string or a number
 465 # (Use of this is deprecated; it should be a string as with ljust c.s.)
 466 def zfill(x, width):
 467         """zfill(x, width) -> string
 468
 469         Pad a numeric string x with zeros on the left, to fill a field
 470         of the specified width.  The string x is never truncated.
 471
 472         """
 473         if type(x) == type(''): s = x
 474         else: s = `x`
 475         n = len(s)
 476         if n >= width: return s
 477         sign = ''
 478         if s[0] in ('-', '+'):
 479                 sign, s = s[0], s[1:]
 480         return sign + '0'*(width-n) + s
 481
 482 # Expand tabs in a string.
 483 # Doesn't take non-printing chars into account, but does understand \n.
 484 def expandtabs(s, tabsize=8):
 485         """expandtabs(s [,tabsize]) -> string
 486
 487         Return a copy of the string s with all tab characters replaced
 488         by the appropriate number of spaces, depending on the current
 489         column, and the tabsize (default 8).
 490
 491         """
 492         res = line = ''
 493         for c in s:
 494                 if c == '\t':
 495                         c = ' '*(tabsize - len(line)%tabsize)
 496                 line = line + c
 497                 if c == '\n':
 498                         res = res + line
 499                         line = ''
 500         return res + line
 501
 502 # Character translation through look-up table.
 503 def translate(s, table, deletions=""):
 504         """translate(s,table [,deletechars]) -> string
 505
 506         Return a copy of the string s, where all characters occurring
 507         in the optional argument deletechars are removed, and the
 508         remaining characters have been mapped through the given
 509         translation table, which must be a string of length 256.
 510
 511         """
 512         if type(table) != type('') or len(table) != 256:
 513                 raise TypeError, \
 514                       "translation table must be 256 characters long"
 515         res = ""
 516         for c in s:
 517                 if c not in deletions:
 518                         res = res + table[ord(c)]
 519         return res
 520
 521 # Capitalize a string, e.g. "aBc  dEf" -> "Abc  def".
 522 def capitalize(s):
 523         """capitalize(s) -> string
 524
 525         Return a copy of the string s with only its first character
 526         capitalized.
 527
 528         """
 529         return upper(s[:1]) + lower(s[1:])
 530
 531 # Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
 532 # See also regsub.capwords().
 533 def capwords(s, sep=None):
 534         """capwords(s, [sep]) -> string
 535
 536         Split the argument into words using split, capitalize each
 537         word using capitalize, and join the capitalized words using
 538         join. Note that this replaces runs of whitespace characters by
 539         a single space.
 540
 541         """
 542         return join(map(capitalize, split(s, sep)), sep or ' ')
 543
 544 # Construct a translation string
 545 _idmapL = None
 546 def maketrans(fromstr, tostr):
 547         """maketrans(frm, to) -> string
 548
 549         Return a translation table (a string of 256 bytes long)
 550         suitable for use in string.translate.  The strings frm and to
 551         must be of the same length.
 552
 553         """
 554         if len(fromstr) != len(tostr):
 555                 raise ValueError, "maketrans arguments must have same length"
 556         global _idmapL
 557         if not _idmapL:
 558                 _idmapL = map(None, _idmap)
 559         L = _idmapL[:]
 560         fromstr = map(ord, fromstr)
 561         for i in range(len(fromstr)):
 562                 L[fromstr[i]] = tostr[i]
 563         return joinfields(L, "")
 564
 565 # Substring replacement (global)
 566 def replace(str, old, new, maxsplit=0):
 567         """replace (str, old, new[, maxsplit]) -> string
 568
 569         Return a copy of string str with all occurrences of substring
 570         old replaced by new. If the optional argument maxsplit is
 571         given, only the first maxsplit occurrences are replaced.
 572
 573         """
 574         return joinfields(splitfields(str, old, maxsplit), new)
 575
 576
 577 # Try importing optional built-in module "strop" -- if it exists,
 578 # it redefines some string operations that are 100-1000 times faster.
 579 # It also defines values for whitespace, lowercase and uppercase
 580 # that match <ctype.h>'s definitions.
 581
 582 try:
 583         from strop import *
 584         letters = lowercase + uppercase
 585 except ImportError:
 586         pass # Use the original, slow versions