Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from _csv import Error, __version__, writer, reader, register_dialect, \
   8                  unregister_dialect, get_dialect, list_dialects, \
   9                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  10                  __doc__
  11
  12 try:
  13     from cStringIO import StringIO
  14 except ImportError:
  15     from StringIO import StringIO
  16
  17 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  18             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  19             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  20             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  21
  22 class Dialect:
  23     _name = ""
  24     _valid = False
  25     # placeholders
  26     delimiter = None
  27     quotechar = None
  28     escapechar = None
  29     doublequote = None
  30     skipinitialspace = None
  31     lineterminator = None
  32     quoting = None
  33
  34     def __init__(self):
  35         if self.__class__ != Dialect:
  36             self._valid = True
  37         errors = self._validate()
  38         if errors != []:
  39             raise Error, "Dialect did not validate: %s" % ", ".join(errors)
  40
  41     def _validate(self):
  42         errors = []
  43         if not self._valid:
  44             errors.append("can't directly instantiate Dialect class")
  45
  46         if self.delimiter is None:
  47             errors.append("delimiter character not set")
  48         elif (not isinstance(self.delimiter, str) or
  49               len(self.delimiter) > 1):
  50             errors.append("delimiter must be one-character string")
  51
  52         if self.quotechar is None:
  53             if self.quoting != QUOTE_NONE:
  54                 errors.append("quotechar not set")
  55         elif (not isinstance(self.quotechar, str) or
  56               len(self.quotechar) > 1):
  57             errors.append("quotechar must be one-character string")
  58
  59         if self.lineterminator is None:
  60             errors.append("lineterminator not set")
  61         elif not isinstance(self.lineterminator, str):
  62             errors.append("lineterminator must be a string")
  63
  64         if self.doublequote not in (True, False):
  65             errors.append("doublequote parameter must be True or False")
  66
  67         if self.skipinitialspace not in (True, False):
  68             errors.append("skipinitialspace parameter must be True or False")
  69
  70         if self.quoting is None:
  71             errors.append("quoting parameter not set")
  72
  73         if self.quoting is QUOTE_NONE:
  74             if (not isinstance(self.escapechar, (unicode, str)) or
  75                 len(self.escapechar) > 1):
  76                 errors.append("escapechar must be a one-character string or unicode object")
  77
  78         return errors
  79
  80 class excel(Dialect):
  81     delimiter = ','
  82     quotechar = '"'
  83     doublequote = True
  84     skipinitialspace = False
  85     lineterminator = '\r\n'
  86     quoting = QUOTE_MINIMAL
  87 register_dialect("excel", excel)
  88
  89 class excel_tab(excel):
  90     delimiter = '\t'
  91 register_dialect("excel-tab", excel_tab)
  92
  93
  94 class DictReader:
  95     def __init__(self, f, fieldnames, restkey=None, restval=None,
  96                  dialect="excel", *args):
  97         self.fieldnames = fieldnames    # list of keys for the dict
  98         self.restkey = restkey          # key to catch long rows
  99         self.restval = restval          # default value for short rows
 100         self.reader = reader(f, dialect, *args)
 101
 102     def __iter__(self):
 103         return self
 104
 105     def next(self):
 106         row = self.reader.next()
 107         # unlike the basic reader, we prefer not to return blanks,
 108         # because we will typically wind up with a dict full of None
 109         # values
 110         while row == []:
 111             row = self.reader.next()
 112         d = dict(zip(self.fieldnames, row))
 113         lf = len(self.fieldnames)
 114         lr = len(row)
 115         if lf < lr:
 116             d[self.restkey] = row[lf:]
 117         elif lf > lr:
 118             for key in self.fieldnames[lr:]:
 119                 d[key] = self.restval
 120         return d
 121
 122
 123 class DictWriter:
 124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 125                  dialect="excel", *args):
 126         self.fieldnames = fieldnames    # list of keys for the dict
 127         self.restval = restval          # for writing short dicts
 128         if extrasaction.lower() not in ("raise", "ignore"):
 129             raise ValueError, \
 130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 131                    extrasaction)
 132         self.extrasaction = extrasaction
 133         self.writer = writer(f, dialect, *args)
 134
 135     def _dict_to_list(self, rowdict):
 136         if self.extrasaction == "raise":
 137             for k in rowdict.keys():
 138                 if k not in self.fieldnames:
 139                     raise ValueError, "dict contains fields not in fieldnames"
 140         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 141
 142     def writerow(self, rowdict):
 143         return self.writer.writerow(self._dict_to_list(rowdict))
 144
 145     def writerows(self, rowdicts):
 146         rows = []
 147         for rowdict in rowdicts:
 148             rows.append(self._dict_to_list(rowdict))
 149         return self.writer.writerows(rows)
 150
 151
 152 class Sniffer:
 153     '''
 154     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 155     Returns a Dialect object.
 156     '''
 157     def __init__(self):
 158         # in case there is more than one possible delimiter
 159         self.preferred = [',', '\t', ';', ' ', ':']
 160
 161
 162     def sniff(self, sample):
 163         """
 164         Returns a dialect (or None) corresponding to the sample
 165         """
 166
 167         quotechar, delimiter, skipinitialspace = \
 168                    self._guess_quote_and_delimiter(sample)
 169         if delimiter is None:
 170             delimiter, skipinitialspace = self._guess_delimiter(sample)
 171
 172         class dialect(Dialect):
 173             _name = "sniffed"
 174             lineterminator = '\r\n'
 175             quoting = QUOTE_MINIMAL
 176             # escapechar = ''
 177             doublequote = False
 178
 179         dialect.delimiter = delimiter
 180         # _csv.reader won't accept a quotechar of ''
 181         dialect.quotechar = quotechar or '"'
 182         dialect.skipinitialspace = skipinitialspace
 183
 184         return dialect
 185
 186
 187     def _guess_quote_and_delimiter(self, data):
 188         """
 189         Looks for text enclosed between two identical quotes
 190         (the probable quotechar) which are preceded and followed
 191         by the same character (the probable delimiter).
 192         For example:
 193                          ,'some text',
 194         The quote with the most wins, same with the delimiter.
 195         If there is no quotechar the delimiter can't be determined
 196         this way.
 197         """
 198
 199         matches = []
 200         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 201                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 202                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 203                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 204             regexp = re.compile(restr, re.S | re.M)
 205             matches = regexp.findall(data)
 206             if matches:
 207                 break
 208
 209         if not matches:
 210             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 211
 212         quotes = {}
 213         delims = {}
 214         spaces = 0
 215         for m in matches:
 216             n = regexp.groupindex['quote'] - 1
 217             key = m[n]
 218             if key:
 219                 quotes[key] = quotes.get(key, 0) + 1
 220             try:
 221                 n = regexp.groupindex['delim'] - 1
 222                 key = m[n]
 223             except KeyError:
 224                 continue
 225             if key:
 226                 delims[key] = delims.get(key, 0) + 1
 227             try:
 228                 n = regexp.groupindex['space'] - 1
 229             except KeyError:
 230                 continue
 231             if m[n]:
 232                 spaces += 1
 233
 234         quotechar = reduce(lambda a, b, quotes = quotes:
 235                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 236
 237         if delims:
 238             delim = reduce(lambda a, b, delims = delims:
 239                            (delims[a] > delims[b]) and a or b, delims.keys())
 240             skipinitialspace = delims[delim] == spaces
 241             if delim == '\n': # most likely a file with a single column
 242                 delim = ''
 243         else:
 244             # there is *no* delimiter, it's a single column of quoted data
 245             delim = ''
 246             skipinitialspace = 0
 247
 248         return (quotechar, delim, skipinitialspace)
 249
 250
 251     def _guess_delimiter(self, data):
 252         """
 253         The delimiter /should/ occur the same number of times on
 254         each row. However, due to malformed data, it may not. We don't want
 255         an all or nothing approach, so we allow for small variations in this
 256         number.
 257           1) build a table of the frequency of each character on every line.
 258           2) build a table of freqencies of this frequency (meta-frequency?),
 259              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 260              7 times in 2 rows'
 261           3) use the mode of the meta-frequency to determine the /expected/
 262              frequency for that character
 263           4) find out how often the character actually meets that goal
 264           5) the character that best meets its goal is the delimiter
 265         For performance reasons, the data is evaluated in chunks, so it can
 266         try and evaluate the smallest portion of the data possible, evaluating
 267         additional chunks as necessary.
 268         """
 269
 270         data = filter(None, data.split('\n'))
 271
 272         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 273
 274         # build frequency tables
 275         chunkLength = min(10, len(data))
 276         iteration = 0
 277         charFrequency = {}
 278         modes = {}
 279         delims = {}
 280         start, end = 0, min(chunkLength, len(data))
 281         while start < len(data):
 282             iteration += 1
 283             for line in data[start:end]:
 284                 for char in ascii:
 285                     metaFrequency = charFrequency.get(char, {})
 286                     # must count even if frequency is 0
 287                     freq = line.strip().count(char)
 288                     # value is the mode
 289                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 290                     charFrequency[char] = metaFrequency
 291
 292             for char in charFrequency.keys():
 293                 items = charFrequency[char].items()
 294                 if len(items) == 1 and items[0][0] == 0:
 295                     continue
 296                 # get the mode of the frequencies
 297                 if len(items) > 1:
 298                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 299                                          items)
 300                     # adjust the mode - subtract the sum of all
 301                     # other frequencies
 302                     items.remove(modes[char])
 303                     modes[char] = (modes[char][0], modes[char][1]
 304                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 305                                             items)[1])
 306                 else:
 307                     modes[char] = items[0]
 308
 309             # build a list of possible delimiters
 310             modeList = modes.items()
 311             total = float(chunkLength * iteration)
 312             # (rows of consistent data) / (number of rows) = 100%
 313             consistency = 1.0
 314             # minimum consistency threshold
 315             threshold = 0.9
 316             while len(delims) == 0 and consistency >= threshold:
 317                 for k, v in modeList:
 318                     if v[0] > 0 and v[1] > 0:
 319                         if (v[1]/total) >= consistency:
 320                             delims[k] = v
 321                 consistency -= 0.01
 322
 323             if len(delims) == 1:
 324                 delim = delims.keys()[0]
 325                 skipinitialspace = (data[0].count(delim) ==
 326                                     data[0].count("%c " % delim))
 327                 return (delim, skipinitialspace)
 328
 329             # analyze another chunkLength lines
 330             start = end
 331             end += chunkLength
 332
 333         if not delims:
 334             return ('', 0)
 335
 336         # if there's more than one, fall back to a 'preferred' list
 337         if len(delims) > 1:
 338             for d in self.preferred:
 339                 if d in delims.keys():
 340                     skipinitialspace = (data[0].count(d) ==
 341                                         data[0].count("%c " % d))
 342                     return (d, skipinitialspace)
 343
 344         # finally, just return the first damn character in the list
 345         delim = delims.keys()[0]
 346         skipinitialspace = (data[0].count(delim) ==
 347                             data[0].count("%c " % delim))
 348         return (delim, skipinitialspace)
 349
 350
 351     def has_header(self, sample):
 352         # Creates a dictionary of types of data in each column. If any
 353         # column is of a single type (say, integers), *except* for the first
 354         # row, then the first row is presumed to be labels. If the type
 355         # can't be determined, it is assumed to be a string in which case
 356         # the length of the string is the determining factor: if all of the
 357         # rows except for the first are the same length, it's a header.
 358         # Finally, a 'vote' is taken at the end for each column, adding or
 359         # subtracting from the likelihood of the first row being a header.
 360
 361         def seval(item):
 362             """
 363             Strips parens from item prior to calling eval in an
 364             attempt to make it safer
 365             """
 366             return eval(item.replace('(', '').replace(')', ''))
 367
 368         rdr = reader(StringIO(sample), self.sniff(sample))
 369
 370         header = rdr.next() # assume first row is header
 371
 372         columns = len(header)
 373         columnTypes = {}
 374         for i in range(columns): columnTypes[i] = None
 375
 376         checked = 0
 377         for row in rdr:
 378             # arbitrary number of rows to check, to keep it sane
 379             if checked > 20:
 380                 break
 381             checked += 1
 382
 383             if len(row) != columns:
 384                 continue # skip rows that have irregular number of columns
 385
 386             for col in columnTypes.keys():
 387                 try:
 388                     try:
 389                         # is it a built-in type (besides string)?
 390                         thisType = type(seval(row[col]))
 391                     except OverflowError:
 392                         # a long int?
 393                         thisType = type(seval(row[col] + 'L'))
 394                         thisType = type(0) # treat long ints as int
 395                 except:
 396                     # fallback to length of string
 397                     thisType = len(row[col])
 398
 399                 if thisType != columnTypes[col]:
 400                     if columnTypes[col] is None: # add new column type
 401                         columnTypes[col] = thisType
 402                     else:
 403                         # type is inconsistent, remove column from
 404                         # consideration
 405                         del columnTypes[col]
 406
 407         # finally, compare results against first row and "vote"
 408         # on whether it's a header
 409         hasHeader = 0
 410         for col, colType in columnTypes.items():
 411             if type(colType) == type(0): # it's a length
 412                 if len(header[col]) != colType:
 413                     hasHeader += 1
 414                 else:
 415                     hasHeader -= 1
 416             else: # attempt typecast
 417                 try:
 418                     eval("%s(%s)" % (colType.__name__, header[col]))
 419                 except:
 420                     hasHeader += 1
 421                 else:
 422                     hasHeader -= 1
 423
 424         return hasHeader > 0