Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from _csv import Error, __version__, writer, reader, register_dialect, \
   8                  unregister_dialect, get_dialect, list_dialects, \
   9                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  10                  __doc__
  11
  12 try:
  13     from cStringIO import StringIO
  14 except ImportError:
  15     from StringIO import StringIO
  16
  17 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  18             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  19             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  20             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  21
  22 class Dialect:
  23     _name = ""
  24     _valid = False
  25     # placeholders
  26     delimiter = None
  27     quotechar = None
  28     escapechar = None
  29     doublequote = None
  30     skipinitialspace = None
  31     lineterminator = None
  32     quoting = None
  33
  34     def __init__(self):
  35         if self.__class__ != Dialect:
  36             self._valid = True
  37         errors = self._validate()
  38         if errors != []:
  39             raise Error, "Dialect did not validate: %s" % ", ".join(errors)
  40
  41     def _validate(self):
  42         errors = []
  43         if not self._valid:
  44             errors.append("can't directly instantiate Dialect class")
  45
  46         if self.delimiter is None:
  47             errors.append("delimiter character not set")
  48         elif (not isinstance(self.delimiter, str) or
  49               len(self.delimiter) > 1):
  50             errors.append("delimiter must be one-character string")
  51
  52         if self.quotechar is None:
  53             if self.quoting != QUOTE_NONE:
  54                 errors.append("quotechar not set")
  55         elif (not isinstance(self.quotechar, str) or
  56               len(self.quotechar) > 1):
  57             errors.append("quotechar must be one-character string")
  58
  59         if self.lineterminator is None:
  60             errors.append("lineterminator not set")
  61         elif not isinstance(self.lineterminator, str):
  62             errors.append("lineterminator must be a string")
  63
  64         if self.doublequote not in (True, False):
  65             errors.append("doublequote parameter must be True or False")
  66
  67         if self.skipinitialspace not in (True, False):
  68             errors.append("skipinitialspace parameter must be True or False")
  69
  70         if self.quoting is None:
  71             errors.append("quoting parameter not set")
  72
  73         if self.quoting is QUOTE_NONE:
  74             if (not isinstance(self.escapechar, (unicode, str)) or
  75                 len(self.escapechar) > 1):
  76                 errors.append("escapechar must be a one-character string or unicode object")
  77
  78         return errors
  79
  80 class excel(Dialect):
  81     delimiter = ','
  82     quotechar = '"'
  83     doublequote = True
  84     skipinitialspace = False
  85     lineterminator = '\r\n'
  86     quoting = QUOTE_MINIMAL
  87 register_dialect("excel", excel)
  88
  89 class excel_tab(excel):
  90     delimiter = '\t'
  91 register_dialect("excel-tab", excel_tab)
  92
  93
  94 class DictReader:
  95     def __init__(self, f, fieldnames, restkey=None, restval=None,
  96                  dialect="excel", *args):
  97         self.fieldnames = fieldnames    # list of keys for the dict
  98         self.restkey = restkey          # key to catch long rows
  99         self.restval = restval          # default value for short rows
 100         self.reader = reader(f, dialect, *args)
 101
 102     def __iter__(self):
 103         return self
 104
 105     def next(self):
 106         row = self.reader.next()
 107         # unlike the basic reader, we prefer not to return blanks,
 108         # because we will typically wind up with a dict full of None
 109         # values
 110         while row == []:
 111             row = self.reader.next()
 112         d = dict(zip(self.fieldnames, row))
 113         lf = len(self.fieldnames)
 114         lr = len(row)
 115         if lf < lr:
 116             d[self.restkey] = row[lf:]
 117         elif lf > lr:
 118             for key in self.fieldnames[lr:]:
 119                 d[key] = self.restval
 120         return d
 121
 122
 123 class DictWriter:
 124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 125                  dialect="excel", *args):
 126         self.fieldnames = fieldnames    # list of keys for the dict
 127         self.restval = restval          # for writing short dicts
 128         if extrasaction.lower() not in ("raise", "ignore"):
 129             raise ValueError, \
 130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 131                    extrasaction)
 132         self.extrasaction = extrasaction
 133         self.writer = writer(f, dialect, *args)
 134
 135     def _dict_to_list(self, rowdict):
 136         if self.extrasaction == "raise":
 137             for k in rowdict.keys():
 138                 if k not in self.fieldnames:
 139                     raise ValueError, "dict contains fields not in fieldnames"
 140         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 141
 142     def writerow(self, rowdict):
 143         return self.writer.writerow(self._dict_to_list(rowdict))
 144
 145     def writerows(self, rowdicts):
 146         rows = []
 147         for rowdict in rowdicts:
 148             rows.append(self._dict_to_list(rowdict))
 149         return self.writer.writerows(rows)
 150
 151 # Guard Sniffer's type checking against builds that exclude complex()
 152 try:
 153     complex
 154 except NameError:
 155     complex = float
 156
 157 class Sniffer:
 158     '''
 159     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 160     Returns a Dialect object.
 161     '''
 162     def __init__(self):
 163         # in case there is more than one possible delimiter
 164         self.preferred = [',', '\t', ';', ' ', ':']
 165
 166
 167     def sniff(self, sample, delimiters=None):
 168         """
 169         Returns a dialect (or None) corresponding to the sample
 170         """
 171
 172         quotechar, delimiter, skipinitialspace = \
 173                    self._guess_quote_and_delimiter(sample, delimiters)
 174         if delimiter is None:
 175             delimiter, skipinitialspace = self._guess_delimiter(sample,
 176                                                                 delimiters)
 177
 178         class dialect(Dialect):
 179             _name = "sniffed"
 180             lineterminator = '\r\n'
 181             quoting = QUOTE_MINIMAL
 182             # escapechar = ''
 183             doublequote = False
 184
 185         dialect.delimiter = delimiter
 186         # _csv.reader won't accept a quotechar of ''
 187         dialect.quotechar = quotechar or '"'
 188         dialect.skipinitialspace = skipinitialspace
 189
 190         return dialect
 191
 192
 193     def _guess_quote_and_delimiter(self, data, delimiters):
 194         """
 195         Looks for text enclosed between two identical quotes
 196         (the probable quotechar) which are preceded and followed
 197         by the same character (the probable delimiter).
 198         For example:
 199                          ,'some text',
 200         The quote with the most wins, same with the delimiter.
 201         If there is no quotechar the delimiter can't be determined
 202         this way.
 203         """
 204
 205         matches = []
 206         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 207                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 208                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 209                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 210             regexp = re.compile(restr, re.S | re.M)
 211             matches = regexp.findall(data)
 212             if matches:
 213                 break
 214
 215         if not matches:
 216             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 217
 218         quotes = {}
 219         delims = {}
 220         spaces = 0
 221         for m in matches:
 222             n = regexp.groupindex['quote'] - 1
 223             key = m[n]
 224             if key:
 225                 quotes[key] = quotes.get(key, 0) + 1
 226             try:
 227                 n = regexp.groupindex['delim'] - 1
 228                 key = m[n]
 229             except KeyError:
 230                 continue
 231             if key and (delimiters is None or key in delimiters):
 232                 delims[key] = delims.get(key, 0) + 1
 233             try:
 234                 n = regexp.groupindex['space'] - 1
 235             except KeyError:
 236                 continue
 237             if m[n]:
 238                 spaces += 1
 239
 240         quotechar = reduce(lambda a, b, quotes = quotes:
 241                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 242
 243         if delims:
 244             delim = reduce(lambda a, b, delims = delims:
 245                            (delims[a] > delims[b]) and a or b, delims.keys())
 246             skipinitialspace = delims[delim] == spaces
 247             if delim == '\n': # most likely a file with a single column
 248                 delim = ''
 249         else:
 250             # there is *no* delimiter, it's a single column of quoted data
 251             delim = ''
 252             skipinitialspace = 0
 253
 254         return (quotechar, delim, skipinitialspace)
 255
 256
 257     def _guess_delimiter(self, data, delimiters):
 258         """
 259         The delimiter /should/ occur the same number of times on
 260         each row. However, due to malformed data, it may not. We don't want
 261         an all or nothing approach, so we allow for small variations in this
 262         number.
 263           1) build a table of the frequency of each character on every line.
 264           2) build a table of freqencies of this frequency (meta-frequency?),
 265              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 266              7 times in 2 rows'
 267           3) use the mode of the meta-frequency to determine the /expected/
 268              frequency for that character
 269           4) find out how often the character actually meets that goal
 270           5) the character that best meets its goal is the delimiter
 271         For performance reasons, the data is evaluated in chunks, so it can
 272         try and evaluate the smallest portion of the data possible, evaluating
 273         additional chunks as necessary.
 274         """
 275
 276         data = filter(None, data.split('\n'))
 277
 278         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 279
 280         # build frequency tables
 281         chunkLength = min(10, len(data))
 282         iteration = 0
 283         charFrequency = {}
 284         modes = {}
 285         delims = {}
 286         start, end = 0, min(chunkLength, len(data))
 287         while start < len(data):
 288             iteration += 1
 289             for line in data[start:end]:
 290                 for char in ascii:
 291                     metaFrequency = charFrequency.get(char, {})
 292                     # must count even if frequency is 0
 293                     freq = line.strip().count(char)
 294                     # value is the mode
 295                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 296                     charFrequency[char] = metaFrequency
 297
 298             for char in charFrequency.keys():
 299                 items = charFrequency[char].items()
 300                 if len(items) == 1 and items[0][0] == 0:
 301                     continue
 302                 # get the mode of the frequencies
 303                 if len(items) > 1:
 304                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 305                                          items)
 306                     # adjust the mode - subtract the sum of all
 307                     # other frequencies
 308                     items.remove(modes[char])
 309                     modes[char] = (modes[char][0], modes[char][1]
 310                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 311                                             items)[1])
 312                 else:
 313                     modes[char] = items[0]
 314
 315             # build a list of possible delimiters
 316             modeList = modes.items()
 317             total = float(chunkLength * iteration)
 318             # (rows of consistent data) / (number of rows) = 100%
 319             consistency = 1.0
 320             # minimum consistency threshold
 321             threshold = 0.9
 322             while len(delims) == 0 and consistency >= threshold:
 323                 for k, v in modeList:
 324                     if v[0] > 0 and v[1] > 0:
 325                         if ((v[1]/total) >= consistency and
 326                             (delimiters is None or k in delimiters)):
 327                             delims[k] = v
 328                 consistency -= 0.01
 329
 330             if len(delims) == 1:
 331                 delim = delims.keys()[0]
 332                 skipinitialspace = (data[0].count(delim) ==
 333                                     data[0].count("%c " % delim))
 334                 return (delim, skipinitialspace)
 335
 336             # analyze another chunkLength lines
 337             start = end
 338             end += chunkLength
 339
 340         if not delims:
 341             return ('', 0)
 342
 343         # if there's more than one, fall back to a 'preferred' list
 344         if len(delims) > 1:
 345             for d in self.preferred:
 346                 if d in delims.keys():
 347                     skipinitialspace = (data[0].count(d) ==
 348                                         data[0].count("%c " % d))
 349                     return (d, skipinitialspace)
 350
 351         # finally, just return the first damn character in the list
 352         delim = delims.keys()[0]
 353         skipinitialspace = (data[0].count(delim) ==
 354                             data[0].count("%c " % delim))
 355         return (delim, skipinitialspace)
 356
 357
 358     def has_header(self, sample):
 359         # Creates a dictionary of types of data in each column. If any
 360         # column is of a single type (say, integers), *except* for the first
 361         # row, then the first row is presumed to be labels. If the type
 362         # can't be determined, it is assumed to be a string in which case
 363         # the length of the string is the determining factor: if all of the
 364         # rows except for the first are the same length, it's a header.
 365         # Finally, a 'vote' is taken at the end for each column, adding or
 366         # subtracting from the likelihood of the first row being a header.
 367
 368         rdr = reader(StringIO(sample), self.sniff(sample))
 369
 370         header = rdr.next() # assume first row is header
 371
 372         columns = len(header)
 373         columnTypes = {}
 374         for i in range(columns): columnTypes[i] = None
 375
 376         checked = 0
 377         for row in rdr:
 378             # arbitrary number of rows to check, to keep it sane
 379             if checked > 20:
 380                 break
 381             checked += 1
 382
 383             if len(row) != columns:
 384                 continue # skip rows that have irregular number of columns
 385
 386             for col in columnTypes.keys():
 387
 388                 for thisType in [int, long, float, complex]:
 389                     try:
 390                         thisType(row[col])
 391                         break
 392                     except (ValueError, OverflowError):
 393                         pass
 394                 else:
 395                     # fallback to length of string
 396                     thisType = len(row[col])
 397
 398                 # treat longs as ints
 399                 if thisType == long:
 400                     thisType = int
 401
 402                 if thisType != columnTypes[col]:
 403                     if columnTypes[col] is None: # add new column type
 404                         columnTypes[col] = thisType
 405                     else:
 406                         # type is inconsistent, remove column from
 407                         # consideration
 408                         del columnTypes[col]
 409
 410         # finally, compare results against first row and "vote"
 411         # on whether it's a header
 412         hasHeader = 0
 413         for col, colType in columnTypes.items():
 414             if type(colType) == type(0): # it's a length
 415                 if len(header[col]) != colType:
 416                     hasHeader += 1
 417                 else:
 418                     hasHeader -= 1
 419             else: # attempt typecast
 420                 try:
 421                     colType(header[col])
 422                 except (ValueError, TypeError):
 423                     hasHeader += 1
 424                 else:
 425                     hasHeader -= 1
 426
 427         return hasHeader > 0