misc/csv.py

   1 # -*- coding: utf-8 -*-
   2
   3 """
   4 csv.py - read/write/investigate CSV files
   5 """
   6
   7 import re
   8 from _csv import Error, __version__, writer, reader, register_dialect, \
   9                  unregister_dialect, get_dialect, list_dialects, \
  10                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  11                  __doc__
  12
  13 try:
  14     from cStringIO import StringIO
  15 except ImportError:
  16     from StringIO import StringIO
  17
  18 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  19             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  20             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  21             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  22
  23 class Dialect:
  24     _name = ""
  25     _valid = False
  26     # placeholders
  27     delimiter = None
  28     quotechar = None
  29     escapechar = None
  30     doublequote = None
  31     skipinitialspace = None
  32     lineterminator = None
  33     quoting = None
  34
  35     def __init__(self):
  36         if self.__class__ != Dialect:
  37             self._valid = True
  38         errors = self._validate()
  39         if errors != []:
  40             raise Error, "Dialect did not validate: %s" % ", ".join(errors)
  41
  42     def _validate(self):
  43         errors = []
  44         if not self._valid:
  45             errors.append("can't directly instantiate Dialect class")
  46
  47         if self.delimiter is None:
  48             errors.append("delimiter character not set")
  49         elif (not isinstance(self.delimiter, str) or
  50               len(self.delimiter) > 1):
  51             errors.append("delimiter must be one-character string")
  52
  53         if self.quotechar is None:
  54             if self.quoting != QUOTE_NONE:
  55                 errors.append("quotechar not set")
  56         elif (not isinstance(self.quotechar, str) or
  57               len(self.quotechar) > 1):
  58             errors.append("quotechar must be one-character string")
  59
  60         if self.lineterminator is None:
  61             errors.append("lineterminator not set")
  62         elif not isinstance(self.lineterminator, str):
  63             errors.append("lineterminator must be a string")
  64
  65         if self.doublequote not in (True, False):
  66             errors.append("doublequote parameter must be True or False")
  67
  68         if self.skipinitialspace not in (True, False):
  69             errors.append("skipinitialspace parameter must be True or False")
  70
  71         if self.quoting is None:
  72             errors.append("quoting parameter not set")
  73
  74         if self.quoting is QUOTE_NONE:
  75             if (not isinstance(self.escapechar, (unicode, str)) or
  76                 len(self.escapechar) > 1):
  77                 errors.append("escapechar must be a one-character string or unicode object")
  78
  79         return errors
  80
  81 class excel(Dialect):
  82     delimiter = ','
  83     quotechar = '"'
  84     doublequote = True
  85     skipinitialspace = False
  86     lineterminator = '\r\n'
  87     quoting = QUOTE_MINIMAL
  88 register_dialect("excel", excel)
  89
  90 class excel_tab(excel):
  91     delimiter = '\t'
  92 register_dialect("excel-tab", excel_tab)
  93
  94
  95 class DictReader:
  96     def __init__(self, f, fieldnames, restkey=None, restval=None,
  97                  dialect="excel", *args, **kwds):
  98         self.fieldnames = fieldnames    # list of keys for the dict
  99         self.restkey = restkey          # key to catch long rows
 100         self.restval = restval          # default value for short rows
 101         self.reader = reader(f, dialect, *args, **kwds)
 102
 103     def __iter__(self):
 104         return self
 105
 106     def next(self):
 107         row = self.reader.next()
 108         # unlike the basic reader, we prefer not to return blanks,
 109         # because we will typically wind up with a dict full of None
 110         # values
 111         while row == []:
 112             row = self.reader.next()
 113         d = dict(zip(self.fieldnames, row))
 114         lf = len(self.fieldnames)
 115         lr = len(row)
 116         if lf < lr:
 117             d[self.restkey] = row[lf:]
 118         elif lf > lr:
 119             for key in self.fieldnames[lr:]:
 120                 d[key] = self.restval
 121         return d
 122
 123
 124 class DictWriter:
 125     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 126                  dialect="excel", *args, **kwds):
 127         self.fieldnames = fieldnames    # list of keys for the dict
 128         self.restval = restval          # for writing short dicts
 129         if extrasaction.lower() not in ("raise", "ignore"):
 130             raise ValueError, \
 131                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 132                    extrasaction)
 133         self.extrasaction = extrasaction
 134         self.writer = writer(f, dialect, *args, **kwds)
 135
 136     def _dict_to_list(self, rowdict):
 137         if self.extrasaction == "raise":
 138             for k in rowdict.keys():
 139                 if k not in self.fieldnames:
 140                     raise ValueError, "dict contains fields not in fieldnames"
 141         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 142
 143     def writerow(self, rowdict):
 144         return self.writer.writerow(self._dict_to_list(rowdict))
 145
 146     def writerows(self, rowdicts):
 147         rows = []
 148         for rowdict in rowdicts:
 149             rows.append(self._dict_to_list(rowdict))
 150         return self.writer.writerows(rows)
 151
 152 # Guard Sniffer's type checking against builds that exclude complex()
 153 try:
 154     complex
 155 except NameError:
 156     complex = float
 157
 158 class Sniffer:
 159     '''
 160     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 161     Returns a Dialect object.
 162     '''
 163     def __init__(self):
 164         # in case there is more than one possible delimiter
 165         self.preferred = [',', '\t', ';', ' ', ':']
 166
 167
 168     def sniff(self, sample, delimiters=None):
 169         """
 170         Returns a dialect (or None) corresponding to the sample
 171         """
 172
 173         quotechar, delimiter, skipinitialspace = \
 174                    self._guess_quote_and_delimiter(sample, delimiters)
 175         if delimiter is None:
 176             delimiter, skipinitialspace = self._guess_delimiter(sample,
 177                                                                 delimiters)
 178
 179         class dialect(Dialect):
 180             _name = "sniffed"
 181             lineterminator = '\r\n'
 182             quoting = QUOTE_MINIMAL
 183             # escapechar = ''
 184             doublequote = False
 185
 186         dialect.delimiter = delimiter
 187         # _csv.reader won't accept a quotechar of ''
 188         dialect.quotechar = quotechar or '"'
 189         dialect.skipinitialspace = skipinitialspace
 190
 191         return dialect
 192
 193
 194     def _guess_quote_and_delimiter(self, data, delimiters):
 195         """
 196         Looks for text enclosed between two identical quotes
 197         (the probable quotechar) which are preceded and followed
 198         by the same character (the probable delimiter).
 199         For example:
 200                          ,'some text',
 201         The quote with the most wins, same with the delimiter.
 202         If there is no quotechar the delimiter can't be determined
 203         this way.
 204         """
 205
 206         matches = []
 207         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 208                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 209                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 210                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 211             regexp = re.compile(restr, re.S | re.M)
 212             matches = regexp.findall(data)
 213             if matches:
 214                 break
 215
 216         if not matches:
 217             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 218
 219         quotes = {}
 220         delims = {}
 221         spaces = 0
 222         for m in matches:
 223             n = regexp.groupindex['quote'] - 1
 224             key = m[n]
 225             if key:
 226                 quotes[key] = quotes.get(key, 0) + 1
 227             try:
 228                 n = regexp.groupindex['delim'] - 1
 229                 key = m[n]
 230             except KeyError:
 231                 continue
 232             if key and (delimiters is None or key in delimiters):
 233                 delims[key] = delims.get(key, 0) + 1
 234             try:
 235                 n = regexp.groupindex['space'] - 1
 236             except KeyError:
 237                 continue
 238             if m[n]:
 239                 spaces += 1
 240
 241         quotechar = reduce(lambda a, b, quotes = quotes:
 242                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 243
 244         if delims:
 245             delim = reduce(lambda a, b, delims = delims:
 246                            (delims[a] > delims[b]) and a or b, delims.keys())
 247             skipinitialspace = delims[delim] == spaces
 248             if delim == '\n': # most likely a file with a single column
 249                 delim = ''
 250         else:
 251             # there is *no* delimiter, it's a single column of quoted data
 252             delim = ''
 253             skipinitialspace = 0
 254
 255         return (quotechar, delim, skipinitialspace)
 256
 257
 258     def _guess_delimiter(self, data, delimiters):
 259         """
 260         The delimiter /should/ occur the same number of times on
 261         each row. However, due to malformed data, it may not. We don't want
 262         an all or nothing approach, so we allow for small variations in this
 263         number.
 264           1) build a table of the frequency of each character on every line.
 265           2) build a table of freqencies of this frequency (meta-frequency?),
 266              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 267              7 times in 2 rows'
 268           3) use the mode of the meta-frequency to determine the /expected/
 269              frequency for that character
 270           4) find out how often the character actually meets that goal
 271           5) the character that best meets its goal is the delimiter
 272         For performance reasons, the data is evaluated in chunks, so it can
 273         try and evaluate the smallest portion of the data possible, evaluating
 274         additional chunks as necessary.
 275         """
 276
 277         data = filter(None, data.split('\n'))
 278
 279         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 280
 281         # build frequency tables
 282         chunkLength = min(10, len(data))
 283         iteration = 0
 284         charFrequency = {}
 285         modes = {}
 286         delims = {}
 287         start, end = 0, min(chunkLength, len(data))
 288         while start < len(data):
 289             iteration += 1
 290             for line in data[start:end]:
 291                 for char in ascii:
 292                     metaFrequency = charFrequency.get(char, {})
 293                     # must count even if frequency is 0
 294                     freq = line.strip().count(char)
 295                     # value is the mode
 296                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 297                     charFrequency[char] = metaFrequency
 298
 299             for char in charFrequency.keys():
 300                 items = charFrequency[char].items()
 301                 if len(items) == 1 and items[0][0] == 0:
 302                     continue
 303                 # get the mode of the frequencies
 304                 if len(items) > 1:
 305                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 306                                          items)
 307                     # adjust the mode - subtract the sum of all
 308                     # other frequencies
 309                     items.remove(modes[char])
 310                     modes[char] = (modes[char][0], modes[char][1]
 311                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 312                                             items)[1])
 313                 else:
 314                     modes[char] = items[0]
 315
 316             # build a list of possible delimiters
 317             modeList = modes.items()
 318             total = float(chunkLength * iteration)
 319             # (rows of consistent data) / (number of rows) = 100%
 320             consistency = 1.0
 321             # minimum consistency threshold
 322             threshold = 0.9
 323             while len(delims) == 0 and consistency >= threshold:
 324                 for k, v in modeList:
 325                     if v[0] > 0 and v[1] > 0:
 326                         if ((v[1]/total) >= consistency and
 327                             (delimiters is None or k in delimiters)):
 328                             delims[k] = v
 329                 consistency -= 0.01
 330
 331             if len(delims) == 1:
 332                 delim = delims.keys()[0]
 333                 skipinitialspace = (data[0].count(delim) ==
 334                                     data[0].count("%c " % delim))
 335                 return (delim, skipinitialspace)
 336
 337             # analyze another chunkLength lines
 338             start = end
 339             end += chunkLength
 340
 341         if not delims:
 342             return ('', 0)
 343
 344         # if there's more than one, fall back to a 'preferred' list
 345         if len(delims) > 1:
 346             for d in self.preferred:
 347                 if d in delims.keys():
 348                     skipinitialspace = (data[0].count(d) ==
 349                                         data[0].count("%c " % d))
 350                     return (d, skipinitialspace)
 351
 352         # finally, just return the first damn character in the list
 353         delim = delims.keys()[0]
 354         skipinitialspace = (data[0].count(delim) ==
 355                             data[0].count("%c " % delim))
 356         return (delim, skipinitialspace)
 357
 358
 359     def has_header(self, sample):
 360         # Creates a dictionary of types of data in each column. If any
 361         # column is of a single type (say, integers), *except* for the first
 362         # row, then the first row is presumed to be labels. If the type
 363         # can't be determined, it is assumed to be a string in which case
 364         # the length of the string is the determining factor: if all of the
 365         # rows except for the first are the same length, it's a header.
 366         # Finally, a 'vote' is taken at the end for each column, adding or
 367         # subtracting from the likelihood of the first row being a header.
 368
 369         rdr = reader(StringIO(sample), self.sniff(sample))
 370
 371         header = rdr.next() # assume first row is header
 372
 373         columns = len(header)
 374         columnTypes = {}
 375         for i in range(columns): columnTypes[i] = None
 376
 377         checked = 0
 378         for row in rdr:
 379             # arbitrary number of rows to check, to keep it sane
 380             if checked > 20:
 381                 break
 382             checked += 1
 383
 384             if len(row) != columns:
 385                 continue # skip rows that have irregular number of columns
 386
 387             for col in columnTypes.keys():
 388
 389                 for thisType in [int, long, float, complex]:
 390                     try:
 391                         thisType(row[col])
 392                         break
 393                     except (ValueError, OverflowError):
 394                         pass
 395                 else:
 396                     # fallback to length of string
 397                     thisType = len(row[col])
 398
 399                 # treat longs as ints
 400                 if thisType == long:
 401                     thisType = int
 402
 403                 if thisType != columnTypes[col]:
 404                     if columnTypes[col] is None: # add new column type
 405                         columnTypes[col] = thisType
 406                     else:
 407                         # type is inconsistent, remove column from
 408                         # consideration
 409                         del columnTypes[col]
 410
 411         # finally, compare results against first row and "vote"
 412         # on whether it's a header
 413         hasHeader = 0
 414         for col, colType in columnTypes.items():
 415             if type(colType) == type(0): # it's a length
 416                 if len(header[col]) != colType:
 417                     hasHeader += 1
 418                 else:
 419                     hasHeader -= 1
 420             else: # attempt typecast
 421                 try:
 422                     colType(header[col])
 423                 except (ValueError, TypeError):
 424                     hasHeader += 1
 425                 else:
 426                     hasHeader -= 1
 427
 428         return hasHeader > 0