Lib/csv.py

   1
   2 """
   3 csv.py - read/write/investigate CSV files
   4 """
   5
   6 import re
   7 from _csv import Error, __version__, writer, reader, register_dialect, \
   8                  unregister_dialect, get_dialect, list_dialects, \
   9                  field_size_limit, \
  10                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
  11                  __doc__
  12 from _csv import Dialect as _Dialect
  13
  14 try:
  15     from cStringIO import StringIO
  16 except ImportError:
  17     from StringIO import StringIO
  18
  19 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
  20             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
  21             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
  22             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
  23
  24 class Dialect:
  25     """Describe an Excel dialect.
  26
  27     This must be subclassed (see csv.excel).  Valid attributes are:
  28     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
  29     lineterminator, quoting.
  30
  31     """
  32     _name = ""
  33     _valid = False
  34     # placeholders
  35     delimiter = None
  36     quotechar = None
  37     escapechar = None
  38     doublequote = None
  39     skipinitialspace = None
  40     lineterminator = None
  41     quoting = None
  42
  43     def __init__(self):
  44         if self.__class__ != Dialect:
  45             self._valid = True
  46         self._validate()
  47
  48     def _validate(self):
  49         try:
  50             _Dialect(self)
  51         except TypeError, e:
  52             # We do this for compatibility with py2.3
  53             raise Error(str(e))
  54
  55 class excel(Dialect):
  56     """Describe the usual properties of Excel-generated CSV files."""
  57     delimiter = ','
  58     quotechar = '"'
  59     doublequote = True
  60     skipinitialspace = False
  61     lineterminator = '\r\n'
  62     quoting = QUOTE_MINIMAL
  63 register_dialect("excel", excel)
  64
  65 class excel_tab(excel):
  66     """Describe the usual properties of Excel-generated TAB-delimited files."""
  67     delimiter = '\t'
  68 register_dialect("excel-tab", excel_tab)
  69
  70
  71 class DictReader:
  72     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
  73                  dialect="excel", *args, **kwds):
  74         self.fieldnames = fieldnames    # list of keys for the dict
  75         self.restkey = restkey          # key to catch long rows
  76         self.restval = restval          # default value for short rows
  77         self.reader = reader(f, dialect, *args, **kwds)
  78
  79     def __iter__(self):
  80         return self
  81
  82     def next(self):
  83         row = self.reader.next()
  84         if self.fieldnames is None:
  85             self.fieldnames = row
  86             row = self.reader.next()
  87
  88         # unlike the basic reader, we prefer not to return blanks,
  89         # because we will typically wind up with a dict full of None
  90         # values
  91         while row == []:
  92             row = self.reader.next()
  93         d = dict(zip(self.fieldnames, row))
  94         lf = len(self.fieldnames)
  95         lr = len(row)
  96         if lf < lr:
  97             d[self.restkey] = row[lf:]
  98         elif lf > lr:
  99             for key in self.fieldnames[lr:]:
 100                 d[key] = self.restval
 101         return d
 102
 103
 104 class DictWriter:
 105     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
 106                  dialect="excel", *args, **kwds):
 107         self.fieldnames = fieldnames    # list of keys for the dict
 108         self.restval = restval          # for writing short dicts
 109         if extrasaction.lower() not in ("raise", "ignore"):
 110             raise ValueError, \
 111                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
 112                    extrasaction)
 113         self.extrasaction = extrasaction
 114         self.writer = writer(f, dialect, *args, **kwds)
 115
 116     def _dict_to_list(self, rowdict):
 117         if self.extrasaction == "raise":
 118             for k in rowdict.keys():
 119                 if k not in self.fieldnames:
 120                     raise ValueError, "dict contains fields not in fieldnames"
 121         return [rowdict.get(key, self.restval) for key in self.fieldnames]
 122
 123     def writerow(self, rowdict):
 124         return self.writer.writerow(self._dict_to_list(rowdict))
 125
 126     def writerows(self, rowdicts):
 127         rows = []
 128         for rowdict in rowdicts:
 129             rows.append(self._dict_to_list(rowdict))
 130         return self.writer.writerows(rows)
 131
 132 # Guard Sniffer's type checking against builds that exclude complex()
 133 try:
 134     complex
 135 except NameError:
 136     complex = float
 137
 138 class Sniffer:
 139     '''
 140     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
 141     Returns a Dialect object.
 142     '''
 143     def __init__(self):
 144         # in case there is more than one possible delimiter
 145         self.preferred = [',', '\t', ';', ' ', ':']
 146
 147
 148     def sniff(self, sample, delimiters=None):
 149         """
 150         Returns a dialect (or None) corresponding to the sample
 151         """
 152
 153         quotechar, delimiter, skipinitialspace = \
 154                    self._guess_quote_and_delimiter(sample, delimiters)
 155         if not delimiter:
 156             delimiter, skipinitialspace = self._guess_delimiter(sample,
 157                                                                 delimiters)
 158
 159         if not delimiter:
 160             raise Error, "Could not determine delimiter"
 161
 162         class dialect(Dialect):
 163             _name = "sniffed"
 164             lineterminator = '\r\n'
 165             quoting = QUOTE_MINIMAL
 166             # escapechar = ''
 167             doublequote = False
 168
 169         dialect.delimiter = delimiter
 170         # _csv.reader won't accept a quotechar of ''
 171         dialect.quotechar = quotechar or '"'
 172         dialect.skipinitialspace = skipinitialspace
 173
 174         return dialect
 175
 176
 177     def _guess_quote_and_delimiter(self, data, delimiters):
 178         """
 179         Looks for text enclosed between two identical quotes
 180         (the probable quotechar) which are preceded and followed
 181         by the same character (the probable delimiter).
 182         For example:
 183                          ,'some text',
 184         The quote with the most wins, same with the delimiter.
 185         If there is no quotechar the delimiter can't be determined
 186         this way.
 187         """
 188
 189         matches = []
 190         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
 191                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
 192                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
 193                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
 194             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
 195             matches = regexp.findall(data)
 196             if matches:
 197                 break
 198
 199         if not matches:
 200             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
 201
 202         quotes = {}
 203         delims = {}
 204         spaces = 0
 205         for m in matches:
 206             n = regexp.groupindex['quote'] - 1
 207             key = m[n]
 208             if key:
 209                 quotes[key] = quotes.get(key, 0) + 1
 210             try:
 211                 n = regexp.groupindex['delim'] - 1
 212                 key = m[n]
 213             except KeyError:
 214                 continue
 215             if key and (delimiters is None or key in delimiters):
 216                 delims[key] = delims.get(key, 0) + 1
 217             try:
 218                 n = regexp.groupindex['space'] - 1
 219             except KeyError:
 220                 continue
 221             if m[n]:
 222                 spaces += 1
 223
 224         quotechar = reduce(lambda a, b, quotes = quotes:
 225                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
 226
 227         if delims:
 228             delim = reduce(lambda a, b, delims = delims:
 229                            (delims[a] > delims[b]) and a or b, delims.keys())
 230             skipinitialspace = delims[delim] == spaces
 231             if delim == '\n': # most likely a file with a single column
 232                 delim = ''
 233         else:
 234             # there is *no* delimiter, it's a single column of quoted data
 235             delim = ''
 236             skipinitialspace = 0
 237
 238         return (quotechar, delim, skipinitialspace)
 239
 240
 241     def _guess_delimiter(self, data, delimiters):
 242         """
 243         The delimiter /should/ occur the same number of times on
 244         each row. However, due to malformed data, it may not. We don't want
 245         an all or nothing approach, so we allow for small variations in this
 246         number.
 247           1) build a table of the frequency of each character on every line.
 248           2) build a table of freqencies of this frequency (meta-frequency?),
 249              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
 250              7 times in 2 rows'
 251           3) use the mode of the meta-frequency to determine the /expected/
 252              frequency for that character
 253           4) find out how often the character actually meets that goal
 254           5) the character that best meets its goal is the delimiter
 255         For performance reasons, the data is evaluated in chunks, so it can
 256         try and evaluate the smallest portion of the data possible, evaluating
 257         additional chunks as necessary.
 258         """
 259
 260         data = filter(None, data.split('\n'))
 261
 262         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
 263
 264         # build frequency tables
 265         chunkLength = min(10, len(data))
 266         iteration = 0
 267         charFrequency = {}
 268         modes = {}
 269         delims = {}
 270         start, end = 0, min(chunkLength, len(data))
 271         while start < len(data):
 272             iteration += 1
 273             for line in data[start:end]:
 274                 for char in ascii:
 275                     metaFrequency = charFrequency.get(char, {})
 276                     # must count even if frequency is 0
 277                     freq = line.count(char)
 278                     # value is the mode
 279                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
 280                     charFrequency[char] = metaFrequency
 281
 282             for char in charFrequency.keys():
 283                 items = charFrequency[char].items()
 284                 if len(items) == 1 and items[0][0] == 0:
 285                     continue
 286                 # get the mode of the frequencies
 287                 if len(items) > 1:
 288                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
 289                                          items)
 290                     # adjust the mode - subtract the sum of all
 291                     # other frequencies
 292                     items.remove(modes[char])
 293                     modes[char] = (modes[char][0], modes[char][1]
 294                                    - reduce(lambda a, b: (0, a[1] + b[1]),
 295                                             items)[1])
 296                 else:
 297                     modes[char] = items[0]
 298
 299             # build a list of possible delimiters
 300             modeList = modes.items()
 301             total = float(chunkLength * iteration)
 302             # (rows of consistent data) / (number of rows) = 100%
 303             consistency = 1.0
 304             # minimum consistency threshold
 305             threshold = 0.9
 306             while len(delims) == 0 and consistency >= threshold:
 307                 for k, v in modeList:
 308                     if v[0] > 0 and v[1] > 0:
 309                         if ((v[1]/total) >= consistency and
 310                             (delimiters is None or k in delimiters)):
 311                             delims[k] = v
 312                 consistency -= 0.01
 313
 314             if len(delims) == 1:
 315                 delim = delims.keys()[0]
 316                 skipinitialspace = (data[0].count(delim) ==
 317                                     data[0].count("%c " % delim))
 318                 return (delim, skipinitialspace)
 319
 320             # analyze another chunkLength lines
 321             start = end
 322             end += chunkLength
 323
 324         if not delims:
 325             return ('', 0)
 326
 327         # if there's more than one, fall back to a 'preferred' list
 328         if len(delims) > 1:
 329             for d in self.preferred:
 330                 if d in delims.keys():
 331                     skipinitialspace = (data[0].count(d) ==
 332                                         data[0].count("%c " % d))
 333                     return (d, skipinitialspace)
 334
 335         # nothing else indicates a preference, pick the character that
 336         # dominates(?)
 337         items = [(v,k) for (k,v) in delims.items()]
 338         items.sort()
 339         delim = items[-1][1]
 340
 341         skipinitialspace = (data[0].count(delim) ==
 342                             data[0].count("%c " % delim))
 343         return (delim, skipinitialspace)
 344
 345
 346     def has_header(self, sample):
 347         # Creates a dictionary of types of data in each column. If any
 348         # column is of a single type (say, integers), *except* for the first
 349         # row, then the first row is presumed to be labels. If the type
 350         # can't be determined, it is assumed to be a string in which case
 351         # the length of the string is the determining factor: if all of the
 352         # rows except for the first are the same length, it's a header.
 353         # Finally, a 'vote' is taken at the end for each column, adding or
 354         # subtracting from the likelihood of the first row being a header.
 355
 356         rdr = reader(StringIO(sample), self.sniff(sample))
 357
 358         header = rdr.next() # assume first row is header
 359
 360         columns = len(header)
 361         columnTypes = {}
 362         for i in range(columns): columnTypes[i] = None
 363
 364         checked = 0
 365         for row in rdr:
 366             # arbitrary number of rows to check, to keep it sane
 367             if checked > 20:
 368                 break
 369             checked += 1
 370
 371             if len(row) != columns:
 372                 continue # skip rows that have irregular number of columns
 373
 374             for col in columnTypes.keys():
 375
 376                 for thisType in [int, long, float, complex]:
 377                     try:
 378                         thisType(row[col])
 379                         break
 380                     except (ValueError, OverflowError):
 381                         pass
 382                 else:
 383                     # fallback to length of string
 384                     thisType = len(row[col])
 385
 386                 # treat longs as ints
 387                 if thisType == long:
 388                     thisType = int
 389
 390                 if thisType != columnTypes[col]:
 391                     if columnTypes[col] is None: # add new column type
 392                         columnTypes[col] = thisType
 393                     else:
 394                         # type is inconsistent, remove column from
 395                         # consideration
 396                         del columnTypes[col]
 397
 398         # finally, compare results against first row and "vote"
 399         # on whether it's a header
 400         hasHeader = 0
 401         for col, colType in columnTypes.items():
 402             if type(colType) == type(0): # it's a length
 403                 if len(header[col]) != colType:
 404                     hasHeader += 1
 405                 else:
 406                     hasHeader -= 1
 407             else: # attempt typecast
 408                 try:
 409                     colType(header[col])
 410                 except (ValueError, TypeError):
 411                     hasHeader += 1
 412                 else:
 413                     hasHeader -= 1
 414
 415         return hasHeader > 0