3 csv.py - read/write/investigate CSV files
7 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
8 unregister_dialect
, get_dialect
, list_dialects
, \
10 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
12 from _csv
import Dialect
as _Dialect
15 from cStringIO
import StringIO
17 from StringIO
import StringIO
19 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
20 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
21 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
22 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25 """Describe an Excel dialect.
27 This must be subclassed (see csv.excel). Valid attributes are:
28 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
29 lineterminator, quoting.
39 skipinitialspace
= None
44 if self
.__class
__ != Dialect
:
52 # We do this for compatibility with py2.3
56 """Describe the usual properties of Excel-generated CSV files."""
60 skipinitialspace
= False
61 lineterminator
= '\r\n'
62 quoting
= QUOTE_MINIMAL
63 register_dialect("excel", excel
)
65 class excel_tab(excel
):
66 """Describe the usual properties of Excel-generated TAB-delimited files."""
68 register_dialect("excel-tab", excel_tab
)
72 def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
73 dialect
="excel", *args
, **kwds
):
74 self
.fieldnames
= fieldnames
# list of keys for the dict
75 self
.restkey
= restkey
# key to catch long rows
76 self
.restval
= restval
# default value for short rows
77 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
83 row
= self
.reader
.next()
84 if self
.fieldnames
is None:
86 row
= self
.reader
.next()
88 # unlike the basic reader, we prefer not to return blanks,
89 # because we will typically wind up with a dict full of None
92 row
= self
.reader
.next()
93 d
= dict(zip(self
.fieldnames
, row
))
94 lf
= len(self
.fieldnames
)
97 d
[self
.restkey
] = row
[lf
:]
99 for key
in self
.fieldnames
[lr
:]:
100 d
[key
] = self
.restval
105 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
106 dialect
="excel", *args
, **kwds
):
107 self
.fieldnames
= fieldnames
# list of keys for the dict
108 self
.restval
= restval
# for writing short dicts
109 if extrasaction
.lower() not in ("raise", "ignore"):
111 ("extrasaction (%s) must be 'raise' or 'ignore'" %
113 self
.extrasaction
= extrasaction
114 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
116 def _dict_to_list(self
, rowdict
):
117 if self
.extrasaction
== "raise":
118 for k
in rowdict
.keys():
119 if k
not in self
.fieldnames
:
120 raise ValueError, "dict contains fields not in fieldnames"
121 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
123 def writerow(self
, rowdict
):
124 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
126 def writerows(self
, rowdicts
):
128 for rowdict
in rowdicts
:
129 rows
.append(self
._dict
_to
_list
(rowdict
))
130 return self
.writer
.writerows(rows
)
132 # Guard Sniffer's type checking against builds that exclude complex()
140 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
141 Returns a Dialect object.
144 # in case there is more than one possible delimiter
145 self
.preferred
= [',', '\t', ';', ' ', ':']
148 def sniff(self
, sample
, delimiters
=None):
150 Returns a dialect (or None) corresponding to the sample
153 quotechar
, delimiter
, skipinitialspace
= \
154 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
156 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
160 raise Error
, "Could not determine delimiter"
162 class dialect(Dialect
):
164 lineterminator
= '\r\n'
165 quoting
= QUOTE_MINIMAL
169 dialect
.delimiter
= delimiter
170 # _csv.reader won't accept a quotechar of ''
171 dialect
.quotechar
= quotechar
or '"'
172 dialect
.skipinitialspace
= skipinitialspace
177 def _guess_quote_and_delimiter(self
, data
, delimiters
):
179 Looks for text enclosed between two identical quotes
180 (the probable quotechar) which are preceded and followed
181 by the same character (the probable delimiter).
184 The quote with the most wins, same with the delimiter.
185 If there is no quotechar the delimiter can't be determined
190 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
191 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
192 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
193 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
194 regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
195 matches
= regexp
.findall(data
)
200 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
206 n
= regexp
.groupindex
['quote'] - 1
209 quotes
[key
] = quotes
.get(key
, 0) + 1
211 n
= regexp
.groupindex
['delim'] - 1
215 if key
and (delimiters
is None or key
in delimiters
):
216 delims
[key
] = delims
.get(key
, 0) + 1
218 n
= regexp
.groupindex
['space'] - 1
224 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
225 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
228 delim
= reduce(lambda a
, b
, delims
= delims
:
229 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
230 skipinitialspace
= delims
[delim
] == spaces
231 if delim
== '\n': # most likely a file with a single column
234 # there is *no* delimiter, it's a single column of quoted data
238 return (quotechar
, delim
, skipinitialspace
)
241 def _guess_delimiter(self
, data
, delimiters
):
243 The delimiter /should/ occur the same number of times on
244 each row. However, due to malformed data, it may not. We don't want
245 an all or nothing approach, so we allow for small variations in this
247 1) build a table of the frequency of each character on every line.
248 2) build a table of freqencies of this frequency (meta-frequency?),
249 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
251 3) use the mode of the meta-frequency to determine the /expected/
252 frequency for that character
253 4) find out how often the character actually meets that goal
254 5) the character that best meets its goal is the delimiter
255 For performance reasons, the data is evaluated in chunks, so it can
256 try and evaluate the smallest portion of the data possible, evaluating
257 additional chunks as necessary.
260 data
= filter(None, data
.split('\n'))
262 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
264 # build frequency tables
265 chunkLength
= min(10, len(data
))
270 start
, end
= 0, min(chunkLength
, len(data
))
271 while start
< len(data
):
273 for line
in data
[start
:end
]:
275 metaFrequency
= charFrequency
.get(char
, {})
276 # must count even if frequency is 0
277 freq
= line
.count(char
)
279 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
280 charFrequency
[char
] = metaFrequency
282 for char
in charFrequency
.keys():
283 items
= charFrequency
[char
].items()
284 if len(items
) == 1 and items
[0][0] == 0:
286 # get the mode of the frequencies
288 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
290 # adjust the mode - subtract the sum of all
292 items
.remove(modes
[char
])
293 modes
[char
] = (modes
[char
][0], modes
[char
][1]
294 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
297 modes
[char
] = items
[0]
299 # build a list of possible delimiters
300 modeList
= modes
.items()
301 total
= float(chunkLength
* iteration
)
302 # (rows of consistent data) / (number of rows) = 100%
304 # minimum consistency threshold
306 while len(delims
) == 0 and consistency
>= threshold
:
307 for k
, v
in modeList
:
308 if v
[0] > 0 and v
[1] > 0:
309 if ((v
[1]/total
) >= consistency
and
310 (delimiters
is None or k
in delimiters
)):
315 delim
= delims
.keys()[0]
316 skipinitialspace
= (data
[0].count(delim
) ==
317 data
[0].count("%c " % delim
))
318 return (delim
, skipinitialspace
)
320 # analyze another chunkLength lines
327 # if there's more than one, fall back to a 'preferred' list
329 for d
in self
.preferred
:
330 if d
in delims
.keys():
331 skipinitialspace
= (data
[0].count(d
) ==
332 data
[0].count("%c " % d
))
333 return (d
, skipinitialspace
)
335 # nothing else indicates a preference, pick the character that
337 items
= [(v
,k
) for (k
,v
) in delims
.items()]
341 skipinitialspace
= (data
[0].count(delim
) ==
342 data
[0].count("%c " % delim
))
343 return (delim
, skipinitialspace
)
346 def has_header(self
, sample
):
347 # Creates a dictionary of types of data in each column. If any
348 # column is of a single type (say, integers), *except* for the first
349 # row, then the first row is presumed to be labels. If the type
350 # can't be determined, it is assumed to be a string in which case
351 # the length of the string is the determining factor: if all of the
352 # rows except for the first are the same length, it's a header.
353 # Finally, a 'vote' is taken at the end for each column, adding or
354 # subtracting from the likelihood of the first row being a header.
356 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
358 header
= rdr
.next() # assume first row is header
360 columns
= len(header
)
362 for i
in range(columns
): columnTypes
[i
] = None
366 # arbitrary number of rows to check, to keep it sane
371 if len(row
) != columns
:
372 continue # skip rows that have irregular number of columns
374 for col
in columnTypes
.keys():
376 for thisType
in [int, long, float, complex]:
380 except (ValueError, OverflowError):
383 # fallback to length of string
384 thisType
= len(row
[col
])
386 # treat longs as ints
390 if thisType
!= columnTypes
[col
]:
391 if columnTypes
[col
] is None: # add new column type
392 columnTypes
[col
] = thisType
394 # type is inconsistent, remove column from
398 # finally, compare results against first row and "vote"
399 # on whether it's a header
401 for col
, colType
in columnTypes
.items():
402 if type(colType
) == type(0): # it's a length
403 if len(header
[col
]) != colType
:
407 else: # attempt typecast
410 except (ValueError, TypeError):