3 csv.py - read/write/investigate CSV files
7 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
8 unregister_dialect
, get_dialect
, list_dialects
, \
9 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
13 from cStringIO
import StringIO
15 from StringIO
import StringIO
17 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
18 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
19 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
20 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
30 skipinitialspace
= None
35 if self
.__class
__ != Dialect
:
37 errors
= self
._validate
()
39 raise Error
, "Dialect did not validate: %s" % ", ".join(errors
)
44 errors
.append("can't directly instantiate Dialect class")
46 if self
.delimiter
is None:
47 errors
.append("delimiter character not set")
48 elif (not isinstance(self
.delimiter
, str) or
49 len(self
.delimiter
) > 1):
50 errors
.append("delimiter must be one-character string")
52 if self
.quotechar
is None:
53 if self
.quoting
!= QUOTE_NONE
:
54 errors
.append("quotechar not set")
55 elif (not isinstance(self
.quotechar
, str) or
56 len(self
.quotechar
) > 1):
57 errors
.append("quotechar must be one-character string")
59 if self
.lineterminator
is None:
60 errors
.append("lineterminator not set")
61 elif not isinstance(self
.lineterminator
, str):
62 errors
.append("lineterminator must be a string")
64 if self
.doublequote
not in (True, False):
65 errors
.append("doublequote parameter must be True or False")
67 if self
.skipinitialspace
not in (True, False):
68 errors
.append("skipinitialspace parameter must be True or False")
70 if self
.quoting
is None:
71 errors
.append("quoting parameter not set")
73 if self
.quoting
is QUOTE_NONE
:
74 if (not isinstance(self
.escapechar
, (unicode, str)) or
75 len(self
.escapechar
) > 1):
76 errors
.append("escapechar must be a one-character string or unicode object")
84 skipinitialspace
= False
85 lineterminator
= '\r\n'
86 quoting
= QUOTE_MINIMAL
87 register_dialect("excel", excel
)
89 class excel_tab(excel
):
91 register_dialect("excel-tab", excel_tab
)
95 def __init__(self
, f
, fieldnames
, restkey
=None, restval
=None,
96 dialect
="excel", *args
):
97 self
.fieldnames
= fieldnames
# list of keys for the dict
98 self
.restkey
= restkey
# key to catch long rows
99 self
.restval
= restval
# default value for short rows
100 self
.reader
= reader(f
, dialect
, *args
)
106 row
= self
.reader
.next()
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
111 row
= self
.reader
.next()
112 d
= dict(zip(self
.fieldnames
, row
))
113 lf
= len(self
.fieldnames
)
116 d
[self
.restkey
] = row
[lf
:]
118 for key
in self
.fieldnames
[lr
:]:
119 d
[key
] = self
.restval
124 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
125 dialect
="excel", *args
):
126 self
.fieldnames
= fieldnames
# list of keys for the dict
127 self
.restval
= restval
# for writing short dicts
128 if extrasaction
.lower() not in ("raise", "ignore"):
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
132 self
.extrasaction
= extrasaction
133 self
.writer
= writer(f
, dialect
, *args
)
135 def _dict_to_list(self
, rowdict
):
136 if self
.extrasaction
== "raise":
137 for k
in rowdict
.keys():
138 if k
not in self
.fieldnames
:
139 raise ValueError, "dict contains fields not in fieldnames"
140 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
142 def writerow(self
, rowdict
):
143 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
145 def writerows(self
, rowdicts
):
147 for rowdict
in rowdicts
:
148 rows
.append(self
._dict
_to
_list
(rowdict
))
149 return self
.writer
.writerows(rows
)
154 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
155 Returns a Dialect object.
158 # in case there is more than one possible delimiter
159 self
.preferred
= [',', '\t', ';', ' ', ':']
162 def sniff(self
, sample
):
164 Returns a dialect (or None) corresponding to the sample
167 quotechar
, delimiter
, skipinitialspace
= \
168 self
._guess
_quote
_and
_delimiter
(sample
)
169 if delimiter
is None:
170 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
)
172 class dialect(Dialect
):
174 lineterminator
= '\r\n'
175 quoting
= QUOTE_MINIMAL
179 dialect
.delimiter
= delimiter
180 # _csv.reader won't accept a quotechar of ''
181 dialect
.quotechar
= quotechar
or '"'
182 dialect
.skipinitialspace
= skipinitialspace
187 def _guess_quote_and_delimiter(self
, data
):
189 Looks for text enclosed between two identical quotes
190 (the probable quotechar) which are preceded and followed
191 by the same character (the probable delimiter).
194 The quote with the most wins, same with the delimiter.
195 If there is no quotechar the delimiter can't be determined
200 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
201 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
202 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
203 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
204 regexp
= re
.compile(restr
, re
.S | re
.M
)
205 matches
= regexp
.findall(data
)
210 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
216 n
= regexp
.groupindex
['quote'] - 1
219 quotes
[key
] = quotes
.get(key
, 0) + 1
221 n
= regexp
.groupindex
['delim'] - 1
226 delims
[key
] = delims
.get(key
, 0) + 1
228 n
= regexp
.groupindex
['space'] - 1
234 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
235 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
238 delim
= reduce(lambda a
, b
, delims
= delims
:
239 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
240 skipinitialspace
= delims
[delim
] == spaces
241 if delim
== '\n': # most likely a file with a single column
244 # there is *no* delimiter, it's a single column of quoted data
248 return (quotechar
, delim
, skipinitialspace
)
251 def _guess_delimiter(self
, data
):
253 The delimiter /should/ occur the same number of times on
254 each row. However, due to malformed data, it may not. We don't want
255 an all or nothing approach, so we allow for small variations in this
257 1) build a table of the frequency of each character on every line.
258 2) build a table of freqencies of this frequency (meta-frequency?),
259 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
261 3) use the mode of the meta-frequency to determine the /expected/
262 frequency for that character
263 4) find out how often the character actually meets that goal
264 5) the character that best meets its goal is the delimiter
265 For performance reasons, the data is evaluated in chunks, so it can
266 try and evaluate the smallest portion of the data possible, evaluating
267 additional chunks as necessary.
270 data
= filter(None, data
.split('\n'))
272 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
274 # build frequency tables
275 chunkLength
= min(10, len(data
))
280 start
, end
= 0, min(chunkLength
, len(data
))
281 while start
< len(data
):
283 for line
in data
[start
:end
]:
285 metaFrequency
= charFrequency
.get(char
, {})
286 # must count even if frequency is 0
287 freq
= line
.strip().count(char
)
289 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
290 charFrequency
[char
] = metaFrequency
292 for char
in charFrequency
.keys():
293 items
= charFrequency
[char
].items()
294 if len(items
) == 1 and items
[0][0] == 0:
296 # get the mode of the frequencies
298 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
300 # adjust the mode - subtract the sum of all
302 items
.remove(modes
[char
])
303 modes
[char
] = (modes
[char
][0], modes
[char
][1]
304 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
307 modes
[char
] = items
[0]
309 # build a list of possible delimiters
310 modeList
= modes
.items()
311 total
= float(chunkLength
* iteration
)
312 # (rows of consistent data) / (number of rows) = 100%
314 # minimum consistency threshold
316 while len(delims
) == 0 and consistency
>= threshold
:
317 for k
, v
in modeList
:
318 if v
[0] > 0 and v
[1] > 0:
319 if (v
[1]/total
) >= consistency
:
324 delim
= delims
.keys()[0]
325 skipinitialspace
= (data
[0].count(delim
) ==
326 data
[0].count("%c " % delim
))
327 return (delim
, skipinitialspace
)
329 # analyze another chunkLength lines
336 # if there's more than one, fall back to a 'preferred' list
338 for d
in self
.preferred
:
339 if d
in delims
.keys():
340 skipinitialspace
= (data
[0].count(d
) ==
341 data
[0].count("%c " % d
))
342 return (d
, skipinitialspace
)
344 # finally, just return the first damn character in the list
345 delim
= delims
.keys()[0]
346 skipinitialspace
= (data
[0].count(delim
) ==
347 data
[0].count("%c " % delim
))
348 return (delim
, skipinitialspace
)
351 def has_header(self
, sample
):
352 # Creates a dictionary of types of data in each column. If any
353 # column is of a single type (say, integers), *except* for the first
354 # row, then the first row is presumed to be labels. If the type
355 # can't be determined, it is assumed to be a string in which case
356 # the length of the string is the determining factor: if all of the
357 # rows except for the first are the same length, it's a header.
358 # Finally, a 'vote' is taken at the end for each column, adding or
359 # subtracting from the likelihood of the first row being a header.
363 Strips parens from item prior to calling eval in an
364 attempt to make it safer
366 return eval(item
.replace('(', '').replace(')', ''))
368 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
370 header
= rdr
.next() # assume first row is header
372 columns
= len(header
)
374 for i
in range(columns
): columnTypes
[i
] = None
378 # arbitrary number of rows to check, to keep it sane
383 if len(row
) != columns
:
384 continue # skip rows that have irregular number of columns
386 for col
in columnTypes
.keys():
389 # is it a built-in type (besides string)?
390 thisType
= type(seval(row
[col
]))
391 except OverflowError:
393 thisType
= type(seval(row
[col
] + 'L'))
394 thisType
= type(0) # treat long ints as int
396 # fallback to length of string
397 thisType
= len(row
[col
])
399 if thisType
!= columnTypes
[col
]:
400 if columnTypes
[col
] is None: # add new column type
401 columnTypes
[col
] = thisType
403 # type is inconsistent, remove column from
407 # finally, compare results against first row and "vote"
408 # on whether it's a header
410 for col
, colType
in columnTypes
.items():
411 if type(colType
) == type(0): # it's a length
412 if len(header
[col
]) != colType
:
416 else: # attempt typecast
418 eval("%s(%s)" % (colType
.__name
__, header
[col
]))