1 # -*- coding: utf-8 -*-
4 csv.py - read/write/investigate CSV files
8 from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
9 unregister_dialect
, get_dialect
, list_dialects
, \
10 QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
14 from cStringIO
import StringIO
16 from StringIO
import StringIO
18 __all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
19 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
20 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
21 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
31 skipinitialspace
= None
36 if self
.__class
__ != Dialect
:
38 errors
= self
._validate
()
40 raise Error
, "Dialect did not validate: %s" % ", ".join(errors
)
45 errors
.append("can't directly instantiate Dialect class")
47 if self
.delimiter
is None:
48 errors
.append("delimiter character not set")
49 elif (not isinstance(self
.delimiter
, str) or
50 len(self
.delimiter
) > 1):
51 errors
.append("delimiter must be one-character string")
53 if self
.quotechar
is None:
54 if self
.quoting
!= QUOTE_NONE
:
55 errors
.append("quotechar not set")
56 elif (not isinstance(self
.quotechar
, str) or
57 len(self
.quotechar
) > 1):
58 errors
.append("quotechar must be one-character string")
60 if self
.lineterminator
is None:
61 errors
.append("lineterminator not set")
62 elif not isinstance(self
.lineterminator
, str):
63 errors
.append("lineterminator must be a string")
65 if self
.doublequote
not in (True, False):
66 errors
.append("doublequote parameter must be True or False")
68 if self
.skipinitialspace
not in (True, False):
69 errors
.append("skipinitialspace parameter must be True or False")
71 if self
.quoting
is None:
72 errors
.append("quoting parameter not set")
74 if self
.quoting
is QUOTE_NONE
:
75 if (not isinstance(self
.escapechar
, (unicode, str)) or
76 len(self
.escapechar
) > 1):
77 errors
.append("escapechar must be a one-character string or unicode object")
85 skipinitialspace
= False
86 lineterminator
= '\r\n'
87 quoting
= QUOTE_MINIMAL
88 register_dialect("excel", excel
)
90 class excel_tab(excel
):
92 register_dialect("excel-tab", excel_tab
)
96 def __init__(self
, f
, fieldnames
, restkey
=None, restval
=None,
97 dialect
="excel", *args
, **kwds
):
98 self
.fieldnames
= fieldnames
# list of keys for the dict
99 self
.restkey
= restkey
# key to catch long rows
100 self
.restval
= restval
# default value for short rows
101 self
.reader
= reader(f
, dialect
, *args
, **kwds
)
107 row
= self
.reader
.next()
108 # unlike the basic reader, we prefer not to return blanks,
109 # because we will typically wind up with a dict full of None
112 row
= self
.reader
.next()
113 d
= dict(zip(self
.fieldnames
, row
))
114 lf
= len(self
.fieldnames
)
117 d
[self
.restkey
] = row
[lf
:]
119 for key
in self
.fieldnames
[lr
:]:
120 d
[key
] = self
.restval
125 def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
126 dialect
="excel", *args
, **kwds
):
127 self
.fieldnames
= fieldnames
# list of keys for the dict
128 self
.restval
= restval
# for writing short dicts
129 if extrasaction
.lower() not in ("raise", "ignore"):
131 ("extrasaction (%s) must be 'raise' or 'ignore'" %
133 self
.extrasaction
= extrasaction
134 self
.writer
= writer(f
, dialect
, *args
, **kwds
)
136 def _dict_to_list(self
, rowdict
):
137 if self
.extrasaction
== "raise":
138 for k
in rowdict
.keys():
139 if k
not in self
.fieldnames
:
140 raise ValueError, "dict contains fields not in fieldnames"
141 return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
143 def writerow(self
, rowdict
):
144 return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
146 def writerows(self
, rowdicts
):
148 for rowdict
in rowdicts
:
149 rows
.append(self
._dict
_to
_list
(rowdict
))
150 return self
.writer
.writerows(rows
)
152 # Guard Sniffer's type checking against builds that exclude complex()
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161 Returns a Dialect object.
164 # in case there is more than one possible delimiter
165 self
.preferred
= [',', '\t', ';', ' ', ':']
168 def sniff(self
, sample
, delimiters
=None):
170 Returns a dialect (or None) corresponding to the sample
173 quotechar
, delimiter
, skipinitialspace
= \
174 self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
175 if delimiter
is None:
176 delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
179 class dialect(Dialect
):
181 lineterminator
= '\r\n'
182 quoting
= QUOTE_MINIMAL
186 dialect
.delimiter
= delimiter
187 # _csv.reader won't accept a quotechar of ''
188 dialect
.quotechar
= quotechar
or '"'
189 dialect
.skipinitialspace
= skipinitialspace
194 def _guess_quote_and_delimiter(self
, data
, delimiters
):
196 Looks for text enclosed between two identical quotes
197 (the probable quotechar) which are preceded and followed
198 by the same character (the probable delimiter).
201 The quote with the most wins, same with the delimiter.
202 If there is no quotechar the delimiter can't be determined
207 for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
208 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
209 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
211 regexp
= re
.compile(restr
, re
.S | re
.M
)
212 matches
= regexp
.findall(data
)
217 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
223 n
= regexp
.groupindex
['quote'] - 1
226 quotes
[key
] = quotes
.get(key
, 0) + 1
228 n
= regexp
.groupindex
['delim'] - 1
232 if key
and (delimiters
is None or key
in delimiters
):
233 delims
[key
] = delims
.get(key
, 0) + 1
235 n
= regexp
.groupindex
['space'] - 1
241 quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
242 (quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
245 delim
= reduce(lambda a
, b
, delims
= delims
:
246 (delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
247 skipinitialspace
= delims
[delim
] == spaces
248 if delim
== '\n': # most likely a file with a single column
251 # there is *no* delimiter, it's a single column of quoted data
255 return (quotechar
, delim
, skipinitialspace
)
258 def _guess_delimiter(self
, data
, delimiters
):
260 The delimiter /should/ occur the same number of times on
261 each row. However, due to malformed data, it may not. We don't want
262 an all or nothing approach, so we allow for small variations in this
264 1) build a table of the frequency of each character on every line.
265 2) build a table of freqencies of this frequency (meta-frequency?),
266 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
268 3) use the mode of the meta-frequency to determine the /expected/
269 frequency for that character
270 4) find out how often the character actually meets that goal
271 5) the character that best meets its goal is the delimiter
272 For performance reasons, the data is evaluated in chunks, so it can
273 try and evaluate the smallest portion of the data possible, evaluating
274 additional chunks as necessary.
277 data
= filter(None, data
.split('\n'))
279 ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
281 # build frequency tables
282 chunkLength
= min(10, len(data
))
287 start
, end
= 0, min(chunkLength
, len(data
))
288 while start
< len(data
):
290 for line
in data
[start
:end
]:
292 metaFrequency
= charFrequency
.get(char
, {})
293 # must count even if frequency is 0
294 freq
= line
.strip().count(char
)
296 metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
297 charFrequency
[char
] = metaFrequency
299 for char
in charFrequency
.keys():
300 items
= charFrequency
[char
].items()
301 if len(items
) == 1 and items
[0][0] == 0:
303 # get the mode of the frequencies
305 modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
307 # adjust the mode - subtract the sum of all
309 items
.remove(modes
[char
])
310 modes
[char
] = (modes
[char
][0], modes
[char
][1]
311 - reduce(lambda a
, b
: (0, a
[1] + b
[1]),
314 modes
[char
] = items
[0]
316 # build a list of possible delimiters
317 modeList
= modes
.items()
318 total
= float(chunkLength
* iteration
)
319 # (rows of consistent data) / (number of rows) = 100%
321 # minimum consistency threshold
323 while len(delims
) == 0 and consistency
>= threshold
:
324 for k
, v
in modeList
:
325 if v
[0] > 0 and v
[1] > 0:
326 if ((v
[1]/total
) >= consistency
and
327 (delimiters
is None or k
in delimiters
)):
332 delim
= delims
.keys()[0]
333 skipinitialspace
= (data
[0].count(delim
) ==
334 data
[0].count("%c " % delim
))
335 return (delim
, skipinitialspace
)
337 # analyze another chunkLength lines
344 # if there's more than one, fall back to a 'preferred' list
346 for d
in self
.preferred
:
347 if d
in delims
.keys():
348 skipinitialspace
= (data
[0].count(d
) ==
349 data
[0].count("%c " % d
))
350 return (d
, skipinitialspace
)
352 # finally, just return the first damn character in the list
353 delim
= delims
.keys()[0]
354 skipinitialspace
= (data
[0].count(delim
) ==
355 data
[0].count("%c " % delim
))
356 return (delim
, skipinitialspace
)
359 def has_header(self
, sample
):
360 # Creates a dictionary of types of data in each column. If any
361 # column is of a single type (say, integers), *except* for the first
362 # row, then the first row is presumed to be labels. If the type
363 # can't be determined, it is assumed to be a string in which case
364 # the length of the string is the determining factor: if all of the
365 # rows except for the first are the same length, it's a header.
366 # Finally, a 'vote' is taken at the end for each column, adding or
367 # subtracting from the likelihood of the first row being a header.
369 rdr
= reader(StringIO(sample
), self
.sniff(sample
))
371 header
= rdr
.next() # assume first row is header
373 columns
= len(header
)
375 for i
in range(columns
): columnTypes
[i
] = None
379 # arbitrary number of rows to check, to keep it sane
384 if len(row
) != columns
:
385 continue # skip rows that have irregular number of columns
387 for col
in columnTypes
.keys():
389 for thisType
in [int, long, float, complex]:
393 except (ValueError, OverflowError):
396 # fallback to length of string
397 thisType
= len(row
[col
])
399 # treat longs as ints
403 if thisType
!= columnTypes
[col
]:
404 if columnTypes
[col
] is None: # add new column type
405 columnTypes
[col
] = thisType
407 # type is inconsistent, remove column from
411 # finally, compare results against first row and "vote"
412 # on whether it's a header
414 for col
, colType
in columnTypes
.items():
415 if type(colType
) == type(0): # it's a length
416 if len(header
[col
]) != colType
:
420 else: # attempt typecast
423 except (ValueError, TypeError):