StatsDb._getstoredfileid could return None. This was a problem in methods which
[translate_toolkit.git] / misc / csv.py
blob0e3071f42a34315df13688b044c4e7920d0cf00f
1 # -*- coding: utf-8 -*-
3 """
4 csv.py - read/write/investigate CSV files
5 """
7 import re
8 from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
10 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
11 __doc__
13 try:
14 from cStringIO import StringIO
15 except ImportError:
16 from StringIO import StringIO
18 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
19 "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
20 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
21 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
23 class Dialect:
24 _name = ""
25 _valid = False
26 # placeholders
27 delimiter = None
28 quotechar = None
29 escapechar = None
30 doublequote = None
31 skipinitialspace = None
32 lineterminator = None
33 quoting = None
35 def __init__(self):
36 if self.__class__ != Dialect:
37 self._valid = True
38 errors = self._validate()
39 if errors != []:
40 raise Error, "Dialect did not validate: %s" % ", ".join(errors)
42 def _validate(self):
43 errors = []
44 if not self._valid:
45 errors.append("can't directly instantiate Dialect class")
47 if self.delimiter is None:
48 errors.append("delimiter character not set")
49 elif (not isinstance(self.delimiter, str) or
50 len(self.delimiter) > 1):
51 errors.append("delimiter must be one-character string")
53 if self.quotechar is None:
54 if self.quoting != QUOTE_NONE:
55 errors.append("quotechar not set")
56 elif (not isinstance(self.quotechar, str) or
57 len(self.quotechar) > 1):
58 errors.append("quotechar must be one-character string")
60 if self.lineterminator is None:
61 errors.append("lineterminator not set")
62 elif not isinstance(self.lineterminator, str):
63 errors.append("lineterminator must be a string")
65 if self.doublequote not in (True, False):
66 errors.append("doublequote parameter must be True or False")
68 if self.skipinitialspace not in (True, False):
69 errors.append("skipinitialspace parameter must be True or False")
71 if self.quoting is None:
72 errors.append("quoting parameter not set")
74 if self.quoting is QUOTE_NONE:
75 if (not isinstance(self.escapechar, (unicode, str)) or
76 len(self.escapechar) > 1):
77 errors.append("escapechar must be a one-character string or unicode object")
79 return errors
81 class excel(Dialect):
82 delimiter = ','
83 quotechar = '"'
84 doublequote = True
85 skipinitialspace = False
86 lineterminator = '\r\n'
87 quoting = QUOTE_MINIMAL
88 register_dialect("excel", excel)
90 class excel_tab(excel):
91 delimiter = '\t'
92 register_dialect("excel-tab", excel_tab)
95 class DictReader:
96 def __init__(self, f, fieldnames, restkey=None, restval=None,
97 dialect="excel", *args, **kwds):
98 self.fieldnames = fieldnames # list of keys for the dict
99 self.restkey = restkey # key to catch long rows
100 self.restval = restval # default value for short rows
101 self.reader = reader(f, dialect, *args, **kwds)
103 def __iter__(self):
104 return self
106 def next(self):
107 row = self.reader.next()
108 # unlike the basic reader, we prefer not to return blanks,
109 # because we will typically wind up with a dict full of None
110 # values
111 while row == []:
112 row = self.reader.next()
113 d = dict(zip(self.fieldnames, row))
114 lf = len(self.fieldnames)
115 lr = len(row)
116 if lf < lr:
117 d[self.restkey] = row[lf:]
118 elif lf > lr:
119 for key in self.fieldnames[lr:]:
120 d[key] = self.restval
121 return d
124 class DictWriter:
125 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
126 dialect="excel", *args, **kwds):
127 self.fieldnames = fieldnames # list of keys for the dict
128 self.restval = restval # for writing short dicts
129 if extrasaction.lower() not in ("raise", "ignore"):
130 raise ValueError, \
131 ("extrasaction (%s) must be 'raise' or 'ignore'" %
132 extrasaction)
133 self.extrasaction = extrasaction
134 self.writer = writer(f, dialect, *args, **kwds)
136 def _dict_to_list(self, rowdict):
137 if self.extrasaction == "raise":
138 for k in rowdict.keys():
139 if k not in self.fieldnames:
140 raise ValueError, "dict contains fields not in fieldnames"
141 return [rowdict.get(key, self.restval) for key in self.fieldnames]
143 def writerow(self, rowdict):
144 return self.writer.writerow(self._dict_to_list(rowdict))
146 def writerows(self, rowdicts):
147 rows = []
148 for rowdict in rowdicts:
149 rows.append(self._dict_to_list(rowdict))
150 return self.writer.writerows(rows)
152 # Guard Sniffer's type checking against builds that exclude complex()
153 try:
154 complex
155 except NameError:
156 complex = float
158 class Sniffer:
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161 Returns a Dialect object.
163 def __init__(self):
164 # in case there is more than one possible delimiter
165 self.preferred = [',', '\t', ';', ' ', ':']
168 def sniff(self, sample, delimiters=None):
170 Returns a dialect (or None) corresponding to the sample
173 quotechar, delimiter, skipinitialspace = \
174 self._guess_quote_and_delimiter(sample, delimiters)
175 if delimiter is None:
176 delimiter, skipinitialspace = self._guess_delimiter(sample,
177 delimiters)
179 class dialect(Dialect):
180 _name = "sniffed"
181 lineterminator = '\r\n'
182 quoting = QUOTE_MINIMAL
183 # escapechar = ''
184 doublequote = False
186 dialect.delimiter = delimiter
187 # _csv.reader won't accept a quotechar of ''
188 dialect.quotechar = quotechar or '"'
189 dialect.skipinitialspace = skipinitialspace
191 return dialect
194 def _guess_quote_and_delimiter(self, data, delimiters):
196 Looks for text enclosed between two identical quotes
197 (the probable quotechar) which are preceded and followed
198 by the same character (the probable delimiter).
199 For example:
200 ,'some text',
201 The quote with the most wins, same with the delimiter.
202 If there is no quotechar the delimiter can't be determined
203 this way.
206 matches = []
207 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
208 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
209 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
210 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
211 regexp = re.compile(restr, re.S | re.M)
212 matches = regexp.findall(data)
213 if matches:
214 break
216 if not matches:
217 return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
219 quotes = {}
220 delims = {}
221 spaces = 0
222 for m in matches:
223 n = regexp.groupindex['quote'] - 1
224 key = m[n]
225 if key:
226 quotes[key] = quotes.get(key, 0) + 1
227 try:
228 n = regexp.groupindex['delim'] - 1
229 key = m[n]
230 except KeyError:
231 continue
232 if key and (delimiters is None or key in delimiters):
233 delims[key] = delims.get(key, 0) + 1
234 try:
235 n = regexp.groupindex['space'] - 1
236 except KeyError:
237 continue
238 if m[n]:
239 spaces += 1
241 quotechar = reduce(lambda a, b, quotes = quotes:
242 (quotes[a] > quotes[b]) and a or b, quotes.keys())
244 if delims:
245 delim = reduce(lambda a, b, delims = delims:
246 (delims[a] > delims[b]) and a or b, delims.keys())
247 skipinitialspace = delims[delim] == spaces
248 if delim == '\n': # most likely a file with a single column
249 delim = ''
250 else:
251 # there is *no* delimiter, it's a single column of quoted data
252 delim = ''
253 skipinitialspace = 0
255 return (quotechar, delim, skipinitialspace)
258 def _guess_delimiter(self, data, delimiters):
260 The delimiter /should/ occur the same number of times on
261 each row. However, due to malformed data, it may not. We don't want
262 an all or nothing approach, so we allow for small variations in this
263 number.
264 1) build a table of the frequency of each character on every line.
265 2) build a table of freqencies of this frequency (meta-frequency?),
266 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
267 7 times in 2 rows'
268 3) use the mode of the meta-frequency to determine the /expected/
269 frequency for that character
270 4) find out how often the character actually meets that goal
271 5) the character that best meets its goal is the delimiter
272 For performance reasons, the data is evaluated in chunks, so it can
273 try and evaluate the smallest portion of the data possible, evaluating
274 additional chunks as necessary.
277 data = filter(None, data.split('\n'))
279 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
281 # build frequency tables
282 chunkLength = min(10, len(data))
283 iteration = 0
284 charFrequency = {}
285 modes = {}
286 delims = {}
287 start, end = 0, min(chunkLength, len(data))
288 while start < len(data):
289 iteration += 1
290 for line in data[start:end]:
291 for char in ascii:
292 metaFrequency = charFrequency.get(char, {})
293 # must count even if frequency is 0
294 freq = line.strip().count(char)
295 # value is the mode
296 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
297 charFrequency[char] = metaFrequency
299 for char in charFrequency.keys():
300 items = charFrequency[char].items()
301 if len(items) == 1 and items[0][0] == 0:
302 continue
303 # get the mode of the frequencies
304 if len(items) > 1:
305 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
306 items)
307 # adjust the mode - subtract the sum of all
308 # other frequencies
309 items.remove(modes[char])
310 modes[char] = (modes[char][0], modes[char][1]
311 - reduce(lambda a, b: (0, a[1] + b[1]),
312 items)[1])
313 else:
314 modes[char] = items[0]
316 # build a list of possible delimiters
317 modeList = modes.items()
318 total = float(chunkLength * iteration)
319 # (rows of consistent data) / (number of rows) = 100%
320 consistency = 1.0
321 # minimum consistency threshold
322 threshold = 0.9
323 while len(delims) == 0 and consistency >= threshold:
324 for k, v in modeList:
325 if v[0] > 0 and v[1] > 0:
326 if ((v[1]/total) >= consistency and
327 (delimiters is None or k in delimiters)):
328 delims[k] = v
329 consistency -= 0.01
331 if len(delims) == 1:
332 delim = delims.keys()[0]
333 skipinitialspace = (data[0].count(delim) ==
334 data[0].count("%c " % delim))
335 return (delim, skipinitialspace)
337 # analyze another chunkLength lines
338 start = end
339 end += chunkLength
341 if not delims:
342 return ('', 0)
344 # if there's more than one, fall back to a 'preferred' list
345 if len(delims) > 1:
346 for d in self.preferred:
347 if d in delims.keys():
348 skipinitialspace = (data[0].count(d) ==
349 data[0].count("%c " % d))
350 return (d, skipinitialspace)
352 # finally, just return the first damn character in the list
353 delim = delims.keys()[0]
354 skipinitialspace = (data[0].count(delim) ==
355 data[0].count("%c " % delim))
356 return (delim, skipinitialspace)
359 def has_header(self, sample):
360 # Creates a dictionary of types of data in each column. If any
361 # column is of a single type (say, integers), *except* for the first
362 # row, then the first row is presumed to be labels. If the type
363 # can't be determined, it is assumed to be a string in which case
364 # the length of the string is the determining factor: if all of the
365 # rows except for the first are the same length, it's a header.
366 # Finally, a 'vote' is taken at the end for each column, adding or
367 # subtracting from the likelihood of the first row being a header.
369 rdr = reader(StringIO(sample), self.sniff(sample))
371 header = rdr.next() # assume first row is header
373 columns = len(header)
374 columnTypes = {}
375 for i in range(columns): columnTypes[i] = None
377 checked = 0
378 for row in rdr:
379 # arbitrary number of rows to check, to keep it sane
380 if checked > 20:
381 break
382 checked += 1
384 if len(row) != columns:
385 continue # skip rows that have irregular number of columns
387 for col in columnTypes.keys():
389 for thisType in [int, long, float, complex]:
390 try:
391 thisType(row[col])
392 break
393 except (ValueError, OverflowError):
394 pass
395 else:
396 # fallback to length of string
397 thisType = len(row[col])
399 # treat longs as ints
400 if thisType == long:
401 thisType = int
403 if thisType != columnTypes[col]:
404 if columnTypes[col] is None: # add new column type
405 columnTypes[col] = thisType
406 else:
407 # type is inconsistent, remove column from
408 # consideration
409 del columnTypes[col]
411 # finally, compare results against first row and "vote"
412 # on whether it's a header
413 hasHeader = 0
414 for col, colType in columnTypes.items():
415 if type(colType) == type(0): # it's a length
416 if len(header[col]) != colType:
417 hasHeader += 1
418 else:
419 hasHeader -= 1
420 else: # attempt typecast
421 try:
422 colType(header[col])
423 except (ValueError, TypeError):
424 hasHeader += 1
425 else:
426 hasHeader -= 1
428 return hasHeader > 0