Doctest skeleton added.
[colly.git] / colly / core.py
blob7b53ef5f1d16470a5fe28b8de82f628ff0f71539
1 import sys
2 import string
3 import logging
4 import csv
5 import re
6 from collections import namedtuple
8 from colly.exceptions import CsvImportError
10 import simplejson as json
12 class Collate(object):
14 _rows = {}
15 headings = ""
16 cols = 0 # number of columns in CSV file
18 ''' Csv parsing and formatting.
19 '''
20 def __init__(self, csv_file, **options):
21 with open(csv_file, "rb") as f:
22 raw = csv.reader(f)
24 ''' Determine given columns, check against CSV.
25 '''
26 first_row = raw.next() # discards first row
28 if 'headings' in options:
29 headings = options['headings']
30 f.seek(0) # reset iter to first row ^
31 else:
32 headings = first_row
33 logging.info('Interpretting headings as row #1')
35 self.headings = self.pad(first_row, headings)
37 ''' OK, columns should be in order, ready to generate a map of the
38 @csv_file.
39 '''
40 index, pk = {}, 0 #: set empty vars
42 try:
43 Head = namedtuple("row", ",".join(self.headings), verbose=False)
44 except ValueError, err:
45 raise CsvImportError(err)
47 for row in map(Head._make, raw):
48 ''' Perhaps not immediately obvious, index contains the full
49 dataset. The "pk" (primary key) should be unique, or if
50 not given it will be incremented.
51 '''
52 if hasattr(row, 'pk'):
53 index[row.pk] = row
54 else:
55 pk += 1
56 index[pk] = row
57 self.column = index #: saves to all to _rows, and the pk column as a set.
59 ''' Make properties use validation & write-once only
60 '''
61 @property
62 def column(self): return False
64 @column.setter
65 def column(self, v):
66 if not self._rows:
67 self._rows = v
69 @column.getter
70 def column(self): return set(self._rows) #: NB this will rtn the pk column
72 ''' Utils
73 '''
74 def pad(self, sample, headings):
75 ''' Padding rows (e.g. csv headings), trys to avoid annoying namedtuple
76 with empties e.g.
77 ",,film,director,"
78 resolves to:
79 "A,B,film,director,E"
80 '''
81 headings = headings or []
82 if len(sample) < len(headings):
83 raise CsvImportError("Given headings exceeded those in CSV")
85 for n, col in enumerate(sample):
86 auto_heading = alphabet(n)
87 try:
88 if re.match(r'^\s|,|$', headings[n]):
89 headings[n] = auto_heading
90 except IndexError:
91 headings.append(auto_heading)
93 logging.info("CSV columns given headings: %s" % (headings))
94 return headings
96 def get_row(self, pk):
97 return self._rows[pk]
99 ''' Rehashing/ formatting
101 def as_json(self):
102 m = {}
103 for i in self.column:
104 m[i] = self._rows[i]._asdict() #: turn _rows (namedtuple) into dict => dump.
105 return json.dumps(m)
107 ''' Helpers
109 def alphabet(n):
110 ''' returns letter of alphabet at 'n', then increments A2, B2, C2 eg:
111 >>> assert (alphabet(0), alphabet(25)) == ('A', 'Z')
112 >>> assert alphabet(26) == 'A2'
113 >>> assert alphabet(42) == 'Q2'
114 >>> assert alphabet(52) == 'A3'
116 if n < 26:
117 return string.uppercase[n]
119 diff = float(n) / 26
120 step = int(round((diff % 1) * 26))
122 return "%s%s" % (
123 string.uppercase[step],
124 str(int(diff+1)) #: number of iterations of alphabet, start at '2'
127 if __name__ == "__main__":
128 import doctest
129 doctest.testmod()