colly/core.py

   1 import sys
   2 import string
   3 import logging
   4 import csv
   5 import re
   6 from collections import namedtuple
   7
   8 from colly.exceptions import CsvImportError
   9
  10 import simplejson as json
  11
  12 class Collate(object):
  13
  14     _rows = {}
  15     headings = ""
  16     cols = 0 # number of columns in CSV file
  17
  18     ''' Csv parsing and formatting.
  19     '''
  20     def __init__(self, csv_file, **options):
  21         with open(csv_file, "rb") as f:
  22             raw = csv.reader(f)
  23
  24             ''' Determine given columns, check against CSV.
  25             '''
  26             first_row = raw.next() # discards first row
  27
  28             if 'headings' in options:
  29                 headings = options['headings']
  30                 f.seek(0) # reset iter to first row ^
  31             else:
  32                 headings = first_row
  33                 logging.info('Interpretting headings as row #1')
  34
  35             self.headings = self.pad(first_row, headings)
  36
  37             ''' OK, columns should be in order, ready to generate a map of the
  38                 @csv_file.
  39             '''
  40             index, pk = {}, 0 #: set empty vars
  41
  42             try:
  43                 Head = namedtuple("row", ",".join(self.headings), verbose=False)
  44             except ValueError, err:
  45                 raise CsvImportError(err)
  46
  47             for row in map(Head._make, raw):
  48                 ''' Perhaps not immediately obvious, index contains the full
  49                     dataset. The "pk" (primary key) should be unique, or if
  50                     not given it will be incremented.
  51                 '''
  52                 if hasattr(row, 'pk'):
  53                     index[row.pk] = row
  54                 else:
  55                     pk += 1
  56                     index[pk] = row
  57         self.column = index #: saves to all to _rows, and the pk column as a set.
  58
  59     ''' Make properties use validation & write-once only
  60     '''
  61     @property
  62     def column(self): return False
  63
  64     @column.setter
  65     def column(self, v):
  66         if not self._rows:
  67             self._rows = v
  68
  69     @column.getter
  70     def column(self): return set(self._rows) #: NB this will rtn the pk column
  71
  72     ''' Utils
  73     '''
  74     def pad(self, sample, headings):
  75         ''' Padding rows (e.g. csv headings), trys to avoid annoying namedtuple
  76             with empties e.g.
  77                 ",,film,director,"
  78             resolves to:
  79                 "A,B,film,director,E"
  80         '''
  81         headings = headings or []
  82         if len(sample) < len(headings):
  83             raise CsvImportError("Given headings exceeded those in CSV")
  84
  85         for n, col in enumerate(sample):
  86             auto_heading = alphabet(n)
  87             try:
  88                 if re.match(r'^\s|,|$', headings[n]):
  89                     headings[n] = auto_heading
  90             except IndexError:
  91                 headings.append(auto_heading)
  92
  93         logging.info("CSV columns given headings: %s" % (headings))
  94         return headings
  95
  96     def get_row(self, pk):
  97         return self._rows[pk]
  98
  99     ''' Rehashing/ formatting
 100     '''
 101     def as_json(self):
 102         m = {}
 103         for i in self.column:
 104             m[i] = self._rows[i]._asdict() #: turn _rows (namedtuple) into dict => dump.
 105         return json.dumps(m)
 106
 107 ''' Helpers
 108 '''
 109 def alphabet(n):
 110     ''' returns letter of alphabet at 'n', then increments A2, B2, C2 eg:
 111     >>> assert (alphabet(0), alphabet(25)) == ('A', 'Z')
 112     >>> assert alphabet(26) == 'A2'
 113     >>> assert alphabet(42) == 'Q2'
 114     >>> assert alphabet(52) == 'A3'
 115     '''
 116     if n < 26:
 117         return string.uppercase[n]
 118
 119     diff = float(n) / 26
 120     step = int(round((diff % 1) * 26))
 121
 122     return "%s%s" % (
 123       string.uppercase[step],
 124       str(int(diff+1)) #: number of iterations of alphabet, start at '2'
 125     )
 126
 127 if __name__ == "__main__":
 128     import doctest
 129     doctest.testmod()