Can now create all needed tables.
[jblite.git] / jblite / jmdict_proto2.py
blob0ccf810899176791fdabaead1ce8bda69bf1a812
1 from __future__ import print_function
2 from __future__ import with_statement
4 import os, sys, re, sqlite3
5 from cStringIO import StringIO
6 from xml.etree.cElementTree import ElementTree
7 from helpers import gzread
9 import gettext
10 #t = gettext.translation("jblite")
11 #_ = t.ugettext
12 gettext.install("jblite")
20 class Database(object):
22 def __init__(self, filename, init_from_file=None):
23 self.conn = sqlite3.connect(filename)
24 self.cursor = self.conn.cursor()
25 if init_from_file is not None:
26 self._reset_database()
27 self._init_from_file(init_from_file)
29 def search(self, query, pref_lang=None):
30 raise NotImplementedError()
32 def _reset_database(self):
33 other_tables = {
34 "entry": EntryTable, # key->int ID
35 "r_ele": REleTable, # key-value plus nokanji flag
36 "audit": AuditTable, # key->(update_date, update_details)
37 "lsource": LSourceTable, # key -> lang, type=full/part, wasei=t/f
38 "gloss": GlossTable, # key -> lang, g_gend, value, pri flag
39 "links": LinksTable, # key -> tag, desc, uri
40 "bibl": BiblTable, # key -> tag, txt
41 #"etym", # not used yet
42 "entities": EntityTable, # Info from JMdict XML entities
44 kv_tables = [ # key-value tables (id -> text blob)
45 "k_ele",
46 "ke_pri",
47 "re_restr",
48 "re_pri",
49 "stagk",
50 "stagr",
51 "xref",
52 "ant",
53 "s_inf",
54 "example",
55 "pri",
57 kv_entity_tables = [ # key-value tables where val == entity
58 "ke_inf",
59 "re_inf",
60 "dial",
61 "field",
62 "misc",
63 "pos",
66 # Drop any existing tables
67 all_tables = other_tables.keys() + kv_tables + kv_entity_tables
68 for tbl in all_tables:
69 self.cursor.execute("DROP TABLE IF EXISTS %s" % tbl)
71 # Create mappings of table name to class
72 class_mappings = other_tables
73 for tbl in kv_tables:
74 class_mappings[tbl] = KeyValueTable
75 for tbl in kv_entity_tables:
76 class_mappings[tbl] = KeyEntityTable
78 # Create all table objects
79 table_mappings = {}
80 for tbl, cls in class_mappings.iteritems():
81 table_mappings[tbl] = cls(self.cursor, tbl)
83 # Create all tables in DB
84 for tbl_obj in table_mappings.itervalues():
85 tbl_obj.create()
87 def _init_from_file(self, jmdict_src):
88 raw_data = gzread(jmdict_src)
89 entities = self._get_entities(raw_data)
91 infile = StringIO(raw_data)
92 etree = ElementTree(file=infile)
93 infile.close()
95 self._process_etree(etree, entities)
97 def _get_entities(self, xml_data):
98 """Gets the ENTITY definitions from JMdict.
100 Finds the built-in DTD and extracts all ENTITY definitions.
103 dtd = self._get_dtd(xml_data)
104 # do some logic to find all entities...
105 entities = {}
106 regex = "<!ENTITY[ ]+([a-zA-Z-]+)[ ]+['\"](.*?)['\"]>"
107 for match in re.finditer(regex, xml_data):
108 key, value = match.groups()[0:2]
109 key = "&%s;" % key # Convert to &entity; format
110 entities[key] = value
111 return entities
113 def _get_dtd(self, xml_data):
114 """Gets the DTD from JMdict."""
115 # This works for JMdict (as it is at the time of writing), but is
116 # not a general solution.
117 start_index = xml_data.find("<!DOCTYPE")
118 if start_index == -1:
119 raise Exception("Could not find start of internal DTD")
120 end_index = xml_data.find("]>")
121 if end_index == -1:
122 raise Exception("Could not find end ofinternal DTD")
123 end_index += 2
124 dtd = xml_data[start_index:end_index]
125 return dtd
127 def _process_etree(self, etree, entities):
128 for i, elem in enumerate(etree.getiterator("entry")):
129 if i >= 10:
130 break
131 print(i, elem)
136 Creating entries:
138 1. Need objects from DB
139 2. Need to parse XML and:
140 1. Directly create DB
141 2. Create entry objects, and create DB from them.
142 - Strength: allows opportunity for modifying database in the future.
143 - Weakness: Need two ways to create objects: from DB and from
144 SQL. This complicates things...
149 class Table(object):
151 """Base class for tables."""
153 # These queries must be specified in child classes.
154 create_query = None
155 insert_query = None
156 index_queries = []
158 def __init__(self, cursor, table_name):
159 self.cursor = cursor
160 self.__next_id = 1
161 self.table_name = table_name
163 def create(self):
164 """Creates table, plus indices if supplied in class definition."""
165 self.cursor.execute(self._get_create_query())
166 index_queries = self._get_index_queries()
167 for query in index_queries:
168 self.cursor.execute(query)
170 def insert(self, *args):
171 self.cursor.execute(self._get_insert_query(), args)
173 def _get_create_query(self):
174 if self.table_name is None:
175 raise ValueError(
176 "table_name must be specified in class definition")
177 return self.create_query % self.table_name
179 def _get_insert_query(self):
180 if self.table_name is None:
181 raise ValueError(
182 "table_name must be specified in class definition")
183 return self.insert_query % self.table_name
185 def _get_index_queries(self):
186 if self.table_name is None:
187 raise ValueError(
188 "table_name must be specified in class definition")
189 if (not (isinstance(self.index_queries, list))
190 or (len(self.index_queries) == 0)):
191 return []
192 else:
193 # Each query needs to have the table name merged in two
194 # places.
195 queries = [q % (self.table_name, self.table_name)
196 for q in self.index_queries]
197 return queries
200 class EntryTable(Table):
201 create_query = ("CREATE TABLE %s "
202 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
203 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
204 index_queries = [
205 "CREATE INDEX %s_seq ON %s (ent_seq)",
209 class KeyValueTable(Table):
210 """General key/value table for one-many relations."""
211 create_query = ("CREATE TABLE %s "
212 "(id INTEGER PRIMARY KEY, fk INTEGER, value TEXT)")
213 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
214 index_queries = [
215 "CREATE INDEX %s_fk ON %s (fk)",
219 class KeyEntityTable(KeyValueTable):
220 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
221 create_query = ("CREATE TABLE %s "
222 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
225 class REleTable(Table):
226 create_query = ("CREATE TABLE %s "
227 "(id INTEGER PRIMARY KEY, fk INTEGER,"
228 " value TEXT, nokanji INTEGER)")
229 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
230 index_queries = [
231 "CREATE INDEX %s_fk ON %s (fk)",
235 class AuditTable(Table):
236 create_query = ("CREATE TABLE %s "
237 "(id INTEGER PRIMARY KEY, fk INTEGER,"
238 " update_date TEXT, update_details TEXT)")
239 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
240 index_queries = [
241 "CREATE INDEX %s_fk ON %s (fk)",
245 class LSourceTable(Table):
246 """Represents the <lsource> element from JMdict.
248 Important changes:
249 ls_type=full/part => partial=1/0
252 create_query = ("CREATE TABLE %s "
253 "(id INTEGER PRIMARY KEY, fk INTEGER,"
254 " lang TEXT, partial INTEGER, wasei INTEGER)")
255 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
256 index_queries = [
257 "CREATE INDEX %s_fk ON %s (fk)",
261 class GlossTable(Table):
262 create_query = ("CREATE TABLE %s "
263 "(id INTEGER PRIMARY KEY, fk INTEGER,"
264 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
265 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
266 index_queries = [
267 "CREATE INDEX %s_fk ON %s (fk)",
268 "CREATE INDEX %s_lang ON %s (lang)",
269 "CREATE INDEX %s_value ON %s (value)",
273 class LinksTable(Table):
274 create_query = ("CREATE TABLE %s "
275 "(id INTEGER PRIMARY KEY, fk INTEGER,"
276 " tag TEXT, desc TEXT, uri TEXT)")
277 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
278 index_queries = [
279 "CREATE INDEX %s_fk ON %s (fk)",
283 class BiblTable(Table):
284 create_query = ("CREATE TABLE %s "
285 "(id INTEGER PRIMARY KEY, fk INTEGER,"
286 " tag TEXT, txt TEXT)")
287 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
288 index_queries = [
289 "CREATE INDEX %s_fk ON %s (fk)",
293 class EntityTable(Table):
294 table_name = "entities"
295 create_query = ("CREATE TABLE %s "
296 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
297 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
298 index_queries = [
299 "CREATE INDEX %s_entity ON %s (entity)",
303 def main():
304 if len(sys.argv) < 3:
305 print(_("Please specify"), file=sys.stderr)
306 db = Database(sys.argv[1], init_from_file=sys.argv[2])
308 if __name__ == "__main__":
309 main()