Added timings to various parts of the import.
[jblite.git] / jblite / jmdict_proto2.py
blobd6873c484844ff011c5f2f192a0a51eb0a59736e
1 from __future__ import print_function
2 from __future__ import with_statement
4 import os, sys, re, sqlite3, time
5 from cStringIO import StringIO
6 from xml.etree.cElementTree import ElementTree
7 from helpers import gzread
9 import gettext
10 #t = gettext.translation("jblite")
11 #_ = t.ugettext
12 gettext.install("jblite")
16 def do_time(fn, *args, **kwargs):
17 """Wraps a function call and prints the result."""
18 start = time.time()
19 result = fn(*args, **kwargs)
20 end = time.time()
21 print("do_time: Fn=%s, Time=%f" % (repr(fn), end-start))
22 return result
25 class Database(object):
27 """Top level object for SQLite 3-based JMdict database."""
29 def __init__(self, filename, init_from_file=None):
30 self.conn = sqlite3.connect(filename)
31 self.cursor = self.conn.cursor()
32 self.tables = self._create_tables()
33 if init_from_file is not None:
34 raw_data = do_time(gzread, init_from_file)
36 entities = do_time(self._get_entities, raw_data)
37 infile = StringIO(raw_data)
38 etree = do_time(ElementTree, file=infile)
39 infile.close()
41 do_time(self._create_new_tables)
42 do_time(self._populate_database, etree, entities)
44 def search(self, query, pref_lang=None):
45 raise NotImplementedError()
47 def _create_tables(self):
48 """Creates table objects.
50 Returns a dictionary of table name to table object.
52 """
53 class_mappings = {
54 "entry": EntryTable, # key->int ID
55 "r_ele": REleTable, # key-value plus nokanji flag
56 "sense": SenseTable, # one-many group mapping for sense info
57 "audit": AuditTable, # key->(update_date, update_details)
58 "lsource": LSourceTable, # key -> lang, type=full/part, wasei=t/f
59 "gloss": GlossTable, # key -> lang, g_gend, value, pri flag
60 "links": LinksTable, # key -> tag, desc, uri
61 "bibl": BiblTable, # key -> tag, txt
62 "entity": EntityTable, # Info from JMdict XML entities
65 # Set up key/value and key/entity tables
66 kv_tables = [ # key-value tables (id -> text blob)
67 "k_ele",
68 "ke_pri",
69 "re_restr",
70 "re_pri",
71 "etym",
72 "stagk",
73 "stagr",
74 "xref", # (#PCDATA)* - why the *?
75 "ant", # (#PCDATA)* - why the *?
76 "s_inf",
77 "example",
78 "pri",
80 kv_entity_tables = [ # key-value tables where val == entity
81 "ke_inf",
82 "re_inf",
83 "dial",
84 "field",
85 "misc",
86 "pos",
88 for tbl in kv_tables:
89 class_mappings[tbl] = KeyValueTable
90 for tbl in kv_entity_tables:
91 class_mappings[tbl] = KeyEntityTable
93 # Create all table objects
94 table_mappings = {}
95 for tbl, cls in class_mappings.iteritems():
96 table_mappings[tbl] = cls(self.cursor, tbl)
98 return table_mappings
100 def _create_new_tables(self):
101 """(Re)creates the database tables."""
102 for tbl, tbl_obj in self.tables.iteritems():
103 self.cursor.execute("DROP TABLE IF EXISTS %s" % tbl)
104 tbl_obj.create()
106 def _populate_database(self, etree, entities):
107 """Imports XML data into SQLite database.
109 table_d: table to table_object dictionary
110 etree: ElementTree object for JMdict
111 entities: entity name to description dictionary
114 # NOTE: this is waaay too long. Should be broken up somehow.
115 # For now this will work though...
117 # Populate entities table and get integer keys
118 # NOTE: we'll be mapping from *expanded* entities to ints.
119 entity_int_d = {}
120 tbl = self.tables['entity']
121 for entity, expansion in entities.iteritems():
122 i = tbl.insert(entity, expansion)
123 entity_int_d[expansion] = i
125 self.conn.commit()
127 # Iterate through each entry
128 print("========================================")
129 for i, entry in enumerate(etree.findall("entry")):
130 if i >= 200:
131 break
133 # entry table
134 ent_seq = entry.find("ent_seq")
135 entry_id = self.tables["entry"].insert(int(ent_seq.text))
137 for k_ele in entry.findall("k_ele"):
138 # k_ele
139 value = k_ele.find("keb").text
140 k_ele_id = self.tables["k_ele"].insert(entry_id, value)
142 # ke_inf
143 for ke_inf in k_ele.findall("ke_inf"):
144 value = ke_inf.text.strip()
145 entity_id = entity_int_d[value]
146 self.tables["ke_inf"].insert(k_ele_id, entity_id)
148 # ke_pri
149 for ke_pri in k_ele.findall("ke_pri"):
150 value = ke_pri.text
151 self.tables["ke_pri"].insert(k_ele_id, value)
153 for r_ele in entry.findall("r_ele"):
154 # r_ele
155 value = r_ele.find("reb").text
156 # For nokanji: currently it's an empty tag, so
157 # treating it as true/false.
158 nokanji = 1 if r_ele.find("nokanji") is not None else 0
159 r_ele_id = self.tables["r_ele"].insert(entry_id, value, nokanji)
161 # re_restr
162 for re_restr in r_ele.findall("re_restr"):
163 value = re_restr.text
164 self.tables["re_restr"].insert(r_ele_id, value)
166 # re_inf
167 for re_inf in r_ele.findall("re_inf"):
168 value = re_inf.text.strip()
169 entity_id = entity_int_d[value]
170 self.tables["re_inf"].insert(r_ele_id, entity_id)
172 # re_pri
173 for re_pri in r_ele.findall("re_pri"):
174 value = re_pri.text
175 self.tables["re_pri"].insert(r_ele_id, value)
177 # info
178 # (Although children of an info node, since there's only
179 # one per entry, let's connect directly to the entry.)
180 info = entry.find("info")
181 if info is not None:
182 for links in info.findall("links"):
183 link_tag = links.find("link_tag")
184 link_desc = links.find("link_desc")
185 link_uri = links.find("link_uri")
186 self.tables["links"].insert(entry_id, link_tag, link_desc,
187 link_uri)
188 for bibl in info.findall("bibl"):
189 bib_tag = links.find("bib_tag")
190 bib_txt = links.find("bib_txt")
191 bib_tag = bib_tag.text if bib_tag is not None else ""
192 bib_txt = bib_txt.text if bib_txt is not None else ""
193 self.tables["bibl"].insert(entry_id, bib_tag, bib_txt)
194 for etym in info.findall("etym"):
195 self.tables["etym"].insert(entry_id, etym.text)
196 for audit in info.findall("audit"):
197 upd_date = audit.find("upd_date")
198 upd_detl = audit.find("upd_detl")
199 self.tables["audit"].insert(entry_id, upd_dte, upd_detl)
201 # sense
202 key_entity_tables = ["pos", "field", "misc", "dial"]
203 key_value_tables = ["stagk", "stagr", "xref", "ant", "s_inf", "example"]
205 for sense in entry.findall("sense"):
206 # Each sense gets its own ID, for grouping purposes
207 sense_id = self.tables["sense"].insert(entry_id)
209 for elem_name in key_value_tables:
210 for element in sense.findall(elem_name):
211 self.tables[elem_name].insert(sense_id, element.text)
213 for elem_name in key_entity_tables:
214 for element in sense.findall(elem_name):
215 entity_id = entity_int_d[element.text.strip()]
216 self.tables[elem_name].insert(sense_id, entity_id)
218 for lsource in sense.findall("lsource"):
219 lang = lsource.get("xml:lang", "eng")
220 ls_type = lsource.get("ls_type") # implied "full" if absent, "part" otherwise
221 ls_wasei = lsource.get("ls_wasei") # usually "y"... just a flag.
223 partial = 1 if ls_type is not None else 0
224 if ls_wasei is None:
225 wasei = 0
226 elif ls_wasei.text == "y":
227 wasei = 1
228 else:
229 raise ValueError(
230 'Only known valid ls_wasei attribute value '
231 'is "y", found:', ls_wasei.text)
233 self.tables["lsource"].insert(sense_id,
234 lang, partial, wasei)
235 for gloss in sense.findall("gloss"):
236 lang = gloss.get("xml:lang", "eng")
237 g_gend = gloss.get("g_gend")
238 pri_list = gloss.getchildren()
239 if len(pri_list) > 1:
240 gloss_id = self.tables['gloss'].insert(
241 sense_id, lang, g_gend, gloss.text, 1)
242 for pri in pri_list:
243 self.tables['pri'].insert(gloss_id, pri.text)
244 else:
245 self.tables['gloss'].insert(sense_id, lang, g_gend,
246 gloss.text, 0)
248 ########################################
250 self.conn.commit()
251 print("========================================")
253 def _get_entities(self, xml_data):
254 """Gets the ENTITY definitions from JMdict.
256 Finds the built-in DTD and extracts all ENTITY definitions.
259 dtd = self._get_dtd(xml_data)
260 # do some logic to find all entities...
261 entities = {}
262 regex = '<!ENTITY[ ]+([a-zA-Z0-9-]+)[ ]+"(.*?)">'
263 for match in re.finditer(regex, xml_data):
264 key, value = match.groups()[0:2]
265 entities[key] = value
266 return entities
268 def _get_dtd(self, xml_data):
269 """Gets the DTD from JMdict."""
270 # This works for JMdict (as it is at the time of writing), but is
271 # not a general solution.
272 start_index = xml_data.find("<!DOCTYPE")
273 if start_index == -1:
274 raise Exception("Could not find start of internal DTD")
275 end_index = xml_data.find("]>")
276 if end_index == -1:
277 raise Exception("Could not find end ofinternal DTD")
278 end_index += 2
279 dtd = xml_data[start_index:end_index]
280 return dtd
283 class Table(object):
285 """Base class for tables."""
287 # These queries must be specified in child classes.
288 create_query = None
289 insert_query = None
291 # Index queries are not required. If specified, they are assumed
292 # to take the format:
294 # CREATE INDEX %s_XXX ON %s (YYY)",
296 # where %s is a placeholder for the table name, and XXX/YYY are
297 # replaced as desired.
298 index_queries = []
300 def __init__(self, cursor, table_name):
301 self.cursor = cursor
302 self.__next_id = 1
303 self.table_name = table_name
305 def create(self):
306 """Creates table, plus indices if supplied in class definition."""
307 query = self._get_create_query()
308 print(query)
309 self.cursor.execute(self._get_create_query())
310 index_queries = self._get_index_queries()
311 for query in index_queries:
312 print(query)
313 self.cursor.execute(query)
315 def insert(self, *args):
316 """Runs an insert with the specified arguments.
318 Returns the row id of the insert. (cursor.lastrowid)
321 query = self._get_insert_query()
323 try:
324 uni_args = u"(%s)" % u", ".join(
325 [unicode(o) for o in args])
326 print(query, uni_args)
327 except UnicodeEncodeError:
328 print("(UnicodeEncodeError)", query, args)
330 self.cursor.execute(query, args)
331 print("INSERT result:", self.cursor.lastrowid)
332 return self.cursor.lastrowid
334 def _get_create_query(self):
335 if self.table_name is None:
336 raise ValueError(
337 "table_name must be specified in class definition")
338 return self.create_query % self.table_name
340 def _get_insert_query(self):
341 if self.table_name is None:
342 raise ValueError(
343 "table_name must be specified in class definition")
344 return self.insert_query % self.table_name
346 def _get_index_queries(self):
347 if self.table_name is None:
348 raise ValueError(
349 "table_name must be specified in class definition")
350 if (not (isinstance(self.index_queries, list))
351 or (len(self.index_queries) == 0)):
352 return []
353 else:
354 # Each query needs to have the table name merged in two
355 # places.
356 queries = [q % (self.table_name, self.table_name)
357 for q in self.index_queries]
358 return queries
361 class EntryTable(Table):
362 create_query = ("CREATE TABLE %s "
363 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
364 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
365 index_queries = [
366 "CREATE INDEX %s_seq ON %s (ent_seq)",
370 class KeyValueTable(Table):
371 """General key/value table for one-many relations."""
372 create_query = ("CREATE TABLE %s "
373 "(id INTEGER PRIMARY KEY, fk INTEGER, value TEXT)")
374 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
375 index_queries = [
376 "CREATE INDEX %s_fk ON %s (fk)",
380 class KeyEntityTable(KeyValueTable):
381 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
382 create_query = ("CREATE TABLE %s "
383 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
386 class REleTable(Table):
387 create_query = ("CREATE TABLE %s "
388 "(id INTEGER PRIMARY KEY, fk INTEGER,"
389 " value TEXT, nokanji INTEGER)")
390 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
391 index_queries = [
392 "CREATE INDEX %s_fk ON %s (fk)",
396 class SenseTable(Table):
397 """Corresponds to <sense> tag. Functions as group for glosses, etc."""
398 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
399 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
400 index_queries = [
401 "CREATE INDEX %s_fk ON %s (fk)",
405 class AuditTable(Table):
406 create_query = ("CREATE TABLE %s "
407 "(id INTEGER PRIMARY KEY, fk INTEGER,"
408 " update_date TEXT, update_details TEXT)")
409 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
410 index_queries = [
411 "CREATE INDEX %s_fk ON %s (fk)",
415 class LSourceTable(Table):
416 """Represents the <lsource> element from JMdict.
418 Important changes:
419 ls_type=full/part => partial=1/0
420 ls_wasei=y/null => wasei=1/0
423 create_query = ("CREATE TABLE %s "
424 "(id INTEGER PRIMARY KEY, fk INTEGER,"
425 " lang TEXT, partial INTEGER, wasei INTEGER)")
426 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
427 index_queries = [
428 "CREATE INDEX %s_fk ON %s (fk)",
432 class GlossTable(Table):
433 create_query = ("CREATE TABLE %s "
434 "(id INTEGER PRIMARY KEY, fk INTEGER,"
435 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
436 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
437 index_queries = [
438 "CREATE INDEX %s_fk ON %s (fk)",
439 "CREATE INDEX %s_lang ON %s (lang)",
440 "CREATE INDEX %s_value ON %s (value)",
444 class LinksTable(Table):
445 create_query = ("CREATE TABLE %s "
446 "(id INTEGER PRIMARY KEY, fk INTEGER,"
447 " tag TEXT, desc TEXT, uri TEXT)")
448 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
449 index_queries = [
450 "CREATE INDEX %s_fk ON %s (fk)",
454 class BiblTable(Table):
455 create_query = ("CREATE TABLE %s "
456 "(id INTEGER PRIMARY KEY, fk INTEGER,"
457 " tag TEXT, txt TEXT)")
458 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
459 index_queries = [
460 "CREATE INDEX %s_fk ON %s (fk)",
464 class EntityTable(Table):
465 create_query = ("CREATE TABLE %s "
466 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
467 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
470 ######################################################################
472 def main():
473 if len(sys.argv) < 3:
474 print(_("Please specify"), file=sys.stderr)
475 db = Database(sys.argv[1], init_from_file=sys.argv[2])
477 if __name__ == "__main__":
478 main()