Removed entry limiter. Importer now seems to work from beginning to end.
[jblite.git] / jblite / jmdict_proto2.py
blob7d9aba2c7437122a5147767fe791f60da87c44ca
1 from __future__ import print_function
2 from __future__ import with_statement
4 import os, sys, re, sqlite3, time
5 from cStringIO import StringIO
6 from xml.etree.cElementTree import ElementTree
7 from helpers import gzread
9 import gettext
10 #t = gettext.translation("jblite")
11 #_ = t.ugettext
12 gettext.install("jblite")
16 def do_time(fn, *args, **kwargs):
17 """Wraps a function call and prints the result.
19 Technically, this will also wrap an object instantiation, and will
20 return the object. (This lets us time ElementTree instantiation.)
22 """
23 start = time.time()
24 result = fn(*args, **kwargs)
25 end = time.time()
26 print("do_time: Fn=%s, Time=%f" % (repr(fn), end-start))
27 return result
30 class Database(object):
32 """Top level object for SQLite 3-based JMdict database."""
34 def __init__(self, filename, init_from_file=None):
35 self.conn = sqlite3.connect(filename)
36 self.cursor = self.conn.cursor()
37 self.tables = self._create_tables()
38 if init_from_file is not None:
39 raw_data = do_time(gzread, init_from_file)
41 entities = do_time(self._get_entities, raw_data)
42 infile = StringIO(raw_data)
43 etree = do_time(ElementTree, file=infile)
44 infile.close()
46 do_time(self._create_new_tables)
47 do_time(self._populate_database, etree, entities)
49 def search(self, query, pref_lang=None):
50 raise NotImplementedError()
52 def _create_tables(self):
53 """Creates table objects.
55 Returns a dictionary of table name to table object.
57 """
58 class_mappings = {
59 "entry": EntryTable, # key->int ID
60 "r_ele": REleTable, # key-value plus nokanji flag
61 "sense": SenseTable, # one-many group mapping for sense info
62 "audit": AuditTable, # key->(update_date, update_details)
63 "lsource": LSourceTable, # key -> lang, type=full/part, wasei=t/f
64 "gloss": GlossTable, # key -> lang, g_gend, value, pri flag
65 "links": LinksTable, # key -> tag, desc, uri
66 "bibl": BiblTable, # key -> tag, txt
67 "entity": EntityTable, # Info from JMdict XML entities
70 # Set up key/value and key/entity tables
71 kv_tables = [ # key-value tables (id -> text blob)
72 "k_ele",
73 "ke_pri",
74 "re_restr",
75 "re_pri",
76 "etym",
77 "stagk",
78 "stagr",
79 "xref", # (#PCDATA)* - why the *?
80 "ant", # (#PCDATA)* - why the *?
81 "s_inf",
82 "example",
83 "pri",
85 kv_entity_tables = [ # key-value tables where val == entity
86 "ke_inf",
87 "re_inf",
88 "dial",
89 "field",
90 "misc",
91 "pos",
93 for tbl in kv_tables:
94 class_mappings[tbl] = KeyValueTable
95 for tbl in kv_entity_tables:
96 class_mappings[tbl] = KeyEntityTable
98 # Create all table objects
99 table_mappings = {}
100 for tbl, cls in class_mappings.iteritems():
101 table_mappings[tbl] = cls(self.cursor, tbl)
103 return table_mappings
105 def _create_new_tables(self):
106 """(Re)creates the database tables."""
107 for tbl, tbl_obj in self.tables.iteritems():
108 self.cursor.execute("DROP TABLE IF EXISTS %s" % tbl)
109 tbl_obj.create()
111 def _populate_database(self, etree, entities):
112 """Imports XML data into SQLite database.
114 table_d: table to table_object dictionary
115 etree: ElementTree object for JMdict
116 entities: entity name to description dictionary
119 # NOTE: this is waaay too long. Should be broken up somehow.
120 # For now this will work though...
122 # Populate entities table and get integer keys
123 # NOTE: we'll be mapping from *expanded* entities to ints.
124 entity_int_d = {}
125 tbl = self.tables['entity']
126 for entity, expansion in entities.iteritems():
127 i = tbl.insert(entity, expansion)
128 entity_int_d[expansion] = i
130 self.conn.commit()
132 # Iterate through each entry
133 #print("========================================")
134 for i, entry in enumerate(etree.findall("entry")):
136 # entry table
137 ent_seq = entry.find("ent_seq")
138 entry_id = self.tables["entry"].insert(int(ent_seq.text))
140 for k_ele in entry.findall("k_ele"):
141 # k_ele
142 value = k_ele.find("keb").text
143 k_ele_id = self.tables["k_ele"].insert(entry_id, value)
145 # ke_inf
146 for ke_inf in k_ele.findall("ke_inf"):
147 value = ke_inf.text.strip()
148 entity_id = entity_int_d[value]
149 self.tables["ke_inf"].insert(k_ele_id, entity_id)
151 # ke_pri
152 for ke_pri in k_ele.findall("ke_pri"):
153 value = ke_pri.text
154 self.tables["ke_pri"].insert(k_ele_id, value)
156 for r_ele in entry.findall("r_ele"):
157 # r_ele
158 value = r_ele.find("reb").text
159 # For nokanji: currently it's an empty tag, so
160 # treating it as true/false.
161 nokanji = 1 if r_ele.find("nokanji") is not None else 0
162 r_ele_id = self.tables["r_ele"].insert(entry_id, value, nokanji)
164 # re_restr
165 for re_restr in r_ele.findall("re_restr"):
166 value = re_restr.text
167 self.tables["re_restr"].insert(r_ele_id, value)
169 # re_inf
170 for re_inf in r_ele.findall("re_inf"):
171 value = re_inf.text.strip()
172 entity_id = entity_int_d[value]
173 self.tables["re_inf"].insert(r_ele_id, entity_id)
175 # re_pri
176 for re_pri in r_ele.findall("re_pri"):
177 value = re_pri.text
178 self.tables["re_pri"].insert(r_ele_id, value)
180 # info
181 # (Although children of an info node, since there's only
182 # one per entry, let's connect directly to the entry.)
183 info = entry.find("info")
184 if info is not None:
185 for links in info.findall("links"):
186 link_tag = links.find("link_tag").text
187 link_desc = links.find("link_desc").text
188 link_uri = links.find("link_uri").text
189 self.tables["links"].insert(entry_id, link_tag, link_desc,
190 link_uri)
191 for bibl in info.findall("bibl"):
192 bib_tag = links.find("bib_tag")
193 bib_txt = links.find("bib_txt")
194 bib_tag = bib_tag.text if bib_tag is not None else ""
195 bib_txt = bib_txt.text if bib_txt is not None else ""
196 self.tables["bibl"].insert(entry_id, bib_tag, bib_txt)
197 for etym in info.findall("etym"):
198 self.tables["etym"].insert(entry_id, etym.text)
199 for audit in info.findall("audit"):
200 upd_date = audit.find("upd_date").text
201 upd_detl = audit.find("upd_detl").text
202 self.tables["audit"].insert(entry_id, upd_date, upd_detl)
204 # sense
205 key_entity_tables = ["pos", "field", "misc", "dial"]
206 key_value_tables = ["stagk", "stagr", "xref", "ant", "s_inf", "example"]
208 for sense in entry.findall("sense"):
209 # Each sense gets its own ID, for grouping purposes
210 sense_id = self.tables["sense"].insert(entry_id)
212 for elem_name in key_value_tables:
213 for element in sense.findall(elem_name):
214 self.tables[elem_name].insert(sense_id, element.text)
216 for elem_name in key_entity_tables:
217 for element in sense.findall(elem_name):
218 entity_id = entity_int_d[element.text.strip()]
219 self.tables[elem_name].insert(sense_id, entity_id)
221 for lsource in sense.findall("lsource"):
222 lang = lsource.get("xml:lang", "eng")
223 ls_type = lsource.get("ls_type") # implied "full" if absent, "part" otherwise
224 ls_wasei = lsource.get("ls_wasei") # usually "y"... just a flag.
226 partial = 1 if ls_type is not None else 0
227 if ls_wasei is None:
228 wasei = 0
229 elif ls_wasei == "y":
230 wasei = 1
231 else:
232 raise ValueError(
233 'Only known valid ls_wasei attribute value '
234 'is "y", found:', ls_wasei.text)
236 self.tables["lsource"].insert(sense_id,
237 lang, partial, wasei)
238 for gloss in sense.findall("gloss"):
239 lang = gloss.get("xml:lang", "eng")
240 g_gend = gloss.get("g_gend")
241 pri_list = gloss.getchildren()
242 if len(pri_list) > 1:
243 gloss_id = self.tables['gloss'].insert(
244 sense_id, lang, g_gend, gloss.text, 1)
245 for pri in pri_list:
246 self.tables['pri'].insert(gloss_id, pri.text)
247 else:
248 self.tables['gloss'].insert(sense_id, lang, g_gend,
249 gloss.text, 0)
251 ########################################
253 if (i+1) % 100 == 0:
254 self.conn.commit()
255 #print("========================================")
257 self.conn.commit()
260 def _get_entities(self, xml_data):
261 """Gets the ENTITY definitions from JMdict.
263 Finds the built-in DTD and extracts all ENTITY definitions.
266 dtd = self._get_dtd(xml_data)
267 # do some logic to find all entities...
268 entities = {}
269 regex = '<!ENTITY[ ]+([a-zA-Z0-9-]+)[ ]+"(.*?)">'
270 for match in re.finditer(regex, xml_data):
271 key, value = match.groups()[0:2]
272 entities[key] = value
273 return entities
275 def _get_dtd(self, xml_data):
276 """Gets the DTD from JMdict."""
277 # This works for JMdict (as it is at the time of writing), but is
278 # not a general solution.
279 start_index = xml_data.find("<!DOCTYPE")
280 if start_index == -1:
281 raise Exception("Could not find start of internal DTD")
282 end_index = xml_data.find("]>")
283 if end_index == -1:
284 raise Exception("Could not find end ofinternal DTD")
285 end_index += 2
286 dtd = xml_data[start_index:end_index]
287 return dtd
290 class Table(object):
292 """Base class for tables."""
294 # These queries must be specified in child classes.
295 create_query = None
296 insert_query = None
298 # Index queries are not required. If specified, they are assumed
299 # to take the format:
301 # CREATE INDEX %s_XXX ON %s (YYY)",
303 # where %s is a placeholder for the table name, and XXX/YYY are
304 # replaced as desired.
305 index_queries = []
307 def __init__(self, cursor, table_name):
308 self.cursor = cursor
309 self.__next_id = 1
310 self.table_name = table_name
312 def create(self):
313 """Creates table, plus indices if supplied in class definition."""
314 query = self._get_create_query()
315 #print(query)
316 self.cursor.execute(query)
317 index_queries = self._get_index_queries()
318 for query in index_queries:
319 #print(query)
320 self.cursor.execute(query)
322 def insert(self, *args):
323 """Runs an insert with the specified arguments.
325 Returns the row id of the insert. (cursor.lastrowid)
328 query = self._get_insert_query()
330 #try:
331 # uni_args = u"(%s)" % u", ".join(
332 # [unicode(o) for o in args])
333 # print(query, uni_args)
334 #except UnicodeEncodeError:
335 # print("(UnicodeEncodeError)", query, args)
337 try:
338 self.cursor.execute(query, args)
339 except:
340 print("EXCEPTION OCCURRED ON INSERT: query=%s, args=%s" %
341 (repr(query), repr(args)))
342 raise
343 #print("INSERT result:", self.cursor.lastrowid)
344 return self.cursor.lastrowid
346 def _get_create_query(self):
347 if self.table_name is None:
348 raise ValueError(
349 "table_name must be specified in class definition")
350 return self.create_query % self.table_name
352 def _get_insert_query(self):
353 if self.table_name is None:
354 raise ValueError(
355 "table_name must be specified in class definition")
356 return self.insert_query % self.table_name
358 def _get_index_queries(self):
359 if self.table_name is None:
360 raise ValueError(
361 "table_name must be specified in class definition")
362 if (not (isinstance(self.index_queries, list))
363 or (len(self.index_queries) == 0)):
364 return []
365 else:
366 # Each query needs to have the table name merged in two
367 # places.
368 queries = [q % (self.table_name, self.table_name)
369 for q in self.index_queries]
370 return queries
373 class EntryTable(Table):
374 create_query = ("CREATE TABLE %s "
375 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
376 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
377 index_queries = [
378 "CREATE INDEX %s_seq ON %s (ent_seq)",
382 class KeyValueTable(Table):
383 """General key/value table for one-many relations."""
384 create_query = ("CREATE TABLE %s "
385 "(id INTEGER PRIMARY KEY, fk INTEGER, value TEXT)")
386 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
387 index_queries = [
388 "CREATE INDEX %s_fk ON %s (fk)",
392 class KeyEntityTable(KeyValueTable):
393 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
394 create_query = ("CREATE TABLE %s "
395 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
398 class REleTable(Table):
399 create_query = ("CREATE TABLE %s "
400 "(id INTEGER PRIMARY KEY, fk INTEGER,"
401 " value TEXT, nokanji INTEGER)")
402 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
403 index_queries = [
404 "CREATE INDEX %s_fk ON %s (fk)",
408 class SenseTable(Table):
409 """Corresponds to <sense> tag. Functions as group for glosses, etc."""
410 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
411 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
412 index_queries = [
413 "CREATE INDEX %s_fk ON %s (fk)",
417 class AuditTable(Table):
418 create_query = ("CREATE TABLE %s "
419 "(id INTEGER PRIMARY KEY, fk INTEGER,"
420 " update_date TEXT, update_details TEXT)")
421 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
422 index_queries = [
423 "CREATE INDEX %s_fk ON %s (fk)",
427 class LSourceTable(Table):
428 """Represents the <lsource> element from JMdict.
430 Important changes:
431 ls_type=full/part => partial=1/0
432 ls_wasei=y/null => wasei=1/0
435 create_query = ("CREATE TABLE %s "
436 "(id INTEGER PRIMARY KEY, fk INTEGER,"
437 " lang TEXT, partial INTEGER, wasei INTEGER)")
438 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
439 index_queries = [
440 "CREATE INDEX %s_fk ON %s (fk)",
444 class GlossTable(Table):
445 create_query = ("CREATE TABLE %s "
446 "(id INTEGER PRIMARY KEY, fk INTEGER,"
447 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
448 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
449 index_queries = [
450 "CREATE INDEX %s_fk ON %s (fk)",
451 "CREATE INDEX %s_lang ON %s (lang)",
452 "CREATE INDEX %s_value ON %s (value)",
456 class LinksTable(Table):
457 create_query = ("CREATE TABLE %s "
458 "(id INTEGER PRIMARY KEY, fk INTEGER,"
459 " tag TEXT, desc TEXT, uri TEXT)")
460 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
461 index_queries = [
462 "CREATE INDEX %s_fk ON %s (fk)",
466 class BiblTable(Table):
467 create_query = ("CREATE TABLE %s "
468 "(id INTEGER PRIMARY KEY, fk INTEGER,"
469 " tag TEXT, txt TEXT)")
470 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
471 index_queries = [
472 "CREATE INDEX %s_fk ON %s (fk)",
476 class EntityTable(Table):
477 create_query = ("CREATE TABLE %s "
478 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
479 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
482 ######################################################################
484 def main():
485 if len(sys.argv) < 3:
486 print(_("Please specify"), file=sys.stderr)
487 db = Database(sys.argv[1], init_from_file=sys.argv[2])
489 if __name__ == "__main__":
490 main()