1 from __future__
import print_function
2 from __future__
import with_statement
4 import os
, sys
, re
, sqlite3
, time
5 from cStringIO
import StringIO
6 from xml
.etree
.cElementTree
import ElementTree
7 from helpers
import gzread
10 #t = gettext.translation("jblite")
12 gettext
.install("jblite")
16 def do_time(fn
, *args
, **kwargs
):
17 """Wraps a function call and prints the result.
19 Technically, this will also wrap an object instantiation, and will
20 return the object. (This lets us time ElementTree instantiation.)
24 result
= fn(*args
, **kwargs
)
26 print("do_time: Fn=%s, Time=%f" % (repr(fn
), end
-start
))
30 class Database(object):
32 """Top level object for SQLite 3-based JMdict database."""
34 def __init__(self
, filename
, init_from_file
=None):
35 self
.conn
= sqlite3
.connect(filename
)
36 self
.cursor
= self
.conn
.cursor()
37 self
.tables
= self
._create
_tables
()
38 if init_from_file
is not None:
39 raw_data
= do_time(gzread
, init_from_file
)
41 entities
= do_time(self
._get
_entities
, raw_data
)
42 infile
= StringIO(raw_data
)
43 etree
= do_time(ElementTree
, file=infile
)
46 do_time(self
._create
_new
_tables
)
47 do_time(self
._populate
_database
, etree
, entities
)
49 def search(self
, query
, pref_lang
=None):
50 raise NotImplementedError()
52 def _create_tables(self
):
53 """Creates table objects.
55 Returns a dictionary of table name to table object.
59 "entry": EntryTable
, # key->int ID
60 "r_ele": REleTable
, # key-value plus nokanji flag
61 "sense": SenseTable
, # one-many group mapping for sense info
62 "audit": AuditTable
, # key->(update_date, update_details)
63 "lsource": LSourceTable
, # key -> lang, type=full/part, wasei=t/f
64 "gloss": GlossTable
, # key -> lang, g_gend, value, pri flag
65 "links": LinksTable
, # key -> tag, desc, uri
66 "bibl": BiblTable
, # key -> tag, txt
67 "entity": EntityTable
, # Info from JMdict XML entities
70 # Set up key/value and key/entity tables
71 kv_tables
= [ # key-value tables (id -> text blob)
79 "xref", # (#PCDATA)* - why the *?
80 "ant", # (#PCDATA)* - why the *?
85 kv_entity_tables
= [ # key-value tables where val == entity
94 class_mappings
[tbl
] = KeyValueTable
95 for tbl
in kv_entity_tables
:
96 class_mappings
[tbl
] = KeyEntityTable
98 # Create all table objects
100 for tbl
, cls
in class_mappings
.iteritems():
101 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
103 return table_mappings
105 def _create_new_tables(self
):
106 """(Re)creates the database tables."""
107 for tbl
, tbl_obj
in self
.tables
.iteritems():
108 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % tbl
)
111 def _populate_database(self
, etree
, entities
):
112 """Imports XML data into SQLite database.
114 table_d: table to table_object dictionary
115 etree: ElementTree object for JMdict
116 entities: entity name to description dictionary
119 # NOTE: this is waaay too long. Should be broken up somehow.
120 # For now this will work though...
122 # Populate entities table and get integer keys
123 # NOTE: we'll be mapping from *expanded* entities to ints.
125 tbl
= self
.tables
['entity']
126 for entity
, expansion
in entities
.iteritems():
127 i
= tbl
.insert(entity
, expansion
)
128 entity_int_d
[expansion
] = i
132 # Iterate through each entry
133 #print("========================================")
134 for i
, entry
in enumerate(etree
.findall("entry")):
137 ent_seq
= entry
.find("ent_seq")
138 entry_id
= self
.tables
["entry"].insert(int(ent_seq
.text
))
140 for k_ele
in entry
.findall("k_ele"):
142 value
= k_ele
.find("keb").text
143 k_ele_id
= self
.tables
["k_ele"].insert(entry_id
, value
)
146 for ke_inf
in k_ele
.findall("ke_inf"):
147 value
= ke_inf
.text
.strip()
148 entity_id
= entity_int_d
[value
]
149 self
.tables
["ke_inf"].insert(k_ele_id
, entity_id
)
152 for ke_pri
in k_ele
.findall("ke_pri"):
154 self
.tables
["ke_pri"].insert(k_ele_id
, value
)
156 for r_ele
in entry
.findall("r_ele"):
158 value
= r_ele
.find("reb").text
159 # For nokanji: currently it's an empty tag, so
160 # treating it as true/false.
161 nokanji
= 1 if r_ele
.find("nokanji") is not None else 0
162 r_ele_id
= self
.tables
["r_ele"].insert(entry_id
, value
, nokanji
)
165 for re_restr
in r_ele
.findall("re_restr"):
166 value
= re_restr
.text
167 self
.tables
["re_restr"].insert(r_ele_id
, value
)
170 for re_inf
in r_ele
.findall("re_inf"):
171 value
= re_inf
.text
.strip()
172 entity_id
= entity_int_d
[value
]
173 self
.tables
["re_inf"].insert(r_ele_id
, entity_id
)
176 for re_pri
in r_ele
.findall("re_pri"):
178 self
.tables
["re_pri"].insert(r_ele_id
, value
)
181 # (Although children of an info node, since there's only
182 # one per entry, let's connect directly to the entry.)
183 info
= entry
.find("info")
185 for links
in info
.findall("links"):
186 link_tag
= links
.find("link_tag").text
187 link_desc
= links
.find("link_desc").text
188 link_uri
= links
.find("link_uri").text
189 self
.tables
["links"].insert(entry_id
, link_tag
, link_desc
,
191 for bibl
in info
.findall("bibl"):
192 bib_tag
= links
.find("bib_tag")
193 bib_txt
= links
.find("bib_txt")
194 bib_tag
= bib_tag
.text
if bib_tag
is not None else ""
195 bib_txt
= bib_txt
.text
if bib_txt
is not None else ""
196 self
.tables
["bibl"].insert(entry_id
, bib_tag
, bib_txt
)
197 for etym
in info
.findall("etym"):
198 self
.tables
["etym"].insert(entry_id
, etym
.text
)
199 for audit
in info
.findall("audit"):
200 upd_date
= audit
.find("upd_date").text
201 upd_detl
= audit
.find("upd_detl").text
202 self
.tables
["audit"].insert(entry_id
, upd_date
, upd_detl
)
205 key_entity_tables
= ["pos", "field", "misc", "dial"]
206 key_value_tables
= ["stagk", "stagr", "xref", "ant", "s_inf", "example"]
208 for sense
in entry
.findall("sense"):
209 # Each sense gets its own ID, for grouping purposes
210 sense_id
= self
.tables
["sense"].insert(entry_id
)
212 for elem_name
in key_value_tables
:
213 for element
in sense
.findall(elem_name
):
214 self
.tables
[elem_name
].insert(sense_id
, element
.text
)
216 for elem_name
in key_entity_tables
:
217 for element
in sense
.findall(elem_name
):
218 entity_id
= entity_int_d
[element
.text
.strip()]
219 self
.tables
[elem_name
].insert(sense_id
, entity_id
)
221 for lsource
in sense
.findall("lsource"):
222 lang
= lsource
.get("xml:lang", "eng")
223 ls_type
= lsource
.get("ls_type") # implied "full" if absent, "part" otherwise
224 ls_wasei
= lsource
.get("ls_wasei") # usually "y"... just a flag.
226 partial
= 1 if ls_type
is not None else 0
229 elif ls_wasei
== "y":
233 'Only known valid ls_wasei attribute value '
234 'is "y", found:', ls_wasei
.text
)
236 self
.tables
["lsource"].insert(sense_id
,
237 lang
, partial
, wasei
)
238 for gloss
in sense
.findall("gloss"):
239 lang
= gloss
.get("xml:lang", "eng")
240 g_gend
= gloss
.get("g_gend")
241 pri_list
= gloss
.getchildren()
242 if len(pri_list
) > 1:
243 gloss_id
= self
.tables
['gloss'].insert(
244 sense_id
, lang
, g_gend
, gloss
.text
, 1)
246 self
.tables
['pri'].insert(gloss_id
, pri
.text
)
248 self
.tables
['gloss'].insert(sense_id
, lang
, g_gend
,
251 ########################################
255 #print("========================================")
260 def _get_entities(self
, xml_data
):
261 """Gets the ENTITY definitions from JMdict.
263 Finds the built-in DTD and extracts all ENTITY definitions.
266 dtd
= self
._get
_dtd
(xml_data
)
267 # do some logic to find all entities...
269 regex
= '<!ENTITY[ ]+([a-zA-Z0-9-]+)[ ]+"(.*?)">'
270 for match
in re
.finditer(regex
, xml_data
):
271 key
, value
= match
.groups()[0:2]
272 entities
[key
] = value
275 def _get_dtd(self
, xml_data
):
276 """Gets the DTD from JMdict."""
277 # This works for JMdict (as it is at the time of writing), but is
278 # not a general solution.
279 start_index
= xml_data
.find("<!DOCTYPE")
280 if start_index
== -1:
281 raise Exception("Could not find start of internal DTD")
282 end_index
= xml_data
.find("]>")
284 raise Exception("Could not find end ofinternal DTD")
286 dtd
= xml_data
[start_index
:end_index
]
292 """Base class for tables."""
294 # These queries must be specified in child classes.
298 # Index queries are not required. If specified, they are assumed
299 # to take the format:
301 # CREATE INDEX %s_XXX ON %s (YYY)",
303 # where %s is a placeholder for the table name, and XXX/YYY are
304 # replaced as desired.
307 def __init__(self
, cursor
, table_name
):
310 self
.table_name
= table_name
313 """Creates table, plus indices if supplied in class definition."""
314 query
= self
._get
_create
_query
()
316 self
.cursor
.execute(query
)
317 index_queries
= self
._get
_index
_queries
()
318 for query
in index_queries
:
320 self
.cursor
.execute(query
)
322 def insert(self
, *args
):
323 """Runs an insert with the specified arguments.
325 Returns the row id of the insert. (cursor.lastrowid)
328 query
= self
._get
_insert
_query
()
331 # uni_args = u"(%s)" % u", ".join(
332 # [unicode(o) for o in args])
333 # print(query, uni_args)
334 #except UnicodeEncodeError:
335 # print("(UnicodeEncodeError)", query, args)
338 self
.cursor
.execute(query
, args
)
340 print("EXCEPTION OCCURRED ON INSERT: query=%s, args=%s" %
341 (repr(query
), repr(args
)))
343 #print("INSERT result:", self.cursor.lastrowid)
344 return self
.cursor
.lastrowid
346 def _get_create_query(self
):
347 if self
.table_name
is None:
349 "table_name must be specified in class definition")
350 return self
.create_query
% self
.table_name
352 def _get_insert_query(self
):
353 if self
.table_name
is None:
355 "table_name must be specified in class definition")
356 return self
.insert_query
% self
.table_name
358 def _get_index_queries(self
):
359 if self
.table_name
is None:
361 "table_name must be specified in class definition")
362 if (not (isinstance(self
.index_queries
, list))
363 or (len(self
.index_queries
) == 0)):
366 # Each query needs to have the table name merged in two
368 queries
= [q
% (self
.table_name
, self
.table_name
)
369 for q
in self
.index_queries
]
373 class EntryTable(Table
):
374 create_query
= ("CREATE TABLE %s "
375 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
376 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
378 "CREATE INDEX %s_seq ON %s (ent_seq)",
382 class KeyValueTable(Table
):
383 """General key/value table for one-many relations."""
384 create_query
= ("CREATE TABLE %s "
385 "(id INTEGER PRIMARY KEY, fk INTEGER, value TEXT)")
386 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
388 "CREATE INDEX %s_fk ON %s (fk)",
392 class KeyEntityTable(KeyValueTable
):
393 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
394 create_query
= ("CREATE TABLE %s "
395 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
398 class REleTable(Table
):
399 create_query
= ("CREATE TABLE %s "
400 "(id INTEGER PRIMARY KEY, fk INTEGER,"
401 " value TEXT, nokanji INTEGER)")
402 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
404 "CREATE INDEX %s_fk ON %s (fk)",
408 class SenseTable(Table
):
409 """Corresponds to <sense> tag. Functions as group for glosses, etc."""
410 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
411 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
413 "CREATE INDEX %s_fk ON %s (fk)",
417 class AuditTable(Table
):
418 create_query
= ("CREATE TABLE %s "
419 "(id INTEGER PRIMARY KEY, fk INTEGER,"
420 " update_date TEXT, update_details TEXT)")
421 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
423 "CREATE INDEX %s_fk ON %s (fk)",
427 class LSourceTable(Table
):
428 """Represents the <lsource> element from JMdict.
431 ls_type=full/part => partial=1/0
432 ls_wasei=y/null => wasei=1/0
435 create_query
= ("CREATE TABLE %s "
436 "(id INTEGER PRIMARY KEY, fk INTEGER,"
437 " lang TEXT, partial INTEGER, wasei INTEGER)")
438 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
440 "CREATE INDEX %s_fk ON %s (fk)",
444 class GlossTable(Table
):
445 create_query
= ("CREATE TABLE %s "
446 "(id INTEGER PRIMARY KEY, fk INTEGER,"
447 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
448 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
450 "CREATE INDEX %s_fk ON %s (fk)",
451 "CREATE INDEX %s_lang ON %s (lang)",
452 "CREATE INDEX %s_value ON %s (value)",
456 class LinksTable(Table
):
457 create_query
= ("CREATE TABLE %s "
458 "(id INTEGER PRIMARY KEY, fk INTEGER,"
459 " tag TEXT, desc TEXT, uri TEXT)")
460 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
462 "CREATE INDEX %s_fk ON %s (fk)",
466 class BiblTable(Table
):
467 create_query
= ("CREATE TABLE %s "
468 "(id INTEGER PRIMARY KEY, fk INTEGER,"
469 " tag TEXT, txt TEXT)")
470 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
472 "CREATE INDEX %s_fk ON %s (fk)",
476 class EntityTable(Table
):
477 create_query
= ("CREATE TABLE %s "
478 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
479 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
482 ######################################################################
485 if len(sys
.argv
) < 3:
486 print(_("Please specify"), file=sys
.stderr
)
487 db
= Database(sys
.argv
[1], init_from_file
=sys
.argv
[2])
489 if __name__
== "__main__":