1 from __future__
import print_function
2 from __future__
import with_statement
4 import os
, sys
, re
, sqlite3
, time
5 from cStringIO
import StringIO
6 from xml
.etree
.cElementTree
import ElementTree
7 from helpers
import gzread
10 #t = gettext.translation("jblite")
12 gettext
.install("jblite")
16 def do_time(fn
, *args
, **kwargs
):
17 """Wraps a function call and prints the result."""
19 result
= fn(*args
, **kwargs
)
21 print("do_time: Fn=%s, Time=%f" % (repr(fn
), end
-start
))
25 class Database(object):
27 """Top level object for SQLite 3-based JMdict database."""
29 def __init__(self
, filename
, init_from_file
=None):
30 self
.conn
= sqlite3
.connect(filename
)
31 self
.cursor
= self
.conn
.cursor()
32 self
.tables
= self
._create
_tables
()
33 if init_from_file
is not None:
34 raw_data
= do_time(gzread
, init_from_file
)
36 entities
= do_time(self
._get
_entities
, raw_data
)
37 infile
= StringIO(raw_data
)
38 etree
= do_time(ElementTree
, file=infile
)
41 do_time(self
._create
_new
_tables
)
42 do_time(self
._populate
_database
, etree
, entities
)
44 def search(self
, query
, pref_lang
=None):
45 raise NotImplementedError()
47 def _create_tables(self
):
48 """Creates table objects.
50 Returns a dictionary of table name to table object.
54 "entry": EntryTable
, # key->int ID
55 "r_ele": REleTable
, # key-value plus nokanji flag
56 "sense": SenseTable
, # one-many group mapping for sense info
57 "audit": AuditTable
, # key->(update_date, update_details)
58 "lsource": LSourceTable
, # key -> lang, type=full/part, wasei=t/f
59 "gloss": GlossTable
, # key -> lang, g_gend, value, pri flag
60 "links": LinksTable
, # key -> tag, desc, uri
61 "bibl": BiblTable
, # key -> tag, txt
62 "entity": EntityTable
, # Info from JMdict XML entities
65 # Set up key/value and key/entity tables
66 kv_tables
= [ # key-value tables (id -> text blob)
74 "xref", # (#PCDATA)* - why the *?
75 "ant", # (#PCDATA)* - why the *?
80 kv_entity_tables
= [ # key-value tables where val == entity
89 class_mappings
[tbl
] = KeyValueTable
90 for tbl
in kv_entity_tables
:
91 class_mappings
[tbl
] = KeyEntityTable
93 # Create all table objects
95 for tbl
, cls
in class_mappings
.iteritems():
96 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
100 def _create_new_tables(self
):
101 """(Re)creates the database tables."""
102 for tbl
, tbl_obj
in self
.tables
.iteritems():
103 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % tbl
)
106 def _populate_database(self
, etree
, entities
):
107 """Imports XML data into SQLite database.
109 table_d: table to table_object dictionary
110 etree: ElementTree object for JMdict
111 entities: entity name to description dictionary
114 # NOTE: this is waaay too long. Should be broken up somehow.
115 # For now this will work though...
117 # Populate entities table and get integer keys
118 # NOTE: we'll be mapping from *expanded* entities to ints.
120 tbl
= self
.tables
['entity']
121 for entity
, expansion
in entities
.iteritems():
122 i
= tbl
.insert(entity
, expansion
)
123 entity_int_d
[expansion
] = i
127 # Iterate through each entry
128 print("========================================")
129 for i
, entry
in enumerate(etree
.findall("entry")):
134 ent_seq
= entry
.find("ent_seq")
135 entry_id
= self
.tables
["entry"].insert(int(ent_seq
.text
))
137 for k_ele
in entry
.findall("k_ele"):
139 value
= k_ele
.find("keb").text
140 k_ele_id
= self
.tables
["k_ele"].insert(entry_id
, value
)
143 for ke_inf
in k_ele
.findall("ke_inf"):
144 value
= ke_inf
.text
.strip()
145 entity_id
= entity_int_d
[value
]
146 self
.tables
["ke_inf"].insert(k_ele_id
, entity_id
)
149 for ke_pri
in k_ele
.findall("ke_pri"):
151 self
.tables
["ke_pri"].insert(k_ele_id
, value
)
153 for r_ele
in entry
.findall("r_ele"):
155 value
= r_ele
.find("reb").text
156 # For nokanji: currently it's an empty tag, so
157 # treating it as true/false.
158 nokanji
= 1 if r_ele
.find("nokanji") is not None else 0
159 r_ele_id
= self
.tables
["r_ele"].insert(entry_id
, value
, nokanji
)
162 for re_restr
in r_ele
.findall("re_restr"):
163 value
= re_restr
.text
164 self
.tables
["re_restr"].insert(r_ele_id
, value
)
167 for re_inf
in r_ele
.findall("re_inf"):
168 value
= re_inf
.text
.strip()
169 entity_id
= entity_int_d
[value
]
170 self
.tables
["re_inf"].insert(r_ele_id
, entity_id
)
173 for re_pri
in r_ele
.findall("re_pri"):
175 self
.tables
["re_pri"].insert(r_ele_id
, value
)
178 # (Although children of an info node, since there's only
179 # one per entry, let's connect directly to the entry.)
180 info
= entry
.find("info")
182 for links
in info
.findall("links"):
183 link_tag
= links
.find("link_tag")
184 link_desc
= links
.find("link_desc")
185 link_uri
= links
.find("link_uri")
186 self
.tables
["links"].insert(entry_id
, link_tag
, link_desc
,
188 for bibl
in info
.findall("bibl"):
189 bib_tag
= links
.find("bib_tag")
190 bib_txt
= links
.find("bib_txt")
191 bib_tag
= bib_tag
.text
if bib_tag
is not None else ""
192 bib_txt
= bib_txt
.text
if bib_txt
is not None else ""
193 self
.tables
["bibl"].insert(entry_id
, bib_tag
, bib_txt
)
194 for etym
in info
.findall("etym"):
195 self
.tables
["etym"].insert(entry_id
, etym
.text
)
196 for audit
in info
.findall("audit"):
197 upd_date
= audit
.find("upd_date")
198 upd_detl
= audit
.find("upd_detl")
199 self
.tables
["audit"].insert(entry_id
, upd_dte
, upd_detl
)
202 key_entity_tables
= ["pos", "field", "misc", "dial"]
203 key_value_tables
= ["stagk", "stagr", "xref", "ant", "s_inf", "example"]
205 for sense
in entry
.findall("sense"):
206 # Each sense gets its own ID, for grouping purposes
207 sense_id
= self
.tables
["sense"].insert(entry_id
)
209 for elem_name
in key_value_tables
:
210 for element
in sense
.findall(elem_name
):
211 self
.tables
[elem_name
].insert(sense_id
, element
.text
)
213 for elem_name
in key_entity_tables
:
214 for element
in sense
.findall(elem_name
):
215 entity_id
= entity_int_d
[element
.text
.strip()]
216 self
.tables
[elem_name
].insert(sense_id
, entity_id
)
218 for lsource
in sense
.findall("lsource"):
219 lang
= lsource
.get("xml:lang", "eng")
220 ls_type
= lsource
.get("ls_type") # implied "full" if absent, "part" otherwise
221 ls_wasei
= lsource
.get("ls_wasei") # usually "y"... just a flag.
223 partial
= 1 if ls_type
is not None else 0
226 elif ls_wasei
.text
== "y":
230 'Only known valid ls_wasei attribute value '
231 'is "y", found:', ls_wasei
.text
)
233 self
.tables
["lsource"].insert(sense_id
,
234 lang
, partial
, wasei
)
235 for gloss
in sense
.findall("gloss"):
236 lang
= gloss
.get("xml:lang", "eng")
237 g_gend
= gloss
.get("g_gend")
238 pri_list
= gloss
.getchildren()
239 if len(pri_list
) > 1:
240 gloss_id
= self
.tables
['gloss'].insert(
241 sense_id
, lang
, g_gend
, gloss
.text
, 1)
243 self
.tables
['pri'].insert(gloss_id
, pri
.text
)
245 self
.tables
['gloss'].insert(sense_id
, lang
, g_gend
,
248 ########################################
251 print("========================================")
253 def _get_entities(self
, xml_data
):
254 """Gets the ENTITY definitions from JMdict.
256 Finds the built-in DTD and extracts all ENTITY definitions.
259 dtd
= self
._get
_dtd
(xml_data
)
260 # do some logic to find all entities...
262 regex
= '<!ENTITY[ ]+([a-zA-Z0-9-]+)[ ]+"(.*?)">'
263 for match
in re
.finditer(regex
, xml_data
):
264 key
, value
= match
.groups()[0:2]
265 entities
[key
] = value
268 def _get_dtd(self
, xml_data
):
269 """Gets the DTD from JMdict."""
270 # This works for JMdict (as it is at the time of writing), but is
271 # not a general solution.
272 start_index
= xml_data
.find("<!DOCTYPE")
273 if start_index
== -1:
274 raise Exception("Could not find start of internal DTD")
275 end_index
= xml_data
.find("]>")
277 raise Exception("Could not find end ofinternal DTD")
279 dtd
= xml_data
[start_index
:end_index
]
285 """Base class for tables."""
287 # These queries must be specified in child classes.
291 # Index queries are not required. If specified, they are assumed
292 # to take the format:
294 # CREATE INDEX %s_XXX ON %s (YYY)",
296 # where %s is a placeholder for the table name, and XXX/YYY are
297 # replaced as desired.
300 def __init__(self
, cursor
, table_name
):
303 self
.table_name
= table_name
306 """Creates table, plus indices if supplied in class definition."""
307 query
= self
._get
_create
_query
()
309 self
.cursor
.execute(self
._get
_create
_query
())
310 index_queries
= self
._get
_index
_queries
()
311 for query
in index_queries
:
313 self
.cursor
.execute(query
)
315 def insert(self
, *args
):
316 """Runs an insert with the specified arguments.
318 Returns the row id of the insert. (cursor.lastrowid)
321 query
= self
._get
_insert
_query
()
324 uni_args
= u
"(%s)" % u
", ".join(
325 [unicode(o
) for o
in args
])
326 print(query
, uni_args
)
327 except UnicodeEncodeError:
328 print("(UnicodeEncodeError)", query
, args
)
330 self
.cursor
.execute(query
, args
)
331 print("INSERT result:", self
.cursor
.lastrowid
)
332 return self
.cursor
.lastrowid
334 def _get_create_query(self
):
335 if self
.table_name
is None:
337 "table_name must be specified in class definition")
338 return self
.create_query
% self
.table_name
340 def _get_insert_query(self
):
341 if self
.table_name
is None:
343 "table_name must be specified in class definition")
344 return self
.insert_query
% self
.table_name
346 def _get_index_queries(self
):
347 if self
.table_name
is None:
349 "table_name must be specified in class definition")
350 if (not (isinstance(self
.index_queries
, list))
351 or (len(self
.index_queries
) == 0)):
354 # Each query needs to have the table name merged in two
356 queries
= [q
% (self
.table_name
, self
.table_name
)
357 for q
in self
.index_queries
]
361 class EntryTable(Table
):
362 create_query
= ("CREATE TABLE %s "
363 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
364 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
366 "CREATE INDEX %s_seq ON %s (ent_seq)",
370 class KeyValueTable(Table
):
371 """General key/value table for one-many relations."""
372 create_query
= ("CREATE TABLE %s "
373 "(id INTEGER PRIMARY KEY, fk INTEGER, value TEXT)")
374 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
376 "CREATE INDEX %s_fk ON %s (fk)",
380 class KeyEntityTable(KeyValueTable
):
381 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
382 create_query
= ("CREATE TABLE %s "
383 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
386 class REleTable(Table
):
387 create_query
= ("CREATE TABLE %s "
388 "(id INTEGER PRIMARY KEY, fk INTEGER,"
389 " value TEXT, nokanji INTEGER)")
390 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
392 "CREATE INDEX %s_fk ON %s (fk)",
396 class SenseTable(Table
):
397 """Corresponds to <sense> tag. Functions as group for glosses, etc."""
398 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
399 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
401 "CREATE INDEX %s_fk ON %s (fk)",
405 class AuditTable(Table
):
406 create_query
= ("CREATE TABLE %s "
407 "(id INTEGER PRIMARY KEY, fk INTEGER,"
408 " update_date TEXT, update_details TEXT)")
409 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
411 "CREATE INDEX %s_fk ON %s (fk)",
415 class LSourceTable(Table
):
416 """Represents the <lsource> element from JMdict.
419 ls_type=full/part => partial=1/0
420 ls_wasei=y/null => wasei=1/0
423 create_query
= ("CREATE TABLE %s "
424 "(id INTEGER PRIMARY KEY, fk INTEGER,"
425 " lang TEXT, partial INTEGER, wasei INTEGER)")
426 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
428 "CREATE INDEX %s_fk ON %s (fk)",
432 class GlossTable(Table
):
433 create_query
= ("CREATE TABLE %s "
434 "(id INTEGER PRIMARY KEY, fk INTEGER,"
435 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
436 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
438 "CREATE INDEX %s_fk ON %s (fk)",
439 "CREATE INDEX %s_lang ON %s (lang)",
440 "CREATE INDEX %s_value ON %s (value)",
444 class LinksTable(Table
):
445 create_query
= ("CREATE TABLE %s "
446 "(id INTEGER PRIMARY KEY, fk INTEGER,"
447 " tag TEXT, desc TEXT, uri TEXT)")
448 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
450 "CREATE INDEX %s_fk ON %s (fk)",
454 class BiblTable(Table
):
455 create_query
= ("CREATE TABLE %s "
456 "(id INTEGER PRIMARY KEY, fk INTEGER,"
457 " tag TEXT, txt TEXT)")
458 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
460 "CREATE INDEX %s_fk ON %s (fk)",
464 class EntityTable(Table
):
465 create_query
= ("CREATE TABLE %s "
466 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
467 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
470 ######################################################################
473 if len(sys
.argv
) < 3:
474 print(_("Please specify"), file=sys
.stderr
)
475 db
= Database(sys
.argv
[1], init_from_file
=sys
.argv
[2])
477 if __name__
== "__main__":