Started to update kd2.py. Small changes also done to jmdict.py.
[jblite.git] / jblite / jmdict.py
blobc3e2338961f4e56f9aa123618bfd0a75ffce2d83
1 # -*- coding: utf-8 -*-
2 """JMdict support."""
4 # This could be a bit cleaner if I used something like SQLalchemy
5 # perhaps... The create/insert/index bits were done decent enough,
6 # but lookups are done in straight SQL due to the potential
7 # complexity, and this sadly does break the abstraction of the table
8 # objects...
10 from __future__ import print_function
11 from __future__ import with_statement
13 import os, sys, re, sqlite3
14 from cStringIO import StringIO
15 from xml.etree.cElementTree import ElementTree
16 from helpers import gzread
17 from db import Database as BaseDatabase
18 from table import Table, ChildTable, KeyValueTable
20 import gettext
21 #t = gettext.translation("jblite")
22 #_ = t.ugettext
23 gettext.install("jblite")
25 # Full expansion of xml:lang
26 XML_LANG = "{http://www.w3.org/XML/1998/namespace}lang"
29 # Copied from kd2.py...
30 get_encoding = sys.getfilesystemencoding
32 # FORMAT OF TABLE MAP:
33 # dictionary entry: table: (children | None)
34 # table: table_name | (table_name, table_type, *args, **kwargs)
36 # Ideas:
37 # Value = dict: take keys as child tables, lookup all rows, and take values as grandchildren.
38 # Value = list: take items as child tables, lookup all rows, assume no children.
41 # entry:
42 # data = tables["entry"].lookup()
43 # children_map = TABLE_MAP["entry"]
44 # children = get_data(children_map["k_ele"])
45 # result = TableData(data, children)
48 # {"k_ele": {"data": [...],
49 # "children": {...}}}
51 # Table data object:
52 # obj.data: {}, # single db row
53 # obj.children: {"key": table_object}
56 # breadth first creation? depth?
58 # Map of tables to their children maps. Empty {} means no children.
61 class Entry(object):
63 def __init__(self, record):
64 self._record = record
66 def __unicode__(self):
67 """Basic string representation of the entry."""
68 rec = self._record
69 lines = []
71 k_eles = rec.find_children("k_ele")
72 if len(k_eles) > 0:
73 lines.append(_(u"Kanji readings:"))
74 for k_ele_index, k_ele in enumerate(k_eles):
75 k_ele_index += 1
76 lines.append(_(u" Reading %d:") % k_ele_index)
77 lines.append(_(u" Blob: %s") % k_ele.data['value'])
79 r_eles = rec.find_children("r_ele")
80 if len(r_eles) > 0:
81 lines.append(_(u"Kana readings:"))
82 for r_ele_index, r_ele in enumerate(r_eles):
83 r_ele_index += 1
84 lines.append(_(u" Reading %d:") % r_ele_index)
85 lines.append(_(u" Blob: %s") % r_ele.data['value'])
87 senses = rec.find_children("sense")
88 if len(senses) > 0:
89 lines.append(_(u"Glosses:"))
90 for sense_index, sense in enumerate(senses):
91 sense_index += 1
92 lines.append(_(u" Sense %d:") % sense_index)
93 glosses = sense.find_children("gloss")
95 gloss_d = {}
96 for gloss in glosses:
97 gloss_d.setdefault(gloss.data["lang"], []).append(gloss)
98 # Output glosses by language
99 for lang in sorted(gloss_d.keys()):
100 gloss_recs = gloss_d[lang]
101 lines.append(_(u" Lang: %s") % lang)
102 for gloss_index, gloss in enumerate(gloss_recs):
103 gloss_index += 1
104 val = gloss.data['value']
105 lines.append(_(u" Gloss %d: %s") % (gloss_index, val))
106 return u"\n".join(lines)
108 def __repr__(self):
109 return repr(self._record)
112 class Database(BaseDatabase):
114 """Top level object for SQLite 3-based JMdict database."""
116 entry_class = Entry
117 table_map = {
118 u"entry": {
119 u"k_ele": {
120 u"ke_inf": {},
121 u"ke_pri": {},
123 u"r_ele": {
124 u"re_restr": {},
125 u"re_inf": {},
126 u"re_pri": {},
128 u"links": {},
129 u"bibl": {},
130 u"etym": {},
131 u"audit": {},
132 u"sense": {
133 u"pos": {},
134 u"field": {},
135 u"misc": {},
136 u"dial": {},
137 u"stagk": {},
138 u"stagr": {},
139 u"xref": {},
140 u"ant": {},
141 u"s_inf": {},
142 u"example": {},
143 u"lsource": {},
144 u"gloss": {
145 u"pri": {},
151 def __init__(self, filename, init_from_file=None):
152 self.conn = sqlite3.connect(filename)
153 self.conn.row_factory = sqlite3.Row # keyword accessors for rows
154 self.cursor = self.conn.cursor()
155 self.tables = self._create_table_objects()
156 if init_from_file is not None:
157 raw_data = gzread(init_from_file)
159 entities = self._get_entities(raw_data)
160 infile = StringIO(raw_data)
161 etree = ElementTree(file=infile)
162 infile.close()
164 self._create_new_tables()
165 self._populate_database(etree, entities)
166 self.conn.commit()
168 def search(self, query, lang=None):
169 # Search
170 # Two main methods: to and from Japanese.
171 # 1. Guess which direction we're searching.
172 # 2. Search preferred method.
173 # 3. Search remaining method.
174 entries_from = self.search_from_japanese(query)
175 entries_to = self.search_to_japanese(query, lang=lang)
177 results = entries_from + entries_to
178 return results
180 def search_from_japanese(self, query):
181 # Japanese search locations:
182 # 1. Kanji elements
183 # 2. Reading elements
184 # 3. Any indices (none yet)
186 # Preferred orderings
187 # 1. Location of query in result
188 # 1. Exact match
189 # 2. Begins with
190 # 3. Anywhere
191 # 2. Ranking of usage (the (P) option in EDICT, for example)
193 # FOR NOW: just get the searching working.
194 # This puts us on roughly the same level as J-Ben 1.2.x.
195 encoding = get_encoding()
196 wrapped_query = "%%%s%%" % query # Wrap in wildcards
197 unicode_query = wrapped_query.decode(encoding)
199 entries_by_keb = self._search_keb(unicode_query)
200 entries_by_reb = self._search_reb(unicode_query)
201 #entries_by_indices = self._search_indices_from_ja(unicode_query)
203 # Merge results into one list and return.
204 results = []
205 for lst in (entries_by_keb, entries_by_reb):
206 for o in lst:
207 if o not in results:
208 results.append(o)
209 return results
211 def _search_keb(self, unicode_query):
212 """Searches kanji elements (Japanese readings with kanji).
214 Returns a list of entry IDs.
217 # keb: entry.id -> k_ele.fk, k_ele.value
218 query = "SELECT fk FROM k_ele WHERE value LIKE ?"
219 args = (unicode_query,)
220 self.cursor.execute(query, args)
221 rows = self.cursor.fetchall()
222 return [row[0] for row in rows]
224 def _search_reb(self, unicode_query):
225 """Searches reading elements (Japanese readings without kanji).
227 Returns a list of entry IDs.
230 # reb: entry.id -> r_ele.fk, r_ele.value
231 query = "SELECT fk FROM r_ele WHERE value LIKE ?"
232 args = (unicode_query,)
233 self.cursor.execute(query, args)
234 rows = self.cursor.fetchall()
235 return [row[0] for row in rows]
237 def _search_indices_from_ja(self, unicode_query):
238 raise NotImplementedError
240 def search_to_japanese(self, query, lang):
241 # Foreign language search locations:
242 # 1. Glosses
243 # 2. Any indices (none yet)
245 # For other considerations, see search_from_japanese().
246 encoding = get_encoding()
247 wrapped_query = "%%%s%%" % query # Wrap in wildcards
248 unicode_query = wrapped_query.decode(encoding)
250 entries_by_glosses = self._search_glosses(unicode_query, lang)
251 #entries_by_indices = self._search_indices_to_ja(unicode_query, lang)
253 # Merge results into one list and return.
254 results = []
255 for lst in (entries_by_glosses,):
256 for o in lst:
257 if o not in results:
258 results.append(o)
259 return results
262 def _search_glosses(self, unicode_query, lang):
263 """Searches foreign language glosses.
265 If lang is not None, only entries which match the lang
266 parameter are returned.
268 Returns a list of entry IDs.
271 # entry.id -> sense.fk, sense.id -> gloss.fk
272 if lang is not None:
273 query = (
274 "SELECT e.id FROM gloss g, sense s, entry e "
275 "WHERE g.lang = ? AND g.value LIKE ? "
276 "AND g.fk = s.id AND s.fk = e.id"
278 args = (lang, unicode_query)
279 else:
280 query = (
281 "SELECT e.id FROM gloss g, sense s, entry e "
282 "WHERE g.value LIKE ?"
284 args = (unicode_query,)
286 self.cursor.execute(query, args)
287 rows = self.cursor.fetchall()
288 return [row[0] for row in rows]
290 def _search_indices_to_ja(self, unicode_query, lang):
291 raise NotImplementedError
293 def lookup(self, id):
294 return BaseDatabase.lookup(self, "entry", id)
296 def query_db(self, *args, **kwargs):
297 """Helper. Wraps the execute/fetchall idiom on the DB cursor."""
298 self.cursor.execute(*args, **kwargs)
299 return self.cursor.fetchall()
301 def _convert_entities(self, entities):
302 """Expands a list of entities.
304 Returns a list of the entity expansions. The order of the
305 returned expansions matches the order of the input entities.
308 args = list(sorted(set(entities)))
309 template = ", ".join(["?"] * len(args))
310 query = "SELECT entity, expansion " \
311 "FROM entity WHERE entity IN (%s)" % template
312 rows = self.query_db(query, args)
313 d = {}
314 for entity, expansion in rows:
315 d[entity] = expansion
316 result = [d[entity] for entity in entities]
317 return result
319 def _create_table_objects(self):
320 """Creates table objects.
322 Returns a dictionary of table name to table object.
325 class_mappings = {
326 "entry": EntryTable, # key->int ID
327 "r_ele": REleTable, # key-value plus nokanji flag
328 "sense": SenseTable, # one-many group mapping for sense info
329 "audit": AuditTable, # key->(update_date, update_details)
330 "lsource": LSourceTable, # key -> lang, type=full/part, wasei=t/f
331 "gloss": GlossTable, # key -> lang, g_gend, value, pri flag
332 "links": LinksTable, # key -> tag, desc, uri
333 "bibl": BiblTable, # key -> tag, txt
334 "entity": EntityTable, # Info from JMdict XML entities
337 # Set up key/value and key/entity tables
338 kv_tables = [ # key-value tables (id -> text blob)
339 "k_ele",
340 "ke_pri",
341 "re_restr",
342 "re_pri",
343 "etym",
344 "stagk",
345 "stagr",
346 "xref", # (#PCDATA)* - why the *?
347 "ant", # (#PCDATA)* - why the *?
348 "s_inf",
349 "example",
350 "pri",
352 kv_entity_tables = [ # key-value tables where val == entity
353 "ke_inf",
354 "re_inf",
355 "dial",
356 "field",
357 "misc",
358 "pos",
360 for tbl in kv_tables:
361 class_mappings[tbl] = KeyValueTable
362 for tbl in kv_entity_tables:
363 class_mappings[tbl] = KeyEntityTable
365 # Create all table objects
366 table_mappings = {}
367 for tbl, cls in class_mappings.iteritems():
368 table_mappings[tbl] = cls(self.cursor, tbl)
370 return table_mappings
372 def _create_new_tables(self):
373 """(Re)creates the database tables."""
374 for tbl, tbl_obj in self.tables.iteritems():
375 self.cursor.execute("DROP TABLE IF EXISTS %s" % tbl)
376 tbl_obj.create()
378 def _populate_database(self, etree, entities):
379 """Imports XML data into SQLite database.
381 table_d: table to table_object dictionary
382 etree: ElementTree object for JMdict
383 entities: entity name to description dictionary
386 # NOTE: this is waaay too long. Should be broken up somehow.
387 # For now this will work though...
389 # Populate entities table and get integer keys
390 # NOTE: we'll be mapping from *expanded* entities to ints.
391 entity_int_d = {}
392 tbl = self.tables['entity']
393 for entity, expansion in entities.iteritems():
394 i = tbl.insert(entity, expansion)
395 entity_int_d[expansion] = i
397 # Iterate through each entry
398 for entry in etree.findall("entry"):
400 # entry table
401 ent_seq = entry.find("ent_seq")
402 entry_id = self.tables["entry"].insert(int(ent_seq.text))
404 for k_ele in entry.findall("k_ele"):
405 # k_ele
406 value = k_ele.find("keb").text
407 k_ele_id = self.tables["k_ele"].insert(entry_id, value)
409 # ke_inf
410 for ke_inf in k_ele.findall("ke_inf"):
411 value = ke_inf.text.strip()
412 entity_id = entity_int_d[value]
413 self.tables["ke_inf"].insert(k_ele_id, entity_id)
415 # ke_pri
416 for ke_pri in k_ele.findall("ke_pri"):
417 value = ke_pri.text
418 self.tables["ke_pri"].insert(k_ele_id, value)
420 for r_ele in entry.findall("r_ele"):
421 # r_ele
422 value = r_ele.find("reb").text
423 # For nokanji: currently it's an empty tag, so
424 # treating it as true/false.
425 nokanji = 1 if r_ele.find("nokanji") is not None else 0
426 r_ele_id = self.tables["r_ele"].insert(entry_id, value, nokanji)
428 # re_restr
429 for re_restr in r_ele.findall("re_restr"):
430 value = re_restr.text
431 self.tables["re_restr"].insert(r_ele_id, value)
433 # re_inf
434 for re_inf in r_ele.findall("re_inf"):
435 value = re_inf.text.strip()
436 entity_id = entity_int_d[value]
437 self.tables["re_inf"].insert(r_ele_id, entity_id)
439 # re_pri
440 for re_pri in r_ele.findall("re_pri"):
441 value = re_pri.text
442 self.tables["re_pri"].insert(r_ele_id, value)
444 # info
445 # (Although children of an info node, since there's only
446 # one per entry, let's connect directly to the entry.)
447 info = entry.find("info")
448 if info is not None:
449 for links in info.findall("links"):
450 link_tag = links.find("link_tag").text
451 link_desc = links.find("link_desc").text
452 link_uri = links.find("link_uri").text
453 self.tables["links"].insert(entry_id, link_tag, link_desc,
454 link_uri)
455 for bibl in info.findall("bibl"):
456 bib_tag = links.find("bib_tag")
457 bib_txt = links.find("bib_txt")
458 bib_tag = bib_tag.text if bib_tag is not None else None
459 bib_txt = bib_txt.text if bib_txt is not None else None
460 self.tables["bibl"].insert(entry_id, bib_tag, bib_txt)
461 for etym in info.findall("etym"):
462 self.tables["etym"].insert(entry_id, etym.text)
463 for audit in info.findall("audit"):
464 upd_date = audit.find("upd_date").text
465 upd_detl = audit.find("upd_detl").text
466 self.tables["audit"].insert(entry_id, upd_date, upd_detl)
468 # sense
469 key_entity_tables = ["pos", "field", "misc", "dial"]
470 key_value_tables = ["stagk", "stagr", "xref", "ant", "s_inf", "example"]
472 for sense in entry.findall("sense"):
473 # Each sense gets its own ID, for grouping purposes
474 sense_id = self.tables["sense"].insert(entry_id)
476 for elem_name in key_value_tables:
477 for element in sense.findall(elem_name):
478 self.tables[elem_name].insert(sense_id, element.text)
480 for elem_name in key_entity_tables:
481 for element in sense.findall(elem_name):
482 entity_id = entity_int_d[element.text.strip()]
483 self.tables[elem_name].insert(sense_id, entity_id)
485 for lsource in sense.findall("lsource"):
486 lang = lsource.get(XML_LANG, "eng")
487 ls_type = lsource.get("ls_type") # implied "full" if absent, "part" otherwise
488 ls_wasei = lsource.get("ls_wasei") # usually "y"... just a flag.
490 partial = 1 if ls_type is not None else 0
491 if ls_wasei is None:
492 wasei = 0
493 elif ls_wasei == "y":
494 wasei = 1
495 else:
496 raise ValueError(
497 'Only known valid ls_wasei attribute value '
498 'is "y", found:', ls_wasei.text)
500 self.tables["lsource"].insert(sense_id,
501 lang, partial, wasei)
502 for gloss in sense.findall("gloss"):
503 lang = gloss.get(XML_LANG, "eng")
504 g_gend = gloss.get("g_gend")
505 pri_list = gloss.getchildren()
506 if len(pri_list) > 1:
507 gloss_id = self.tables['gloss'].insert(
508 sense_id, lang, g_gend, gloss.text, 1)
509 for pri in pri_list:
510 self.tables['pri'].insert(gloss_id, pri.text)
511 else:
512 self.tables['gloss'].insert(sense_id, lang, g_gend,
513 gloss.text, 0)
515 def _get_entities(self, xml_data):
516 """Gets the ENTITY definitions from JMdict.
518 Finds the built-in DTD and extracts all ENTITY definitions.
521 dtd = self._get_dtd(xml_data)
522 # do some logic to find all entities...
523 entities = {}
524 regex = '<!ENTITY[ ]+([a-zA-Z0-9-]+)[ ]+"(.*?)">'
525 for match in re.finditer(regex, xml_data):
526 key, value = match.groups()[0:2]
527 entities[key] = value
528 return entities
530 def _get_dtd(self, xml_data):
531 """Gets the DTD from JMdict."""
532 # This works for JMdict (as it is at the time of writing), but is
533 # not a general solution.
534 start_index = xml_data.find("<!DOCTYPE")
535 if start_index == -1:
536 raise Exception("Could not find start of internal DTD")
537 end_index = xml_data.find("]>")
538 if end_index == -1:
539 raise Exception("Could not find end ofinternal DTD")
540 end_index += 2
541 dtd = xml_data[start_index:end_index]
542 return dtd
545 class EntryTable(Table):
546 create_query = ("CREATE TABLE %s "
547 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
548 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
549 index_queries = [
550 "CREATE INDEX %s_seq ON %s (ent_seq)",
554 class KeyEntityTable(KeyValueTable):
555 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
556 create_query = ("CREATE TABLE %s "
557 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
560 class REleTable(ChildTable):
561 create_query = ("CREATE TABLE %s "
562 "(id INTEGER PRIMARY KEY, fk INTEGER,"
563 " value TEXT, nokanji INTEGER)")
564 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
565 index_queries = [
566 "CREATE INDEX %s_fk ON %s (fk)",
570 class SenseTable(ChildTable):
571 """Corresponds to <sense> tag. Functions as group for glosses, etc."""
572 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
573 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
574 index_queries = [
575 "CREATE INDEX %s_fk ON %s (fk)",
579 class AuditTable(ChildTable):
580 create_query = ("CREATE TABLE %s "
581 "(id INTEGER PRIMARY KEY, fk INTEGER,"
582 " update_date TEXT, update_details TEXT)")
583 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
584 index_queries = [
585 "CREATE INDEX %s_fk ON %s (fk)",
589 class LSourceTable(ChildTable):
590 """Represents the <lsource> element from JMdict.
592 Important changes:
593 ls_type=full/part => partial=1/0
594 ls_wasei=y/null => wasei=1/0
597 create_query = ("CREATE TABLE %s "
598 "(id INTEGER PRIMARY KEY, fk INTEGER,"
599 " lang TEXT, partial INTEGER, wasei INTEGER)")
600 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
601 index_queries = [
602 "CREATE INDEX %s_fk ON %s (fk)",
606 class GlossTable(ChildTable):
607 create_query = ("CREATE TABLE %s "
608 "(id INTEGER PRIMARY KEY, fk INTEGER,"
609 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
610 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
611 index_queries = [
612 "CREATE INDEX %s_fk ON %s (fk)",
613 "CREATE INDEX %s_lang ON %s (lang)",
614 "CREATE INDEX %s_value ON %s (value)",
618 class LinksTable(ChildTable):
619 create_query = ("CREATE TABLE %s "
620 "(id INTEGER PRIMARY KEY, fk INTEGER,"
621 " tag TEXT, desc TEXT, uri TEXT)")
622 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
623 index_queries = [
624 "CREATE INDEX %s_fk ON %s (fk)",
628 class BiblTable(ChildTable):
629 create_query = ("CREATE TABLE %s "
630 "(id INTEGER PRIMARY KEY, fk INTEGER,"
631 " tag TEXT, txt TEXT)")
632 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
633 index_queries = [
634 "CREATE INDEX %s_fk ON %s (fk)",
638 class EntityTable(Table):
639 create_query = ("CREATE TABLE %s "
640 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
641 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
644 ######################################################################
646 def parse_args():
647 from optparse import OptionParser
648 op = OptionParser(usage="%prog [options] <db_filename> [search_query]")
649 op.add_option("-i", "--initialize",
650 dest="init_fname", metavar="XML_SOURCE",
651 help=_("Initialize database from file."))
652 op.add_option("-L", "--lang",
653 help=_("Specify preferred language for searching."))
654 options, args = op.parse_args()
655 if len(args) < 1:
656 op.print_help()
657 exit(-1)
658 return (options, args)
660 def main():
661 # Copied *almost* verbatim from kd2.py.
662 options, args = parse_args()
663 db_fname = args[0]
665 if options.init_fname is not None:
666 db = Database(db_fname, init_from_file=options.init_fname)
667 else:
668 db = Database(db_fname)
670 results = []
671 if len(args) > 1:
672 # Do search
673 # To be nice, we'll join all remaining args with spaces.
674 search_query = " ".join(args[1:])
675 if options.lang is not None:
676 results = db.search(search_query, lang=options.lang)
677 else:
678 results = db.search(search_query)
680 if len(results) > 0:
681 from pprint import pprint
682 encoding = get_encoding()
683 for index, result in enumerate(results):
684 index += 1
685 print(_("[Entry %d]") % index)
686 entry = db.lookup(result)
688 print(unicode(entry).encode(encoding))
689 print()
690 else:
691 print(_("No results found."))
693 if __name__ == "__main__":
694 main()