1 # -*- coding: utf-8 -*-
4 # This could be a bit cleaner if I used something like SQLalchemy
5 # perhaps... The create/insert/index bits were done decent enough,
6 # but lookups are done in straight SQL due to the potential
7 # complexity, and this sadly does break the abstraction of the table
10 from __future__
import print_function
11 from __future__
import with_statement
13 import os
, sys
, re
, sqlite3
14 from cStringIO
import StringIO
15 from xml
.etree
.cElementTree
import ElementTree
16 from helpers
import gzread
17 from db
import Database
as BaseDatabase
18 from table
import Table
, ChildTable
, KeyValueTable
21 #t = gettext.translation("jblite")
23 gettext
.install("jblite")
25 # Full expansion of xml:lang
26 XML_LANG
= "{http://www.w3.org/XML/1998/namespace}lang"
29 # Copied from kd2.py...
30 get_encoding
= sys
.getfilesystemencoding
32 # FORMAT OF TABLE MAP:
33 # dictionary entry: table: (children | None)
34 # table: table_name | (table_name, table_type, *args, **kwargs)
37 # Value = dict: take keys as child tables, lookup all rows, and take values as grandchildren.
38 # Value = list: take items as child tables, lookup all rows, assume no children.
42 # data = tables["entry"].lookup()
43 # children_map = TABLE_MAP["entry"]
44 # children = get_data(children_map["k_ele"])
45 # result = TableData(data, children)
48 # {"k_ele": {"data": [...],
52 # obj.data: {}, # single db row
53 # obj.children: {"key": table_object}
56 # breadth first creation? depth?
58 # Map of tables to their children maps. Empty {} means no children.
63 def __init__(self
, record
):
66 def __unicode__(self
):
67 """Basic string representation of the entry."""
71 k_eles
= rec
.find_children("k_ele")
73 lines
.append(_(u
"Kanji readings:"))
74 for k_ele_index
, k_ele
in enumerate(k_eles
):
76 lines
.append(_(u
" Reading %d:") % k_ele_index
)
77 lines
.append(_(u
" Blob: %s") % k_ele
.data
['value'])
79 r_eles
= rec
.find_children("r_ele")
81 lines
.append(_(u
"Kana readings:"))
82 for r_ele_index
, r_ele
in enumerate(r_eles
):
84 lines
.append(_(u
" Reading %d:") % r_ele_index
)
85 lines
.append(_(u
" Blob: %s") % r_ele
.data
['value'])
87 senses
= rec
.find_children("sense")
89 lines
.append(_(u
"Glosses:"))
90 for sense_index
, sense
in enumerate(senses
):
92 lines
.append(_(u
" Sense %d:") % sense_index
)
93 glosses
= sense
.find_children("gloss")
97 gloss_d
.setdefault(gloss
.data
["lang"], []).append(gloss
)
98 # Output glosses by language
99 for lang
in sorted(gloss_d
.keys()):
100 gloss_recs
= gloss_d
[lang
]
101 lines
.append(_(u
" Lang: %s") % lang
)
102 for gloss_index
, gloss
in enumerate(gloss_recs
):
104 val
= gloss
.data
['value']
105 lines
.append(_(u
" Gloss %d: %s") % (gloss_index
, val
))
106 return u
"\n".join(lines
)
109 return repr(self
._record
)
112 class Database(BaseDatabase
):
114 """Top level object for SQLite 3-based JMdict database."""
151 def __init__(self
, filename
, init_from_file
=None):
152 self
.conn
= sqlite3
.connect(filename
)
153 self
.conn
.row_factory
= sqlite3
.Row
# keyword accessors for rows
154 self
.cursor
= self
.conn
.cursor()
155 self
.tables
= self
._create
_table
_objects
()
156 if init_from_file
is not None:
157 raw_data
= gzread(init_from_file
)
159 entities
= self
._get
_entities
(raw_data
)
160 infile
= StringIO(raw_data
)
161 etree
= ElementTree(file=infile
)
164 self
._create
_new
_tables
()
165 self
._populate
_database
(etree
, entities
)
168 def search(self
, query
, lang
=None):
170 # Two main methods: to and from Japanese.
171 # 1. Guess which direction we're searching.
172 # 2. Search preferred method.
173 # 3. Search remaining method.
174 entries_from
= self
.search_from_japanese(query
)
175 entries_to
= self
.search_to_japanese(query
, lang
=lang
)
177 results
= entries_from
+ entries_to
180 def search_from_japanese(self
, query
):
181 # Japanese search locations:
183 # 2. Reading elements
184 # 3. Any indices (none yet)
186 # Preferred orderings
187 # 1. Location of query in result
191 # 2. Ranking of usage (the (P) option in EDICT, for example)
193 # FOR NOW: just get the searching working.
194 # This puts us on roughly the same level as J-Ben 1.2.x.
195 encoding
= get_encoding()
196 wrapped_query
= "%%%s%%" % query
# Wrap in wildcards
197 unicode_query
= wrapped_query
.decode(encoding
)
199 entries_by_keb
= self
._search
_keb
(unicode_query
)
200 entries_by_reb
= self
._search
_reb
(unicode_query
)
201 #entries_by_indices = self._search_indices_from_ja(unicode_query)
203 # Merge results into one list and return.
205 for lst
in (entries_by_keb
, entries_by_reb
):
211 def _search_keb(self
, unicode_query
):
212 """Searches kanji elements (Japanese readings with kanji).
214 Returns a list of entry IDs.
217 # keb: entry.id -> k_ele.fk, k_ele.value
218 query
= "SELECT fk FROM k_ele WHERE value LIKE ?"
219 args
= (unicode_query
,)
220 self
.cursor
.execute(query
, args
)
221 rows
= self
.cursor
.fetchall()
222 return [row
[0] for row
in rows
]
224 def _search_reb(self
, unicode_query
):
225 """Searches reading elements (Japanese readings without kanji).
227 Returns a list of entry IDs.
230 # reb: entry.id -> r_ele.fk, r_ele.value
231 query
= "SELECT fk FROM r_ele WHERE value LIKE ?"
232 args
= (unicode_query
,)
233 self
.cursor
.execute(query
, args
)
234 rows
= self
.cursor
.fetchall()
235 return [row
[0] for row
in rows
]
237 def _search_indices_from_ja(self
, unicode_query
):
238 raise NotImplementedError
240 def search_to_japanese(self
, query
, lang
):
241 # Foreign language search locations:
243 # 2. Any indices (none yet)
245 # For other considerations, see search_from_japanese().
246 encoding
= get_encoding()
247 wrapped_query
= "%%%s%%" % query
# Wrap in wildcards
248 unicode_query
= wrapped_query
.decode(encoding
)
250 entries_by_glosses
= self
._search
_glosses
(unicode_query
, lang
)
251 #entries_by_indices = self._search_indices_to_ja(unicode_query, lang)
253 # Merge results into one list and return.
255 for lst
in (entries_by_glosses
,):
262 def _search_glosses(self
, unicode_query
, lang
):
263 """Searches foreign language glosses.
265 If lang is not None, only entries which match the lang
266 parameter are returned.
268 Returns a list of entry IDs.
271 # entry.id -> sense.fk, sense.id -> gloss.fk
274 "SELECT e.id FROM gloss g, sense s, entry e "
275 "WHERE g.lang = ? AND g.value LIKE ? "
276 "AND g.fk = s.id AND s.fk = e.id"
278 args
= (lang
, unicode_query
)
281 "SELECT e.id FROM gloss g, sense s, entry e "
282 "WHERE g.value LIKE ?"
284 args
= (unicode_query
,)
286 self
.cursor
.execute(query
, args
)
287 rows
= self
.cursor
.fetchall()
288 return [row
[0] for row
in rows
]
290 def _search_indices_to_ja(self
, unicode_query
, lang
):
291 raise NotImplementedError
293 def lookup(self
, id):
294 return BaseDatabase
.lookup(self
, "entry", id)
296 def query_db(self
, *args
, **kwargs
):
297 """Helper. Wraps the execute/fetchall idiom on the DB cursor."""
298 self
.cursor
.execute(*args
, **kwargs
)
299 return self
.cursor
.fetchall()
301 def _convert_entities(self
, entities
):
302 """Expands a list of entities.
304 Returns a list of the entity expansions. The order of the
305 returned expansions matches the order of the input entities.
308 args
= list(sorted(set(entities
)))
309 template
= ", ".join(["?"] * len(args
))
310 query
= "SELECT entity, expansion " \
311 "FROM entity WHERE entity IN (%s)" % template
312 rows
= self
.query_db(query
, args
)
314 for entity
, expansion
in rows
:
315 d
[entity
] = expansion
316 result
= [d
[entity
] for entity
in entities
]
319 def _create_table_objects(self
):
320 """Creates table objects.
322 Returns a dictionary of table name to table object.
326 "entry": EntryTable
, # key->int ID
327 "r_ele": REleTable
, # key-value plus nokanji flag
328 "sense": SenseTable
, # one-many group mapping for sense info
329 "audit": AuditTable
, # key->(update_date, update_details)
330 "lsource": LSourceTable
, # key -> lang, type=full/part, wasei=t/f
331 "gloss": GlossTable
, # key -> lang, g_gend, value, pri flag
332 "links": LinksTable
, # key -> tag, desc, uri
333 "bibl": BiblTable
, # key -> tag, txt
334 "entity": EntityTable
, # Info from JMdict XML entities
337 # Set up key/value and key/entity tables
338 kv_tables
= [ # key-value tables (id -> text blob)
346 "xref", # (#PCDATA)* - why the *?
347 "ant", # (#PCDATA)* - why the *?
352 kv_entity_tables
= [ # key-value tables where val == entity
360 for tbl
in kv_tables
:
361 class_mappings
[tbl
] = KeyValueTable
362 for tbl
in kv_entity_tables
:
363 class_mappings
[tbl
] = KeyEntityTable
365 # Create all table objects
367 for tbl
, cls
in class_mappings
.iteritems():
368 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
370 return table_mappings
372 def _create_new_tables(self
):
373 """(Re)creates the database tables."""
374 for tbl
, tbl_obj
in self
.tables
.iteritems():
375 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % tbl
)
378 def _populate_database(self
, etree
, entities
):
379 """Imports XML data into SQLite database.
381 table_d: table to table_object dictionary
382 etree: ElementTree object for JMdict
383 entities: entity name to description dictionary
386 # NOTE: this is waaay too long. Should be broken up somehow.
387 # For now this will work though...
389 # Populate entities table and get integer keys
390 # NOTE: we'll be mapping from *expanded* entities to ints.
392 tbl
= self
.tables
['entity']
393 for entity
, expansion
in entities
.iteritems():
394 i
= tbl
.insert(entity
, expansion
)
395 entity_int_d
[expansion
] = i
397 # Iterate through each entry
398 for entry
in etree
.findall("entry"):
401 ent_seq
= entry
.find("ent_seq")
402 entry_id
= self
.tables
["entry"].insert(int(ent_seq
.text
))
404 for k_ele
in entry
.findall("k_ele"):
406 value
= k_ele
.find("keb").text
407 k_ele_id
= self
.tables
["k_ele"].insert(entry_id
, value
)
410 for ke_inf
in k_ele
.findall("ke_inf"):
411 value
= ke_inf
.text
.strip()
412 entity_id
= entity_int_d
[value
]
413 self
.tables
["ke_inf"].insert(k_ele_id
, entity_id
)
416 for ke_pri
in k_ele
.findall("ke_pri"):
418 self
.tables
["ke_pri"].insert(k_ele_id
, value
)
420 for r_ele
in entry
.findall("r_ele"):
422 value
= r_ele
.find("reb").text
423 # For nokanji: currently it's an empty tag, so
424 # treating it as true/false.
425 nokanji
= 1 if r_ele
.find("nokanji") is not None else 0
426 r_ele_id
= self
.tables
["r_ele"].insert(entry_id
, value
, nokanji
)
429 for re_restr
in r_ele
.findall("re_restr"):
430 value
= re_restr
.text
431 self
.tables
["re_restr"].insert(r_ele_id
, value
)
434 for re_inf
in r_ele
.findall("re_inf"):
435 value
= re_inf
.text
.strip()
436 entity_id
= entity_int_d
[value
]
437 self
.tables
["re_inf"].insert(r_ele_id
, entity_id
)
440 for re_pri
in r_ele
.findall("re_pri"):
442 self
.tables
["re_pri"].insert(r_ele_id
, value
)
445 # (Although children of an info node, since there's only
446 # one per entry, let's connect directly to the entry.)
447 info
= entry
.find("info")
449 for links
in info
.findall("links"):
450 link_tag
= links
.find("link_tag").text
451 link_desc
= links
.find("link_desc").text
452 link_uri
= links
.find("link_uri").text
453 self
.tables
["links"].insert(entry_id
, link_tag
, link_desc
,
455 for bibl
in info
.findall("bibl"):
456 bib_tag
= links
.find("bib_tag")
457 bib_txt
= links
.find("bib_txt")
458 bib_tag
= bib_tag
.text
if bib_tag
is not None else None
459 bib_txt
= bib_txt
.text
if bib_txt
is not None else None
460 self
.tables
["bibl"].insert(entry_id
, bib_tag
, bib_txt
)
461 for etym
in info
.findall("etym"):
462 self
.tables
["etym"].insert(entry_id
, etym
.text
)
463 for audit
in info
.findall("audit"):
464 upd_date
= audit
.find("upd_date").text
465 upd_detl
= audit
.find("upd_detl").text
466 self
.tables
["audit"].insert(entry_id
, upd_date
, upd_detl
)
469 key_entity_tables
= ["pos", "field", "misc", "dial"]
470 key_value_tables
= ["stagk", "stagr", "xref", "ant", "s_inf", "example"]
472 for sense
in entry
.findall("sense"):
473 # Each sense gets its own ID, for grouping purposes
474 sense_id
= self
.tables
["sense"].insert(entry_id
)
476 for elem_name
in key_value_tables
:
477 for element
in sense
.findall(elem_name
):
478 self
.tables
[elem_name
].insert(sense_id
, element
.text
)
480 for elem_name
in key_entity_tables
:
481 for element
in sense
.findall(elem_name
):
482 entity_id
= entity_int_d
[element
.text
.strip()]
483 self
.tables
[elem_name
].insert(sense_id
, entity_id
)
485 for lsource
in sense
.findall("lsource"):
486 lang
= lsource
.get(XML_LANG
, "eng")
487 ls_type
= lsource
.get("ls_type") # implied "full" if absent, "part" otherwise
488 ls_wasei
= lsource
.get("ls_wasei") # usually "y"... just a flag.
490 partial
= 1 if ls_type
is not None else 0
493 elif ls_wasei
== "y":
497 'Only known valid ls_wasei attribute value '
498 'is "y", found:', ls_wasei
.text
)
500 self
.tables
["lsource"].insert(sense_id
,
501 lang
, partial
, wasei
)
502 for gloss
in sense
.findall("gloss"):
503 lang
= gloss
.get(XML_LANG
, "eng")
504 g_gend
= gloss
.get("g_gend")
505 pri_list
= gloss
.getchildren()
506 if len(pri_list
) > 1:
507 gloss_id
= self
.tables
['gloss'].insert(
508 sense_id
, lang
, g_gend
, gloss
.text
, 1)
510 self
.tables
['pri'].insert(gloss_id
, pri
.text
)
512 self
.tables
['gloss'].insert(sense_id
, lang
, g_gend
,
515 def _get_entities(self
, xml_data
):
516 """Gets the ENTITY definitions from JMdict.
518 Finds the built-in DTD and extracts all ENTITY definitions.
521 dtd
= self
._get
_dtd
(xml_data
)
522 # do some logic to find all entities...
524 regex
= '<!ENTITY[ ]+([a-zA-Z0-9-]+)[ ]+"(.*?)">'
525 for match
in re
.finditer(regex
, xml_data
):
526 key
, value
= match
.groups()[0:2]
527 entities
[key
] = value
530 def _get_dtd(self
, xml_data
):
531 """Gets the DTD from JMdict."""
532 # This works for JMdict (as it is at the time of writing), but is
533 # not a general solution.
534 start_index
= xml_data
.find("<!DOCTYPE")
535 if start_index
== -1:
536 raise Exception("Could not find start of internal DTD")
537 end_index
= xml_data
.find("]>")
539 raise Exception("Could not find end ofinternal DTD")
541 dtd
= xml_data
[start_index
:end_index
]
545 class EntryTable(Table
):
546 create_query
= ("CREATE TABLE %s "
547 "(id INTEGER PRIMARY KEY, ent_seq INTEGER)")
548 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
550 "CREATE INDEX %s_seq ON %s (ent_seq)",
554 class KeyEntityTable(KeyValueTable
):
555 """Just like a KeyValueTable, but with 'entity' instead of 'value'."""
556 create_query
= ("CREATE TABLE %s "
557 "(id INTEGER PRIMARY KEY, fk INTEGER, entity INTEGER)")
560 class REleTable(ChildTable
):
561 create_query
= ("CREATE TABLE %s "
562 "(id INTEGER PRIMARY KEY, fk INTEGER,"
563 " value TEXT, nokanji INTEGER)")
564 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
566 "CREATE INDEX %s_fk ON %s (fk)",
570 class SenseTable(ChildTable
):
571 """Corresponds to <sense> tag. Functions as group for glosses, etc."""
572 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
573 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
575 "CREATE INDEX %s_fk ON %s (fk)",
579 class AuditTable(ChildTable
):
580 create_query
= ("CREATE TABLE %s "
581 "(id INTEGER PRIMARY KEY, fk INTEGER,"
582 " update_date TEXT, update_details TEXT)")
583 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
585 "CREATE INDEX %s_fk ON %s (fk)",
589 class LSourceTable(ChildTable
):
590 """Represents the <lsource> element from JMdict.
593 ls_type=full/part => partial=1/0
594 ls_wasei=y/null => wasei=1/0
597 create_query
= ("CREATE TABLE %s "
598 "(id INTEGER PRIMARY KEY, fk INTEGER,"
599 " lang TEXT, partial INTEGER, wasei INTEGER)")
600 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
602 "CREATE INDEX %s_fk ON %s (fk)",
606 class GlossTable(ChildTable
):
607 create_query
= ("CREATE TABLE %s "
608 "(id INTEGER PRIMARY KEY, fk INTEGER,"
609 " lang TEXT, g_gend TEXT, value TEXT, pri INTEGER)")
610 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
612 "CREATE INDEX %s_fk ON %s (fk)",
613 "CREATE INDEX %s_lang ON %s (lang)",
614 "CREATE INDEX %s_value ON %s (value)",
618 class LinksTable(ChildTable
):
619 create_query
= ("CREATE TABLE %s "
620 "(id INTEGER PRIMARY KEY, fk INTEGER,"
621 " tag TEXT, desc TEXT, uri TEXT)")
622 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
624 "CREATE INDEX %s_fk ON %s (fk)",
628 class BiblTable(ChildTable
):
629 create_query
= ("CREATE TABLE %s "
630 "(id INTEGER PRIMARY KEY, fk INTEGER,"
631 " tag TEXT, txt TEXT)")
632 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
634 "CREATE INDEX %s_fk ON %s (fk)",
638 class EntityTable(Table
):
639 create_query
= ("CREATE TABLE %s "
640 "(id INTEGER PRIMARY KEY, entity TEXT, expansion TEXT)")
641 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
644 ######################################################################
647 from optparse
import OptionParser
648 op
= OptionParser(usage
="%prog [options] <db_filename> [search_query]")
649 op
.add_option("-i", "--initialize",
650 dest
="init_fname", metavar
="XML_SOURCE",
651 help=_("Initialize database from file."))
652 op
.add_option("-L", "--lang",
653 help=_("Specify preferred language for searching."))
654 options
, args
= op
.parse_args()
658 return (options
, args
)
661 # Copied *almost* verbatim from kd2.py.
662 options
, args
= parse_args()
665 if options
.init_fname
is not None:
666 db
= Database(db_fname
, init_from_file
=options
.init_fname
)
668 db
= Database(db_fname
)
673 # To be nice, we'll join all remaining args with spaces.
674 search_query
= " ".join(args
[1:])
675 if options
.lang
is not None:
676 results
= db
.search(search_query
, lang
=options
.lang
)
678 results
= db
.search(search_query
)
681 from pprint
import pprint
682 encoding
= get_encoding()
683 for index
, result
in enumerate(results
):
685 print(_("[Entry %d]") % index
)
686 entry
= db
.lookup(result
)
688 print(unicode(entry
).encode(encoding
))
691 print(_("No results found."))
693 if __name__
== "__main__":