Finished up changes to the KANJIDIC2 code.
[jblite.git] / jblite / kd2.py
blob85cc6d9472ff86d199e4938181b0fe10343f792d
1 # -*- coding: utf-8 -*-
2 from __future__ import print_function
3 from __future__ import with_statement
5 import os, sys, re, sqlite3, time
6 from cStringIO import StringIO
7 from xml.etree.cElementTree import ElementTree
8 from helpers import gzread
9 from db import Database as BaseDatabase
10 from table import Table, ChildTable, KeyValueTable
12 import gettext
13 #t = gettext.translation("jblite")
14 #_ = t.ugettext
15 gettext.install("jblite")
18 # This method of getting the encoding might not be the best...
19 # but it works for now, and avoids hacks with
20 # setdefaultencoding.
21 get_encoding = sys.getfilesystemencoding
24 def convert_kunyomi(coded_str):
25 """Converts a kunyomi string with ./- to a user-friendly format.
27 Specifically, two things occur:
29 1. Strings are split on '.' and the right half, if any, is
30 enclosed in parentheses.
32 2. '-' is replaced with '~'.
34 """
35 pieces = coded_str.split(u'.', 1)
36 if len(pieces) > 1:
37 intermediate = u"".join([pieces[0], u"(", pieces[1], u")"])
38 else:
39 intermediate = pieces[0]
40 result = intermediate.replace(u"-", u"~")
41 return result
43 def get_jouyou_str(grade_int):
44 """Converts an integer Jouyou grade code into a string."""
45 # DO LATER
46 return str(grade_int)
49 class Entry(object):
51 def __init__(self, record):
52 self._record = record
54 def __unicode__(self):
55 """Basic string representation of the entry."""
57 char_rec = self._record
58 lines = []
60 literal = char_rec.data["literal"]
61 lines.append(_(u"Literal: %s (0x%X)") % (literal, ord(literal)))
63 rmgroup_recs = char_rec.find_children("rmgroup")
64 for i, rmgroup_rec in enumerate(rmgroup_recs):
65 group_index = i + 1
66 lines.append(_(u"Group %d:") % group_index)
68 reading_recs = rmgroup_rec.find_children("reading")
70 kunyomi = [r.data['value'] for r in reading_recs
71 if r.data["type"] == "ja_kun"]
72 # kun-yomi needs ./- translated
73 kunyomi = map(convert_kunyomi, kunyomi)
74 lines.append(_(u" Kun-yomi: %s") % u"、".join(kunyomi))
76 onyomi = [r.data['value'] for r in reading_recs
77 if r.data["type"] == "ja_on"]
78 lines.append(_(u" On-yomi: %s") % u"、".join(onyomi))
80 meaning_recs = rmgroup_rec.find_children("meaning")
81 meaning_d = {}
82 for r in meaning_recs:
83 meanings = meaning_d.setdefault(r.data['lang'], [])
84 meanings.append(r.data['value'])
85 for lang in sorted(meaning_d.keys()):
86 meanings = meaning_d[lang]
87 meaning_str = "; ".join(meanings)
88 lines.append(_(u" Meanings (%s): %s") % (lang, meaning_str))
90 nanori_recs = char_rec.find_children("nanori")
91 if len(nanori_recs) > 0:
92 nanori = [r.data["value"] for r in nanori_recs]
93 nanori_str = u"、".join(nanori)
94 lines.append(_(u"Nanori: %s") % nanori_str)
96 stroke_recs = char_rec.find_children("stroke_count")
97 strokes = [r.data['count'] for r in stroke_recs]
98 if len(strokes) == 1:
99 lines.append(_(u"Stroke count: %d") % strokes[0])
100 elif len(strokes) > 1:
101 miscounts = ", ".join(map(str, strokes[1:]))
102 lines.append(_(u"Stroke count: %d (miscounts: %s)") %
103 (strokes[0], miscounts))
104 else:
105 pass # No stroke count info; don't print anything
107 freq = char_rec.data["freq"]
108 if freq is not None:
109 lines.append(_(u"Frequency: %d") % freq)
111 grade = char_rec.data["grade"]
112 if grade is not None:
113 # Jouyou grade codes has special meanings; a conversion is
114 # needed.
115 grade_str = get_jouyou_str(grade)
116 lines.append(_(u"Jouyou grade: %s") % grade_str)
118 jlpt = char_rec.data["jlpt"]
119 if jlpt is not None:
120 lines.append(_(u"JLPT grade: %d") % jlpt)
122 return u"\n".join(lines)
124 def __repr__(self):
125 return repr(self._record)
128 class Database(BaseDatabase):
130 """Top level object for SQLite 3-based KANJIDIC2 database."""
132 entry_class = Entry
133 table_map = {
134 u"character": {
135 u"codepoint": {},
136 u"radical": {},
137 u"stroke_count": {},
138 u"variant": {},
139 u"rad_name": {},
140 u"dic_number": {},
141 u"query_code": {},
142 u"rmgroup": {
143 u"reading": {},
144 u"meaning": {},
146 u"nanori": {},
150 def __init__(self, filename, init_from_file=None):
151 self.conn = sqlite3.connect(filename)
152 self.conn.row_factory = sqlite3.Row # keyword accessors for rows
153 self.cursor = self.conn.cursor()
154 self.tables = self._create_table_objects()
155 if init_from_file is not None:
156 raw_data = gzread(init_from_file)
158 infile = StringIO(raw_data)
159 etree = ElementTree(file=infile)
160 infile.close()
162 # Create the core database
163 self._create_new_tables()
164 self._populate_database(etree)
165 self.conn.commit()
167 # Create supplemental indices
168 self._create_index_tables()
169 self.conn.commit()
171 def search(self, query, lang=None, options=None):
172 encoding = get_encoding()
173 wrapped_query = "%%%s%%" % query # Wrap in wildcards
174 unicode_query = wrapped_query.decode(encoding)
176 verbose = (options is not None) and (options.verbose == True)
178 if verbose and os.name == "nt":
179 print(u"Searching for \"%s\", lang=%s..." %
180 (unicode_query, repr(lang)),
181 file=sys.stderr)
183 # Do some search stuff here...
185 # 1. Find by reading
187 entries_r = self._search_by_reading(unicode_query)
188 entries_m = self._search_by_meaning(unicode_query,
189 lang=lang)
190 entries_n = self._search_by_nanori(unicode_query)
191 entries_i = self._search_by_indices(unicode_query, lang=lang)
193 # DEBUG CODE
194 if verbose:
195 print("READINGS:")
196 if len(entries_r) == 0:
197 print("No 'reading' results found.")
198 for ent_id, literal in entries_r:
199 try:
200 print(u"ID: %d, literal: %s" % (ent_id, literal))
201 except UnicodeEncodeError:
202 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
204 print("NANORI:")
205 if len(entries_n) == 0:
206 print("No 'nanori' results found.")
207 for ent_id, literal in entries_n:
208 try:
209 print(u"ID: %d, literal: %s" % (ent_id, literal))
210 except UnicodeEncodeError:
211 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
213 print("MEANINGS:")
214 if len(entries_m) == 0:
215 print("No 'meaning' results found.")
216 for ent_id, literal in entries_m:
217 try:
218 print(u"ID: %d, literal: %s" % (ent_id, literal))
219 except UnicodeEncodeError:
220 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
222 print("INDICES:")
223 if len(entries_i) == 0:
224 print("No indexed results found.")
225 for ent_id in entries_i:
226 print(u"ID: %d" % (ent_id,))
228 # Get list of unique character IDs
229 char_ids = []
230 for lst in (entries_r, entries_m, entries_n):
231 for row in lst:
232 if row[0] not in char_ids:
233 char_ids.append(row[0])
234 for char_id in entries_i:
235 if char_id not in char_ids:
236 char_ids.append(char_id)
238 char_ids = list(sorted(char_ids))
240 results = [self.lookup(char_id) for char_id in char_ids]
241 return results
243 def _search_by_reading(self, query):
244 # reading -> rmgroup -> character
245 self.cursor.execute(
246 "SELECT id, literal FROM character WHERE id IN "
247 "(SELECT fk FROM rmgroup WHERE id IN "
248 "(SELECT fk FROM reading WHERE value LIKE ?))", (query,))
249 rows = self.cursor.fetchall()
250 return rows
252 def _search_by_nanori(self, query):
253 # nanori -> character
254 self.cursor.execute(
255 "SELECT id, literal FROM character WHERE id IN "
256 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query,))
257 rows = self.cursor.fetchall()
258 return rows
260 def _search_by_meaning(self, query, lang=None):
261 # meaning -> rmgroup -> character
262 if lang is None:
263 self.cursor.execute(
264 "SELECT id, literal FROM character WHERE id IN "
265 "(SELECT fk FROM rmgroup WHERE id IN "
266 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query,))
267 else:
268 self.cursor.execute(
269 "SELECT id, literal FROM character WHERE id IN "
270 "(SELECT fk FROM rmgroup WHERE id IN "
271 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
272 (lang, query))
273 rows = self.cursor.fetchall()
274 return rows
276 def _search_by_indices(self, query, lang=None):
277 # Get IDs from index table
278 # Note: lang is currently unused.
279 self.cursor.execute(
280 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
281 (query,))
282 rows = self.cursor.fetchall()
283 return [row[0] for row in rows]
285 def search_by_literal(self, literal):
286 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
287 self.cursor.execute("SELECT id FROM character WHERE literal = ?",
288 (literal,))
289 rows = self.cursor.fetchall()
290 if len(rows) < 1:
291 return None
292 else:
293 char_id = rows[0][0]
294 return self.lookup(char_id)
296 def lookup(self, id):
297 return BaseDatabase.lookup(self, "character", id)
299 def _create_table_objects(self):
300 """Creates table objects.
302 Returns a dictionary of table name to table object.
305 class_mappings = {
306 "header": HeaderTable,
307 "character": CharacterTable,
308 "codepoint": TypeValueTable,
309 "radical": TypeValueTable,
310 "stroke_count": StrokeCountTable,
311 "variant": TypeValueTable,
312 "rad_name": KeyValueTable,
313 "dic_number": DicNumberTable,
314 "query_code": QueryCodeTable,
315 "rmgroup": RMGroupTable,
316 "reading": ReadingTable,
317 "meaning": MeaningTable,
318 "nanori": KeyValueTable,
321 # Create all table objects
322 table_mappings = {}
323 for tbl, cls in class_mappings.iteritems():
324 table_mappings[tbl] = cls(self.cursor, tbl)
326 return table_mappings
328 def _create_new_tables(self):
329 """(Re)creates the database tables."""
330 for tbl, tbl_obj in self.tables.iteritems():
331 self._drop_table(tbl)
332 tbl_obj.create()
334 def _populate_database(self, etree):
335 """Imports XML data into SQLite database.
337 table_d: table to table_object dictionary
338 etree: ElementTree object for KANJIDIC2
341 # Grab header
342 header = etree.find("header")
343 file_ver = header.find("file_version").text
344 db_ver = header.find("database_version").text
345 date = header.find("date_of_creation").text
346 self.tables['header'].insert(file_ver, db_ver, date)
348 # Iterate through characters
349 for character in etree.findall("character"):
350 # Character table
351 literal = character.find("literal").text
353 # Grab misc node - we'll store a few things from it in the
354 # main character table, too.
355 misc = character.find("misc")
356 grade = misc.find("grade")
357 grade = int(grade.text) if grade is not None else None
358 freq = misc.find("freq")
359 freq = int(freq.text) if freq is not None else None
360 jlpt = misc.find("jlpt")
361 jlpt = int(jlpt.text) if jlpt is not None else None
363 char_id = self.tables['character'].insert(literal, grade,
364 freq, jlpt)
366 table = self.tables['codepoint']
367 codepoint = character.find("codepoint")
368 for cp_value in codepoint.findall("cp_value"):
369 value = cp_value.text
370 cp_type = cp_value.get("cp_type")
371 table.insert(char_id, cp_type, value)
373 table = self.tables['radical']
374 radical = character.find("radical")
375 for rad_value in radical.findall("rad_value"):
376 value = rad_value.text
377 rad_type = rad_value.get("rad_type")
378 table.insert(char_id, rad_type, value)
380 # Tables generated from <misc> begin here
381 table = self.tables['stroke_count']
382 for stroke_count in misc.findall("stroke_count"):
383 count = int(stroke_count.text)
384 table.insert(char_id, count)
386 table = self.tables['variant']
387 for variant in misc.findall("variant"):
388 value = variant.text
389 var_type = variant.get("var_type")
390 table.insert(char_id, var_type, value)
392 table = self.tables['rad_name']
393 for rad_name in misc.findall("rad_name"):
394 value = rad_name.text
395 table.insert(char_id, value)
397 # Remaining direct descendents of <character>...
398 dic_number = character.find("dic_number")
399 if dic_number is not None:
400 table = self.tables['dic_number']
401 for dic_ref in dic_number.findall("dic_ref"):
402 dr_type = dic_ref.get("dr_type")
403 m_vol = dic_ref.get("m_vol", None)
404 m_page = dic_ref.get("m_page", None)
405 value = dic_ref.text
406 table.insert(char_id, dr_type, m_vol, m_page, value)
408 query_code = character.find("query_code")
409 if query_code is not None:
410 table = self.tables['query_code']
411 for q_code in query_code.findall("q_code"):
412 qc_type = q_code.get("qc_type")
413 skip_misclass = q_code.get("skip_misclass", None)
414 value = q_code.text
415 table.insert(char_id, qc_type, skip_misclass, value)
417 reading_meaning = character.find("reading_meaning")
418 if reading_meaning is not None:
419 table = self.tables['rmgroup']
420 for rmgroup in reading_meaning.findall("rmgroup"):
421 group_id = table.insert(char_id)
422 table = self.tables['reading']
423 for reading in rmgroup.findall("reading"):
424 r_type = reading.get("r_type")
425 on_type = reading.get("on_type")
426 r_status = reading.get("r_status")
427 value = reading.text
428 table.insert(group_id, r_type, on_type, r_status, value)
429 table = self.tables['meaning']
430 for meaning in rmgroup.findall("meaning"):
431 lang = meaning.get("m_lang", "en")
432 value = meaning.text
433 table.insert(group_id, lang, value)
434 table = self.tables['nanori']
435 for nanori in reading_meaning.findall("nanori"):
436 table.insert(char_id, nanori.text)
438 def _drop_table(self, name):
439 self.cursor.execute("DROP TABLE IF EXISTS %s" % name)
441 def _create_index_tables(self):
442 """Creates extra tables to help with common searches.
444 Supplementary tables include:
446 1. Reading search table: kun-yomi to character ID. Kun-yomi
447 is modified for easier searching (no "." or "-" markers).
450 self._create_reading_search_table()
452 def _create_reading_search_table(self):
453 """Creates "sanitized" reading to character ID search table."""
455 # Mapping is from reading to character ID...
456 # r.fk -> rg.id, rg.fk -> c.id.
457 query = (
458 "SELECT r.value, c.id "
459 "FROM reading r, rmgroup rg, character c "
460 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
462 self.cursor.execute(query)
463 rows = self.cursor.fetchall()
464 values, ids = zip(*rows) # unzip idiom (see zip doc)
466 # Sanitize strings by removing "." and "-"
467 values = [value.replace(u".", u"").replace(u"-", u"")
468 for value in values]
470 # Create new table
471 tbl_name = "kunyomi_lookup"
472 self.tables[tbl_name] = tbl = ReadingLookupTable(self.cursor, tbl_name)
473 self._drop_table(tbl_name)
474 tbl.create()
476 # Store all sanitized strings and their keys in the table
477 rows = zip(values, ids)
478 tbl.insertmany(rows)
481 ######################################################################
482 # KANJIDIC2 data tables
483 ######################################################################
486 class HeaderTable(Table):
487 create_query = ("CREATE TABLE %s "
488 "(file_version TEXT, "
489 "database_version TEXT, "
490 "date_of_creation TEXT)")
491 insert_query = "INSERT INTO %s VALUES (?, ?, ?)"
494 class CharacterTable(Table):
495 create_query = ("CREATE TABLE %s "
496 "(id INTEGER PRIMARY KEY, literal TEXT, "
497 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
498 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
499 index_queries = [
500 "CREATE INDEX %s_literal ON %s (literal)",
504 class TypeValueTable(ChildTable):
505 create_query = ("CREATE TABLE %s "
506 "(id INTEGER PRIMARY KEY, fk INTEGER, "
507 "type TEXT, value TEXT)")
508 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
509 index_queries = [
510 "CREATE INDEX %s_fk ON %s (fk)",
514 class StrokeCountTable(ChildTable):
515 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
516 "fk INTEGER, count INTEGER)")
517 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
518 index_queries = [
519 "CREATE INDEX %s_fk ON %s (fk)",
523 class DicNumberTable(ChildTable):
524 create_query = ("CREATE TABLE %s "
525 "(id INTEGER PRIMARY KEY, fk INTEGER, "
526 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
527 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
528 index_queries = [
529 "CREATE INDEX %s_fk ON %s (fk)",
533 class QueryCodeTable(ChildTable):
534 create_query = ("CREATE TABLE %s "
535 "(id INTEGER PRIMARY KEY, fk INTEGER, "
536 "type TEXT, skip_misclass TEXT, value TEXT)")
537 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
538 index_queries = [
539 "CREATE INDEX %s_fk ON %s (fk)",
543 class RMGroupTable(ChildTable):
544 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
545 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
546 index_queries = [
547 "CREATE INDEX %s_fk ON %s (fk)",
550 class ReadingTable(ChildTable):
551 create_query = ("CREATE TABLE %s "
552 "(id INTEGER PRIMARY KEY, fk INTEGER, "
553 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
554 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
555 index_queries = [
556 "CREATE INDEX %s_fk ON %s (fk)",
557 "CREATE INDEX %s_value ON %s (value)",
561 class MeaningTable(ChildTable):
562 create_query = ("CREATE TABLE %s "
563 "(id INTEGER PRIMARY KEY, fk INTEGER, "
564 "lang TEXT, value TEXT)")
565 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
566 index_queries = [
567 "CREATE INDEX %s_fk ON %s (fk)",
568 "CREATE INDEX %s_lang_value ON %s (lang, value)",
572 ######################################################################
573 # Index tables (not part of actual KANJIDIC2)
574 ######################################################################
577 class ReadingLookupTable(Table):
578 """Maps reading to character IDs."""
579 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
580 create_query = ("CREATE TABLE %s "
581 "(id INTEGER PRIMARY KEY, "
582 "reading TEXT, character_id INTEGER)")
583 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
584 index_queries = [
585 "CREATE INDEX %s_reading ON %s (reading)",
590 ######################################################################
592 def parse_args():
593 from optparse import OptionParser
594 op = OptionParser(usage="%prog [options] <db_filename> [search_query]")
595 op.add_option("-i", "--initialize",
596 dest="init_fname", metavar="XML_SOURCE",
597 help=_("Initialize database from file."))
598 op.add_option("-s", "--search", action="store_true",
599 help=_("Search for kanji by readings or meanings"))
600 op.add_option("-l", "--lookup", action="store_true",
601 help=_("Look up exact character"))
602 op.add_option("-L", "--lang",
603 help=_("Specify preferred language for searching."))
604 op.add_option("-v", "--verbose", action="store_true",
605 help=_("Verbose mode (print debug strings)"))
606 options, args = op.parse_args()
607 if len(args) < 1:
608 op.print_help()
609 exit(-1)
610 if options.lookup and options.search:
611 print(_("Cannot --lookup and --search at the same time."),
612 file=sys.stderr)
613 exit(-1)
614 return (options, args)
616 def main():
617 options, args = parse_args()
618 db_fname = args[0]
620 if options.init_fname is not None:
621 db = Database(db_fname, init_from_file=options.init_fname)
622 else:
623 db = Database(db_fname)
625 results = []
626 if len(args) <= 1:
627 # No search was requested; we can exit here.
628 return
630 if options.search == True:
631 # Do search
632 # To be nice, we'll join all remaining args with spaces.
633 search_query = " ".join(args[1:])
635 if options.lang is not None:
636 results = db.search(search_query,
637 lang=options.lang, options=options)
638 else:
639 results = db.search(search_query, options=options)
640 elif options.lookup == True:
641 # Do lookup
642 encoding = get_encoding()
643 lookup_query = args[1].decode(encoding)
644 results = []
645 for character in lookup_query:
646 result = db.search_by_literal(character)
647 if result is not None:
648 results.append(result)
649 else:
650 # No lookup
651 print(_("For searches or lookups, the --search or --lookup flag is "
652 "required."))
653 return
655 # To do: visualize results
656 # Not as important; now we know we can at least do our needed
657 # lookups...
658 if len(results) > 0:
659 encoding = get_encoding()
660 # DEBUG: until lookup_by_id is implemented, this will work.
661 for index, result in enumerate(results):
662 index += 1
663 print(_("[Entry %d]") % index)
665 print(unicode(result).encode(encoding))
666 print()
667 else:
668 print(_("No results found."))
670 if __name__ == "__main__":
671 main()