Updated kd2.py and helpers.py to allow for unicode objects as search input.
[jblite.git] / jblite / kd2.py
blobea5b8bd41fc4fbd75f9cff5330951c8b613ed25e
1 # -*- coding: utf-8 -*-
2 from __future__ import print_function
3 from __future__ import with_statement
5 import os, sys, re, sqlite3, time
6 from cStringIO import StringIO
7 from xml.etree.cElementTree import ElementTree
8 from helpers import gzread, get_encoding, convert_query_to_unicode
9 from db import Database as BaseDatabase
10 from table import Table, ChildTable, KeyValueTable
12 import gettext
13 #t = gettext.translation("jblite")
14 #_ = t.ugettext
15 gettext.install("jblite")
18 def convert_kunyomi(coded_str):
19 """Converts a kunyomi string with ./- to a user-friendly format.
21 Specifically, two things occur:
23 1. Strings are split on '.' and the right half, if any, is
24 enclosed in parentheses.
26 2. '-' is replaced with '~'.
28 """
29 pieces = coded_str.split(u'.', 1)
30 if len(pieces) > 1:
31 intermediate = u"".join([pieces[0], u"(", pieces[1], u")"])
32 else:
33 intermediate = pieces[0]
34 result = intermediate.replace(u"-", u"~")
35 return result
37 def get_jouyou_str(grade_int):
38 """Converts an integer Jouyou grade code into a string."""
39 # DO LATER
40 return str(grade_int)
43 class Entry(object):
45 def __init__(self, record):
46 self._record = record
48 def __unicode__(self):
49 """Basic string representation of the entry."""
51 char_rec = self._record
52 lines = []
54 literal = char_rec.data["literal"]
55 lines.append(_(u"Literal: %s (0x%X)") % (literal, ord(literal)))
57 rmgroup_recs = char_rec.find_children("rmgroup")
58 for i, rmgroup_rec in enumerate(rmgroup_recs):
59 group_index = i + 1
60 lines.append(_(u"Group %d:") % group_index)
62 reading_recs = rmgroup_rec.find_children("reading")
64 kunyomi = [r.data['value'] for r in reading_recs
65 if r.data["type"] == "ja_kun"]
66 # kun-yomi needs ./- translated
67 kunyomi = map(convert_kunyomi, kunyomi)
68 lines.append(_(u" Kun-yomi: %s") % u"、".join(kunyomi))
70 onyomi = [r.data['value'] for r in reading_recs
71 if r.data["type"] == "ja_on"]
72 lines.append(_(u" On-yomi: %s") % u"、".join(onyomi))
74 meaning_recs = rmgroup_rec.find_children("meaning")
75 meaning_d = {}
76 for r in meaning_recs:
77 meanings = meaning_d.setdefault(r.data['lang'], [])
78 meanings.append(r.data['value'])
79 for lang in sorted(meaning_d.keys()):
80 meanings = meaning_d[lang]
81 meaning_str = "; ".join(meanings)
82 lines.append(_(u" Meanings (%s): %s") % (lang, meaning_str))
84 nanori_recs = char_rec.find_children("nanori")
85 if len(nanori_recs) > 0:
86 nanori = [r.data["value"] for r in nanori_recs]
87 nanori_str = u"、".join(nanori)
88 lines.append(_(u"Nanori: %s") % nanori_str)
90 stroke_recs = char_rec.find_children("stroke_count")
91 strokes = [r.data['count'] for r in stroke_recs]
92 if len(strokes) == 1:
93 lines.append(_(u"Stroke count: %d") % strokes[0])
94 elif len(strokes) > 1:
95 miscounts = ", ".join(map(str, strokes[1:]))
96 lines.append(_(u"Stroke count: %d (miscounts: %s)") %
97 (strokes[0], miscounts))
98 else:
99 pass # No stroke count info; don't print anything
101 freq = char_rec.data["freq"]
102 if freq is not None:
103 lines.append(_(u"Frequency: %d") % freq)
105 grade = char_rec.data["grade"]
106 if grade is not None:
107 # Jouyou grade codes has special meanings; a conversion is
108 # needed.
109 grade_str = get_jouyou_str(grade)
110 lines.append(_(u"Jouyou grade: %s") % grade_str)
112 jlpt = char_rec.data["jlpt"]
113 if jlpt is not None:
114 lines.append(_(u"JLPT grade: %d") % jlpt)
116 return u"\n".join(lines)
118 def __repr__(self):
119 return repr(self._record)
122 class Database(BaseDatabase):
124 """Top level object for SQLite 3-based KANJIDIC2 database."""
126 entry_class = Entry
127 table_map = {
128 u"character": {
129 u"codepoint": {},
130 u"radical": {},
131 u"stroke_count": {},
132 u"variant": {},
133 u"rad_name": {},
134 u"dic_number": {},
135 u"query_code": {},
136 u"rmgroup": {
137 u"reading": {},
138 u"meaning": {},
140 u"nanori": {},
144 def __init__(self, filename, init_from_file=None):
145 self.conn = sqlite3.connect(filename)
146 self.conn.row_factory = sqlite3.Row # keyword accessors for rows
147 self.cursor = self.conn.cursor()
148 self.tables = self._create_table_objects()
149 if init_from_file is not None:
150 raw_data = gzread(init_from_file)
152 infile = StringIO(raw_data)
153 etree = ElementTree(file=infile)
154 infile.close()
156 # Create the core database
157 self._create_new_tables()
158 self._populate_database(etree)
159 self.conn.commit()
161 # Create supplemental indices
162 self._create_index_tables()
163 self.conn.commit()
165 def search(self, query, lang=None, options=None):
166 unicode_query = convert_query_to_unicode(query)
168 verbose = (options is not None) and (options.verbose == True)
170 if verbose and os.name == "nt":
171 print(u"Searching for \"%s\", lang=%s..." %
172 (unicode_query, repr(lang)),
173 file=sys.stderr)
175 # Do some search stuff here...
177 # 1. Find by reading
179 entries_r = self._search_by_reading(unicode_query)
180 entries_m = self._search_by_meaning(unicode_query,
181 lang=lang)
182 entries_n = self._search_by_nanori(unicode_query)
183 entries_i = self._search_by_indices(unicode_query, lang=lang)
185 # DEBUG CODE
186 if verbose:
187 print("READINGS:")
188 if len(entries_r) == 0:
189 print("No 'reading' results found.")
190 for ent_id, literal in entries_r:
191 try:
192 print(u"ID: %d, literal: %s" % (ent_id, literal))
193 except UnicodeEncodeError:
194 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
196 print("NANORI:")
197 if len(entries_n) == 0:
198 print("No 'nanori' results found.")
199 for ent_id, literal in entries_n:
200 try:
201 print(u"ID: %d, literal: %s" % (ent_id, literal))
202 except UnicodeEncodeError:
203 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
205 print("MEANINGS:")
206 if len(entries_m) == 0:
207 print("No 'meaning' results found.")
208 for ent_id, literal in entries_m:
209 try:
210 print(u"ID: %d, literal: %s" % (ent_id, literal))
211 except UnicodeEncodeError:
212 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
214 print("INDICES:")
215 if len(entries_i) == 0:
216 print("No indexed results found.")
217 for ent_id in entries_i:
218 print(u"ID: %d" % (ent_id,))
220 # Get list of unique character IDs
221 char_ids = []
222 for lst in (entries_r, entries_m, entries_n):
223 for row in lst:
224 if row[0] not in char_ids:
225 char_ids.append(row[0])
226 for char_id in entries_i:
227 if char_id not in char_ids:
228 char_ids.append(char_id)
230 char_ids = list(sorted(char_ids))
232 results = [self.lookup(char_id) for char_id in char_ids]
233 return results
235 def _search_by_reading(self, query):
236 # reading -> rmgroup -> character
237 self.cursor.execute(
238 "SELECT id, literal FROM character WHERE id IN "
239 "(SELECT fk FROM rmgroup WHERE id IN "
240 "(SELECT fk FROM reading WHERE value LIKE ?))", (query,))
241 rows = self.cursor.fetchall()
242 return rows
244 def _search_by_nanori(self, query):
245 # nanori -> character
246 self.cursor.execute(
247 "SELECT id, literal FROM character WHERE id IN "
248 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query,))
249 rows = self.cursor.fetchall()
250 return rows
252 def _search_by_meaning(self, query, lang=None):
253 # meaning -> rmgroup -> character
254 if lang is None:
255 self.cursor.execute(
256 "SELECT id, literal FROM character WHERE id IN "
257 "(SELECT fk FROM rmgroup WHERE id IN "
258 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query,))
259 else:
260 self.cursor.execute(
261 "SELECT id, literal FROM character WHERE id IN "
262 "(SELECT fk FROM rmgroup WHERE id IN "
263 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
264 (lang, query))
265 rows = self.cursor.fetchall()
266 return rows
268 def _search_by_indices(self, query, lang=None):
269 # Get IDs from index table
270 # Note: lang is currently unused.
271 self.cursor.execute(
272 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
273 (query,))
274 rows = self.cursor.fetchall()
275 return [row[0] for row in rows]
277 def search_by_literal(self, literal):
278 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
279 self.cursor.execute("SELECT id FROM character WHERE literal = ?",
280 (literal,))
281 rows = self.cursor.fetchall()
282 if len(rows) < 1:
283 return None
284 else:
285 char_id = rows[0][0]
286 return self.lookup(char_id)
288 def lookup(self, id):
289 return BaseDatabase.lookup(self, "character", id)
291 def _create_table_objects(self):
292 """Creates table objects.
294 Returns a dictionary of table name to table object.
297 class_mappings = {
298 "header": HeaderTable,
299 "character": CharacterTable,
300 "codepoint": TypeValueTable,
301 "radical": TypeValueTable,
302 "stroke_count": StrokeCountTable,
303 "variant": TypeValueTable,
304 "rad_name": KeyValueTable,
305 "dic_number": DicNumberTable,
306 "query_code": QueryCodeTable,
307 "rmgroup": RMGroupTable,
308 "reading": ReadingTable,
309 "meaning": MeaningTable,
310 "nanori": KeyValueTable,
313 # Create all table objects
314 table_mappings = {}
315 for tbl, cls in class_mappings.iteritems():
316 table_mappings[tbl] = cls(self.cursor, tbl)
318 return table_mappings
320 def _create_new_tables(self):
321 """(Re)creates the database tables."""
322 for tbl, tbl_obj in self.tables.iteritems():
323 self._drop_table(tbl)
324 tbl_obj.create()
326 def _populate_database(self, etree):
327 """Imports XML data into SQLite database.
329 table_d: table to table_object dictionary
330 etree: ElementTree object for KANJIDIC2
333 # Grab header
334 header = etree.find("header")
335 file_ver = header.find("file_version").text
336 db_ver = header.find("database_version").text
337 date = header.find("date_of_creation").text
338 self.tables['header'].insert(file_ver, db_ver, date)
340 # Iterate through characters
341 for character in etree.findall("character"):
342 # Character table
343 literal = character.find("literal").text
345 # Grab misc node - we'll store a few things from it in the
346 # main character table, too.
347 misc = character.find("misc")
348 grade = misc.find("grade")
349 grade = int(grade.text) if grade is not None else None
350 freq = misc.find("freq")
351 freq = int(freq.text) if freq is not None else None
352 jlpt = misc.find("jlpt")
353 jlpt = int(jlpt.text) if jlpt is not None else None
355 char_id = self.tables['character'].insert(literal, grade,
356 freq, jlpt)
358 table = self.tables['codepoint']
359 codepoint = character.find("codepoint")
360 for cp_value in codepoint.findall("cp_value"):
361 value = cp_value.text
362 cp_type = cp_value.get("cp_type")
363 table.insert(char_id, cp_type, value)
365 table = self.tables['radical']
366 radical = character.find("radical")
367 for rad_value in radical.findall("rad_value"):
368 value = rad_value.text
369 rad_type = rad_value.get("rad_type")
370 table.insert(char_id, rad_type, value)
372 # Tables generated from <misc> begin here
373 table = self.tables['stroke_count']
374 for stroke_count in misc.findall("stroke_count"):
375 count = int(stroke_count.text)
376 table.insert(char_id, count)
378 table = self.tables['variant']
379 for variant in misc.findall("variant"):
380 value = variant.text
381 var_type = variant.get("var_type")
382 table.insert(char_id, var_type, value)
384 table = self.tables['rad_name']
385 for rad_name in misc.findall("rad_name"):
386 value = rad_name.text
387 table.insert(char_id, value)
389 # Remaining direct descendents of <character>...
390 dic_number = character.find("dic_number")
391 if dic_number is not None:
392 table = self.tables['dic_number']
393 for dic_ref in dic_number.findall("dic_ref"):
394 dr_type = dic_ref.get("dr_type")
395 m_vol = dic_ref.get("m_vol", None)
396 m_page = dic_ref.get("m_page", None)
397 value = dic_ref.text
398 table.insert(char_id, dr_type, m_vol, m_page, value)
400 query_code = character.find("query_code")
401 if query_code is not None:
402 table = self.tables['query_code']
403 for q_code in query_code.findall("q_code"):
404 qc_type = q_code.get("qc_type")
405 skip_misclass = q_code.get("skip_misclass", None)
406 value = q_code.text
407 table.insert(char_id, qc_type, skip_misclass, value)
409 reading_meaning = character.find("reading_meaning")
410 if reading_meaning is not None:
411 table = self.tables['rmgroup']
412 for rmgroup in reading_meaning.findall("rmgroup"):
413 group_id = table.insert(char_id)
414 table = self.tables['reading']
415 for reading in rmgroup.findall("reading"):
416 r_type = reading.get("r_type")
417 on_type = reading.get("on_type")
418 r_status = reading.get("r_status")
419 value = reading.text
420 table.insert(group_id, r_type, on_type, r_status, value)
421 table = self.tables['meaning']
422 for meaning in rmgroup.findall("meaning"):
423 lang = meaning.get("m_lang", "en")
424 value = meaning.text
425 table.insert(group_id, lang, value)
426 table = self.tables['nanori']
427 for nanori in reading_meaning.findall("nanori"):
428 table.insert(char_id, nanori.text)
430 def _drop_table(self, name):
431 self.cursor.execute("DROP TABLE IF EXISTS %s" % name)
433 def _create_index_tables(self):
434 """Creates extra tables to help with common searches.
436 Supplementary tables include:
438 1. Reading search table: kun-yomi to character ID. Kun-yomi
439 is modified for easier searching (no "." or "-" markers).
442 self._create_reading_search_table()
444 def _create_reading_search_table(self):
445 """Creates "sanitized" reading to character ID search table."""
447 # Mapping is from reading to character ID...
448 # r.fk -> rg.id, rg.fk -> c.id.
449 query = (
450 "SELECT r.value, c.id "
451 "FROM reading r, rmgroup rg, character c "
452 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
454 self.cursor.execute(query)
455 rows = self.cursor.fetchall()
456 values, ids = zip(*rows) # unzip idiom (see zip doc)
458 # Sanitize strings by removing "." and "-"
459 values = [value.replace(u".", u"").replace(u"-", u"")
460 for value in values]
462 # Create new table
463 tbl_name = "kunyomi_lookup"
464 self.tables[tbl_name] = tbl = ReadingLookupTable(self.cursor, tbl_name)
465 self._drop_table(tbl_name)
466 tbl.create()
468 # Store all sanitized strings and their keys in the table
469 rows = zip(values, ids)
470 tbl.insertmany(rows)
473 ######################################################################
474 # KANJIDIC2 data tables
475 ######################################################################
478 class HeaderTable(Table):
479 create_query = ("CREATE TABLE %s "
480 "(file_version TEXT, "
481 "database_version TEXT, "
482 "date_of_creation TEXT)")
483 insert_query = "INSERT INTO %s VALUES (?, ?, ?)"
486 class CharacterTable(Table):
487 create_query = ("CREATE TABLE %s "
488 "(id INTEGER PRIMARY KEY, literal TEXT, "
489 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
490 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
491 index_queries = [
492 "CREATE INDEX %s_literal ON %s (literal)",
496 class TypeValueTable(ChildTable):
497 create_query = ("CREATE TABLE %s "
498 "(id INTEGER PRIMARY KEY, fk INTEGER, "
499 "type TEXT, value TEXT)")
500 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
501 index_queries = [
502 "CREATE INDEX %s_fk ON %s (fk)",
506 class StrokeCountTable(ChildTable):
507 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
508 "fk INTEGER, count INTEGER)")
509 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
510 index_queries = [
511 "CREATE INDEX %s_fk ON %s (fk)",
515 class DicNumberTable(ChildTable):
516 create_query = ("CREATE TABLE %s "
517 "(id INTEGER PRIMARY KEY, fk INTEGER, "
518 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
519 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
520 index_queries = [
521 "CREATE INDEX %s_fk ON %s (fk)",
525 class QueryCodeTable(ChildTable):
526 create_query = ("CREATE TABLE %s "
527 "(id INTEGER PRIMARY KEY, fk INTEGER, "
528 "type TEXT, skip_misclass TEXT, value TEXT)")
529 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
530 index_queries = [
531 "CREATE INDEX %s_fk ON %s (fk)",
535 class RMGroupTable(ChildTable):
536 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
537 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
538 index_queries = [
539 "CREATE INDEX %s_fk ON %s (fk)",
542 class ReadingTable(ChildTable):
543 create_query = ("CREATE TABLE %s "
544 "(id INTEGER PRIMARY KEY, fk INTEGER, "
545 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
546 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
547 index_queries = [
548 "CREATE INDEX %s_fk ON %s (fk)",
549 "CREATE INDEX %s_value ON %s (value)",
553 class MeaningTable(ChildTable):
554 create_query = ("CREATE TABLE %s "
555 "(id INTEGER PRIMARY KEY, fk INTEGER, "
556 "lang TEXT, value TEXT)")
557 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
558 index_queries = [
559 "CREATE INDEX %s_fk ON %s (fk)",
560 "CREATE INDEX %s_lang_value ON %s (lang, value)",
564 ######################################################################
565 # Index tables (not part of actual KANJIDIC2)
566 ######################################################################
569 class ReadingLookupTable(Table):
570 """Maps reading to character IDs."""
571 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
572 create_query = ("CREATE TABLE %s "
573 "(id INTEGER PRIMARY KEY, "
574 "reading TEXT, character_id INTEGER)")
575 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
576 index_queries = [
577 "CREATE INDEX %s_reading ON %s (reading)",
582 ######################################################################
584 def parse_args():
585 from optparse import OptionParser
586 op = OptionParser(usage="%prog [options] <db_filename> [search_query]")
587 op.add_option("-i", "--initialize",
588 dest="init_fname", metavar="XML_SOURCE",
589 help=_("Initialize database from file."))
590 op.add_option("-s", "--search", action="store_true",
591 help=_("Search for kanji by readings or meanings"))
592 op.add_option("-l", "--lookup", action="store_true",
593 help=_("Look up exact character"))
594 op.add_option("-L", "--lang",
595 help=_("Specify preferred language for searching."))
596 op.add_option("-v", "--verbose", action="store_true",
597 help=_("Verbose mode (print debug strings)"))
598 options, args = op.parse_args()
599 if len(args) < 1:
600 op.print_help()
601 exit(-1)
602 if options.lookup and options.search:
603 print(_("Cannot --lookup and --search at the same time."),
604 file=sys.stderr)
605 exit(-1)
606 return (options, args)
608 def main():
609 options, args = parse_args()
610 db_fname = args[0]
612 if options.init_fname is not None:
613 db = Database(db_fname, init_from_file=options.init_fname)
614 else:
615 db = Database(db_fname)
617 results = []
618 if len(args) <= 1:
619 # No search was requested; we can exit here.
620 return
622 if options.search == True:
623 # Do search
624 # To be nice, we'll join all remaining args with spaces.
625 search_query = " ".join(args[1:])
627 if options.lang is not None:
628 results = db.search(search_query,
629 lang=options.lang, options=options)
630 else:
631 results = db.search(search_query, options=options)
632 elif options.lookup == True:
633 # Do lookup
634 encoding = get_encoding()
635 lookup_query = args[1].decode(encoding)
636 results = []
637 for character in lookup_query:
638 result = db.search_by_literal(character)
639 if result is not None:
640 results.append(result)
641 else:
642 # No lookup
643 print(_("For searches or lookups, the --search or --lookup flag is "
644 "required."))
645 return
647 # To do: visualize results
648 # Not as important; now we know we can at least do our needed
649 # lookups...
650 if len(results) > 0:
651 encoding = get_encoding()
652 # DEBUG: until lookup_by_id is implemented, this will work.
653 for index, result in enumerate(results):
654 index += 1
655 print(_("[Entry %d]") % index)
657 print(unicode(result).encode(encoding))
658 print()
659 else:
660 print(_("No results found."))
662 if __name__ == "__main__":
663 main()