More appropriate version number.
[jblite.git] / jblite / kd2.py
blobae12728041d700a7944662a27b87798237c52ad1
1 # -*- coding: utf-8 -*-
2 from __future__ import print_function
3 from __future__ import with_statement
5 import os, sys, re, sqlite3, time
6 from cStringIO import StringIO
7 from xml.etree.cElementTree import ElementTree
8 from helpers import gzread, get_encoding, convert_query_to_unicode
9 from db import Database as BaseDatabase
10 from table import Table, ChildTable, KeyValueTable
12 import gettext
13 #t = gettext.translation("jblite")
14 #_ = t.ugettext
15 gettext.install("jblite")
18 def convert_kunyomi(coded_str):
19 """Converts a kunyomi string with ./- to a user-friendly format.
21 Specifically, two things occur:
23 1. Strings are split on '.' and the right half, if any, is
24 enclosed in parentheses.
26 2. '-' is replaced with '~'.
28 """
29 pieces = coded_str.split(u'.', 1)
30 if len(pieces) > 1:
31 intermediate = u"".join([pieces[0], u"(", pieces[1], u")"])
32 else:
33 intermediate = pieces[0]
34 result = intermediate.replace(u"-", u"~")
35 return result
37 def get_jouyou_str(grade_int):
38 """Converts an integer Jouyou grade code into a string."""
39 # DO LATER
40 return str(grade_int)
43 class Entry(object):
45 def __init__(self, record):
46 self._record = record
48 def __unicode__(self):
49 """Basic string representation of the entry."""
51 char_rec = self._record
52 lines = []
54 literal = char_rec.data["literal"]
55 lines.append(_(u"Literal: %s (0x%X)") % (literal, ord(literal)))
57 rmgroup_recs = char_rec.find_children("rmgroup")
58 for i, rmgroup_rec in enumerate(rmgroup_recs):
59 group_index = i + 1
60 lines.append(_(u"Group %d:") % group_index)
62 reading_recs = rmgroup_rec.find_children("reading")
64 kunyomi = [r.data['value'] for r in reading_recs
65 if r.data["type"] == "ja_kun"]
66 # kun-yomi needs ./- translated
67 kunyomi = map(convert_kunyomi, kunyomi)
68 lines.append(_(u" Kun-yomi: %s") % u"、".join(kunyomi))
70 onyomi = [r.data['value'] for r in reading_recs
71 if r.data["type"] == "ja_on"]
72 lines.append(_(u" On-yomi: %s") % u"、".join(onyomi))
74 meaning_recs = rmgroup_rec.find_children("meaning")
75 meaning_d = {}
76 for r in meaning_recs:
77 meanings = meaning_d.setdefault(r.data['lang'], [])
78 meanings.append(r.data['value'])
79 for lang in sorted(meaning_d.keys()):
80 meanings = meaning_d[lang]
81 meaning_str = "; ".join(meanings)
82 lines.append(_(u" Meanings (%s): %s") % (lang, meaning_str))
84 nanori_recs = char_rec.find_children("nanori")
85 if len(nanori_recs) > 0:
86 nanori = [r.data["value"] for r in nanori_recs]
87 nanori_str = u"、".join(nanori)
88 lines.append(_(u"Nanori: %s") % nanori_str)
90 stroke_recs = char_rec.find_children("stroke_count")
91 strokes = [r.data['count'] for r in stroke_recs]
92 if len(strokes) == 1:
93 lines.append(_(u"Stroke count: %d") % strokes[0])
94 elif len(strokes) > 1:
95 miscounts = ", ".join(map(str, strokes[1:]))
96 lines.append(_(u"Stroke count: %d (miscounts: %s)") %
97 (strokes[0], miscounts))
98 else:
99 pass # No stroke count info; don't print anything
101 freq = char_rec.data["freq"]
102 if freq is not None:
103 lines.append(_(u"Frequency: %d") % freq)
105 grade = char_rec.data["grade"]
106 if grade is not None:
107 # Jouyou grade codes has special meanings; a conversion is
108 # needed.
109 grade_str = get_jouyou_str(grade)
110 lines.append(_(u"Jouyou grade: %s") % grade_str)
112 jlpt = char_rec.data["jlpt"]
113 if jlpt is not None:
114 lines.append(_(u"JLPT grade: %d") % jlpt)
116 return u"\n".join(lines)
118 def __repr__(self):
119 return repr(self._record)
122 class Database(BaseDatabase):
124 """Top level object for SQLite 3-based KANJIDIC2 database."""
126 entry_class = Entry
127 table_map = {
128 u"character": {
129 u"codepoint": {},
130 u"radical": {},
131 u"stroke_count": {},
132 u"variant": {},
133 u"rad_name": {},
134 u"dic_number": {},
135 u"query_code": {},
136 u"rmgroup": {
137 u"reading": {},
138 u"meaning": {},
140 u"nanori": {},
144 def __init__(self, filename, init_from_file=None):
145 self.conn = sqlite3.connect(filename)
146 self.conn.row_factory = sqlite3.Row # keyword accessors for rows
147 self.cursor = self.conn.cursor()
148 self.tables = self._create_table_objects()
149 if init_from_file is not None:
150 raw_data = gzread(init_from_file)
152 infile = StringIO(raw_data)
153 etree = ElementTree(file=infile)
154 infile.close()
156 # Create the core database
157 self._create_new_tables()
158 self._populate_database(etree)
159 self.conn.commit()
161 # Create supplemental indices
162 self._create_index_tables()
163 self.conn.commit()
165 def search(self, query, lang=None, options=None):
166 query = convert_query_to_unicode(query)
167 query = "%%%s%%" % query # Wrap in wildcards
169 verbose = (options is not None) and (options.verbose == True)
171 if verbose and os.name == "nt":
172 print(u"Searching for \"%s\", lang=%s..." %
173 (query, repr(lang)),
174 file=sys.stderr)
176 # Do some search stuff here...
178 # 1. Find by reading
180 entries_r = self._search_by_reading(query)
181 entries_m = self._search_by_meaning(query,
182 lang=lang)
183 entries_n = self._search_by_nanori(query)
184 entries_i = self._search_by_indices(query, lang=lang)
186 # DEBUG CODE
187 if verbose:
188 print("READINGS:")
189 if len(entries_r) == 0:
190 print("No 'reading' results found.")
191 for ent_id, literal in entries_r:
192 try:
193 print(u"ID: %d, literal: %s" % (ent_id, literal))
194 except UnicodeEncodeError:
195 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
197 print("NANORI:")
198 if len(entries_n) == 0:
199 print("No 'nanori' results found.")
200 for ent_id, literal in entries_n:
201 try:
202 print(u"ID: %d, literal: %s" % (ent_id, literal))
203 except UnicodeEncodeError:
204 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
206 print("MEANINGS:")
207 if len(entries_m) == 0:
208 print("No 'meaning' results found.")
209 for ent_id, literal in entries_m:
210 try:
211 print(u"ID: %d, literal: %s" % (ent_id, literal))
212 except UnicodeEncodeError:
213 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
215 print("INDICES:")
216 if len(entries_i) == 0:
217 print("No indexed results found.")
218 for ent_id in entries_i:
219 print(u"ID: %d" % (ent_id,))
221 # Get list of unique character IDs
222 char_ids = []
223 for lst in (entries_r, entries_m, entries_n):
224 for row in lst:
225 if row[0] not in char_ids:
226 char_ids.append(row[0])
227 for char_id in entries_i:
228 if char_id not in char_ids:
229 char_ids.append(char_id)
231 char_ids = list(sorted(char_ids))
233 results = [self.lookup(char_id) for char_id in char_ids]
234 return results
236 def _search_by_reading(self, query):
237 # reading -> rmgroup -> character
238 self.cursor.execute(
239 "SELECT id, literal FROM character WHERE id IN "
240 "(SELECT fk FROM rmgroup WHERE id IN "
241 "(SELECT fk FROM reading WHERE value LIKE ?))", (query,))
242 rows = self.cursor.fetchall()
243 return rows
245 def _search_by_nanori(self, query):
246 # nanori -> character
247 self.cursor.execute(
248 "SELECT id, literal FROM character WHERE id IN "
249 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query,))
250 rows = self.cursor.fetchall()
251 return rows
253 def _search_by_meaning(self, query, lang=None):
254 # meaning -> rmgroup -> character
255 if lang is None:
256 self.cursor.execute(
257 "SELECT id, literal FROM character WHERE id IN "
258 "(SELECT fk FROM rmgroup WHERE id IN "
259 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query,))
260 else:
261 self.cursor.execute(
262 "SELECT id, literal FROM character WHERE id IN "
263 "(SELECT fk FROM rmgroup WHERE id IN "
264 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
265 (lang, query))
266 rows = self.cursor.fetchall()
267 return rows
269 def _search_by_indices(self, query, lang=None):
270 # Get IDs from index table
271 # Note: lang is currently unused.
272 self.cursor.execute(
273 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
274 (query,))
275 rows = self.cursor.fetchall()
276 return [row[0] for row in rows]
278 def search_by_literal(self, literal):
279 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
280 self.cursor.execute("SELECT id FROM character WHERE literal = ?",
281 (literal,))
282 rows = self.cursor.fetchall()
283 if len(rows) < 1:
284 return None
285 else:
286 char_id = rows[0][0]
287 return self.lookup(char_id)
289 def lookup(self, id):
290 return BaseDatabase.lookup(self, "character", id)
292 def _create_table_objects(self):
293 """Creates table objects.
295 Returns a dictionary of table name to table object.
298 class_mappings = {
299 "header": HeaderTable,
300 "character": CharacterTable,
301 "codepoint": TypeValueTable,
302 "radical": TypeValueTable,
303 "stroke_count": StrokeCountTable,
304 "variant": TypeValueTable,
305 "rad_name": KeyValueTable,
306 "dic_number": DicNumberTable,
307 "query_code": QueryCodeTable,
308 "rmgroup": RMGroupTable,
309 "reading": ReadingTable,
310 "meaning": MeaningTable,
311 "nanori": KeyValueTable,
314 # Create all table objects
315 table_mappings = {}
316 for tbl, cls in class_mappings.iteritems():
317 table_mappings[tbl] = cls(self.cursor, tbl)
319 return table_mappings
321 def _create_new_tables(self):
322 """(Re)creates the database tables."""
323 for tbl, tbl_obj in self.tables.iteritems():
324 self._drop_table(tbl)
325 tbl_obj.create()
327 def _populate_database(self, etree):
328 """Imports XML data into SQLite database.
330 table_d: table to table_object dictionary
331 etree: ElementTree object for KANJIDIC2
334 # Grab header
335 header = etree.find("header")
336 file_ver = header.find("file_version").text
337 db_ver = header.find("database_version").text
338 date = header.find("date_of_creation").text
339 self.tables['header'].insert(file_ver, db_ver, date)
341 # Iterate through characters
342 for character in etree.findall("character"):
343 # Character table
344 literal = character.find("literal").text
346 # Grab misc node - we'll store a few things from it in the
347 # main character table, too.
348 misc = character.find("misc")
349 grade = misc.find("grade")
350 grade = int(grade.text) if grade is not None else None
351 freq = misc.find("freq")
352 freq = int(freq.text) if freq is not None else None
353 jlpt = misc.find("jlpt")
354 jlpt = int(jlpt.text) if jlpt is not None else None
356 char_id = self.tables['character'].insert(literal, grade,
357 freq, jlpt)
359 table = self.tables['codepoint']
360 codepoint = character.find("codepoint")
361 for cp_value in codepoint.findall("cp_value"):
362 value = cp_value.text
363 cp_type = cp_value.get("cp_type")
364 table.insert(char_id, cp_type, value)
366 table = self.tables['radical']
367 radical = character.find("radical")
368 for rad_value in radical.findall("rad_value"):
369 value = rad_value.text
370 rad_type = rad_value.get("rad_type")
371 table.insert(char_id, rad_type, value)
373 # Tables generated from <misc> begin here
374 table = self.tables['stroke_count']
375 for stroke_count in misc.findall("stroke_count"):
376 count = int(stroke_count.text)
377 table.insert(char_id, count)
379 table = self.tables['variant']
380 for variant in misc.findall("variant"):
381 value = variant.text
382 var_type = variant.get("var_type")
383 table.insert(char_id, var_type, value)
385 table = self.tables['rad_name']
386 for rad_name in misc.findall("rad_name"):
387 value = rad_name.text
388 table.insert(char_id, value)
390 # Remaining direct descendents of <character>...
391 dic_number = character.find("dic_number")
392 if dic_number is not None:
393 table = self.tables['dic_number']
394 for dic_ref in dic_number.findall("dic_ref"):
395 dr_type = dic_ref.get("dr_type")
396 m_vol = dic_ref.get("m_vol", None)
397 m_page = dic_ref.get("m_page", None)
398 value = dic_ref.text
399 table.insert(char_id, dr_type, m_vol, m_page, value)
401 query_code = character.find("query_code")
402 if query_code is not None:
403 table = self.tables['query_code']
404 for q_code in query_code.findall("q_code"):
405 qc_type = q_code.get("qc_type")
406 skip_misclass = q_code.get("skip_misclass", None)
407 value = q_code.text
408 table.insert(char_id, qc_type, skip_misclass, value)
410 reading_meaning = character.find("reading_meaning")
411 if reading_meaning is not None:
412 table = self.tables['rmgroup']
413 for rmgroup in reading_meaning.findall("rmgroup"):
414 group_id = table.insert(char_id)
415 table = self.tables['reading']
416 for reading in rmgroup.findall("reading"):
417 r_type = reading.get("r_type")
418 on_type = reading.get("on_type")
419 r_status = reading.get("r_status")
420 value = reading.text
421 table.insert(group_id, r_type, on_type, r_status, value)
422 table = self.tables['meaning']
423 for meaning in rmgroup.findall("meaning"):
424 lang = meaning.get("m_lang", "en")
425 value = meaning.text
426 table.insert(group_id, lang, value)
427 table = self.tables['nanori']
428 for nanori in reading_meaning.findall("nanori"):
429 table.insert(char_id, nanori.text)
431 def _drop_table(self, name):
432 self.cursor.execute("DROP TABLE IF EXISTS %s" % name)
434 def _create_index_tables(self):
435 """Creates extra tables to help with common searches.
437 Supplementary tables include:
439 1. Reading search table: kun-yomi to character ID. Kun-yomi
440 is modified for easier searching (no "." or "-" markers).
443 self._create_reading_search_table()
445 def _create_reading_search_table(self):
446 """Creates "sanitized" reading to character ID search table."""
448 # Mapping is from reading to character ID...
449 # r.fk -> rg.id, rg.fk -> c.id.
450 query = (
451 "SELECT r.value, c.id "
452 "FROM reading r, rmgroup rg, character c "
453 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
455 self.cursor.execute(query)
456 rows = self.cursor.fetchall()
457 values, ids = zip(*rows) # unzip idiom (see zip doc)
459 # Sanitize strings by removing "." and "-"
460 values = [value.replace(u".", u"").replace(u"-", u"")
461 for value in values]
463 # Create new table
464 tbl_name = "kunyomi_lookup"
465 self.tables[tbl_name] = tbl = ReadingLookupTable(self.cursor, tbl_name)
466 self._drop_table(tbl_name)
467 tbl.create()
469 # Store all sanitized strings and their keys in the table
470 rows = zip(values, ids)
471 tbl.insertmany(rows)
474 ######################################################################
475 # KANJIDIC2 data tables
476 ######################################################################
479 class HeaderTable(Table):
480 create_query = ("CREATE TABLE %s "
481 "(file_version TEXT, "
482 "database_version TEXT, "
483 "date_of_creation TEXT)")
484 insert_query = "INSERT INTO %s VALUES (?, ?, ?)"
487 class CharacterTable(Table):
488 create_query = ("CREATE TABLE %s "
489 "(id INTEGER PRIMARY KEY, literal TEXT, "
490 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
491 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
492 index_queries = [
493 "CREATE INDEX %s_literal ON %s (literal)",
497 class TypeValueTable(ChildTable):
498 create_query = ("CREATE TABLE %s "
499 "(id INTEGER PRIMARY KEY, fk INTEGER, "
500 "type TEXT, value TEXT)")
501 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
502 index_queries = [
503 "CREATE INDEX %s_fk ON %s (fk)",
507 class StrokeCountTable(ChildTable):
508 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
509 "fk INTEGER, count INTEGER)")
510 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
511 index_queries = [
512 "CREATE INDEX %s_fk ON %s (fk)",
516 class DicNumberTable(ChildTable):
517 create_query = ("CREATE TABLE %s "
518 "(id INTEGER PRIMARY KEY, fk INTEGER, "
519 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
520 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
521 index_queries = [
522 "CREATE INDEX %s_fk ON %s (fk)",
526 class QueryCodeTable(ChildTable):
527 create_query = ("CREATE TABLE %s "
528 "(id INTEGER PRIMARY KEY, fk INTEGER, "
529 "type TEXT, skip_misclass TEXT, value TEXT)")
530 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
531 index_queries = [
532 "CREATE INDEX %s_fk ON %s (fk)",
536 class RMGroupTable(ChildTable):
537 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
538 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
539 index_queries = [
540 "CREATE INDEX %s_fk ON %s (fk)",
543 class ReadingTable(ChildTable):
544 create_query = ("CREATE TABLE %s "
545 "(id INTEGER PRIMARY KEY, fk INTEGER, "
546 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
547 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
548 index_queries = [
549 "CREATE INDEX %s_fk ON %s (fk)",
550 "CREATE INDEX %s_value ON %s (value)",
554 class MeaningTable(ChildTable):
555 create_query = ("CREATE TABLE %s "
556 "(id INTEGER PRIMARY KEY, fk INTEGER, "
557 "lang TEXT, value TEXT)")
558 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
559 index_queries = [
560 "CREATE INDEX %s_fk ON %s (fk)",
561 "CREATE INDEX %s_lang_value ON %s (lang, value)",
565 ######################################################################
566 # Index tables (not part of actual KANJIDIC2)
567 ######################################################################
570 class ReadingLookupTable(Table):
571 """Maps reading to character IDs."""
572 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
573 create_query = ("CREATE TABLE %s "
574 "(id INTEGER PRIMARY KEY, "
575 "reading TEXT, character_id INTEGER)")
576 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
577 index_queries = [
578 "CREATE INDEX %s_reading ON %s (reading)",
583 ######################################################################
585 def parse_args():
586 from optparse import OptionParser
587 op = OptionParser(usage="%prog [options] <db_filename> [search_query]")
588 op.add_option("-i", "--initialize",
589 dest="init_fname", metavar="XML_SOURCE",
590 help=_("Initialize database from file."))
591 op.add_option("-s", "--search", action="store_true",
592 help=_("Search for kanji by readings or meanings"))
593 op.add_option("-l", "--lookup", action="store_true",
594 help=_("Look up exact character"))
595 op.add_option("-L", "--lang",
596 help=_("Specify preferred language for searching."))
597 op.add_option("-v", "--verbose", action="store_true",
598 help=_("Verbose mode (print debug strings)"))
599 options, args = op.parse_args()
600 if len(args) < 1:
601 op.print_help()
602 exit(-1)
603 if options.lookup and options.search:
604 print(_("Cannot --lookup and --search at the same time."),
605 file=sys.stderr)
606 exit(-1)
607 return (options, args)
609 def main():
610 options, args = parse_args()
611 db_fname = args[0]
613 if options.init_fname is not None:
614 db = Database(db_fname, init_from_file=options.init_fname)
615 else:
616 db = Database(db_fname)
618 results = []
619 if len(args) <= 1:
620 # No search was requested; we can exit here.
621 return
623 if options.search == True:
624 # Do search
625 # To be nice, we'll join all remaining args with spaces.
626 search_query = " ".join(args[1:])
628 if options.lang is not None:
629 results = db.search(search_query,
630 lang=options.lang, options=options)
631 else:
632 results = db.search(search_query, options=options)
633 elif options.lookup == True:
634 # Do lookup
635 encoding = get_encoding()
636 lookup_query = args[1].decode(encoding)
637 results = []
638 for character in lookup_query:
639 result = db.search_by_literal(character)
640 if result is not None:
641 results.append(result)
642 else:
643 # No lookup
644 print(_("For searches or lookups, the --search or --lookup flag is "
645 "required."))
646 return
648 # To do: visualize results
649 # Not as important; now we know we can at least do our needed
650 # lookups...
651 if len(results) > 0:
652 encoding = get_encoding()
653 # DEBUG: until lookup_by_id is implemented, this will work.
654 for index, result in enumerate(results):
655 index += 1
656 print(_("[Entry %d]") % index)
658 print(unicode(result).encode(encoding))
659 print()
660 else:
661 print(_("No results found."))
663 if __name__ == "__main__":
664 main()