jblite/kd2.py

   1 # -*- coding: utf-8 -*-
   2 from __future__ import print_function
   3 from __future__ import with_statement
   4
   5 import os, sys, re, sqlite3, time
   6 from cStringIO import StringIO
   7 from xml.etree.cElementTree import ElementTree
   8 from helpers import gzread
   9 from db import Database as BaseDatabase
  10 from table import Table, ChildTable, KeyValueTable
  11
  12 import gettext
  13 #t = gettext.translation("jblite")
  14 #_ = t.ugettext
  15 gettext.install("jblite")
  16
  17
  18 # This method of getting the encoding might not be the best...
  19 # but it works for now, and avoids hacks with
  20 # setdefaultencoding.
  21 get_encoding = sys.getfilesystemencoding
  22
  23
  24 class Entry(object):
  25
  26     def __init__(self, record):
  27         self._record = record
  28
  29     def __unicode__(self):
  30         """Basic string representation of the entry."""
  31         # *** TO DO ***
  32         return unicode(self._record)
  33
  34     def __repr__(self):
  35         return repr(self._record)
  36
  37
  38 class Database(BaseDatabase):
  39
  40     """Top level object for SQLite 3-based KANJIDIC2 database."""
  41
  42     entry_class = Entry
  43     table_map = {
  44         u"character": {
  45             u"codepoint": {},
  46             u"radical": {},
  47             u"stroke_count": {},
  48             u"variant": {},
  49             u"rad_name": {},
  50             u"dic_number": {},
  51             u"query_code": {},
  52             u"rmgroup": {
  53                 u"reading": {},
  54                 u"meaning": {},
  55                 },
  56             u"nanori": {},
  57             }
  58         }
  59
  60     def __init__(self, filename, init_from_file=None):
  61         self.conn = sqlite3.connect(filename)
  62         self.conn.row_factory = sqlite3.Row  # keyword accessors for rows
  63         self.cursor = self.conn.cursor()
  64         self.tables = self._create_table_objects()
  65         if init_from_file is not None:
  66             raw_data = gzread(init_from_file)
  67
  68             infile = StringIO(raw_data)
  69             etree = ElementTree(file=infile)
  70             infile.close()
  71
  72             # Create the core database
  73             self._create_new_tables()
  74             self._populate_database(etree)
  75             self.conn.commit()
  76
  77             # Create supplemental indices
  78             self._create_index_tables()
  79             self.conn.commit()
  80
  81     def search(self, query, lang=None):
  82         encoding = get_encoding()
  83         wrapped_query = "%%%s%%" % query  # Wrap in wildcards
  84         unicode_query = wrapped_query.decode(encoding)
  85
  86         if os.name == "nt":
  87             print(u"Searching for \"%s\", lang=%s..." %
  88                   (unicode_query, repr(lang)),
  89                   file=sys.stderr)
  90
  91         # Do some search stuff here...
  92
  93         # 1. Find by reading
  94
  95         entries_r = self.search_by_reading(unicode_query)
  96         entries_m = self.search_by_meaning(unicode_query,
  97                                            lang=lang)
  98         entries_n = self.search_by_nanori(unicode_query)
  99         entries_i = self.search_by_indices(unicode_query, lang=lang)
 100
 101         # DEBUG CODE
 102         print("READINGS:")
 103         if len(entries_r) == 0:
 104             print("No 'reading' results found.")
 105         for ent_id, literal in entries_r:
 106             try:
 107                 print(u"ID: %d, literal: %s" % (ent_id, literal))
 108             except UnicodeEncodeError:
 109                 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
 110
 111         print("NANORI:")
 112         if len(entries_n) == 0:
 113             print("No 'nanori' results found.")
 114         for ent_id, literal in entries_n:
 115             try:
 116                 print(u"ID: %d, literal: %s" % (ent_id, literal))
 117             except UnicodeEncodeError:
 118                 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
 119
 120         print("MEANINGS:")
 121         if len(entries_m) == 0:
 122             print("No 'meaning' results found.")
 123         for ent_id, literal in entries_m:
 124             try:
 125                 print(u"ID: %d, literal: %s" % (ent_id, literal))
 126             except UnicodeEncodeError:
 127                 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
 128
 129         print("INDICES:")
 130         if len(entries_i) == 0:
 131             print("No indexed results found.")
 132         for ent_id in entries_i:
 133             print(u"ID: %d" % (ent_id,))
 134
 135         # Results: character IDs
 136         results = list(sorted([row[0] for row in
 137                                (entries_r + entries_m + entries_n)]
 138                               + entries_i))
 139
 140         return results
 141
 142     def search_by_reading(self, query):
 143         # reading -> rmgroup -> character
 144         self.cursor.execute(
 145             "SELECT id, literal FROM character WHERE id IN "
 146             "(SELECT fk FROM rmgroup WHERE id IN "
 147             "(SELECT fk FROM reading WHERE value LIKE ?))", (query,))
 148         rows = self.cursor.fetchall()
 149         return rows
 150
 151     def search_by_nanori(self, query):
 152         # nanori -> character
 153         self.cursor.execute(
 154             "SELECT id, literal FROM character WHERE id IN "
 155             "(SELECT fk FROM nanori WHERE value LIKE ?)", (query,))
 156         rows = self.cursor.fetchall()
 157         return rows
 158
 159     def search_by_meaning(self, query, lang=None):
 160         # meaning -> rmgroup -> character
 161         if lang is None:
 162             self.cursor.execute(
 163                 "SELECT id, literal FROM character WHERE id IN "
 164                 "(SELECT fk FROM rmgroup WHERE id IN "
 165                 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query,))
 166         else:
 167             self.cursor.execute(
 168                 "SELECT id, literal FROM character WHERE id IN "
 169                 "(SELECT fk FROM rmgroup WHERE id IN "
 170                 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
 171                 (lang, query))
 172         rows = self.cursor.fetchall()
 173         return rows
 174
 175     def search_by_indices(self, query, lang=None):
 176         # Get IDs from index table
 177         # Note: lang is currently unused.
 178         self.cursor.execute(
 179             "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
 180             (query,))
 181         rows = self.cursor.fetchall()
 182         return [row[0] for row in rows]
 183
 184     def search_by_literal(self, literal):
 185         # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
 186         self.cursor.execute("SELECT id FROM character WHERE literal = ?",
 187                             (literal,))
 188         rows = self.cursor.fetchall()
 189         if len(rows) < 1:
 190             return None
 191         else:
 192             return rows[0][0]
 193
 194     def lookup(self, id):
 195         return BaseDatabase.lookup(self, "character", id)
 196
 197     def _create_table_objects(self):
 198         """Creates table objects.
 199
 200         Returns a dictionary of table name to table object.
 201
 202         """
 203         class_mappings = {
 204             "header": HeaderTable,
 205             "character": CharacterTable,
 206             "codepoint": TypeValueTable,
 207             "radical": TypeValueTable,
 208             "stroke_count": StrokeCountTable,
 209             "variant": TypeValueTable,
 210             "rad_name": KeyValueTable,
 211             "dic_number": DicNumberTable,
 212             "query_code": QueryCodeTable,
 213             "rmgroup": RMGroupTable,
 214             "reading": ReadingTable,
 215             "meaning": MeaningTable,
 216             "nanori": KeyValueTable,
 217             }
 218
 219         # Create all table objects
 220         table_mappings = {}
 221         for tbl, cls in class_mappings.iteritems():
 222             table_mappings[tbl] = cls(self.cursor, tbl)
 223
 224         return table_mappings
 225
 226     def _create_new_tables(self):
 227         """(Re)creates the database tables."""
 228         for tbl, tbl_obj in self.tables.iteritems():
 229             self._drop_table(tbl)
 230             tbl_obj.create()
 231
 232     def _populate_database(self, etree):
 233         """Imports XML data into SQLite database.
 234
 235         table_d: table to table_object dictionary
 236         etree: ElementTree object for KANJIDIC2
 237
 238         """
 239         # Grab header
 240         header = etree.find("header")
 241         file_ver = header.find("file_version").text
 242         db_ver = header.find("database_version").text
 243         date = header.find("date_of_creation").text
 244         self.tables['header'].insert(file_ver, db_ver, date)
 245
 246         # Iterate through characters
 247         for character in etree.findall("character"):
 248             # Character table
 249             literal = character.find("literal").text
 250
 251             # Grab misc node - we'll store a few things from it in the
 252             # main character table, too.
 253             misc = character.find("misc")
 254             grade = misc.find("grade")
 255             grade = int(grade.text) if grade is not None else None
 256             freq = misc.find("freq")
 257             freq = int(freq.text) if freq is not None else None
 258             jlpt = misc.find("jlpt")
 259             jlpt = int(jlpt.text) if jlpt is not None else None
 260
 261             char_id = self.tables['character'].insert(literal, grade,
 262                                                       freq, jlpt)
 263
 264             table = self.tables['codepoint']
 265             codepoint = character.find("codepoint")
 266             for cp_value in codepoint.findall("cp_value"):
 267                 value = cp_value.text
 268                 cp_type = cp_value.get("cp_type")
 269                 table.insert(char_id, cp_type, value)
 270
 271             table = self.tables['radical']
 272             radical = character.find("radical")
 273             for rad_value in radical.findall("rad_value"):
 274                 value = rad_value.text
 275                 rad_type = rad_value.get("rad_type")
 276                 table.insert(char_id, rad_type, value)
 277
 278             # Tables generated from <misc> begin here
 279             table = self.tables['stroke_count']
 280             for stroke_count in misc.findall("stroke_count"):
 281                 count = int(stroke_count.text)
 282                 table.insert(char_id, count)
 283
 284             table = self.tables['variant']
 285             for variant in misc.findall("variant"):
 286                 value = variant.text
 287                 var_type = variant.get("var_type")
 288                 table.insert(char_id, var_type, value)
 289
 290             table = self.tables['rad_name']
 291             for rad_name in misc.findall("rad_name"):
 292                 value = rad_name.text
 293                 table.insert(char_id, value)
 294
 295             # Remaining direct descendents of <character>...
 296             dic_number = character.find("dic_number")
 297             if dic_number is not None:
 298                 table = self.tables['dic_number']
 299                 for dic_ref in dic_number.findall("dic_ref"):
 300                     dr_type = dic_ref.get("dr_type")
 301                     m_vol = dic_ref.get("m_vol", None)
 302                     m_page = dic_ref.get("m_page", None)
 303                     value = dic_ref.text
 304                     table.insert(char_id, dr_type, m_vol, m_page, value)
 305
 306             query_code = character.find("query_code")
 307             if query_code is not None:
 308                 table = self.tables['query_code']
 309                 for q_code in query_code.findall("q_code"):
 310                     qc_type = q_code.get("qc_type")
 311                     skip_misclass = q_code.get("skip_misclass", None)
 312                     value = q_code.text
 313                     table.insert(char_id, qc_type, skip_misclass, value)
 314
 315             reading_meaning = character.find("reading_meaning")
 316             if reading_meaning is not None:
 317                 table = self.tables['rmgroup']
 318                 for rmgroup in reading_meaning.findall("rmgroup"):
 319                     group_id = table.insert(char_id)
 320                     table = self.tables['reading']
 321                     for reading in rmgroup.findall("reading"):
 322                         r_type = reading.get("r_type")
 323                         on_type = reading.get("on_type")
 324                         r_status = reading.get("r_status")
 325                         value = reading.text
 326                         table.insert(group_id, r_type, on_type, r_status, value)
 327                     table = self.tables['meaning']
 328                     for meaning in rmgroup.findall("meaning"):
 329                         lang = meaning.get("m_lang", "en")
 330                         value = meaning.text
 331                         table.insert(group_id, lang, value)
 332                 table = self.tables['nanori']
 333                 for nanori in reading_meaning.findall("nanori"):
 334                     table.insert(char_id, nanori.text)
 335
 336     def _drop_table(self, name):
 337         self.cursor.execute("DROP TABLE IF EXISTS %s" % name)
 338
 339     def _create_index_tables(self):
 340         """Creates extra tables to help with common searches.
 341
 342         Supplementary tables include:
 343
 344         1. Reading search table: kun-yomi to character ID.  Kun-yomi
 345            is modified for easier searching (no "." or "-" markers).
 346
 347         """
 348         self._create_reading_search_table()
 349
 350     def _create_reading_search_table(self):
 351         """Creates "sanitized" reading to character ID search table."""
 352
 353         # Mapping is from reading to character ID...
 354         # r.fk -> rg.id, rg.fk -> c.id.
 355         query = (
 356             "SELECT r.value, c.id "
 357             "FROM reading r, rmgroup rg, character c "
 358             'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
 359             )
 360         self.cursor.execute(query)
 361         rows = self.cursor.fetchall()
 362         values, ids = zip(*rows)  # unzip idiom (see zip doc)
 363
 364         # Sanitize strings by removing "." and "-"
 365         values = [value.replace(u".", u"").replace(u"-", u"")
 366                   for value in values]
 367
 368         # Create new table
 369         tbl_name = "kunyomi_lookup"
 370         self.tables[tbl_name] = tbl = ReadingLookupTable(self.cursor, tbl_name)
 371         self._drop_table(tbl_name)
 372         tbl.create()
 373
 374         # Store all sanitized strings and their keys in the table
 375         rows = zip(values, ids)
 376         tbl.insertmany(rows)
 377
 378
 379 ######################################################################
 380 # KANJIDIC2 data tables
 381 ######################################################################
 382
 383
 384 class HeaderTable(Table):
 385     create_query = ("CREATE TABLE %s "
 386                     "(file_version TEXT, "
 387                     "database_version TEXT, "
 388                     "date_of_creation TEXT)")
 389     insert_query = "INSERT INTO %s VALUES (?, ?, ?)"
 390
 391
 392 class CharacterTable(Table):
 393     create_query = ("CREATE TABLE %s "
 394                     "(id INTEGER PRIMARY KEY, literal TEXT, "
 395                     "grade INTEGER, freq INTEGER, jlpt INTEGER)")
 396     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
 397     index_queries = [
 398         "CREATE INDEX %s_literal ON %s (literal)",
 399         ]
 400
 401
 402 class TypeValueTable(ChildTable):
 403     create_query = ("CREATE TABLE %s "
 404                     "(id INTEGER PRIMARY KEY, fk INTEGER, "
 405                     "type TEXT, value TEXT)")
 406     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
 407     index_queries = [
 408         "CREATE INDEX %s_fk ON %s (fk)",
 409         ]
 410
 411
 412 class StrokeCountTable(ChildTable):
 413     create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
 414                     "fk INTEGER, count INTEGER)")
 415     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
 416     index_queries = [
 417         "CREATE INDEX %s_fk ON %s (fk)",
 418         ]
 419
 420
 421 class DicNumberTable(ChildTable):
 422     create_query = ("CREATE TABLE %s "
 423                     "(id INTEGER PRIMARY KEY, fk INTEGER, "
 424                     "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
 425     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
 426     index_queries = [
 427         "CREATE INDEX %s_fk ON %s (fk)",
 428         ]
 429
 430
 431 class QueryCodeTable(ChildTable):
 432     create_query = ("CREATE TABLE %s "
 433                     "(id INTEGER PRIMARY KEY, fk INTEGER, "
 434                     "type TEXT, skip_misclass TEXT, value TEXT)")
 435     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
 436     index_queries = [
 437         "CREATE INDEX %s_fk ON %s (fk)",
 438         ]
 439
 440
 441 class RMGroupTable(ChildTable):
 442     create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
 443     insert_query = "INSERT INTO %s VALUES (NULL, ?)"
 444     index_queries = [
 445         "CREATE INDEX %s_fk ON %s (fk)",
 446         ]
 447
 448 class ReadingTable(ChildTable):
 449     create_query = ("CREATE TABLE %s "
 450                     "(id INTEGER PRIMARY KEY, fk INTEGER, "
 451                     "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
 452     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
 453     index_queries = [
 454         "CREATE INDEX %s_fk ON %s (fk)",
 455         "CREATE INDEX %s_value ON %s (value)",
 456         ]
 457
 458
 459 class MeaningTable(ChildTable):
 460     create_query = ("CREATE TABLE %s "
 461                     "(id INTEGER PRIMARY KEY, fk INTEGER, "
 462                     "lang TEXT, value TEXT)")
 463     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
 464     index_queries = [
 465         "CREATE INDEX %s_fk ON %s (fk)",
 466         "CREATE INDEX %s_lang_value ON %s (lang, value)",
 467         ]
 468
 469
 470 ######################################################################
 471 # Index tables (not part of actual KANJIDIC2)
 472 ######################################################################
 473
 474
 475 class ReadingLookupTable(Table):
 476     """Maps reading to character IDs."""
 477     # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
 478     create_query = ("CREATE TABLE %s "
 479                     "(id INTEGER PRIMARY KEY, "
 480                     "reading TEXT, character_id INTEGER)")
 481     insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
 482     index_queries = [
 483         "CREATE INDEX %s_reading ON %s (reading)",
 484         ]
 485
 486
 487
 488 ######################################################################
 489
 490 def parse_args():
 491     from optparse import OptionParser
 492     op = OptionParser(usage="%prog [options] <db_filename> [search_query]")
 493     op.add_option("-i", "--initialize",
 494                   dest="init_fname", metavar="XML_SOURCE",
 495                   help=_("Initialize database from file."))
 496     op.add_option("-s", "--search", action="store_true",
 497                   help=_("Search for kanji by readings or meanings"))
 498     op.add_option("-l", "--lookup", action="store_true",
 499                   help=_("Look up exact character"))
 500     op.add_option("-L", "--lang",
 501                   help=_("Specify preferred language for searching."))
 502     options, args = op.parse_args()
 503     if len(args) < 1:
 504         op.print_help()
 505         exit(-1)
 506     if options.lookup and options.search:
 507         print(_("Cannot --lookup and --search at the same time."),
 508               file=sys.stderr)
 509         exit(-1)
 510     return (options, args)
 511
 512 def main():
 513     options, args = parse_args()
 514     db_fname = args[0]
 515
 516     if options.init_fname is not None:
 517         db = Database(db_fname, init_from_file=options.init_fname)
 518     else:
 519         db = Database(db_fname)
 520
 521     results = []
 522     if len(args) <= 1:
 523         # No search was requested; we can exit here.
 524         return
 525
 526     if options.search == True:
 527         # Do search
 528         # To be nice, we'll join all remaining args with spaces.
 529         search_query = " ".join(args[1:])
 530
 531         if options.lang is not None:
 532             results = db.search(search_query, lang=options.lang)
 533         else:
 534             results = db.search(search_query)
 535     elif options.lookup == True:
 536         # Do lookup
 537         encoding = get_encoding()
 538         lookup_query = args[1].decode(encoding)
 539         results = []
 540         for character in lookup_query:
 541             result = db.search_by_literal(character)
 542             if result is not None:
 543                 results.append(result)
 544     else:
 545         # No lookup
 546         print(_("For searches or lookups, the --search or --lookup flag is "
 547                 "required."))
 548         return
 549
 550     # To do: visualize results
 551     # Not as important; now we know we can at least do our needed
 552     # lookups...
 553     if len(results) > 0:
 554         from pprint import pprint
 555         # DEBUG: until lookup_by_id is implemented, this will work.
 556         pprint(results)
 557         for result in results:
 558             entry = db.lookup(result)
 559             pprint(entry)
 560             print()
 561     else:
 562         print(_("No results found."))
 563
 564 if __name__ == "__main__":
 565     main()