Added search-by-index support to kd2.py.
[jblite.git] / jblite / kd2.py
blobaeee0ae825503fa3145c7eda41ec8d70401c6ef4
1 # -*- coding: utf-8 -*-
2 from __future__ import print_function
3 from __future__ import with_statement
5 import os, sys, re, sqlite3, time
6 from cStringIO import StringIO
7 from xml.etree.cElementTree import ElementTree
8 from helpers import gzread
9 from db import Database as BaseDatabase
10 from table import Table, ChildTable, KeyValueTable
12 import gettext
13 #t = gettext.translation("jblite")
14 #_ = t.ugettext
15 gettext.install("jblite")
18 # This method of getting the encoding might not be the best...
19 # but it works for now, and avoids hacks with
20 # setdefaultencoding.
21 get_encoding = sys.getfilesystemencoding
24 class Entry(object):
26 def __init__(self, record):
27 self._record = record
29 def __unicode__(self):
30 """Basic string representation of the entry."""
31 # *** TO DO ***
32 return unicode(self._record)
34 def __repr__(self):
35 return repr(self._record)
38 class Database(BaseDatabase):
40 """Top level object for SQLite 3-based KANJIDIC2 database."""
42 entry_class = Entry
43 table_map = {
44 u"character": {
45 u"codepoint": {},
46 u"radical": {},
47 u"stroke_count": {},
48 u"variant": {},
49 u"rad_name": {},
50 u"dic_number": {},
51 u"query_code": {},
52 u"rmgroup": {
53 u"reading": {},
54 u"meaning": {},
56 u"nanori": {},
60 def __init__(self, filename, init_from_file=None):
61 self.conn = sqlite3.connect(filename)
62 self.conn.row_factory = sqlite3.Row # keyword accessors for rows
63 self.cursor = self.conn.cursor()
64 self.tables = self._create_table_objects()
65 if init_from_file is not None:
66 raw_data = gzread(init_from_file)
68 infile = StringIO(raw_data)
69 etree = ElementTree(file=infile)
70 infile.close()
72 # Create the core database
73 self._create_new_tables()
74 self._populate_database(etree)
75 self.conn.commit()
77 # Create supplemental indices
78 self._create_index_tables()
79 self.conn.commit()
81 def search(self, query, lang=None):
82 encoding = get_encoding()
83 wrapped_query = "%%%s%%" % query # Wrap in wildcards
84 unicode_query = wrapped_query.decode(encoding)
86 if os.name == "nt":
87 print(u"Searching for \"%s\", lang=%s..." %
88 (unicode_query, repr(lang)),
89 file=sys.stderr)
91 # Do some search stuff here...
93 # 1. Find by reading
95 entries_r = self.search_by_reading(unicode_query)
96 entries_m = self.search_by_meaning(unicode_query,
97 lang=lang)
98 entries_n = self.search_by_nanori(unicode_query)
99 entries_i = self.search_by_indices(unicode_query, lang=lang)
101 # DEBUG CODE
102 print("READINGS:")
103 if len(entries_r) == 0:
104 print("No 'reading' results found.")
105 for ent_id, literal in entries_r:
106 try:
107 print(u"ID: %d, literal: %s" % (ent_id, literal))
108 except UnicodeEncodeError:
109 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
111 print("NANORI:")
112 if len(entries_n) == 0:
113 print("No 'nanori' results found.")
114 for ent_id, literal in entries_n:
115 try:
116 print(u"ID: %d, literal: %s" % (ent_id, literal))
117 except UnicodeEncodeError:
118 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
120 print("MEANINGS:")
121 if len(entries_m) == 0:
122 print("No 'meaning' results found.")
123 for ent_id, literal in entries_m:
124 try:
125 print(u"ID: %d, literal: %s" % (ent_id, literal))
126 except UnicodeEncodeError:
127 print(u"ID: %d, literal (repr): %s" % (ent_id, repr(literal)))
129 print("INDICES:")
130 if len(entries_i) == 0:
131 print("No indexed results found.")
132 for ent_id in entries_i:
133 print(u"ID: %d" % (ent_id,))
135 # Results: character IDs
136 results = list(sorted([row[0] for row in
137 (entries_r + entries_m + entries_n)]
138 + entries_i))
140 return results
142 def search_by_reading(self, query):
143 # reading -> rmgroup -> character
144 self.cursor.execute(
145 "SELECT id, literal FROM character WHERE id IN "
146 "(SELECT fk FROM rmgroup WHERE id IN "
147 "(SELECT fk FROM reading WHERE value LIKE ?))", (query,))
148 rows = self.cursor.fetchall()
149 return rows
151 def search_by_nanori(self, query):
152 # nanori -> character
153 self.cursor.execute(
154 "SELECT id, literal FROM character WHERE id IN "
155 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query,))
156 rows = self.cursor.fetchall()
157 return rows
159 def search_by_meaning(self, query, lang=None):
160 # meaning -> rmgroup -> character
161 if lang is None:
162 self.cursor.execute(
163 "SELECT id, literal FROM character WHERE id IN "
164 "(SELECT fk FROM rmgroup WHERE id IN "
165 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query,))
166 else:
167 self.cursor.execute(
168 "SELECT id, literal FROM character WHERE id IN "
169 "(SELECT fk FROM rmgroup WHERE id IN "
170 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
171 (lang, query))
172 rows = self.cursor.fetchall()
173 return rows
175 def search_by_indices(self, query, lang=None):
176 # Get IDs from index table
177 # Note: lang is currently unused.
178 self.cursor.execute(
179 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
180 (query,))
181 rows = self.cursor.fetchall()
182 return [row[0] for row in rows]
184 def search_by_literal(self, literal):
185 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
186 self.cursor.execute("SELECT id FROM character WHERE literal = ?",
187 (literal,))
188 rows = self.cursor.fetchall()
189 if len(rows) < 1:
190 return None
191 else:
192 return rows[0][0]
194 def lookup(self, id):
195 return BaseDatabase.lookup(self, "character", id)
197 def _create_table_objects(self):
198 """Creates table objects.
200 Returns a dictionary of table name to table object.
203 class_mappings = {
204 "header": HeaderTable,
205 "character": CharacterTable,
206 "codepoint": TypeValueTable,
207 "radical": TypeValueTable,
208 "stroke_count": StrokeCountTable,
209 "variant": TypeValueTable,
210 "rad_name": KeyValueTable,
211 "dic_number": DicNumberTable,
212 "query_code": QueryCodeTable,
213 "rmgroup": RMGroupTable,
214 "reading": ReadingTable,
215 "meaning": MeaningTable,
216 "nanori": KeyValueTable,
219 # Create all table objects
220 table_mappings = {}
221 for tbl, cls in class_mappings.iteritems():
222 table_mappings[tbl] = cls(self.cursor, tbl)
224 return table_mappings
226 def _create_new_tables(self):
227 """(Re)creates the database tables."""
228 for tbl, tbl_obj in self.tables.iteritems():
229 self._drop_table(tbl)
230 tbl_obj.create()
232 def _populate_database(self, etree):
233 """Imports XML data into SQLite database.
235 table_d: table to table_object dictionary
236 etree: ElementTree object for KANJIDIC2
239 # Grab header
240 header = etree.find("header")
241 file_ver = header.find("file_version").text
242 db_ver = header.find("database_version").text
243 date = header.find("date_of_creation").text
244 self.tables['header'].insert(file_ver, db_ver, date)
246 # Iterate through characters
247 for character in etree.findall("character"):
248 # Character table
249 literal = character.find("literal").text
251 # Grab misc node - we'll store a few things from it in the
252 # main character table, too.
253 misc = character.find("misc")
254 grade = misc.find("grade")
255 grade = int(grade.text) if grade is not None else None
256 freq = misc.find("freq")
257 freq = int(freq.text) if freq is not None else None
258 jlpt = misc.find("jlpt")
259 jlpt = int(jlpt.text) if jlpt is not None else None
261 char_id = self.tables['character'].insert(literal, grade,
262 freq, jlpt)
264 table = self.tables['codepoint']
265 codepoint = character.find("codepoint")
266 for cp_value in codepoint.findall("cp_value"):
267 value = cp_value.text
268 cp_type = cp_value.get("cp_type")
269 table.insert(char_id, cp_type, value)
271 table = self.tables['radical']
272 radical = character.find("radical")
273 for rad_value in radical.findall("rad_value"):
274 value = rad_value.text
275 rad_type = rad_value.get("rad_type")
276 table.insert(char_id, rad_type, value)
278 # Tables generated from <misc> begin here
279 table = self.tables['stroke_count']
280 for stroke_count in misc.findall("stroke_count"):
281 count = int(stroke_count.text)
282 table.insert(char_id, count)
284 table = self.tables['variant']
285 for variant in misc.findall("variant"):
286 value = variant.text
287 var_type = variant.get("var_type")
288 table.insert(char_id, var_type, value)
290 table = self.tables['rad_name']
291 for rad_name in misc.findall("rad_name"):
292 value = rad_name.text
293 table.insert(char_id, value)
295 # Remaining direct descendents of <character>...
296 dic_number = character.find("dic_number")
297 if dic_number is not None:
298 table = self.tables['dic_number']
299 for dic_ref in dic_number.findall("dic_ref"):
300 dr_type = dic_ref.get("dr_type")
301 m_vol = dic_ref.get("m_vol", None)
302 m_page = dic_ref.get("m_page", None)
303 value = dic_ref.text
304 table.insert(char_id, dr_type, m_vol, m_page, value)
306 query_code = character.find("query_code")
307 if query_code is not None:
308 table = self.tables['query_code']
309 for q_code in query_code.findall("q_code"):
310 qc_type = q_code.get("qc_type")
311 skip_misclass = q_code.get("skip_misclass", None)
312 value = q_code.text
313 table.insert(char_id, qc_type, skip_misclass, value)
315 reading_meaning = character.find("reading_meaning")
316 if reading_meaning is not None:
317 table = self.tables['rmgroup']
318 for rmgroup in reading_meaning.findall("rmgroup"):
319 group_id = table.insert(char_id)
320 table = self.tables['reading']
321 for reading in rmgroup.findall("reading"):
322 r_type = reading.get("r_type")
323 on_type = reading.get("on_type")
324 r_status = reading.get("r_status")
325 value = reading.text
326 table.insert(group_id, r_type, on_type, r_status, value)
327 table = self.tables['meaning']
328 for meaning in rmgroup.findall("meaning"):
329 lang = meaning.get("m_lang", "en")
330 value = meaning.text
331 table.insert(group_id, lang, value)
332 table = self.tables['nanori']
333 for nanori in reading_meaning.findall("nanori"):
334 table.insert(char_id, nanori.text)
336 def _drop_table(self, name):
337 self.cursor.execute("DROP TABLE IF EXISTS %s" % name)
339 def _create_index_tables(self):
340 """Creates extra tables to help with common searches.
342 Supplementary tables include:
344 1. Reading search table: kun-yomi to character ID. Kun-yomi
345 is modified for easier searching (no "." or "-" markers).
348 self._create_reading_search_table()
350 def _create_reading_search_table(self):
351 """Creates "sanitized" reading to character ID search table."""
353 # Mapping is from reading to character ID...
354 # r.fk -> rg.id, rg.fk -> c.id.
355 query = (
356 "SELECT r.value, c.id "
357 "FROM reading r, rmgroup rg, character c "
358 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
360 self.cursor.execute(query)
361 rows = self.cursor.fetchall()
362 values, ids = zip(*rows) # unzip idiom (see zip doc)
364 # Sanitize strings by removing "." and "-"
365 values = [value.replace(u".", u"").replace(u"-", u"")
366 for value in values]
368 # Create new table
369 tbl_name = "kunyomi_lookup"
370 self.tables[tbl_name] = tbl = ReadingLookupTable(self.cursor, tbl_name)
371 self._drop_table(tbl_name)
372 tbl.create()
374 # Store all sanitized strings and their keys in the table
375 rows = zip(values, ids)
376 tbl.insertmany(rows)
379 ######################################################################
380 # KANJIDIC2 data tables
381 ######################################################################
384 class HeaderTable(Table):
385 create_query = ("CREATE TABLE %s "
386 "(file_version TEXT, "
387 "database_version TEXT, "
388 "date_of_creation TEXT)")
389 insert_query = "INSERT INTO %s VALUES (?, ?, ?)"
392 class CharacterTable(Table):
393 create_query = ("CREATE TABLE %s "
394 "(id INTEGER PRIMARY KEY, literal TEXT, "
395 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
396 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
397 index_queries = [
398 "CREATE INDEX %s_literal ON %s (literal)",
402 class TypeValueTable(ChildTable):
403 create_query = ("CREATE TABLE %s "
404 "(id INTEGER PRIMARY KEY, fk INTEGER, "
405 "type TEXT, value TEXT)")
406 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
407 index_queries = [
408 "CREATE INDEX %s_fk ON %s (fk)",
412 class StrokeCountTable(ChildTable):
413 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
414 "fk INTEGER, count INTEGER)")
415 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
416 index_queries = [
417 "CREATE INDEX %s_fk ON %s (fk)",
421 class DicNumberTable(ChildTable):
422 create_query = ("CREATE TABLE %s "
423 "(id INTEGER PRIMARY KEY, fk INTEGER, "
424 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
425 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
426 index_queries = [
427 "CREATE INDEX %s_fk ON %s (fk)",
431 class QueryCodeTable(ChildTable):
432 create_query = ("CREATE TABLE %s "
433 "(id INTEGER PRIMARY KEY, fk INTEGER, "
434 "type TEXT, skip_misclass TEXT, value TEXT)")
435 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
436 index_queries = [
437 "CREATE INDEX %s_fk ON %s (fk)",
441 class RMGroupTable(ChildTable):
442 create_query = ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
443 insert_query = "INSERT INTO %s VALUES (NULL, ?)"
444 index_queries = [
445 "CREATE INDEX %s_fk ON %s (fk)",
448 class ReadingTable(ChildTable):
449 create_query = ("CREATE TABLE %s "
450 "(id INTEGER PRIMARY KEY, fk INTEGER, "
451 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
452 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
453 index_queries = [
454 "CREATE INDEX %s_fk ON %s (fk)",
455 "CREATE INDEX %s_value ON %s (value)",
459 class MeaningTable(ChildTable):
460 create_query = ("CREATE TABLE %s "
461 "(id INTEGER PRIMARY KEY, fk INTEGER, "
462 "lang TEXT, value TEXT)")
463 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
464 index_queries = [
465 "CREATE INDEX %s_fk ON %s (fk)",
466 "CREATE INDEX %s_lang_value ON %s (lang, value)",
470 ######################################################################
471 # Index tables (not part of actual KANJIDIC2)
472 ######################################################################
475 class ReadingLookupTable(Table):
476 """Maps reading to character IDs."""
477 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
478 create_query = ("CREATE TABLE %s "
479 "(id INTEGER PRIMARY KEY, "
480 "reading TEXT, character_id INTEGER)")
481 insert_query = "INSERT INTO %s VALUES (NULL, ?, ?)"
482 index_queries = [
483 "CREATE INDEX %s_reading ON %s (reading)",
488 ######################################################################
490 def parse_args():
491 from optparse import OptionParser
492 op = OptionParser(usage="%prog [options] <db_filename> [search_query]")
493 op.add_option("-i", "--initialize",
494 dest="init_fname", metavar="XML_SOURCE",
495 help=_("Initialize database from file."))
496 op.add_option("-s", "--search", action="store_true",
497 help=_("Search for kanji by readings or meanings"))
498 op.add_option("-l", "--lookup", action="store_true",
499 help=_("Look up exact character"))
500 op.add_option("-L", "--lang",
501 help=_("Specify preferred language for searching."))
502 options, args = op.parse_args()
503 if len(args) < 1:
504 op.print_help()
505 exit(-1)
506 if options.lookup and options.search:
507 print(_("Cannot --lookup and --search at the same time."),
508 file=sys.stderr)
509 exit(-1)
510 return (options, args)
512 def main():
513 options, args = parse_args()
514 db_fname = args[0]
516 if options.init_fname is not None:
517 db = Database(db_fname, init_from_file=options.init_fname)
518 else:
519 db = Database(db_fname)
521 results = []
522 if len(args) <= 1:
523 # No search was requested; we can exit here.
524 return
526 if options.search == True:
527 # Do search
528 # To be nice, we'll join all remaining args with spaces.
529 search_query = " ".join(args[1:])
531 if options.lang is not None:
532 results = db.search(search_query, lang=options.lang)
533 else:
534 results = db.search(search_query)
535 elif options.lookup == True:
536 # Do lookup
537 encoding = get_encoding()
538 lookup_query = args[1].decode(encoding)
539 results = []
540 for character in lookup_query:
541 result = db.search_by_literal(character)
542 if result is not None:
543 results.append(result)
544 else:
545 # No lookup
546 print(_("For searches or lookups, the --search or --lookup flag is "
547 "required."))
548 return
550 # To do: visualize results
551 # Not as important; now we know we can at least do our needed
552 # lookups...
553 if len(results) > 0:
554 from pprint import pprint
555 # DEBUG: until lookup_by_id is implemented, this will work.
556 pprint(results)
557 for result in results:
558 entry = db.lookup(result)
559 pprint(entry)
560 print()
561 else:
562 print(_("No results found."))
564 if __name__ == "__main__":
565 main()