1 # -*- coding: utf-8 -*-
2 from __future__
import print_function
3 from __future__
import with_statement
5 import os
, sys
, re
, sqlite3
, time
6 from cStringIO
import StringIO
7 from xml
.etree
.cElementTree
import ElementTree
8 from helpers
import gzread
9 from db
import Database
as BaseDatabase
10 from table
import Table
, ChildTable
, KeyValueTable
13 #t = gettext.translation("jblite")
15 gettext
.install("jblite")
18 # This method of getting the encoding might not be the best...
19 # but it works for now, and avoids hacks with
21 get_encoding
= sys
.getfilesystemencoding
26 def __init__(self
, record
):
29 def __unicode__(self
):
30 """Basic string representation of the entry."""
32 return unicode(self
._record
)
35 return repr(self
._record
)
38 class Database(BaseDatabase
):
40 """Top level object for SQLite 3-based KANJIDIC2 database."""
60 def __init__(self
, filename
, init_from_file
=None):
61 self
.conn
= sqlite3
.connect(filename
)
62 self
.conn
.row_factory
= sqlite3
.Row
# keyword accessors for rows
63 self
.cursor
= self
.conn
.cursor()
64 self
.tables
= self
._create
_table
_objects
()
65 if init_from_file
is not None:
66 raw_data
= gzread(init_from_file
)
68 infile
= StringIO(raw_data
)
69 etree
= ElementTree(file=infile
)
72 # Create the core database
73 self
._create
_new
_tables
()
74 self
._populate
_database
(etree
)
77 # Create supplemental indices
78 self
._create
_index
_tables
()
81 def search(self
, query
, lang
=None):
82 encoding
= get_encoding()
83 wrapped_query
= "%%%s%%" % query
# Wrap in wildcards
84 unicode_query
= wrapped_query
.decode(encoding
)
87 print(u
"Searching for \"%s\", lang=%s..." %
88 (unicode_query
, repr(lang
)),
91 # Do some search stuff here...
95 entries_r
= self
.search_by_reading(unicode_query
)
96 entries_m
= self
.search_by_meaning(unicode_query
,
98 entries_n
= self
.search_by_nanori(unicode_query
)
99 entries_i
= self
.search_by_indices(unicode_query
, lang
=lang
)
103 if len(entries_r
) == 0:
104 print("No 'reading' results found.")
105 for ent_id
, literal
in entries_r
:
107 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
108 except UnicodeEncodeError:
109 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
112 if len(entries_n
) == 0:
113 print("No 'nanori' results found.")
114 for ent_id
, literal
in entries_n
:
116 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
117 except UnicodeEncodeError:
118 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
121 if len(entries_m
) == 0:
122 print("No 'meaning' results found.")
123 for ent_id
, literal
in entries_m
:
125 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
126 except UnicodeEncodeError:
127 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
130 if len(entries_i
) == 0:
131 print("No indexed results found.")
132 for ent_id
in entries_i
:
133 print(u
"ID: %d" % (ent_id
,))
135 # Results: character IDs
136 results
= list(sorted([row
[0] for row
in
137 (entries_r
+ entries_m
+ entries_n
)]
142 def search_by_reading(self
, query
):
143 # reading -> rmgroup -> character
145 "SELECT id, literal FROM character WHERE id IN "
146 "(SELECT fk FROM rmgroup WHERE id IN "
147 "(SELECT fk FROM reading WHERE value LIKE ?))", (query
,))
148 rows
= self
.cursor
.fetchall()
151 def search_by_nanori(self
, query
):
152 # nanori -> character
154 "SELECT id, literal FROM character WHERE id IN "
155 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query
,))
156 rows
= self
.cursor
.fetchall()
159 def search_by_meaning(self
, query
, lang
=None):
160 # meaning -> rmgroup -> character
163 "SELECT id, literal FROM character WHERE id IN "
164 "(SELECT fk FROM rmgroup WHERE id IN "
165 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query
,))
168 "SELECT id, literal FROM character WHERE id IN "
169 "(SELECT fk FROM rmgroup WHERE id IN "
170 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
172 rows
= self
.cursor
.fetchall()
175 def search_by_indices(self
, query
, lang
=None):
176 # Get IDs from index table
177 # Note: lang is currently unused.
179 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
181 rows
= self
.cursor
.fetchall()
182 return [row
[0] for row
in rows
]
184 def search_by_literal(self
, literal
):
185 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
186 self
.cursor
.execute("SELECT id FROM character WHERE literal = ?",
188 rows
= self
.cursor
.fetchall()
194 def lookup(self
, id):
195 return BaseDatabase
.lookup(self
, "character", id)
197 def _create_table_objects(self
):
198 """Creates table objects.
200 Returns a dictionary of table name to table object.
204 "header": HeaderTable
,
205 "character": CharacterTable
,
206 "codepoint": TypeValueTable
,
207 "radical": TypeValueTable
,
208 "stroke_count": StrokeCountTable
,
209 "variant": TypeValueTable
,
210 "rad_name": KeyValueTable
,
211 "dic_number": DicNumberTable
,
212 "query_code": QueryCodeTable
,
213 "rmgroup": RMGroupTable
,
214 "reading": ReadingTable
,
215 "meaning": MeaningTable
,
216 "nanori": KeyValueTable
,
219 # Create all table objects
221 for tbl
, cls
in class_mappings
.iteritems():
222 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
224 return table_mappings
226 def _create_new_tables(self
):
227 """(Re)creates the database tables."""
228 for tbl
, tbl_obj
in self
.tables
.iteritems():
229 self
._drop
_table
(tbl
)
232 def _populate_database(self
, etree
):
233 """Imports XML data into SQLite database.
235 table_d: table to table_object dictionary
236 etree: ElementTree object for KANJIDIC2
240 header
= etree
.find("header")
241 file_ver
= header
.find("file_version").text
242 db_ver
= header
.find("database_version").text
243 date
= header
.find("date_of_creation").text
244 self
.tables
['header'].insert(file_ver
, db_ver
, date
)
246 # Iterate through characters
247 for character
in etree
.findall("character"):
249 literal
= character
.find("literal").text
251 # Grab misc node - we'll store a few things from it in the
252 # main character table, too.
253 misc
= character
.find("misc")
254 grade
= misc
.find("grade")
255 grade
= int(grade
.text
) if grade
is not None else None
256 freq
= misc
.find("freq")
257 freq
= int(freq
.text
) if freq
is not None else None
258 jlpt
= misc
.find("jlpt")
259 jlpt
= int(jlpt
.text
) if jlpt
is not None else None
261 char_id
= self
.tables
['character'].insert(literal
, grade
,
264 table
= self
.tables
['codepoint']
265 codepoint
= character
.find("codepoint")
266 for cp_value
in codepoint
.findall("cp_value"):
267 value
= cp_value
.text
268 cp_type
= cp_value
.get("cp_type")
269 table
.insert(char_id
, cp_type
, value
)
271 table
= self
.tables
['radical']
272 radical
= character
.find("radical")
273 for rad_value
in radical
.findall("rad_value"):
274 value
= rad_value
.text
275 rad_type
= rad_value
.get("rad_type")
276 table
.insert(char_id
, rad_type
, value
)
278 # Tables generated from <misc> begin here
279 table
= self
.tables
['stroke_count']
280 for stroke_count
in misc
.findall("stroke_count"):
281 count
= int(stroke_count
.text
)
282 table
.insert(char_id
, count
)
284 table
= self
.tables
['variant']
285 for variant
in misc
.findall("variant"):
287 var_type
= variant
.get("var_type")
288 table
.insert(char_id
, var_type
, value
)
290 table
= self
.tables
['rad_name']
291 for rad_name
in misc
.findall("rad_name"):
292 value
= rad_name
.text
293 table
.insert(char_id
, value
)
295 # Remaining direct descendents of <character>...
296 dic_number
= character
.find("dic_number")
297 if dic_number
is not None:
298 table
= self
.tables
['dic_number']
299 for dic_ref
in dic_number
.findall("dic_ref"):
300 dr_type
= dic_ref
.get("dr_type")
301 m_vol
= dic_ref
.get("m_vol", None)
302 m_page
= dic_ref
.get("m_page", None)
304 table
.insert(char_id
, dr_type
, m_vol
, m_page
, value
)
306 query_code
= character
.find("query_code")
307 if query_code
is not None:
308 table
= self
.tables
['query_code']
309 for q_code
in query_code
.findall("q_code"):
310 qc_type
= q_code
.get("qc_type")
311 skip_misclass
= q_code
.get("skip_misclass", None)
313 table
.insert(char_id
, qc_type
, skip_misclass
, value
)
315 reading_meaning
= character
.find("reading_meaning")
316 if reading_meaning
is not None:
317 table
= self
.tables
['rmgroup']
318 for rmgroup
in reading_meaning
.findall("rmgroup"):
319 group_id
= table
.insert(char_id
)
320 table
= self
.tables
['reading']
321 for reading
in rmgroup
.findall("reading"):
322 r_type
= reading
.get("r_type")
323 on_type
= reading
.get("on_type")
324 r_status
= reading
.get("r_status")
326 table
.insert(group_id
, r_type
, on_type
, r_status
, value
)
327 table
= self
.tables
['meaning']
328 for meaning
in rmgroup
.findall("meaning"):
329 lang
= meaning
.get("m_lang", "en")
331 table
.insert(group_id
, lang
, value
)
332 table
= self
.tables
['nanori']
333 for nanori
in reading_meaning
.findall("nanori"):
334 table
.insert(char_id
, nanori
.text
)
336 def _drop_table(self
, name
):
337 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % name
)
339 def _create_index_tables(self
):
340 """Creates extra tables to help with common searches.
342 Supplementary tables include:
344 1. Reading search table: kun-yomi to character ID. Kun-yomi
345 is modified for easier searching (no "." or "-" markers).
348 self
._create
_reading
_search
_table
()
350 def _create_reading_search_table(self
):
351 """Creates "sanitized" reading to character ID search table."""
353 # Mapping is from reading to character ID...
354 # r.fk -> rg.id, rg.fk -> c.id.
356 "SELECT r.value, c.id "
357 "FROM reading r, rmgroup rg, character c "
358 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
360 self
.cursor
.execute(query
)
361 rows
= self
.cursor
.fetchall()
362 values
, ids
= zip(*rows
) # unzip idiom (see zip doc)
364 # Sanitize strings by removing "." and "-"
365 values
= [value
.replace(u
".", u
"").replace(u
"-", u
"")
369 tbl_name
= "kunyomi_lookup"
370 self
.tables
[tbl_name
] = tbl
= ReadingLookupTable(self
.cursor
, tbl_name
)
371 self
._drop
_table
(tbl_name
)
374 # Store all sanitized strings and their keys in the table
375 rows
= zip(values
, ids
)
379 ######################################################################
380 # KANJIDIC2 data tables
381 ######################################################################
384 class HeaderTable(Table
):
385 create_query
= ("CREATE TABLE %s "
386 "(file_version TEXT, "
387 "database_version TEXT, "
388 "date_of_creation TEXT)")
389 insert_query
= "INSERT INTO %s VALUES (?, ?, ?)"
392 class CharacterTable(Table
):
393 create_query
= ("CREATE TABLE %s "
394 "(id INTEGER PRIMARY KEY, literal TEXT, "
395 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
396 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
398 "CREATE INDEX %s_literal ON %s (literal)",
402 class TypeValueTable(ChildTable
):
403 create_query
= ("CREATE TABLE %s "
404 "(id INTEGER PRIMARY KEY, fk INTEGER, "
405 "type TEXT, value TEXT)")
406 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
408 "CREATE INDEX %s_fk ON %s (fk)",
412 class StrokeCountTable(ChildTable
):
413 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
414 "fk INTEGER, count INTEGER)")
415 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
417 "CREATE INDEX %s_fk ON %s (fk)",
421 class DicNumberTable(ChildTable
):
422 create_query
= ("CREATE TABLE %s "
423 "(id INTEGER PRIMARY KEY, fk INTEGER, "
424 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
425 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
427 "CREATE INDEX %s_fk ON %s (fk)",
431 class QueryCodeTable(ChildTable
):
432 create_query
= ("CREATE TABLE %s "
433 "(id INTEGER PRIMARY KEY, fk INTEGER, "
434 "type TEXT, skip_misclass TEXT, value TEXT)")
435 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
437 "CREATE INDEX %s_fk ON %s (fk)",
441 class RMGroupTable(ChildTable
):
442 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
443 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
445 "CREATE INDEX %s_fk ON %s (fk)",
448 class ReadingTable(ChildTable
):
449 create_query
= ("CREATE TABLE %s "
450 "(id INTEGER PRIMARY KEY, fk INTEGER, "
451 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
452 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
454 "CREATE INDEX %s_fk ON %s (fk)",
455 "CREATE INDEX %s_value ON %s (value)",
459 class MeaningTable(ChildTable
):
460 create_query
= ("CREATE TABLE %s "
461 "(id INTEGER PRIMARY KEY, fk INTEGER, "
462 "lang TEXT, value TEXT)")
463 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
465 "CREATE INDEX %s_fk ON %s (fk)",
466 "CREATE INDEX %s_lang_value ON %s (lang, value)",
470 ######################################################################
471 # Index tables (not part of actual KANJIDIC2)
472 ######################################################################
475 class ReadingLookupTable(Table
):
476 """Maps reading to character IDs."""
477 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
478 create_query
= ("CREATE TABLE %s "
479 "(id INTEGER PRIMARY KEY, "
480 "reading TEXT, character_id INTEGER)")
481 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
483 "CREATE INDEX %s_reading ON %s (reading)",
488 ######################################################################
491 from optparse
import OptionParser
492 op
= OptionParser(usage
="%prog [options] <db_filename> [search_query]")
493 op
.add_option("-i", "--initialize",
494 dest
="init_fname", metavar
="XML_SOURCE",
495 help=_("Initialize database from file."))
496 op
.add_option("-s", "--search", action
="store_true",
497 help=_("Search for kanji by readings or meanings"))
498 op
.add_option("-l", "--lookup", action
="store_true",
499 help=_("Look up exact character"))
500 op
.add_option("-L", "--lang",
501 help=_("Specify preferred language for searching."))
502 options
, args
= op
.parse_args()
506 if options
.lookup
and options
.search
:
507 print(_("Cannot --lookup and --search at the same time."),
510 return (options
, args
)
513 options
, args
= parse_args()
516 if options
.init_fname
is not None:
517 db
= Database(db_fname
, init_from_file
=options
.init_fname
)
519 db
= Database(db_fname
)
523 # No search was requested; we can exit here.
526 if options
.search
== True:
528 # To be nice, we'll join all remaining args with spaces.
529 search_query
= " ".join(args
[1:])
531 if options
.lang
is not None:
532 results
= db
.search(search_query
, lang
=options
.lang
)
534 results
= db
.search(search_query
)
535 elif options
.lookup
== True:
537 encoding
= get_encoding()
538 lookup_query
= args
[1].decode(encoding
)
540 for character
in lookup_query
:
541 result
= db
.search_by_literal(character
)
542 if result
is not None:
543 results
.append(result
)
546 print(_("For searches or lookups, the --search or --lookup flag is "
550 # To do: visualize results
551 # Not as important; now we know we can at least do our needed
554 from pprint
import pprint
555 # DEBUG: until lookup_by_id is implemented, this will work.
557 for result
in results
:
558 entry
= db
.lookup(result
)
562 print(_("No results found."))
564 if __name__
== "__main__":