1 # -*- coding: utf-8 -*-
2 from __future__
import print_function
3 from __future__
import with_statement
5 import os
, sys
, re
, sqlite3
, time
6 from cStringIO
import StringIO
7 from xml
.etree
.cElementTree
import ElementTree
8 from helpers
import gzread
9 from db
import Database
as BaseDatabase
10 from table
import Table
, ChildTable
, KeyValueTable
13 #t = gettext.translation("jblite")
15 gettext
.install("jblite")
18 # This method of getting the encoding might not be the best...
19 # but it works for now, and avoids hacks with
21 get_encoding
= sys
.getfilesystemencoding
24 def convert_kunyomi(coded_str
):
25 """Converts a kunyomi string with ./- to a user-friendly format.
27 Specifically, two things occur:
29 1. Strings are split on '.' and the right half, if any, is
30 enclosed in parentheses.
32 2. '-' is replaced with '~'.
35 pieces
= coded_str
.split(u
'.', 1)
37 intermediate
= u
"".join([pieces
[0], u
"(", pieces
[1], u
")"])
39 intermediate
= pieces
[0]
40 result
= intermediate
.replace(u
"-", u
"~")
43 def get_jouyou_str(grade_int
):
44 """Converts an integer Jouyou grade code into a string."""
51 def __init__(self
, record
):
54 def __unicode__(self
):
55 """Basic string representation of the entry."""
57 char_rec
= self
._record
60 literal
= char_rec
.data
["literal"]
61 lines
.append(_(u
"Literal: %s (0x%X)") % (literal
, ord(literal
)))
63 rmgroup_recs
= char_rec
.find_children("rmgroup")
64 for i
, rmgroup_rec
in enumerate(rmgroup_recs
):
66 lines
.append(_(u
"Group %d:") % group_index
)
68 reading_recs
= rmgroup_rec
.find_children("reading")
70 kunyomi
= [r
.data
['value'] for r
in reading_recs
71 if r
.data
["type"] == "ja_kun"]
72 # kun-yomi needs ./- translated
73 kunyomi
= map(convert_kunyomi
, kunyomi
)
74 lines
.append(_(u
" Kun-yomi: %s") % u
"、".join(kunyomi
))
76 onyomi
= [r
.data
['value'] for r
in reading_recs
77 if r
.data
["type"] == "ja_on"]
78 lines
.append(_(u
" On-yomi: %s") % u
"、".join(onyomi
))
80 meaning_recs
= rmgroup_rec
.find_children("meaning")
82 for r
in meaning_recs
:
83 meanings
= meaning_d
.setdefault(r
.data
['lang'], [])
84 meanings
.append(r
.data
['value'])
85 for lang
in sorted(meaning_d
.keys()):
86 meanings
= meaning_d
[lang
]
87 meaning_str
= "; ".join(meanings
)
88 lines
.append(_(u
" Meanings (%s): %s") % (lang
, meaning_str
))
90 nanori_recs
= char_rec
.find_children("nanori")
91 if len(nanori_recs
) > 0:
92 nanori
= [r
.data
["value"] for r
in nanori_recs
]
93 nanori_str
= u
"、".join(nanori
)
94 lines
.append(_(u
"Nanori: %s") % nanori_str
)
96 stroke_recs
= char_rec
.find_children("stroke_count")
97 strokes
= [r
.data
['count'] for r
in stroke_recs
]
99 lines
.append(_(u
"Stroke count: %d") % strokes
[0])
100 elif len(strokes
) > 1:
101 miscounts
= ", ".join(map(str, strokes
[1:]))
102 lines
.append(_(u
"Stroke count: %d (miscounts: %s)") %
103 (strokes
[0], miscounts
))
105 pass # No stroke count info; don't print anything
107 freq
= char_rec
.data
["freq"]
109 lines
.append(_(u
"Frequency: %d") % freq
)
111 grade
= char_rec
.data
["grade"]
112 if grade
is not None:
113 # Jouyou grade codes has special meanings; a conversion is
115 grade_str
= get_jouyou_str(grade
)
116 lines
.append(_(u
"Jouyou grade: %s") % grade_str
)
118 jlpt
= char_rec
.data
["jlpt"]
120 lines
.append(_(u
"JLPT grade: %d") % jlpt
)
122 return u
"\n".join(lines
)
125 return repr(self
._record
)
128 class Database(BaseDatabase
):
130 """Top level object for SQLite 3-based KANJIDIC2 database."""
150 def __init__(self
, filename
, init_from_file
=None):
151 self
.conn
= sqlite3
.connect(filename
)
152 self
.conn
.row_factory
= sqlite3
.Row
# keyword accessors for rows
153 self
.cursor
= self
.conn
.cursor()
154 self
.tables
= self
._create
_table
_objects
()
155 if init_from_file
is not None:
156 raw_data
= gzread(init_from_file
)
158 infile
= StringIO(raw_data
)
159 etree
= ElementTree(file=infile
)
162 # Create the core database
163 self
._create
_new
_tables
()
164 self
._populate
_database
(etree
)
167 # Create supplemental indices
168 self
._create
_index
_tables
()
171 def search(self
, query
, lang
=None, options
=None):
172 encoding
= get_encoding()
173 wrapped_query
= "%%%s%%" % query
# Wrap in wildcards
174 unicode_query
= wrapped_query
.decode(encoding
)
176 verbose
= (options
is not None) and (options
.verbose
== True)
178 if verbose
and os
.name
== "nt":
179 print(u
"Searching for \"%s\", lang=%s..." %
180 (unicode_query
, repr(lang
)),
183 # Do some search stuff here...
187 entries_r
= self
._search
_by
_reading
(unicode_query
)
188 entries_m
= self
._search
_by
_meaning
(unicode_query
,
190 entries_n
= self
._search
_by
_nanori
(unicode_query
)
191 entries_i
= self
._search
_by
_indices
(unicode_query
, lang
=lang
)
196 if len(entries_r
) == 0:
197 print("No 'reading' results found.")
198 for ent_id
, literal
in entries_r
:
200 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
201 except UnicodeEncodeError:
202 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
205 if len(entries_n
) == 0:
206 print("No 'nanori' results found.")
207 for ent_id
, literal
in entries_n
:
209 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
210 except UnicodeEncodeError:
211 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
214 if len(entries_m
) == 0:
215 print("No 'meaning' results found.")
216 for ent_id
, literal
in entries_m
:
218 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
219 except UnicodeEncodeError:
220 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
223 if len(entries_i
) == 0:
224 print("No indexed results found.")
225 for ent_id
in entries_i
:
226 print(u
"ID: %d" % (ent_id
,))
228 # Get list of unique character IDs
230 for lst
in (entries_r
, entries_m
, entries_n
):
232 if row
[0] not in char_ids
:
233 char_ids
.append(row
[0])
234 for char_id
in entries_i
:
235 if char_id
not in char_ids
:
236 char_ids
.append(char_id
)
238 char_ids
= list(sorted(char_ids
))
240 results
= [self
.lookup(char_id
) for char_id
in char_ids
]
243 def _search_by_reading(self
, query
):
244 # reading -> rmgroup -> character
246 "SELECT id, literal FROM character WHERE id IN "
247 "(SELECT fk FROM rmgroup WHERE id IN "
248 "(SELECT fk FROM reading WHERE value LIKE ?))", (query
,))
249 rows
= self
.cursor
.fetchall()
252 def _search_by_nanori(self
, query
):
253 # nanori -> character
255 "SELECT id, literal FROM character WHERE id IN "
256 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query
,))
257 rows
= self
.cursor
.fetchall()
260 def _search_by_meaning(self
, query
, lang
=None):
261 # meaning -> rmgroup -> character
264 "SELECT id, literal FROM character WHERE id IN "
265 "(SELECT fk FROM rmgroup WHERE id IN "
266 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query
,))
269 "SELECT id, literal FROM character WHERE id IN "
270 "(SELECT fk FROM rmgroup WHERE id IN "
271 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
273 rows
= self
.cursor
.fetchall()
276 def _search_by_indices(self
, query
, lang
=None):
277 # Get IDs from index table
278 # Note: lang is currently unused.
280 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
282 rows
= self
.cursor
.fetchall()
283 return [row
[0] for row
in rows
]
285 def search_by_literal(self
, literal
):
286 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
287 self
.cursor
.execute("SELECT id FROM character WHERE literal = ?",
289 rows
= self
.cursor
.fetchall()
294 return self
.lookup(char_id
)
296 def lookup(self
, id):
297 return BaseDatabase
.lookup(self
, "character", id)
299 def _create_table_objects(self
):
300 """Creates table objects.
302 Returns a dictionary of table name to table object.
306 "header": HeaderTable
,
307 "character": CharacterTable
,
308 "codepoint": TypeValueTable
,
309 "radical": TypeValueTable
,
310 "stroke_count": StrokeCountTable
,
311 "variant": TypeValueTable
,
312 "rad_name": KeyValueTable
,
313 "dic_number": DicNumberTable
,
314 "query_code": QueryCodeTable
,
315 "rmgroup": RMGroupTable
,
316 "reading": ReadingTable
,
317 "meaning": MeaningTable
,
318 "nanori": KeyValueTable
,
321 # Create all table objects
323 for tbl
, cls
in class_mappings
.iteritems():
324 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
326 return table_mappings
328 def _create_new_tables(self
):
329 """(Re)creates the database tables."""
330 for tbl
, tbl_obj
in self
.tables
.iteritems():
331 self
._drop
_table
(tbl
)
334 def _populate_database(self
, etree
):
335 """Imports XML data into SQLite database.
337 table_d: table to table_object dictionary
338 etree: ElementTree object for KANJIDIC2
342 header
= etree
.find("header")
343 file_ver
= header
.find("file_version").text
344 db_ver
= header
.find("database_version").text
345 date
= header
.find("date_of_creation").text
346 self
.tables
['header'].insert(file_ver
, db_ver
, date
)
348 # Iterate through characters
349 for character
in etree
.findall("character"):
351 literal
= character
.find("literal").text
353 # Grab misc node - we'll store a few things from it in the
354 # main character table, too.
355 misc
= character
.find("misc")
356 grade
= misc
.find("grade")
357 grade
= int(grade
.text
) if grade
is not None else None
358 freq
= misc
.find("freq")
359 freq
= int(freq
.text
) if freq
is not None else None
360 jlpt
= misc
.find("jlpt")
361 jlpt
= int(jlpt
.text
) if jlpt
is not None else None
363 char_id
= self
.tables
['character'].insert(literal
, grade
,
366 table
= self
.tables
['codepoint']
367 codepoint
= character
.find("codepoint")
368 for cp_value
in codepoint
.findall("cp_value"):
369 value
= cp_value
.text
370 cp_type
= cp_value
.get("cp_type")
371 table
.insert(char_id
, cp_type
, value
)
373 table
= self
.tables
['radical']
374 radical
= character
.find("radical")
375 for rad_value
in radical
.findall("rad_value"):
376 value
= rad_value
.text
377 rad_type
= rad_value
.get("rad_type")
378 table
.insert(char_id
, rad_type
, value
)
380 # Tables generated from <misc> begin here
381 table
= self
.tables
['stroke_count']
382 for stroke_count
in misc
.findall("stroke_count"):
383 count
= int(stroke_count
.text
)
384 table
.insert(char_id
, count
)
386 table
= self
.tables
['variant']
387 for variant
in misc
.findall("variant"):
389 var_type
= variant
.get("var_type")
390 table
.insert(char_id
, var_type
, value
)
392 table
= self
.tables
['rad_name']
393 for rad_name
in misc
.findall("rad_name"):
394 value
= rad_name
.text
395 table
.insert(char_id
, value
)
397 # Remaining direct descendents of <character>...
398 dic_number
= character
.find("dic_number")
399 if dic_number
is not None:
400 table
= self
.tables
['dic_number']
401 for dic_ref
in dic_number
.findall("dic_ref"):
402 dr_type
= dic_ref
.get("dr_type")
403 m_vol
= dic_ref
.get("m_vol", None)
404 m_page
= dic_ref
.get("m_page", None)
406 table
.insert(char_id
, dr_type
, m_vol
, m_page
, value
)
408 query_code
= character
.find("query_code")
409 if query_code
is not None:
410 table
= self
.tables
['query_code']
411 for q_code
in query_code
.findall("q_code"):
412 qc_type
= q_code
.get("qc_type")
413 skip_misclass
= q_code
.get("skip_misclass", None)
415 table
.insert(char_id
, qc_type
, skip_misclass
, value
)
417 reading_meaning
= character
.find("reading_meaning")
418 if reading_meaning
is not None:
419 table
= self
.tables
['rmgroup']
420 for rmgroup
in reading_meaning
.findall("rmgroup"):
421 group_id
= table
.insert(char_id
)
422 table
= self
.tables
['reading']
423 for reading
in rmgroup
.findall("reading"):
424 r_type
= reading
.get("r_type")
425 on_type
= reading
.get("on_type")
426 r_status
= reading
.get("r_status")
428 table
.insert(group_id
, r_type
, on_type
, r_status
, value
)
429 table
= self
.tables
['meaning']
430 for meaning
in rmgroup
.findall("meaning"):
431 lang
= meaning
.get("m_lang", "en")
433 table
.insert(group_id
, lang
, value
)
434 table
= self
.tables
['nanori']
435 for nanori
in reading_meaning
.findall("nanori"):
436 table
.insert(char_id
, nanori
.text
)
438 def _drop_table(self
, name
):
439 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % name
)
441 def _create_index_tables(self
):
442 """Creates extra tables to help with common searches.
444 Supplementary tables include:
446 1. Reading search table: kun-yomi to character ID. Kun-yomi
447 is modified for easier searching (no "." or "-" markers).
450 self
._create
_reading
_search
_table
()
452 def _create_reading_search_table(self
):
453 """Creates "sanitized" reading to character ID search table."""
455 # Mapping is from reading to character ID...
456 # r.fk -> rg.id, rg.fk -> c.id.
458 "SELECT r.value, c.id "
459 "FROM reading r, rmgroup rg, character c "
460 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
462 self
.cursor
.execute(query
)
463 rows
= self
.cursor
.fetchall()
464 values
, ids
= zip(*rows
) # unzip idiom (see zip doc)
466 # Sanitize strings by removing "." and "-"
467 values
= [value
.replace(u
".", u
"").replace(u
"-", u
"")
471 tbl_name
= "kunyomi_lookup"
472 self
.tables
[tbl_name
] = tbl
= ReadingLookupTable(self
.cursor
, tbl_name
)
473 self
._drop
_table
(tbl_name
)
476 # Store all sanitized strings and their keys in the table
477 rows
= zip(values
, ids
)
481 ######################################################################
482 # KANJIDIC2 data tables
483 ######################################################################
486 class HeaderTable(Table
):
487 create_query
= ("CREATE TABLE %s "
488 "(file_version TEXT, "
489 "database_version TEXT, "
490 "date_of_creation TEXT)")
491 insert_query
= "INSERT INTO %s VALUES (?, ?, ?)"
494 class CharacterTable(Table
):
495 create_query
= ("CREATE TABLE %s "
496 "(id INTEGER PRIMARY KEY, literal TEXT, "
497 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
498 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
500 "CREATE INDEX %s_literal ON %s (literal)",
504 class TypeValueTable(ChildTable
):
505 create_query
= ("CREATE TABLE %s "
506 "(id INTEGER PRIMARY KEY, fk INTEGER, "
507 "type TEXT, value TEXT)")
508 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
510 "CREATE INDEX %s_fk ON %s (fk)",
514 class StrokeCountTable(ChildTable
):
515 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
516 "fk INTEGER, count INTEGER)")
517 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
519 "CREATE INDEX %s_fk ON %s (fk)",
523 class DicNumberTable(ChildTable
):
524 create_query
= ("CREATE TABLE %s "
525 "(id INTEGER PRIMARY KEY, fk INTEGER, "
526 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
527 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
529 "CREATE INDEX %s_fk ON %s (fk)",
533 class QueryCodeTable(ChildTable
):
534 create_query
= ("CREATE TABLE %s "
535 "(id INTEGER PRIMARY KEY, fk INTEGER, "
536 "type TEXT, skip_misclass TEXT, value TEXT)")
537 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
539 "CREATE INDEX %s_fk ON %s (fk)",
543 class RMGroupTable(ChildTable
):
544 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
545 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
547 "CREATE INDEX %s_fk ON %s (fk)",
550 class ReadingTable(ChildTable
):
551 create_query
= ("CREATE TABLE %s "
552 "(id INTEGER PRIMARY KEY, fk INTEGER, "
553 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
554 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
556 "CREATE INDEX %s_fk ON %s (fk)",
557 "CREATE INDEX %s_value ON %s (value)",
561 class MeaningTable(ChildTable
):
562 create_query
= ("CREATE TABLE %s "
563 "(id INTEGER PRIMARY KEY, fk INTEGER, "
564 "lang TEXT, value TEXT)")
565 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
567 "CREATE INDEX %s_fk ON %s (fk)",
568 "CREATE INDEX %s_lang_value ON %s (lang, value)",
572 ######################################################################
573 # Index tables (not part of actual KANJIDIC2)
574 ######################################################################
577 class ReadingLookupTable(Table
):
578 """Maps reading to character IDs."""
579 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
580 create_query
= ("CREATE TABLE %s "
581 "(id INTEGER PRIMARY KEY, "
582 "reading TEXT, character_id INTEGER)")
583 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
585 "CREATE INDEX %s_reading ON %s (reading)",
590 ######################################################################
593 from optparse
import OptionParser
594 op
= OptionParser(usage
="%prog [options] <db_filename> [search_query]")
595 op
.add_option("-i", "--initialize",
596 dest
="init_fname", metavar
="XML_SOURCE",
597 help=_("Initialize database from file."))
598 op
.add_option("-s", "--search", action
="store_true",
599 help=_("Search for kanji by readings or meanings"))
600 op
.add_option("-l", "--lookup", action
="store_true",
601 help=_("Look up exact character"))
602 op
.add_option("-L", "--lang",
603 help=_("Specify preferred language for searching."))
604 op
.add_option("-v", "--verbose", action
="store_true",
605 help=_("Verbose mode (print debug strings)"))
606 options
, args
= op
.parse_args()
610 if options
.lookup
and options
.search
:
611 print(_("Cannot --lookup and --search at the same time."),
614 return (options
, args
)
617 options
, args
= parse_args()
620 if options
.init_fname
is not None:
621 db
= Database(db_fname
, init_from_file
=options
.init_fname
)
623 db
= Database(db_fname
)
627 # No search was requested; we can exit here.
630 if options
.search
== True:
632 # To be nice, we'll join all remaining args with spaces.
633 search_query
= " ".join(args
[1:])
635 if options
.lang
is not None:
636 results
= db
.search(search_query
,
637 lang
=options
.lang
, options
=options
)
639 results
= db
.search(search_query
, options
=options
)
640 elif options
.lookup
== True:
642 encoding
= get_encoding()
643 lookup_query
= args
[1].decode(encoding
)
645 for character
in lookup_query
:
646 result
= db
.search_by_literal(character
)
647 if result
is not None:
648 results
.append(result
)
651 print(_("For searches or lookups, the --search or --lookup flag is "
655 # To do: visualize results
656 # Not as important; now we know we can at least do our needed
659 encoding
= get_encoding()
660 # DEBUG: until lookup_by_id is implemented, this will work.
661 for index
, result
in enumerate(results
):
663 print(_("[Entry %d]") % index
)
665 print(unicode(result
).encode(encoding
))
668 print(_("No results found."))
670 if __name__
== "__main__":