1 # -*- coding: utf-8 -*-
2 from __future__
import print_function
3 from __future__
import with_statement
5 import os
, sys
, re
, sqlite3
, time
6 from cStringIO
import StringIO
7 from xml
.etree
.cElementTree
import ElementTree
8 from helpers
import gzread
, get_encoding
, convert_query_to_unicode
9 from db
import Database
as BaseDatabase
10 from table
import Table
, ChildTable
, KeyValueTable
13 #t = gettext.translation("jblite")
15 gettext
.install("jblite")
18 def convert_kunyomi(coded_str
):
19 """Converts a kunyomi string with ./- to a user-friendly format.
21 Specifically, two things occur:
23 1. Strings are split on '.' and the right half, if any, is
24 enclosed in parentheses.
26 2. '-' is replaced with '~'.
29 pieces
= coded_str
.split(u
'.', 1)
31 intermediate
= u
"".join([pieces
[0], u
"(", pieces
[1], u
")"])
33 intermediate
= pieces
[0]
34 result
= intermediate
.replace(u
"-", u
"~")
37 def get_jouyou_str(grade_int
):
38 """Converts an integer Jouyou grade code into a string."""
45 def __init__(self
, record
):
48 def __unicode__(self
):
49 """Basic string representation of the entry."""
51 char_rec
= self
._record
54 literal
= char_rec
.data
["literal"]
55 lines
.append(_(u
"Literal: %s (0x%X)") % (literal
, ord(literal
)))
57 rmgroup_recs
= char_rec
.find_children("rmgroup")
58 for i
, rmgroup_rec
in enumerate(rmgroup_recs
):
60 lines
.append(_(u
"Group %d:") % group_index
)
62 reading_recs
= rmgroup_rec
.find_children("reading")
64 kunyomi
= [r
.data
['value'] for r
in reading_recs
65 if r
.data
["type"] == "ja_kun"]
66 # kun-yomi needs ./- translated
67 kunyomi
= map(convert_kunyomi
, kunyomi
)
68 lines
.append(_(u
" Kun-yomi: %s") % u
"、".join(kunyomi
))
70 onyomi
= [r
.data
['value'] for r
in reading_recs
71 if r
.data
["type"] == "ja_on"]
72 lines
.append(_(u
" On-yomi: %s") % u
"、".join(onyomi
))
74 meaning_recs
= rmgroup_rec
.find_children("meaning")
76 for r
in meaning_recs
:
77 meanings
= meaning_d
.setdefault(r
.data
['lang'], [])
78 meanings
.append(r
.data
['value'])
79 for lang
in sorted(meaning_d
.keys()):
80 meanings
= meaning_d
[lang
]
81 meaning_str
= "; ".join(meanings
)
82 lines
.append(_(u
" Meanings (%s): %s") % (lang
, meaning_str
))
84 nanori_recs
= char_rec
.find_children("nanori")
85 if len(nanori_recs
) > 0:
86 nanori
= [r
.data
["value"] for r
in nanori_recs
]
87 nanori_str
= u
"、".join(nanori
)
88 lines
.append(_(u
"Nanori: %s") % nanori_str
)
90 stroke_recs
= char_rec
.find_children("stroke_count")
91 strokes
= [r
.data
['count'] for r
in stroke_recs
]
93 lines
.append(_(u
"Stroke count: %d") % strokes
[0])
94 elif len(strokes
) > 1:
95 miscounts
= ", ".join(map(str, strokes
[1:]))
96 lines
.append(_(u
"Stroke count: %d (miscounts: %s)") %
97 (strokes
[0], miscounts
))
99 pass # No stroke count info; don't print anything
101 freq
= char_rec
.data
["freq"]
103 lines
.append(_(u
"Frequency: %d") % freq
)
105 grade
= char_rec
.data
["grade"]
106 if grade
is not None:
107 # Jouyou grade codes has special meanings; a conversion is
109 grade_str
= get_jouyou_str(grade
)
110 lines
.append(_(u
"Jouyou grade: %s") % grade_str
)
112 jlpt
= char_rec
.data
["jlpt"]
114 lines
.append(_(u
"JLPT grade: %d") % jlpt
)
116 return u
"\n".join(lines
)
119 return repr(self
._record
)
122 class Database(BaseDatabase
):
124 """Top level object for SQLite 3-based KANJIDIC2 database."""
144 def __init__(self
, filename
, init_from_file
=None):
145 self
.conn
= sqlite3
.connect(filename
)
146 self
.conn
.row_factory
= sqlite3
.Row
# keyword accessors for rows
147 self
.cursor
= self
.conn
.cursor()
148 self
.tables
= self
._create
_table
_objects
()
149 if init_from_file
is not None:
150 raw_data
= gzread(init_from_file
)
152 infile
= StringIO(raw_data
)
153 etree
= ElementTree(file=infile
)
156 # Create the core database
157 self
._create
_new
_tables
()
158 self
._populate
_database
(etree
)
161 # Create supplemental indices
162 self
._create
_index
_tables
()
165 def search(self
, query
, lang
=None, options
=None):
166 unicode_query
= convert_query_to_unicode(query
)
168 verbose
= (options
is not None) and (options
.verbose
== True)
170 if verbose
and os
.name
== "nt":
171 print(u
"Searching for \"%s\", lang=%s..." %
172 (unicode_query
, repr(lang
)),
175 # Do some search stuff here...
179 entries_r
= self
._search
_by
_reading
(unicode_query
)
180 entries_m
= self
._search
_by
_meaning
(unicode_query
,
182 entries_n
= self
._search
_by
_nanori
(unicode_query
)
183 entries_i
= self
._search
_by
_indices
(unicode_query
, lang
=lang
)
188 if len(entries_r
) == 0:
189 print("No 'reading' results found.")
190 for ent_id
, literal
in entries_r
:
192 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
193 except UnicodeEncodeError:
194 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
197 if len(entries_n
) == 0:
198 print("No 'nanori' results found.")
199 for ent_id
, literal
in entries_n
:
201 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
202 except UnicodeEncodeError:
203 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
206 if len(entries_m
) == 0:
207 print("No 'meaning' results found.")
208 for ent_id
, literal
in entries_m
:
210 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
211 except UnicodeEncodeError:
212 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
215 if len(entries_i
) == 0:
216 print("No indexed results found.")
217 for ent_id
in entries_i
:
218 print(u
"ID: %d" % (ent_id
,))
220 # Get list of unique character IDs
222 for lst
in (entries_r
, entries_m
, entries_n
):
224 if row
[0] not in char_ids
:
225 char_ids
.append(row
[0])
226 for char_id
in entries_i
:
227 if char_id
not in char_ids
:
228 char_ids
.append(char_id
)
230 char_ids
= list(sorted(char_ids
))
232 results
= [self
.lookup(char_id
) for char_id
in char_ids
]
235 def _search_by_reading(self
, query
):
236 # reading -> rmgroup -> character
238 "SELECT id, literal FROM character WHERE id IN "
239 "(SELECT fk FROM rmgroup WHERE id IN "
240 "(SELECT fk FROM reading WHERE value LIKE ?))", (query
,))
241 rows
= self
.cursor
.fetchall()
244 def _search_by_nanori(self
, query
):
245 # nanori -> character
247 "SELECT id, literal FROM character WHERE id IN "
248 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query
,))
249 rows
= self
.cursor
.fetchall()
252 def _search_by_meaning(self
, query
, lang
=None):
253 # meaning -> rmgroup -> character
256 "SELECT id, literal FROM character WHERE id IN "
257 "(SELECT fk FROM rmgroup WHERE id IN "
258 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query
,))
261 "SELECT id, literal FROM character WHERE id IN "
262 "(SELECT fk FROM rmgroup WHERE id IN "
263 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
265 rows
= self
.cursor
.fetchall()
268 def _search_by_indices(self
, query
, lang
=None):
269 # Get IDs from index table
270 # Note: lang is currently unused.
272 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
274 rows
= self
.cursor
.fetchall()
275 return [row
[0] for row
in rows
]
277 def search_by_literal(self
, literal
):
278 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
279 self
.cursor
.execute("SELECT id FROM character WHERE literal = ?",
281 rows
= self
.cursor
.fetchall()
286 return self
.lookup(char_id
)
288 def lookup(self
, id):
289 return BaseDatabase
.lookup(self
, "character", id)
291 def _create_table_objects(self
):
292 """Creates table objects.
294 Returns a dictionary of table name to table object.
298 "header": HeaderTable
,
299 "character": CharacterTable
,
300 "codepoint": TypeValueTable
,
301 "radical": TypeValueTable
,
302 "stroke_count": StrokeCountTable
,
303 "variant": TypeValueTable
,
304 "rad_name": KeyValueTable
,
305 "dic_number": DicNumberTable
,
306 "query_code": QueryCodeTable
,
307 "rmgroup": RMGroupTable
,
308 "reading": ReadingTable
,
309 "meaning": MeaningTable
,
310 "nanori": KeyValueTable
,
313 # Create all table objects
315 for tbl
, cls
in class_mappings
.iteritems():
316 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
318 return table_mappings
320 def _create_new_tables(self
):
321 """(Re)creates the database tables."""
322 for tbl
, tbl_obj
in self
.tables
.iteritems():
323 self
._drop
_table
(tbl
)
326 def _populate_database(self
, etree
):
327 """Imports XML data into SQLite database.
329 table_d: table to table_object dictionary
330 etree: ElementTree object for KANJIDIC2
334 header
= etree
.find("header")
335 file_ver
= header
.find("file_version").text
336 db_ver
= header
.find("database_version").text
337 date
= header
.find("date_of_creation").text
338 self
.tables
['header'].insert(file_ver
, db_ver
, date
)
340 # Iterate through characters
341 for character
in etree
.findall("character"):
343 literal
= character
.find("literal").text
345 # Grab misc node - we'll store a few things from it in the
346 # main character table, too.
347 misc
= character
.find("misc")
348 grade
= misc
.find("grade")
349 grade
= int(grade
.text
) if grade
is not None else None
350 freq
= misc
.find("freq")
351 freq
= int(freq
.text
) if freq
is not None else None
352 jlpt
= misc
.find("jlpt")
353 jlpt
= int(jlpt
.text
) if jlpt
is not None else None
355 char_id
= self
.tables
['character'].insert(literal
, grade
,
358 table
= self
.tables
['codepoint']
359 codepoint
= character
.find("codepoint")
360 for cp_value
in codepoint
.findall("cp_value"):
361 value
= cp_value
.text
362 cp_type
= cp_value
.get("cp_type")
363 table
.insert(char_id
, cp_type
, value
)
365 table
= self
.tables
['radical']
366 radical
= character
.find("radical")
367 for rad_value
in radical
.findall("rad_value"):
368 value
= rad_value
.text
369 rad_type
= rad_value
.get("rad_type")
370 table
.insert(char_id
, rad_type
, value
)
372 # Tables generated from <misc> begin here
373 table
= self
.tables
['stroke_count']
374 for stroke_count
in misc
.findall("stroke_count"):
375 count
= int(stroke_count
.text
)
376 table
.insert(char_id
, count
)
378 table
= self
.tables
['variant']
379 for variant
in misc
.findall("variant"):
381 var_type
= variant
.get("var_type")
382 table
.insert(char_id
, var_type
, value
)
384 table
= self
.tables
['rad_name']
385 for rad_name
in misc
.findall("rad_name"):
386 value
= rad_name
.text
387 table
.insert(char_id
, value
)
389 # Remaining direct descendents of <character>...
390 dic_number
= character
.find("dic_number")
391 if dic_number
is not None:
392 table
= self
.tables
['dic_number']
393 for dic_ref
in dic_number
.findall("dic_ref"):
394 dr_type
= dic_ref
.get("dr_type")
395 m_vol
= dic_ref
.get("m_vol", None)
396 m_page
= dic_ref
.get("m_page", None)
398 table
.insert(char_id
, dr_type
, m_vol
, m_page
, value
)
400 query_code
= character
.find("query_code")
401 if query_code
is not None:
402 table
= self
.tables
['query_code']
403 for q_code
in query_code
.findall("q_code"):
404 qc_type
= q_code
.get("qc_type")
405 skip_misclass
= q_code
.get("skip_misclass", None)
407 table
.insert(char_id
, qc_type
, skip_misclass
, value
)
409 reading_meaning
= character
.find("reading_meaning")
410 if reading_meaning
is not None:
411 table
= self
.tables
['rmgroup']
412 for rmgroup
in reading_meaning
.findall("rmgroup"):
413 group_id
= table
.insert(char_id
)
414 table
= self
.tables
['reading']
415 for reading
in rmgroup
.findall("reading"):
416 r_type
= reading
.get("r_type")
417 on_type
= reading
.get("on_type")
418 r_status
= reading
.get("r_status")
420 table
.insert(group_id
, r_type
, on_type
, r_status
, value
)
421 table
= self
.tables
['meaning']
422 for meaning
in rmgroup
.findall("meaning"):
423 lang
= meaning
.get("m_lang", "en")
425 table
.insert(group_id
, lang
, value
)
426 table
= self
.tables
['nanori']
427 for nanori
in reading_meaning
.findall("nanori"):
428 table
.insert(char_id
, nanori
.text
)
430 def _drop_table(self
, name
):
431 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % name
)
433 def _create_index_tables(self
):
434 """Creates extra tables to help with common searches.
436 Supplementary tables include:
438 1. Reading search table: kun-yomi to character ID. Kun-yomi
439 is modified for easier searching (no "." or "-" markers).
442 self
._create
_reading
_search
_table
()
444 def _create_reading_search_table(self
):
445 """Creates "sanitized" reading to character ID search table."""
447 # Mapping is from reading to character ID...
448 # r.fk -> rg.id, rg.fk -> c.id.
450 "SELECT r.value, c.id "
451 "FROM reading r, rmgroup rg, character c "
452 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
454 self
.cursor
.execute(query
)
455 rows
= self
.cursor
.fetchall()
456 values
, ids
= zip(*rows
) # unzip idiom (see zip doc)
458 # Sanitize strings by removing "." and "-"
459 values
= [value
.replace(u
".", u
"").replace(u
"-", u
"")
463 tbl_name
= "kunyomi_lookup"
464 self
.tables
[tbl_name
] = tbl
= ReadingLookupTable(self
.cursor
, tbl_name
)
465 self
._drop
_table
(tbl_name
)
468 # Store all sanitized strings and their keys in the table
469 rows
= zip(values
, ids
)
473 ######################################################################
474 # KANJIDIC2 data tables
475 ######################################################################
478 class HeaderTable(Table
):
479 create_query
= ("CREATE TABLE %s "
480 "(file_version TEXT, "
481 "database_version TEXT, "
482 "date_of_creation TEXT)")
483 insert_query
= "INSERT INTO %s VALUES (?, ?, ?)"
486 class CharacterTable(Table
):
487 create_query
= ("CREATE TABLE %s "
488 "(id INTEGER PRIMARY KEY, literal TEXT, "
489 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
490 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
492 "CREATE INDEX %s_literal ON %s (literal)",
496 class TypeValueTable(ChildTable
):
497 create_query
= ("CREATE TABLE %s "
498 "(id INTEGER PRIMARY KEY, fk INTEGER, "
499 "type TEXT, value TEXT)")
500 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
502 "CREATE INDEX %s_fk ON %s (fk)",
506 class StrokeCountTable(ChildTable
):
507 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
508 "fk INTEGER, count INTEGER)")
509 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
511 "CREATE INDEX %s_fk ON %s (fk)",
515 class DicNumberTable(ChildTable
):
516 create_query
= ("CREATE TABLE %s "
517 "(id INTEGER PRIMARY KEY, fk INTEGER, "
518 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
519 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
521 "CREATE INDEX %s_fk ON %s (fk)",
525 class QueryCodeTable(ChildTable
):
526 create_query
= ("CREATE TABLE %s "
527 "(id INTEGER PRIMARY KEY, fk INTEGER, "
528 "type TEXT, skip_misclass TEXT, value TEXT)")
529 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
531 "CREATE INDEX %s_fk ON %s (fk)",
535 class RMGroupTable(ChildTable
):
536 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
537 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
539 "CREATE INDEX %s_fk ON %s (fk)",
542 class ReadingTable(ChildTable
):
543 create_query
= ("CREATE TABLE %s "
544 "(id INTEGER PRIMARY KEY, fk INTEGER, "
545 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
546 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
548 "CREATE INDEX %s_fk ON %s (fk)",
549 "CREATE INDEX %s_value ON %s (value)",
553 class MeaningTable(ChildTable
):
554 create_query
= ("CREATE TABLE %s "
555 "(id INTEGER PRIMARY KEY, fk INTEGER, "
556 "lang TEXT, value TEXT)")
557 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
559 "CREATE INDEX %s_fk ON %s (fk)",
560 "CREATE INDEX %s_lang_value ON %s (lang, value)",
564 ######################################################################
565 # Index tables (not part of actual KANJIDIC2)
566 ######################################################################
569 class ReadingLookupTable(Table
):
570 """Maps reading to character IDs."""
571 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
572 create_query
= ("CREATE TABLE %s "
573 "(id INTEGER PRIMARY KEY, "
574 "reading TEXT, character_id INTEGER)")
575 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
577 "CREATE INDEX %s_reading ON %s (reading)",
582 ######################################################################
585 from optparse
import OptionParser
586 op
= OptionParser(usage
="%prog [options] <db_filename> [search_query]")
587 op
.add_option("-i", "--initialize",
588 dest
="init_fname", metavar
="XML_SOURCE",
589 help=_("Initialize database from file."))
590 op
.add_option("-s", "--search", action
="store_true",
591 help=_("Search for kanji by readings or meanings"))
592 op
.add_option("-l", "--lookup", action
="store_true",
593 help=_("Look up exact character"))
594 op
.add_option("-L", "--lang",
595 help=_("Specify preferred language for searching."))
596 op
.add_option("-v", "--verbose", action
="store_true",
597 help=_("Verbose mode (print debug strings)"))
598 options
, args
= op
.parse_args()
602 if options
.lookup
and options
.search
:
603 print(_("Cannot --lookup and --search at the same time."),
606 return (options
, args
)
609 options
, args
= parse_args()
612 if options
.init_fname
is not None:
613 db
= Database(db_fname
, init_from_file
=options
.init_fname
)
615 db
= Database(db_fname
)
619 # No search was requested; we can exit here.
622 if options
.search
== True:
624 # To be nice, we'll join all remaining args with spaces.
625 search_query
= " ".join(args
[1:])
627 if options
.lang
is not None:
628 results
= db
.search(search_query
,
629 lang
=options
.lang
, options
=options
)
631 results
= db
.search(search_query
, options
=options
)
632 elif options
.lookup
== True:
634 encoding
= get_encoding()
635 lookup_query
= args
[1].decode(encoding
)
637 for character
in lookup_query
:
638 result
= db
.search_by_literal(character
)
639 if result
is not None:
640 results
.append(result
)
643 print(_("For searches or lookups, the --search or --lookup flag is "
647 # To do: visualize results
648 # Not as important; now we know we can at least do our needed
651 encoding
= get_encoding()
652 # DEBUG: until lookup_by_id is implemented, this will work.
653 for index
, result
in enumerate(results
):
655 print(_("[Entry %d]") % index
)
657 print(unicode(result
).encode(encoding
))
660 print(_("No results found."))
662 if __name__
== "__main__":