1 # -*- coding: utf-8 -*-
2 from __future__
import print_function
3 from __future__
import with_statement
5 import os
, sys
, re
, sqlite3
, time
6 from cStringIO
import StringIO
7 from xml
.etree
.cElementTree
import ElementTree
8 from helpers
import gzread
, get_encoding
, convert_query_to_unicode
9 from db
import Database
as BaseDatabase
10 from table
import Table
, ChildTable
, KeyValueTable
13 #t = gettext.translation("jblite")
15 gettext
.install("jblite")
18 def convert_kunyomi(coded_str
):
19 """Converts a kunyomi string with ./- to a user-friendly format.
21 Specifically, two things occur:
23 1. Strings are split on '.' and the right half, if any, is
24 enclosed in parentheses.
26 2. '-' is replaced with '~'.
29 pieces
= coded_str
.split(u
'.', 1)
31 intermediate
= u
"".join([pieces
[0], u
"(", pieces
[1], u
")"])
33 intermediate
= pieces
[0]
34 result
= intermediate
.replace(u
"-", u
"~")
37 def get_jouyou_str(grade_int
):
38 """Converts an integer Jouyou grade code into a string."""
45 def __init__(self
, record
):
48 def __unicode__(self
):
49 """Basic string representation of the entry."""
51 char_rec
= self
._record
54 literal
= char_rec
.data
["literal"]
55 lines
.append(_(u
"Literal: %s (0x%X)") % (literal
, ord(literal
)))
57 rmgroup_recs
= char_rec
.find_children("rmgroup")
58 for i
, rmgroup_rec
in enumerate(rmgroup_recs
):
60 lines
.append(_(u
"Group %d:") % group_index
)
62 reading_recs
= rmgroup_rec
.find_children("reading")
64 kunyomi
= [r
.data
['value'] for r
in reading_recs
65 if r
.data
["type"] == "ja_kun"]
66 # kun-yomi needs ./- translated
67 kunyomi
= map(convert_kunyomi
, kunyomi
)
68 lines
.append(_(u
" Kun-yomi: %s") % u
"、".join(kunyomi
))
70 onyomi
= [r
.data
['value'] for r
in reading_recs
71 if r
.data
["type"] == "ja_on"]
72 lines
.append(_(u
" On-yomi: %s") % u
"、".join(onyomi
))
74 meaning_recs
= rmgroup_rec
.find_children("meaning")
76 for r
in meaning_recs
:
77 meanings
= meaning_d
.setdefault(r
.data
['lang'], [])
78 meanings
.append(r
.data
['value'])
79 for lang
in sorted(meaning_d
.keys()):
80 meanings
= meaning_d
[lang
]
81 meaning_str
= "; ".join(meanings
)
82 lines
.append(_(u
" Meanings (%s): %s") % (lang
, meaning_str
))
84 nanori_recs
= char_rec
.find_children("nanori")
85 if len(nanori_recs
) > 0:
86 nanori
= [r
.data
["value"] for r
in nanori_recs
]
87 nanori_str
= u
"、".join(nanori
)
88 lines
.append(_(u
"Nanori: %s") % nanori_str
)
90 stroke_recs
= char_rec
.find_children("stroke_count")
91 strokes
= [r
.data
['count'] for r
in stroke_recs
]
93 lines
.append(_(u
"Stroke count: %d") % strokes
[0])
94 elif len(strokes
) > 1:
95 miscounts
= ", ".join(map(str, strokes
[1:]))
96 lines
.append(_(u
"Stroke count: %d (miscounts: %s)") %
97 (strokes
[0], miscounts
))
99 pass # No stroke count info; don't print anything
101 freq
= char_rec
.data
["freq"]
103 lines
.append(_(u
"Frequency: %d") % freq
)
105 grade
= char_rec
.data
["grade"]
106 if grade
is not None:
107 # Jouyou grade codes has special meanings; a conversion is
109 grade_str
= get_jouyou_str(grade
)
110 lines
.append(_(u
"Jouyou grade: %s") % grade_str
)
112 jlpt
= char_rec
.data
["jlpt"]
114 lines
.append(_(u
"JLPT grade: %d") % jlpt
)
116 return u
"\n".join(lines
)
119 return repr(self
._record
)
122 class Database(BaseDatabase
):
124 """Top level object for SQLite 3-based KANJIDIC2 database."""
144 def __init__(self
, filename
, init_from_file
=None):
145 self
.conn
= sqlite3
.connect(filename
)
146 self
.conn
.row_factory
= sqlite3
.Row
# keyword accessors for rows
147 self
.cursor
= self
.conn
.cursor()
148 self
.tables
= self
._create
_table
_objects
()
149 if init_from_file
is not None:
150 raw_data
= gzread(init_from_file
)
152 infile
= StringIO(raw_data
)
153 etree
= ElementTree(file=infile
)
156 # Create the core database
157 self
._create
_new
_tables
()
158 self
._populate
_database
(etree
)
161 # Create supplemental indices
162 self
._create
_index
_tables
()
165 def search(self
, query
, lang
=None, options
=None):
166 query
= convert_query_to_unicode(query
)
167 query
= "%%%s%%" % query
# Wrap in wildcards
169 verbose
= (options
is not None) and (options
.verbose
== True)
171 if verbose
and os
.name
== "nt":
172 print(u
"Searching for \"%s\", lang=%s..." %
176 # Do some search stuff here...
180 entries_r
= self
._search
_by
_reading
(query
)
181 entries_m
= self
._search
_by
_meaning
(query
,
183 entries_n
= self
._search
_by
_nanori
(query
)
184 entries_i
= self
._search
_by
_indices
(query
, lang
=lang
)
189 if len(entries_r
) == 0:
190 print("No 'reading' results found.")
191 for ent_id
, literal
in entries_r
:
193 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
194 except UnicodeEncodeError:
195 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
198 if len(entries_n
) == 0:
199 print("No 'nanori' results found.")
200 for ent_id
, literal
in entries_n
:
202 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
203 except UnicodeEncodeError:
204 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
207 if len(entries_m
) == 0:
208 print("No 'meaning' results found.")
209 for ent_id
, literal
in entries_m
:
211 print(u
"ID: %d, literal: %s" % (ent_id
, literal
))
212 except UnicodeEncodeError:
213 print(u
"ID: %d, literal (repr): %s" % (ent_id
, repr(literal
)))
216 if len(entries_i
) == 0:
217 print("No indexed results found.")
218 for ent_id
in entries_i
:
219 print(u
"ID: %d" % (ent_id
,))
221 # Get list of unique character IDs
223 for lst
in (entries_r
, entries_m
, entries_n
):
225 if row
[0] not in char_ids
:
226 char_ids
.append(row
[0])
227 for char_id
in entries_i
:
228 if char_id
not in char_ids
:
229 char_ids
.append(char_id
)
231 char_ids
= list(sorted(char_ids
))
233 results
= [self
.lookup(char_id
) for char_id
in char_ids
]
236 def _search_by_reading(self
, query
):
237 # reading -> rmgroup -> character
239 "SELECT id, literal FROM character WHERE id IN "
240 "(SELECT fk FROM rmgroup WHERE id IN "
241 "(SELECT fk FROM reading WHERE value LIKE ?))", (query
,))
242 rows
= self
.cursor
.fetchall()
245 def _search_by_nanori(self
, query
):
246 # nanori -> character
248 "SELECT id, literal FROM character WHERE id IN "
249 "(SELECT fk FROM nanori WHERE value LIKE ?)", (query
,))
250 rows
= self
.cursor
.fetchall()
253 def _search_by_meaning(self
, query
, lang
=None):
254 # meaning -> rmgroup -> character
257 "SELECT id, literal FROM character WHERE id IN "
258 "(SELECT fk FROM rmgroup WHERE id IN "
259 "(SELECT fk FROM meaning WHERE value LIKE ?))", (query
,))
262 "SELECT id, literal FROM character WHERE id IN "
263 "(SELECT fk FROM rmgroup WHERE id IN "
264 "(SELECT fk FROM meaning WHERE lang = ? AND value LIKE ?))",
266 rows
= self
.cursor
.fetchall()
269 def _search_by_indices(self
, query
, lang
=None):
270 # Get IDs from index table
271 # Note: lang is currently unused.
273 "SELECT character_id FROM kunyomi_lookup WHERE reading LIKE ?",
275 rows
= self
.cursor
.fetchall()
276 return [row
[0] for row
in rows
]
278 def search_by_literal(self
, literal
):
279 # Not much of a "search", but avoids overlap with BaseDictionary.lookup.
280 self
.cursor
.execute("SELECT id FROM character WHERE literal = ?",
282 rows
= self
.cursor
.fetchall()
287 return self
.lookup(char_id
)
289 def lookup(self
, id):
290 return BaseDatabase
.lookup(self
, "character", id)
292 def _create_table_objects(self
):
293 """Creates table objects.
295 Returns a dictionary of table name to table object.
299 "header": HeaderTable
,
300 "character": CharacterTable
,
301 "codepoint": TypeValueTable
,
302 "radical": TypeValueTable
,
303 "stroke_count": StrokeCountTable
,
304 "variant": TypeValueTable
,
305 "rad_name": KeyValueTable
,
306 "dic_number": DicNumberTable
,
307 "query_code": QueryCodeTable
,
308 "rmgroup": RMGroupTable
,
309 "reading": ReadingTable
,
310 "meaning": MeaningTable
,
311 "nanori": KeyValueTable
,
314 # Create all table objects
316 for tbl
, cls
in class_mappings
.iteritems():
317 table_mappings
[tbl
] = cls(self
.cursor
, tbl
)
319 return table_mappings
321 def _create_new_tables(self
):
322 """(Re)creates the database tables."""
323 for tbl
, tbl_obj
in self
.tables
.iteritems():
324 self
._drop
_table
(tbl
)
327 def _populate_database(self
, etree
):
328 """Imports XML data into SQLite database.
330 table_d: table to table_object dictionary
331 etree: ElementTree object for KANJIDIC2
335 header
= etree
.find("header")
336 file_ver
= header
.find("file_version").text
337 db_ver
= header
.find("database_version").text
338 date
= header
.find("date_of_creation").text
339 self
.tables
['header'].insert(file_ver
, db_ver
, date
)
341 # Iterate through characters
342 for character
in etree
.findall("character"):
344 literal
= character
.find("literal").text
346 # Grab misc node - we'll store a few things from it in the
347 # main character table, too.
348 misc
= character
.find("misc")
349 grade
= misc
.find("grade")
350 grade
= int(grade
.text
) if grade
is not None else None
351 freq
= misc
.find("freq")
352 freq
= int(freq
.text
) if freq
is not None else None
353 jlpt
= misc
.find("jlpt")
354 jlpt
= int(jlpt
.text
) if jlpt
is not None else None
356 char_id
= self
.tables
['character'].insert(literal
, grade
,
359 table
= self
.tables
['codepoint']
360 codepoint
= character
.find("codepoint")
361 for cp_value
in codepoint
.findall("cp_value"):
362 value
= cp_value
.text
363 cp_type
= cp_value
.get("cp_type")
364 table
.insert(char_id
, cp_type
, value
)
366 table
= self
.tables
['radical']
367 radical
= character
.find("radical")
368 for rad_value
in radical
.findall("rad_value"):
369 value
= rad_value
.text
370 rad_type
= rad_value
.get("rad_type")
371 table
.insert(char_id
, rad_type
, value
)
373 # Tables generated from <misc> begin here
374 table
= self
.tables
['stroke_count']
375 for stroke_count
in misc
.findall("stroke_count"):
376 count
= int(stroke_count
.text
)
377 table
.insert(char_id
, count
)
379 table
= self
.tables
['variant']
380 for variant
in misc
.findall("variant"):
382 var_type
= variant
.get("var_type")
383 table
.insert(char_id
, var_type
, value
)
385 table
= self
.tables
['rad_name']
386 for rad_name
in misc
.findall("rad_name"):
387 value
= rad_name
.text
388 table
.insert(char_id
, value
)
390 # Remaining direct descendents of <character>...
391 dic_number
= character
.find("dic_number")
392 if dic_number
is not None:
393 table
= self
.tables
['dic_number']
394 for dic_ref
in dic_number
.findall("dic_ref"):
395 dr_type
= dic_ref
.get("dr_type")
396 m_vol
= dic_ref
.get("m_vol", None)
397 m_page
= dic_ref
.get("m_page", None)
399 table
.insert(char_id
, dr_type
, m_vol
, m_page
, value
)
401 query_code
= character
.find("query_code")
402 if query_code
is not None:
403 table
= self
.tables
['query_code']
404 for q_code
in query_code
.findall("q_code"):
405 qc_type
= q_code
.get("qc_type")
406 skip_misclass
= q_code
.get("skip_misclass", None)
408 table
.insert(char_id
, qc_type
, skip_misclass
, value
)
410 reading_meaning
= character
.find("reading_meaning")
411 if reading_meaning
is not None:
412 table
= self
.tables
['rmgroup']
413 for rmgroup
in reading_meaning
.findall("rmgroup"):
414 group_id
= table
.insert(char_id
)
415 table
= self
.tables
['reading']
416 for reading
in rmgroup
.findall("reading"):
417 r_type
= reading
.get("r_type")
418 on_type
= reading
.get("on_type")
419 r_status
= reading
.get("r_status")
421 table
.insert(group_id
, r_type
, on_type
, r_status
, value
)
422 table
= self
.tables
['meaning']
423 for meaning
in rmgroup
.findall("meaning"):
424 lang
= meaning
.get("m_lang", "en")
426 table
.insert(group_id
, lang
, value
)
427 table
= self
.tables
['nanori']
428 for nanori
in reading_meaning
.findall("nanori"):
429 table
.insert(char_id
, nanori
.text
)
431 def _drop_table(self
, name
):
432 self
.cursor
.execute("DROP TABLE IF EXISTS %s" % name
)
434 def _create_index_tables(self
):
435 """Creates extra tables to help with common searches.
437 Supplementary tables include:
439 1. Reading search table: kun-yomi to character ID. Kun-yomi
440 is modified for easier searching (no "." or "-" markers).
443 self
._create
_reading
_search
_table
()
445 def _create_reading_search_table(self
):
446 """Creates "sanitized" reading to character ID search table."""
448 # Mapping is from reading to character ID...
449 # r.fk -> rg.id, rg.fk -> c.id.
451 "SELECT r.value, c.id "
452 "FROM reading r, rmgroup rg, character c "
453 'WHERE r.type = "ja_kun" AND r.fk = rg.id AND rg.fk = c.id'
455 self
.cursor
.execute(query
)
456 rows
= self
.cursor
.fetchall()
457 values
, ids
= zip(*rows
) # unzip idiom (see zip doc)
459 # Sanitize strings by removing "." and "-"
460 values
= [value
.replace(u
".", u
"").replace(u
"-", u
"")
464 tbl_name
= "kunyomi_lookup"
465 self
.tables
[tbl_name
] = tbl
= ReadingLookupTable(self
.cursor
, tbl_name
)
466 self
._drop
_table
(tbl_name
)
469 # Store all sanitized strings and their keys in the table
470 rows
= zip(values
, ids
)
474 ######################################################################
475 # KANJIDIC2 data tables
476 ######################################################################
479 class HeaderTable(Table
):
480 create_query
= ("CREATE TABLE %s "
481 "(file_version TEXT, "
482 "database_version TEXT, "
483 "date_of_creation TEXT)")
484 insert_query
= "INSERT INTO %s VALUES (?, ?, ?)"
487 class CharacterTable(Table
):
488 create_query
= ("CREATE TABLE %s "
489 "(id INTEGER PRIMARY KEY, literal TEXT, "
490 "grade INTEGER, freq INTEGER, jlpt INTEGER)")
491 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
493 "CREATE INDEX %s_literal ON %s (literal)",
497 class TypeValueTable(ChildTable
):
498 create_query
= ("CREATE TABLE %s "
499 "(id INTEGER PRIMARY KEY, fk INTEGER, "
500 "type TEXT, value TEXT)")
501 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
503 "CREATE INDEX %s_fk ON %s (fk)",
507 class StrokeCountTable(ChildTable
):
508 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, "
509 "fk INTEGER, count INTEGER)")
510 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
512 "CREATE INDEX %s_fk ON %s (fk)",
516 class DicNumberTable(ChildTable
):
517 create_query
= ("CREATE TABLE %s "
518 "(id INTEGER PRIMARY KEY, fk INTEGER, "
519 "type TEXT, m_vol TEXT, m_page TEXT, value TEXT)")
520 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
522 "CREATE INDEX %s_fk ON %s (fk)",
526 class QueryCodeTable(ChildTable
):
527 create_query
= ("CREATE TABLE %s "
528 "(id INTEGER PRIMARY KEY, fk INTEGER, "
529 "type TEXT, skip_misclass TEXT, value TEXT)")
530 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?)"
532 "CREATE INDEX %s_fk ON %s (fk)",
536 class RMGroupTable(ChildTable
):
537 create_query
= ("CREATE TABLE %s (id INTEGER PRIMARY KEY, fk INTEGER)")
538 insert_query
= "INSERT INTO %s VALUES (NULL, ?)"
540 "CREATE INDEX %s_fk ON %s (fk)",
543 class ReadingTable(ChildTable
):
544 create_query
= ("CREATE TABLE %s "
545 "(id INTEGER PRIMARY KEY, fk INTEGER, "
546 "type TEXT, on_type TEXT, r_status TEXT, value TEXT)")
547 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?, ?, ?)"
549 "CREATE INDEX %s_fk ON %s (fk)",
550 "CREATE INDEX %s_value ON %s (value)",
554 class MeaningTable(ChildTable
):
555 create_query
= ("CREATE TABLE %s "
556 "(id INTEGER PRIMARY KEY, fk INTEGER, "
557 "lang TEXT, value TEXT)")
558 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?, ?)"
560 "CREATE INDEX %s_fk ON %s (fk)",
561 "CREATE INDEX %s_lang_value ON %s (lang, value)",
565 ######################################################################
566 # Index tables (not part of actual KANJIDIC2)
567 ######################################################################
570 class ReadingLookupTable(Table
):
571 """Maps reading to character IDs."""
572 # Used for: kunyomi (KANJIDIC2 r_type==ja_kun)
573 create_query
= ("CREATE TABLE %s "
574 "(id INTEGER PRIMARY KEY, "
575 "reading TEXT, character_id INTEGER)")
576 insert_query
= "INSERT INTO %s VALUES (NULL, ?, ?)"
578 "CREATE INDEX %s_reading ON %s (reading)",
583 ######################################################################
586 from optparse
import OptionParser
587 op
= OptionParser(usage
="%prog [options] <db_filename> [search_query]")
588 op
.add_option("-i", "--initialize",
589 dest
="init_fname", metavar
="XML_SOURCE",
590 help=_("Initialize database from file."))
591 op
.add_option("-s", "--search", action
="store_true",
592 help=_("Search for kanji by readings or meanings"))
593 op
.add_option("-l", "--lookup", action
="store_true",
594 help=_("Look up exact character"))
595 op
.add_option("-L", "--lang",
596 help=_("Specify preferred language for searching."))
597 op
.add_option("-v", "--verbose", action
="store_true",
598 help=_("Verbose mode (print debug strings)"))
599 options
, args
= op
.parse_args()
603 if options
.lookup
and options
.search
:
604 print(_("Cannot --lookup and --search at the same time."),
607 return (options
, args
)
610 options
, args
= parse_args()
613 if options
.init_fname
is not None:
614 db
= Database(db_fname
, init_from_file
=options
.init_fname
)
616 db
= Database(db_fname
)
620 # No search was requested; we can exit here.
623 if options
.search
== True:
625 # To be nice, we'll join all remaining args with spaces.
626 search_query
= " ".join(args
[1:])
628 if options
.lang
is not None:
629 results
= db
.search(search_query
,
630 lang
=options
.lang
, options
=options
)
632 results
= db
.search(search_query
, options
=options
)
633 elif options
.lookup
== True:
635 encoding
= get_encoding()
636 lookup_query
= args
[1].decode(encoding
)
638 for character
in lookup_query
:
639 result
= db
.search_by_literal(character
)
640 if result
is not None:
641 results
.append(result
)
644 print(_("For searches or lookups, the --search or --lookup flag is "
648 # To do: visualize results
649 # Not as important; now we know we can at least do our needed
652 encoding
= get_encoding()
653 # DEBUG: until lookup_by_id is implemented, this will work.
654 for index
, result
in enumerate(results
):
656 print(_("[Entry %d]") % index
)
658 print(unicode(result
).encode(encoding
))
661 print(_("No results found."))
663 if __name__
== "__main__":