1 from __future__
import absolute_import
4 from .helpers
import with_db
5 from jbparse
import kanjidic2
7 gettext
.install("jblite")
10 class KD2Converter(object):
12 def __init__(self
, kd2_fname
, db_fname
, verbose
=False):
13 if os
.path
.exists(db_fname
):
14 assert os
.path
.isfile(db_fname
), _("Specified path is not a file.")
15 assert os
.access(db_fname
, os
.W_OK
), \
16 _("Cannot write to specified file.")
17 self
.kd2_fname
= kd2_fname
18 self
.db_fname
= db_fname
19 self
.verbose
= verbose
22 parser
= kanjidic2
.Parser(self
.kd2_fname
)
23 with_db(self
.db_fname
, self
.create_db
,
24 parser
.header
, parser
.characters
)
26 print _("Database committed. Conversion complete.")
28 def create_db(self
, cur
, header
, data
):
29 """Main function for creating a KANJIDIC2-based SQLite database."""
30 self
.create_tables(cur
)
31 self
.populate_tables(cur
, header
, data
)
33 def drop_tables(self
, cur
):
35 print _("Dropping existing tables... ")
53 cur
.execute("DROP TABLE IF EXISTS %s" % tbl
)
55 def create_tables(self
, cur
):
56 """Creates tables for storing kanji information.
58 This format should be fully compatible with all data currently
59 stored in KANJIDIC2. (KANJIDIC2 file_version: 4)
61 If tables already exist, they will be silently dropped beforehand.
66 print _("Creating empty tables... ")
68 "CREATE TABLE header "
69 "(file_version TEXT, database_version TEXT, date_of_creation TEXT)")
72 "(literal TEXT PRIMARY KEY,"
73 " grade INTEGER, freq INTEGER, jlpt INTEGER, strokes INTEGER)")
74 for tbltype
in ("codepoints", "radicals", "variants", "dict_codes"):
77 "(literal TEXT, seq INTEGER, type TEXT, value TEXT,"
78 " PRIMARY KEY (literal, seq))" % tbltype
)
79 for tbltype
in ("radical_names", "nanori"):
82 "(literal TEXT, seq INTEGER, value TEXT,"
83 " PRIMARY KEY (literal, seq))" % tbltype
)
85 "CREATE TABLE stroke_miscounts "
86 "(literal TEXT, seq INTEGER, strokes INTEGER,"
87 " PRIMARY KEY (literal, seq))")
89 "CREATE TABLE query_codes "
90 "(literal TEXT, seq INTEGER, type TEXT, value TEXT,"
91 " skip_misclass TEXT, PRIMARY KEY (literal, seq))")
93 "CREATE TABLE readings "
94 "(literal TEXT, sense INTEGER, seq INTEGER, type TEXT, value TEXT,"
95 " on_type TEXT, status TEXT, PRIMARY KEY (literal, sense, seq))")
97 "CREATE TABLE meanings "
98 "(literal TEXT, sense INTEGER, seq INTEGER, lang TEXT, value TEXT,"
99 " PRIMARY KEY (literal, sense, seq))")
101 def populate_tables(self
, cur
, header
, data
):
103 print _("Populating tables... ")
104 self
.populate_header(cur
, header
)
105 total_kanji
= len(data
)
106 for i
, kanji
in enumerate(data
):
108 if i
% 1000 == 0 and i
!= 0:
109 print _("%d/%d kanji converted.") % (i
, total_kanji
)
110 self
.populate_senses(cur
, kanji
)
111 self
.populate_nanori(cur
, kanji
)
112 self
.populate_misc(cur
, kanji
)
113 self
.populate_codepoints(cur
, kanji
)
114 self
.populate_radicals(cur
, kanji
)
115 self
.populate_radical_names(cur
, kanji
)
116 self
.populate_variants(cur
, kanji
)
117 self
.populate_dict_codes(cur
, kanji
)
118 self
.populate_query_codes(cur
, kanji
)
120 print _("All kanji converted. Committing...")
122 def populate_header(self
, cur
, header
):
123 file_version
= header
.find("file_version").text
124 database_version
= header
.find("database_version").text
125 date
= header
.find("date_of_creation").text
127 "INSERT INTO header "
128 "(file_version, database_version, date_of_creation) VALUES (?, ?, ?)",
129 (file_version
, database_version
, date
))
131 def populate_meanings(self
, cur
, kanji
, sense
, sense_id
):
132 meanings
= sense
._get
_meaning
_nodes
()
136 for lang
in sorted(meanings
):
137 pairs
.extend([(lang
, o
.text
) for o
in meanings
[lang
]])
138 pairs
= [(kanji
.literal
, sense_id
, i
+1, lang
, gloss
)
139 for i
, (lang
, gloss
) in enumerate(pairs
)]
141 "INSERT INTO meanings (literal, sense, seq, lang, value) "
142 "VALUES (?, ?, ?, ?, ?)", pairs
)
144 def populate_readings(self
, cur
, kanji
, sense
, sense_id
):
145 readings
= sense
._get
_reading
_nodes
()
149 for r_type
in sorted(readings
):
151 [(r_type
, o
.text
, o
.attrib
.get("on_type"), o
.attrib
.get("r_status"))
152 for o
in readings
[r_type
]])
153 pairs
= [(kanji
.literal
, sense_id
, i
+1, r_type
, reading
, on_type
, status
)
154 for i
, (r_type
, reading
, on_type
, status
) in enumerate(pairs
)]
156 "INSERT INTO readings "
157 "(literal, sense, seq, type, value, on_type, status) "
158 "VALUES (?, ?, ?, ?, ?, ?, ?)", pairs
)
160 def populate_senses(self
, cur
, kanji
):
161 senses
= kanji
._get
_sense
_nodes
()
164 for i
, sense
in enumerate(senses
):
166 self
.populate_readings(cur
, kanji
, sense
, i
)
167 self
.populate_meanings(cur
, kanji
, sense
, i
)
169 def populate_stroke_miscounts(self
, cur
, kanji
, miscounts
):
170 data
= [(kanji
.literal
, i
+1, count
) for i
, count
in enumerate(miscounts
)]
172 "INSERT INTO stroke_miscounts (literal, seq, strokes) "
173 "VALUES (?, ?, ?)", data
)
175 def populate_misc(self
, cur
, kanji
):
176 strokes
, miscounts
= kanji
.get_strokes()
177 grade
= kanji
.get_grade()
178 freq
= kanji
.get_freq()
179 jlpt
= kanji
.get_jlpt()
181 "INSERT INTO misc (literal, grade, freq, jlpt, strokes) "
182 "VALUES (?, ?, ?, ?, ?)", (kanji
.literal
, grade
, freq
, jlpt
, strokes
))
184 self
.populate_stroke_miscounts(cur
, kanji
, miscounts
)
186 def _populate_val_table(self
, cur
, kanji
, table_name
, nodes
):
189 data
= [(kanji
.literal
, i
+1, node
.text
) for i
, node
in enumerate(nodes
)]
191 "INSERT INTO %s (literal, seq, value) VALUES (?, ?, ?)" % table_name
,
194 def _populate_type_val_table(self
, cur
, kanji
, table_name
, node_d
):
199 for typ
in sorted(node_d
):
200 data
.extend([(typ
, o
.text
) for o
in node_d
[typ
]])
201 data
= [(kanji
.literal
, i
+1, typ
, value
)
202 for i
, (typ
, value
) in enumerate(data
)]
204 "INSERT INTO %s (literal, seq, type, value) "
205 "VALUES (?, ?, ?, ?)" % table_name
, data
)
207 def populate_nanori(self
, cur
, kanji
):
208 self
._populate
_val
_table
(cur
, kanji
, "nanori", kanji
._get
_nanori
_nodes
())
210 def populate_radical_names(self
, cur
, kanji
):
211 self
._populate
_val
_table
(cur
, kanji
, "radical_names",
212 kanji
._get
_radical
_name
_nodes
())
214 def populate_radicals(self
, cur
, kanji
):
215 self
._populate
_type
_val
_table
(cur
, kanji
, "radicals",
216 kanji
._get
_radical
_nodes
())
218 def populate_codepoints(self
, cur
, kanji
):
219 self
._populate
_type
_val
_table
(cur
, kanji
, "codepoints",
220 kanji
._get
_codepoint
_nodes
())
222 def populate_variants(self
, cur
, kanji
):
223 self
._populate
_type
_val
_table
(cur
, kanji
, "variants",
224 kanji
._get
_variant
_nodes
())
226 def populate_dict_codes(self
, cur
, kanji
):
229 def populate_query_codes(self
, cur
, kanji
):