Added unicode string printing to kd2 Entries.
[jblite.git] / jblite / jd2_old.py
blob8831b35e28a42276c13eee0586e14a2c92df88ce
1 from __future__ import absolute_import
3 import os
4 from .helpers import with_db
5 from jbparse import kanjidic2
6 import gettext
7 gettext.install("jblite")
10 class KD2Converter(object):
12 def __init__(self, kd2_fname, db_fname, verbose=False):
13 if os.path.exists(db_fname):
14 assert os.path.isfile(db_fname), _("Specified path is not a file.")
15 assert os.access(db_fname, os.W_OK), \
16 _("Cannot write to specified file.")
17 self.kd2_fname = kd2_fname
18 self.db_fname = db_fname
19 self.verbose = verbose
21 def run(self):
22 parser = kanjidic2.Parser(self.kd2_fname)
23 with_db(self.db_fname, self.create_db,
24 parser.header, parser.characters)
25 if self.verbose:
26 print _("Database committed. Conversion complete.")
28 def create_db(self, cur, header, data):
29 """Main function for creating a KANJIDIC2-based SQLite database."""
30 self.create_tables(cur)
31 self.populate_tables(cur, header, data)
33 def drop_tables(self, cur):
34 if self.verbose:
35 print _("Dropping existing tables... ")
36 tables = (
37 "header",
38 "stroke_miscounts",
39 "misc",
40 "codepoints",
41 "radicals",
42 "variants",
43 "dict_codes",
44 "radical_names",
45 "nanori",
46 "misstrokes",
47 "query_codes",
48 "senses",
49 "readings",
50 "meanings",
52 for tbl in tables:
53 cur.execute("DROP TABLE IF EXISTS %s" % tbl)
55 def create_tables(self, cur):
56 """Creates tables for storing kanji information.
58 This format should be fully compatible with all data currently
59 stored in KANJIDIC2. (KANJIDIC2 file_version: 4)
61 If tables already exist, they will be silently dropped beforehand.
63 """
64 self.drop_tables(cur)
65 if self.verbose:
66 print _("Creating empty tables... ")
67 cur.execute(
68 "CREATE TABLE header "
69 "(file_version TEXT, database_version TEXT, date_of_creation TEXT)")
70 cur.execute(
71 "CREATE TABLE misc "
72 "(literal TEXT PRIMARY KEY,"
73 " grade INTEGER, freq INTEGER, jlpt INTEGER, strokes INTEGER)")
74 for tbltype in ("codepoints", "radicals", "variants", "dict_codes"):
75 cur.execute(
76 "CREATE TABLE %s "
77 "(literal TEXT, seq INTEGER, type TEXT, value TEXT,"
78 " PRIMARY KEY (literal, seq))" % tbltype)
79 for tbltype in ("radical_names", "nanori"):
80 cur.execute(
81 "CREATE TABLE %s "
82 "(literal TEXT, seq INTEGER, value TEXT,"
83 " PRIMARY KEY (literal, seq))" % tbltype)
84 cur.execute(
85 "CREATE TABLE stroke_miscounts "
86 "(literal TEXT, seq INTEGER, strokes INTEGER,"
87 " PRIMARY KEY (literal, seq))")
88 cur.execute(
89 "CREATE TABLE query_codes "
90 "(literal TEXT, seq INTEGER, type TEXT, value TEXT,"
91 " skip_misclass TEXT, PRIMARY KEY (literal, seq))")
92 cur.execute(
93 "CREATE TABLE readings "
94 "(literal TEXT, sense INTEGER, seq INTEGER, type TEXT, value TEXT,"
95 " on_type TEXT, status TEXT, PRIMARY KEY (literal, sense, seq))")
96 cur.execute(
97 "CREATE TABLE meanings "
98 "(literal TEXT, sense INTEGER, seq INTEGER, lang TEXT, value TEXT,"
99 " PRIMARY KEY (literal, sense, seq))")
101 def populate_tables(self, cur, header, data):
102 if self.verbose:
103 print _("Populating tables... ")
104 self.populate_header(cur, header)
105 total_kanji = len(data)
106 for i, kanji in enumerate(data):
107 if self.verbose:
108 if i % 1000 == 0 and i != 0:
109 print _("%d/%d kanji converted.") % (i, total_kanji)
110 self.populate_senses(cur, kanji)
111 self.populate_nanori(cur, kanji)
112 self.populate_misc(cur, kanji)
113 self.populate_codepoints(cur, kanji)
114 self.populate_radicals(cur, kanji)
115 self.populate_radical_names(cur, kanji)
116 self.populate_variants(cur, kanji)
117 self.populate_dict_codes(cur, kanji)
118 self.populate_query_codes(cur, kanji)
119 if self.verbose:
120 print _("All kanji converted. Committing...")
122 def populate_header(self, cur, header):
123 file_version = header.find("file_version").text
124 database_version = header.find("database_version").text
125 date = header.find("date_of_creation").text
126 cur.execute(
127 "INSERT INTO header "
128 "(file_version, database_version, date_of_creation) VALUES (?, ?, ?)",
129 (file_version, database_version, date))
131 def populate_meanings(self, cur, kanji, sense, sense_id):
132 meanings = sense._get_meaning_nodes()
133 if not meanings:
134 return
135 pairs = []
136 for lang in sorted(meanings):
137 pairs.extend([(lang, o.text) for o in meanings[lang]])
138 pairs = [(kanji.literal, sense_id, i+1, lang, gloss)
139 for i, (lang, gloss) in enumerate(pairs)]
140 cur.executemany(
141 "INSERT INTO meanings (literal, sense, seq, lang, value) "
142 "VALUES (?, ?, ?, ?, ?)", pairs)
144 def populate_readings(self, cur, kanji, sense, sense_id):
145 readings = sense._get_reading_nodes()
146 if not readings:
147 return
148 pairs = []
149 for r_type in sorted(readings):
150 pairs.extend(
151 [(r_type, o.text, o.attrib.get("on_type"), o.attrib.get("r_status"))
152 for o in readings[r_type]])
153 pairs = [(kanji.literal, sense_id, i+1, r_type, reading, on_type, status)
154 for i, (r_type, reading, on_type, status) in enumerate(pairs)]
155 cur.executemany(
156 "INSERT INTO readings "
157 "(literal, sense, seq, type, value, on_type, status) "
158 "VALUES (?, ?, ?, ?, ?, ?, ?)", pairs)
160 def populate_senses(self, cur, kanji):
161 senses = kanji._get_sense_nodes()
162 if not senses:
163 return
164 for i, sense in enumerate(senses):
165 i += 1
166 self.populate_readings(cur, kanji, sense, i)
167 self.populate_meanings(cur, kanji, sense, i)
169 def populate_stroke_miscounts(self, cur, kanji, miscounts):
170 data = [(kanji.literal, i+1, count) for i, count in enumerate(miscounts)]
171 cur.executemany(
172 "INSERT INTO stroke_miscounts (literal, seq, strokes) "
173 "VALUES (?, ?, ?)", data)
175 def populate_misc(self, cur, kanji):
176 strokes, miscounts = kanji.get_strokes()
177 grade = kanji.get_grade()
178 freq = kanji.get_freq()
179 jlpt = kanji.get_jlpt()
180 cur.execute(
181 "INSERT INTO misc (literal, grade, freq, jlpt, strokes) "
182 "VALUES (?, ?, ?, ?, ?)", (kanji.literal, grade, freq, jlpt, strokes))
183 if miscounts:
184 self.populate_stroke_miscounts(cur, kanji, miscounts)
186 def _populate_val_table(self, cur, kanji, table_name, nodes):
187 if not nodes:
188 return
189 data = [(kanji.literal, i+1, node.text) for i, node in enumerate(nodes)]
190 cur.executemany(
191 "INSERT INTO %s (literal, seq, value) VALUES (?, ?, ?)" % table_name,
192 data)
194 def _populate_type_val_table(self, cur, kanji, table_name, node_d):
195 if not node_d:
196 return
198 data = []
199 for typ in sorted(node_d):
200 data.extend([(typ, o.text) for o in node_d[typ]])
201 data = [(kanji.literal, i+1, typ, value)
202 for i, (typ, value) in enumerate(data)]
203 cur.executemany(
204 "INSERT INTO %s (literal, seq, type, value) "
205 "VALUES (?, ?, ?, ?)" % table_name, data)
207 def populate_nanori(self, cur, kanji):
208 self._populate_val_table(cur, kanji, "nanori", kanji._get_nanori_nodes())
210 def populate_radical_names(self, cur, kanji):
211 self._populate_val_table(cur, kanji, "radical_names",
212 kanji._get_radical_name_nodes())
214 def populate_radicals(self, cur, kanji):
215 self._populate_type_val_table(cur, kanji, "radicals",
216 kanji._get_radical_nodes())
218 def populate_codepoints(self, cur, kanji):
219 self._populate_type_val_table(cur, kanji, "codepoints",
220 kanji._get_codepoint_nodes())
222 def populate_variants(self, cur, kanji):
223 self._populate_type_val_table(cur, kanji, "variants",
224 kanji._get_variant_nodes())
226 def populate_dict_codes(self, cur, kanji):
227 pass
229 def populate_query_codes(self, cur, kanji):
230 pass