1 https://github.com/google/mozc/issues/462
3 --- /src/dictionary/gen_pos_map.py
4 +++ /src/dictionary/gen_pos_map.py
6 from build_tools import code_generator_util
9 -HEADER = """// Copyright 2009 Google Inc. All Rights Reserved.
10 +HEADER = b"""// Copyright 2009 Google Inc. All Rights Reserved.
13 #ifndef MOZC_DICTIONARY_POS_MAP_H_
15 // POS conversion rules
16 const POSMap kPOSMap[] = {
21 #endif // MOZC_DICTIONARY_POS_MAP_H_
24 def ParseUserPos(user_pos_file):
25 - with open(user_pos_file, 'r') as stream:
26 + with open(user_pos_file, 'rb') as stream:
27 stream = code_generator_util.SkipLineComment(stream)
28 stream = code_generator_util.ParseColumnStream(stream, num_column=2)
29 return dict((key, enum_value) for key, enum_value in stream)
31 user_pos_map = ParseUserPos(user_pos_file)
34 - with open(third_party_pos_map_file, 'r') as stream:
35 + with open(third_party_pos_map_file, 'rb') as stream:
36 stream = code_generator_util.SkipLineComment(stream)
37 for columns in code_generator_util.ParseColumnStream(stream, num_column=2):
38 third_party_pos_name, mozc_pos = (columns + [None])[:2]
40 result[third_party_pos_name] = mozc_pos
42 # Create mozc_pos to mozc_pos map.
43 - for key, value in user_pos_map.iteritems():
44 + for key, value in user_pos_map.items():
46 assert (result[key] == value)
52 - 'static_cast< ::mozc::user_dictionary::UserDictionary::PosType>(-1)')
53 + b'static_cast< ::mozc::user_dictionary::UserDictionary::PosType>(-1)')
55 - value = '::mozc::user_dictionary::UserDictionary::' + value
56 - output.write(' { %s, %s },\n' % (key, value))
57 + value = b'::mozc::user_dictionary::UserDictionary::' + value
58 + output.write(b' { %s, %s },\n' % (key, value))
63 pos_map = GeneratePosMap(options.third_party_pos_map_file,
64 options.user_pos_file)
66 - with open(options.output, 'w') as stream:
67 + with open(options.output, 'wb') as stream:
68 OutputPosMap(pos_map, stream)
71 --- /src/dictionary/gen_pos_rewrite_rule.py
72 +++ /src/dictionary/gen_pos_rewrite_rule.py
76 def LoadRewriteMapRule(filename):
78 + fh = open(filename, 'rb')
81 - line = line.rstrip('\n')
82 - if not line or line.startswith('#'):
83 + line = line.rstrip(b'\n')
84 + if not line or line.startswith(b'#'):
87 rule.append([fields[0], fields[1]])
92 def ReadPOSID(id_file, special_pos_file):
95 - for line in open(id_file, 'r'):
96 + fh = open(id_file, 'rb')
99 pos_list.append(fields[1])
102 - for line in open(special_pos_file, 'r'):
103 - if len(line) <= 1 or line[0] == '#':
104 + fh = open(special_pos_file, 'rb')
106 + if len(line) <= 1 or line[0:1] == b'#':
108 fields = line.split()
109 pos_list.append(fields[0])
117 with open(opts.output, 'wb') as f:
118 - f.write(''.join(chr(id) for id in ids))
119 + f.write(''.join(chr(id) for id in ids).encode('utf-8'))
122 if __name__ == '__main__':
123 --- /src/dictionary/gen_suffix_data.py
124 +++ /src/dictionary/gen_suffix_data.py
126 opts = _ParseOptions()
129 - with open(opts.input, 'r') as stream:
130 + with open(opts.input, 'rb') as stream:
132 - line = line.rstrip('\r\n')
133 - fields = line.split('\t')
134 + line = line.rstrip(b'\r\n')
135 + fields = line.split(b'\t')
146 result.append((key, value, lid, rid, cost))
148 --- /src/dictionary/gen_user_pos_data.py
149 +++ /src/dictionary/gen_user_pos_data.py
151 f.write(struct.pack('<H', conjugation_id))
153 serialized_string_array_builder.SerializeToFile(
154 - sorted(string_index.iterkeys()), output_string_array)
155 + sorted(x.encode('utf-8') for x in string_index.keys()), output_string_array)
161 if options.output_pos_list:
162 serialized_string_array_builder.SerializeToFile(
163 - [pos for (pos, _) in user_pos.data], options.output_pos_list)
164 + [pos.encode('utf-8') for (pos, _) in user_pos.data], options.output_pos_list)
167 if __name__ == '__main__':
168 --- /src/dictionary/gen_zip_code_seed.py
169 +++ /src/dictionary/gen_zip_code_seed.py
171 address = unicodedata.normalize('NFKC', self.address)
172 line = '\t'.join([zip_code, '0', '0', str(ZIP_CODE_COST),
173 address, ZIP_CODE_LABEL])
174 - print line.encode('utf-8')
175 + print(line.encode('utf-8'))
178 def ProcessZipCodeCSV(file_name):
179 @@ -105,26 +105,26 @@
181 def ReadZipCodeEntries(zip_code, level1, level2, level3):
182 """Read zip code entries."""
183 - return [ZipEntry(zip_code, u''.join([level1, level2, town]))
184 + return [ZipEntry(zip_code, ''.join([level1, level2, town]))
185 for town in ParseTownName(level3)]
188 def ReadJigyosyoEntry(zip_code, level1, level2, level3, name):
189 """Read jigyosyo entry."""
190 return ZipEntry(zip_code,
191 - u''.join([level1, level2, level3, u' ', name]))
192 + ''.join([level1, level2, level3, ' ', name]))
195 def ParseTownName(level3):
196 """Parse town name."""
197 - if level3.find(u'以下に掲載がない場合') != -1:
198 + if level3.find('以下に掲載がない場合') != -1:
201 assert CanParseAddress(level3), ('failed to be merged %s'
202 % level3.encode('utf-8'))
204 # We ignore additional information here.
205 - level3 = re.sub(u'(.*)', u'', level3, re.U)
206 + level3 = re.sub('(.*)', '', level3, re.U)
208 # For 地割, we have these cases.
213 # We simply use XX for them.
214 - chiwari_match = re.match(u'(\D*?)第?\d+地割.*', level3, re.U)
215 + chiwari_match = re.match('(\D*?)第?\d+地割.*', level3, re.U)
217 town = chiwari_match.group(1)
219 @@ -144,21 +144,21 @@
220 # -> XX町YY and (XX町)ZZ
223 - chou_match = re.match(u'(.*町)?(.*)', level3, re.U)
224 + chou_match = re.match('(.*町)?(.*)', level3, re.U)
228 if chou_match.group(1):
229 chou = chou_match.group(1)
230 rests = chou_match.group(2)
231 - return [chou + rest for rest in rests.split(u'、')]
232 + return [chou + rest for rest in rests.split('、')]
237 def CanParseAddress(address):
238 """Return true for valid address."""
239 - return (address.find(u'(') == -1 or
240 - address.find(u')') != -1)
241 + return (address.find('(') == -1 or
242 + address.find(')') != -1)
246 --- /src/dictionary/zip_code_util.py
247 +++ /src/dictionary/zip_code_util.py
252 - SpecialMergeZip(u'5900111', u'大阪府', u'堺市中区', [u'三原台']),
253 - SpecialMergeZip(u'8710046', u'大分県', u'中津市',
254 - [u'金谷', u'西堀端', u'東堀端', u'古金谷']),
255 - SpecialMergeZip(u'9218046', u'石川県', u'金沢市',
256 - [u'大桑町', u'三小牛町']),
257 + SpecialMergeZip('5900111', '大阪府', '堺市中区', ['三原台']),
258 + SpecialMergeZip('8710046', '大分県', '中津市',
259 + ['金谷', '西堀端', '東堀端', '古金谷']),
260 + SpecialMergeZip('9218046', '石川県', '金沢市',
265 --- /src/gui/character_pad/data/gen_cp932_map.py
266 +++ /src/gui/character_pad/data/gen_cp932_map.py
273 kUnicodePat = re.compile(r'0x[0-9A-Fa-f]{2,4}')
274 def IsValidUnicode(n):
276 fh = open(sys.argv[1])
278 for line in fh.readlines():
282 - array = string.split(line)
283 + array = line.split()
286 if eval(sjis) < 32 or not IsValidUnicode(ucs2):
288 result.setdefault(ucs2, sjis)
291 keys = sorted(result.keys())
293 - print "struct CP932MapData {"
294 - print " unsigned int ucs4;"
295 - print " unsigned short int sjis;"
298 - print "static const size_t kCP932MapDataSize = %d;" % (len(keys))
299 - print "static const CP932MapData kCP932MapData[] = {"
300 + print("struct CP932MapData {")
301 + print(" unsigned int ucs4;")
302 + print(" unsigned short int sjis;")
305 + print("static const size_t kCP932MapDataSize = %d;" % (len(keys)))
306 + print("static const CP932MapData kCP932MapData[] = {")
308 - print " { %s, %s }," % (n ,result[n])
311 + print(" { %s, %s }," % (n ,result[n]))
312 + print(" { 0, 0 }");
315 if __name__ == "__main__":
317 --- /src/gui/character_pad/data/gen_local_character_map.py
318 +++ /src/gui/character_pad/data/gen_local_character_map.py
330 for line in fh.readlines():
334 - array = string.split(line)
335 + array = line.split()
336 jis = array[0].replace('0x', '')
337 ucs2 = array[1].replace('0x', '')
341 if IsValidUnicode(ucs2):
342 result.append([jis, ucs2])
345 return ["JISX0201", result]
350 for line in fh.readlines():
355 jis = array[1].replace('0x', '')
356 ucs2 = array[2].replace('0x', '')
357 if IsValidUnicode(ucs2):
358 result.append([jis, ucs2])
361 return ["JISX0208", result]
366 for line in fh.readlines():
371 jis = array[0].replace('0x', '')
372 ucs2 = array[1].replace('0x', '')
373 if IsValidUnicode(ucs2):
374 result.append([jis, ucs2])
377 return ["JISX0212", result]
382 for line in fh.readlines():
387 sjis = array[0].replace('0x', '')
388 @@ -100,19 +102,20 @@
390 if IsValidUnicode(ucs2):
391 result.append([sjis, ucs2])
394 return ["CP932", result]
399 - print "static const size_t k%sMapSize = %d;" % (name, len(result))
400 - print "static const mozc::gui::CharacterPalette::LocalCharacterMap k%sMap[] = {" % (name)
401 + print("static const size_t k%sMapSize = %d;" % (name, len(result)))
402 + print("static const mozc::gui::CharacterPalette::LocalCharacterMap k%sMap[] = {" % (name))
404 - print " { 0x%s, 0x%s }," % (n[0] ,n[1])
408 + print(" { 0x%s, 0x%s }," % (n[0] ,n[1]))
409 + print(" { 0, 0 }");
413 if __name__ == "__main__":
414 Output(LoadJISX0201(sys.argv[1]))
415 --- /src/gui/character_pad/data/gen_unicode_blocks.py
416 +++ /src/gui/character_pad/data/gen_unicode_blocks.py
421 -re = re.compile('^(.....?)\.\.(.....?); (.+)')
422 +re = re.compile(r'^(.....?)\.\.(.....?); (.+)')
425 - print "static const mozc::gui::CharacterPalette::UnicodeBlock kUnicodeBlockTable[] = {"
426 + print("static const mozc::gui::CharacterPalette::UnicodeBlock kUnicodeBlockTable[] = {")
427 fh = open(sys.argv[1])
428 for line in fh.readlines():
435 end = int(m.group(2), 16)
437 if start <= 0x2FFFF and end <= 0x2FFFF:
438 - print " { \"%s\", { %d, %d } }," % (name, start, end)
439 + print(" { \"%s\", { %d, %d } }," % (name, start, end))
442 - print " { NULL, { 0, 0 } }"
445 + print(" { NULL, { 0, 0 } }")
449 if __name__ == "__main__":
451 --- /src/gui/character_pad/data/gen_unicode_data.py
452 +++ /src/gui/character_pad/data/gen_unicode_data.py
456 results.append(" { %d, \"%s\" }," % (code, desc))
459 - print "struct UnicodeData {";
460 - print " char32 ucs4;";
461 - print " const char *description;";
464 - print "static const size_t kUnicodeDataSize = %d;" % (len(results))
465 - print "static const UnicodeData kUnicodeData[] = {";
466 + print("struct UnicodeData {");
467 + print(" char32 ucs4;");
468 + print(" const char *description;");
471 + print("static const size_t kUnicodeDataSize = %d;" % (len(results)))
472 + print("static const UnicodeData kUnicodeData[] = {");
475 - print " { 0, NULL }";
478 + print(" { 0, NULL }");
481 if __name__ == "__main__":
483 --- /src/gui/character_pad/data/gen_unihan_data.py
484 +++ /src/gui/character_pad/data/gen_unihan_data.py
494 - if n is not "NULL":
496 return "\"%s\"" % (n)
501 - if n is not "NULL":
502 - n = string.replace(n, '0-', 'JIS X 0208: 0x')
503 - n = string.replace(n, '1-', 'JIS X 0212: 0x')
504 - n = string.replace(n, '3-', 'JIS X 0213: 0x')
505 - n = string.replace(n, '4-', 'JIS X 0213: 0x')
506 - n = string.replace(n, 'A-', 'Vendors Ideographs: 0x')
507 - n = string.replace(n, '3A', 'JIS X 0213 2000: 0x')
509 + n = n.replace('0-', 'JIS X 0208: 0x')
510 + n = n.replace('1-', 'JIS X 0212: 0x')
511 + n = n.replace('3-', 'JIS X 0213: 0x')
512 + n = n.replace('4-', 'JIS X 0213: 0x')
513 + n = n.replace('A-', 'Vendors Ideographs: 0x')
514 + n = n.replace('3A', 'JIS X 0213 2000: 0x')
520 pat = re.compile(r'^(\d+)\.')
521 - if n is not "NULL":
525 result = rs[m.group(1)]
526 - return "\"%s\"" % (result.encode('string_escape'))
527 + return "\"%s\"" % result
538 pat = re.compile(r'^U\+(\S+)\s+(kTotalStrokes|kJapaneseKun|kJapaneseOn|kRSUnicode|kIRG_JSource)\t(.+)')
540 n = int(m.group(1), 16)
542 dic.setdefault(key, {}).setdefault(field, value)
545 keys = sorted(dic.keys())
547 - print "struct UnihanData {";
548 - print " unsigned int ucs4;";
549 + print("struct UnihanData {");
550 + print(" unsigned int ucs4;");
551 # Since the total strokes defined in Unihan data is Chinese-based
552 # number, we can't use it.
553 # print " unsigned char total_strokes;";
554 - print " const char *japanese_kun;";
555 - print " const char *japanese_on;";
556 + print(" const char *japanese_kun;");
557 + print(" const char *japanese_on;");
558 # Since the radical information defined in Unihan data is Chinese-based
559 # number, we can't use it.
560 # print " const char *radical;";
561 - print " const char *IRG_jsource;";
563 - print "static const size_t kUnihanDataSize = %d;" % (len(keys))
564 - print "static const UnihanData kUnihanData[] = {"
565 + print(" const char *IRG_jsource;");
567 + print("static const size_t kUnihanDataSize = %d;" % (len(keys)))
568 + print("static const UnihanData kUnihanData[] = {")
571 total_strokes = dic[key].get("kTotalStrokes", "0")
573 rad = GetRadical(dic[key].get("kRSUnicode", "NULL"))
574 code = GetCode(dic[key].get("kIRG_JSource", "NULL"))
575 # print " { 0x%s, %s, %s, %s, %s, %s }," % (key, total_strokes, kun, on, rad, code)
576 - print " { 0x%s, %s, %s, %s }," % (key, kun, on, code)
577 + print(" { 0x%s, %s, %s, %s }," % (key, kun, on, code))
582 if __name__ == "__main__":