Update mojo sdk to rev 1dc8a9a5db73d3718d99917fadf31f5fb2ebad4f
[chromium-blink-merge.git] / third_party / gtk+ / gtk / compose-parse.py
blobc9729e16b1446939d13cec942408afd000891ec1
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re import findall, match, split, sub
14 from string import atoi
15 from unicodedata import normalize
16 from urllib import urlretrieve
17 from os.path import isfile, getsize
18 from copy import copy
20 import sys
21 import getopt
23 # We grab files off the web, left and right.
24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
31 # we might produce some tables with size 2 and some with size 4.
32 SIZEOFINT = 2
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE = 5
37 keysymdatabase = {}
38 keysymunicodedatabase = {}
39 unicodedatabase = {}
41 headerfile_start = """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 * Lesser General Public License for more details.
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library; if not, write to the
56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57 * Boston, MA 02111-1307, USA.
61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62 * using the input files
63 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
67 * This table is optimised for space and requires special handling to access the content.
68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
75 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
76 * file for a list of people on the GTK+ Team. See the ChangeLog
77 * files for a list of changes. These files are distributed with
78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
84 /* === These are the original comments of the file; we keep for historical purposes ===
86 * The following table was generated from the X compose tables include with
87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88 * to obtain the relevant perl scripts.
90 * The following compose letter letter sequences confliced
91 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
93 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
94 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
95 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
96 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99 * spanish. atilde and otilde are used at least for Portuguese ]
101 * at and Aring; resolved to Aring [ AA ]
102 * guillemotleft and caron; resolved to guillemotleft [ << ]
103 * ogonek and cedilla; resolved to cedilla [ ,, ]
105 * This probably should be resolved by first checking an additional set of compose tables
106 * that depend on the locale or selected input method.
109 static const guint16 gtk_compose_seqs_compact[] = {"""
111 headerfile_end = """};
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
116 def stringtohex(str): return atoi(str, 16)
118 def factorial(n):
119 if n <= 1:
120 return 1
121 else:
122 return n * factorial(n-1)
124 def uniq(*args) :
125 """ Performs a uniq operation on a list or lists """
126 theInputList = []
127 for theList in args:
128 theInputList += theList
129 theFinalList = []
130 for elem in theInputList:
131 if elem not in theFinalList:
132 theFinalList.append(elem)
133 return theFinalList
137 def all_permutations(seq):
138 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139 """ Produces all permutations of the items of a list """
140 if len(seq) <=1:
141 yield seq
142 else:
143 for perm in all_permutations(seq[1:]):
144 for i in range(len(perm)+1):
145 #nb str[0:1] works in both string and list contexts
146 yield perm[:i] + seq[0:1] + perm[i:]
148 def usage():
149 print """compose-parse available parameters:
150 -h, --help this craft
151 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
152 -a, --algorithmic show sequences saved with algorithmic optimisation
153 -g, --gtk show entries that go to GTK+
154 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
155 -v, --verbose show verbose output
156 -p, --plane1 show plane1 compose sequences
157 -n, --numeric when used with --gtk, create file with numeric values only
158 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
159 --all-sequences when used with --gtk, create file with entries rejected by default
160 Default is to show statistics.
163 try:
164 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
165 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"])
166 except:
167 usage()
168 sys.exit(2)
170 opt_statistics = False
171 opt_algorithmic = False
172 opt_gtk = False
173 opt_unicodedatatxt = False
174 opt_verbose = False
175 opt_plane1 = False
176 opt_numeric = False
177 opt_gtkexpanded = False
178 opt_allsequences = False
180 for o, a in opts:
181 if o in ("-h", "--help"):
182 usage()
183 sys.exit()
184 if o in ("-s", "--statistics"):
185 opt_statistics = True
186 if o in ("-a", "--algorithmic"):
187 opt_algorithmic = True
188 if o in ("-g", "--gtk"):
189 opt_gtk = True
190 if o in ("-u", "--unicodedatatxt"):
191 opt_unicodedatatxt = True
192 if o in ("-v", "--verbose"):
193 opt_verbose = True
194 if o in ("-p", "--plane1"):
195 opt_plane1 = True
196 if o in ("-n", "--numeric"):
197 opt_numeric = True
198 if o in ("-e", "--gtk-expanded"):
199 opt_gtkexpanded = True
200 if o == "--all-sequences":
201 opt_allsequences = True
203 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
204 opt_statistics = True
206 def download_hook(blocks_transferred, block_size, file_size):
207 """ A download hook to provide some feedback when downloading """
208 if blocks_transferred == 0:
209 if file_size > 0:
210 if opt_verbose:
211 print "Downloading", file_size, "bytes: ",
212 else:
213 if opt_verbose:
214 print "Downloading: ",
215 sys.stdout.write('#')
216 sys.stdout.flush()
219 def download_file(url):
220 """ Downloads a file provided a URL. Returns the filename. """
221 """ Borks on failure """
222 localfilename = url.split('/')[-1]
223 if not isfile(localfilename) or getsize(localfilename) <= 0:
224 if opt_verbose:
225 print "Downloading ", url, "..."
226 try:
227 urlretrieve(url, localfilename, download_hook)
228 except IOError, (errno, strerror):
229 print "I/O error(%s): %s" % (errno, strerror)
230 sys.exit(-1)
231 except:
232 print "Unexpected error: ", sys.exc_info()[0]
233 sys.exit(-1)
234 print " done."
235 else:
236 if opt_verbose:
237 print "Using cached file for ", url
238 return localfilename
240 def process_gdkkeysymsh():
241 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
242 """ Fills up keysymdb with contents """
243 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
244 try:
245 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
246 except IOError, (errno, strerror):
247 print "I/O error(%s): %s" % (errno, strerror)
248 sys.exit(-1)
249 except:
250 print "Unexpected error: ", sys.exc_info()[0]
251 sys.exit(-1)
253 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
254 linenum_gdkkeysymsh = 0
255 keysymdb = {}
256 for line in gdkkeysymsh.readlines():
257 linenum_gdkkeysymsh += 1
258 line = line.strip()
259 if line == "" or not match('^#define GDK_KEY_', line):
260 continue
261 components = split('\s+', line)
262 if len(components) < 3:
263 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
264 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
265 print "Was expecting 3 items in the line"
266 sys.exit(-1)
267 if not match('^GDK_KEY_', components[1]):
268 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
269 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
270 print "Was expecting a keysym starting with GDK_KEY_"
271 sys.exit(-1)
272 if match('^0x[0-9a-fA-F]+$', components[2]):
273 unival = long(components[2][2:], 16)
274 if unival == 0:
275 continue
276 keysymdb[components[1][8:]] = unival
277 else:
278 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
279 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
280 print "Was expecting a hexadecimal number at the end of the line"
281 sys.exit(-1)
282 gdkkeysymsh.close()
284 """ Patch up the keysymdb with some of our own stuff """
286 """ This is for a missing keysym from the currently upstream file """
287 #keysymdb['dead_stroke'] = 0x338
289 """ This is for a missing keysym from the currently upstream file """
290 ###keysymdb['dead_belowring'] = 0x323
291 ###keysymdb['dead_belowmacron'] = 0x331
292 ###keysymdb['dead_belowcircumflex'] = 0x32d
293 ###keysymdb['dead_belowtilde'] = 0x330
294 ###keysymdb['dead_belowbreve'] = 0x32e
295 ###keysymdb['dead_belowdiaeresis'] = 0x324
297 """ This is^Wwas preferential treatment for Greek """
298 # keysymdb['dead_tilde'] = 0x342
299 """ This is^was preferential treatment for Greek """
300 #keysymdb['combining_tilde'] = 0x342
302 """ Fixing VoidSymbol """
303 keysymdb['VoidSymbol'] = 0xFFFF
305 return keysymdb
307 def process_keysymstxt():
308 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
309 """ This file keeps a record between keysyms <-> unicode chars """
310 filename_keysymstxt = download_file(URL_KEYSYMSTXT)
311 try:
312 keysymstxt = open(filename_keysymstxt, 'r')
313 except IOError, (errno, strerror):
314 print "I/O error(%s): %s" % (errno, strerror)
315 sys.exit(-1)
316 except:
317 print "Unexpected error: ", sys.exc_info()[0]
318 sys.exit(-1)
320 """ Parse the keysyms.txt file and place content in keysymdb """
321 linenum_keysymstxt = 0
322 keysymdb = {}
323 for line in keysymstxt.readlines():
324 linenum_keysymstxt += 1
325 line = line.strip()
326 if line == "" or match('^#', line):
327 continue
328 components = split('\s+', line)
329 if len(components) < 5:
330 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
331 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
332 print "Was expecting 5 items in the line"
333 sys.exit(-1)
334 if match('^U[0-9a-fA-F]+$', components[1]):
335 unival = long(components[1][1:], 16)
336 if unival == 0:
337 continue
338 keysymdb[components[4]] = unival
339 keysymstxt.close()
341 """ Patch up the keysymdb with some of our own stuff """
342 """ This is for a missing keysym from the currently upstream file """
343 ###keysymdb['dead_belowring'] = 0x323
344 ###keysymdb['dead_belowmacron'] = 0x331
345 ###keysymdb['dead_belowcircumflex'] = 0x32d
346 ###keysymdb['dead_belowtilde'] = 0x330
347 ###keysymdb['dead_belowbreve'] = 0x32e
348 ###keysymdb['dead_belowdiaeresis'] = 0x324
350 """ This is preferential treatment for Greek """
351 """ => we get more savings if used for Greek """
352 # keysymdb['dead_tilde'] = 0x342
353 """ This is preferential treatment for Greek """
354 # keysymdb['combining_tilde'] = 0x342
356 """ This is for a missing keysym from Markus Kuhn's db """
357 keysymdb['dead_stroke'] = 0x338
358 """ This is for a missing keysym from Markus Kuhn's db """
359 keysymdb['Oslash'] = 0x0d8
360 """ This is for a missing keysym from Markus Kuhn's db """
361 keysymdb['Ssharp'] = 0x1e9e
363 """ This is for a missing (recently added) keysym """
364 keysymdb['dead_psili'] = 0x313
365 """ This is for a missing (recently added) keysym """
366 keysymdb['dead_dasia'] = 0x314
368 """ Allows to import Multi_key sequences """
369 keysymdb['Multi_key'] = 0xff20
371 keysymdb['zerosubscript'] = 0x2080
372 keysymdb['onesubscript'] = 0x2081
373 keysymdb['twosubscript'] = 0x2082
374 keysymdb['threesubscript'] = 0x2083
375 keysymdb['foursubscript'] = 0x2084
376 keysymdb['fivesubscript'] = 0x2085
377 keysymdb['sixsubscript'] = 0x2086
378 keysymdb['sevensubscript'] = 0x2087
379 keysymdb['eightsubscript'] = 0x2088
380 keysymdb['ninesubscript'] = 0x2089
381 keysymdb['dead_doublegrave'] = 0x030F
382 keysymdb['dead_invertedbreve'] = 0x0311
384 return keysymdb
386 def keysymvalue(keysym, file = "n/a", linenum = 0):
387 """ Extracts a value from the keysym """
388 """ Find the value of keysym, using the data from keysyms """
389 """ Use file and linenum to when reporting errors """
390 if keysym == "":
391 return 0
392 if keysymdatabase.has_key(keysym):
393 return keysymdatabase[keysym]
394 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
395 return atoi(keysym[1:], 16)
396 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
397 return atoi(keysym[2:], 16)
398 else:
399 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
400 #return -1
401 sys.exit(-1)
403 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
404 """ Extracts a value from the keysym """
405 """ Find the value of keysym, using the data from keysyms """
406 """ Use file and linenum to when reporting errors """
407 if keysym == "":
408 return 0
409 if keysymunicodedatabase.has_key(keysym):
410 return keysymunicodedatabase[keysym]
411 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
412 return atoi(keysym[1:], 16)
413 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
414 return atoi(keysym[2:], 16)
415 else:
416 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
417 sys.exit(-1)
419 def rename_combining(seq):
420 filtered_sequence = []
421 for ks in seq:
422 if findall('^combining_', ks):
423 ks = sub('^combining_', 'dead_', ks)
424 if ks == 'dead_double_grave':
425 ks = 'dead_doublegrave'
426 if ks == 'dead_inverted_breve':
427 ks = 'dead_invertedbreve'
428 filtered_sequence.append(ks)
429 return filtered_sequence
432 keysymunicodedatabase = process_keysymstxt()
433 keysymdatabase = process_gdkkeysymsh()
435 """ Grab and open the compose file from upstream """
436 filename_compose = download_file(URL_COMPOSE)
437 try:
438 composefile = open(filename_compose, 'r')
439 except IOError, (errno, strerror):
440 print "I/O error(%s): %s" % (errno, strerror)
441 sys.exit(-1)
442 except:
443 print "Unexpected error: ", sys.exc_info()[0]
444 sys.exit(-1)
446 """ Look if there is a lookaside (supplementary) compose file in the current
447 directory, and if so, open, then merge with upstream Compose file.
449 xorg_compose_sequences_raw = []
450 for seq in composefile.readlines():
451 xorg_compose_sequences_raw.append(seq)
453 try:
454 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
455 for seq in composefile_lookaside.readlines():
456 xorg_compose_sequences_raw.append(seq)
457 except IOError, (errno, strerror):
458 if opt_verbose:
459 print "I/O error(%s): %s" % (errno, strerror)
460 print "Did not find lookaside compose file. Continuing..."
461 except:
462 print "Unexpected error: ", sys.exc_info()[0]
463 sys.exit(-1)
465 """ Parse the compose file in xorg_compose_sequences"""
466 xorg_compose_sequences = []
467 xorg_compose_sequences_algorithmic = []
468 linenum_compose = 0
469 comment_nest_depth = 0
470 for line in xorg_compose_sequences_raw:
471 linenum_compose += 1
472 line = line.strip()
473 if match("^XCOMM", line) or match("^#", line):
474 continue
476 line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
478 comment_start = line.find("/*")
480 if comment_start >= 0:
481 if comment_nest_depth == 0:
482 line = line[:comment_start]
483 else:
484 line = ""
486 comment_nest_depth += 1
487 else:
488 comment_end = line.find("*/")
490 if comment_end >= 0:
491 comment_nest_depth -= 1
493 if comment_nest_depth < 0:
494 print "Invalid comment %(linenum_compose)d in %(filename)s: \
495 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
496 exit(-1)
498 if comment_nest_depth > 0:
499 line = ""
500 else:
501 line = line[comment_end + 2:]
503 if line is "":
504 continue
506 #line = line[:-1]
507 components = split(':', line)
508 if len(components) != 2:
509 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
510 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
511 exit(-1)
512 (seq, val ) = split(':', line)
513 seq = seq.strip()
514 val = val.strip()
515 raw_sequence = findall('\w+', seq)
516 values = split('\s+', val)
517 unichar_temp = split('"', values[0])
518 unichar = unichar_temp[1]
519 if len(values) == 1:
520 continue
521 codepointstr = values[1]
522 if values[1] == '#':
523 # No codepoints that are >1 characters yet.
524 continue
525 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
526 raw_sequence[0] = '0x' + raw_sequence[0][1:]
527 if match('^U[0-9a-fA-F]+$', codepointstr):
528 codepoint = long(codepointstr[1:], 16)
529 elif keysymunicodedatabase.has_key(codepointstr):
530 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
531 #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
532 #print raw_sequence, codepointstr
533 codepoint = keysymunicodedatabase[codepointstr]
534 else:
535 print
536 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
537 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
538 exit(-1)
539 sequence = rename_combining(raw_sequence)
540 reject_this = False
541 for i in sequence:
542 if keysymvalue(i) > 0xFFFF:
543 reject_this = True
544 if opt_plane1:
545 print sequence
546 break
547 if keysymvalue(i) < 0:
548 reject_this = True
549 break
550 if reject_this:
551 continue
552 if "U0342" in sequence or \
553 "U0313" in sequence or \
554 "U0314" in sequence or \
555 "0x0313" in sequence or \
556 "0x0342" in sequence or \
557 "0x0314" in sequence:
558 continue
559 if "dead_belowring" in sequence or\
560 "dead_currency" in sequence or\
561 "dead_belowcomma" in sequence or\
562 "dead_belowmacron" in sequence or\
563 "dead_belowtilde" in sequence or\
564 "dead_belowbreve" in sequence or\
565 "dead_belowdiaeresis" in sequence or\
566 "dead_belowcircumflex" in sequence:
567 continue
568 #for i in range(len(sequence)):
569 # if sequence[i] == "0x0342":
570 # sequence[i] = "dead_tilde"
571 if "Multi_key" not in sequence:
572 """ Ignore for now >0xFFFF keysyms """
573 if codepoint < 0xFFFF:
574 original_sequence = copy(sequence)
575 stats_sequence = copy(sequence)
576 base = sequence.pop()
577 basechar = keysymvalue(base, filename_compose, linenum_compose)
579 if basechar < 0xFFFF:
580 counter = 1
581 unisequence = []
582 not_normalised = True
583 skipping_this = False
584 for i in range(0, len(sequence)):
585 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
586 because of lack of dead_perispomeni (i.e. conflict)
588 bc = basechar
589 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
590 skipping_this = True
591 break
592 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
593 skipping_this = True
594 break
595 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
596 skipping_this = True
597 break
598 if sequence[-1] == "dead_psili":
599 sequence[i] = "dead_horn"
600 if sequence[-1] == "dead_dasia":
601 sequence[-1] = "dead_ogonek"
603 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
605 if skipping_this:
606 unisequence = []
607 for perm in all_permutations(unisequence):
608 # print counter, original_sequence, unichr(basechar) + "".join(perm)
609 # print counter, map(unichr, perm)
610 normalized = normalize('NFC', unichr(basechar) + "".join(perm))
611 if len(normalized) == 1:
612 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
613 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
614 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
615 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
616 stats_sequence_data.append(normalized)
617 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
618 not_normalised = False
619 break;
620 counter += 1
621 if not_normalised or opt_allsequences:
622 original_sequence.append(codepoint)
623 xorg_compose_sequences.append(original_sequence)
624 """ print xorg_compose_sequences[-1] """
626 else:
627 print "Error in base char !?!"
628 exit(-2)
629 else:
630 print "OVER", sequence
631 exit(-1)
632 else:
633 sequence.append(codepoint)
634 xorg_compose_sequences.append(sequence)
635 """ print xorg_compose_sequences[-1] """
637 def sequence_cmp(x, y):
638 if keysymvalue(x[0]) > keysymvalue(y[0]):
639 return 1
640 elif keysymvalue(x[0]) < keysymvalue(y[0]):
641 return -1
642 elif len(x) > len(y):
643 return 1
644 elif len(x) < len(y):
645 return -1
646 elif keysymvalue(x[1]) > keysymvalue(y[1]):
647 return 1
648 elif keysymvalue(x[1]) < keysymvalue(y[1]):
649 return -1
650 elif len(x) < 4:
651 return 0
652 elif keysymvalue(x[2]) > keysymvalue(y[2]):
653 return 1
654 elif keysymvalue(x[2]) < keysymvalue(y[2]):
655 return -1
656 elif len(x) < 5:
657 return 0
658 elif keysymvalue(x[3]) > keysymvalue(y[3]):
659 return 1
660 elif keysymvalue(x[3]) < keysymvalue(y[3]):
661 return -1
662 elif len(x) < 6:
663 return 0
664 elif keysymvalue(x[4]) > keysymvalue(y[4]):
665 return 1
666 elif keysymvalue(x[4]) < keysymvalue(y[4]):
667 return -1
668 else:
669 return 0
671 def sequence_unicode_cmp(x, y):
672 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
673 return 1
674 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
675 return -1
676 elif len(x) > len(y):
677 return 1
678 elif len(x) < len(y):
679 return -1
680 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
681 return 1
682 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
683 return -1
684 elif len(x) < 4:
685 return 0
686 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
687 return 1
688 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
689 return -1
690 elif len(x) < 5:
691 return 0
692 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
693 return 1
694 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
695 return -1
696 elif len(x) < 6:
697 return 0
698 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
699 return 1
700 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
701 return -1
702 else:
703 return 0
705 def sequence_algorithmic_cmp(x, y):
706 if len(x) < len(y):
707 return -1
708 elif len(x) > len(y):
709 return 1
710 else:
711 for i in range(len(x)):
712 if x[i] < y[i]:
713 return -1
714 elif x[i] > y[i]:
715 return 1
716 return 0
719 xorg_compose_sequences.sort(sequence_cmp)
721 xorg_compose_sequences_uniqued = []
722 first_time = True
723 item = None
724 for next_item in xorg_compose_sequences:
725 if first_time:
726 first_time = False
727 item = next_item
728 if sequence_unicode_cmp(item, next_item) != 0:
729 xorg_compose_sequences_uniqued.append(item)
730 item = next_item
732 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
734 counter_multikey = 0
735 for item in xorg_compose_sequences:
736 if findall('Multi_key', "".join(item[:-1])) != []:
737 counter_multikey += 1
739 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
740 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
742 firstitem = ""
743 num_first_keysyms = 0
744 zeroes = 0
745 num_entries = 0
746 num_algorithmic_greek = 0
747 for sequence in xorg_compose_sequences:
748 if keysymvalue(firstitem) != keysymvalue(sequence[0]):
749 firstitem = sequence[0]
750 num_first_keysyms += 1
751 zeroes += 6 - len(sequence) + 1
752 num_entries += 1
754 for sequence in xorg_compose_sequences_algorithmic_uniqued:
755 ch = ord(sequence[-1:][0])
756 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
757 num_algorithmic_greek += 1
760 if opt_algorithmic:
761 for sequence in xorg_compose_sequences_algorithmic_uniqued:
762 letter = "".join(sequence[-1:])
763 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
764 for elem in sequence[:-2]:
765 print "<0x%(keysym)04X>," % { 'keysym': elem },
766 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
767 print "], recomposed as", letter.encode('utf-8'), "verified"
769 def num_of_keysyms(seq):
770 return len(seq) - 1
772 def convert_UnotationToHex(arg):
773 if isinstance(arg, str):
774 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
775 return sub('^U', '0x', arg)
776 return arg
778 def addprefix_GDK(arg):
779 if match('^0x', arg):
780 return '%(arg)s, ' % { 'arg': arg }
781 else:
782 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
784 if opt_gtk:
785 first_keysym = ""
786 sequence = []
787 compose_table = []
788 ct_second_part = []
789 ct_sequence_width = 2
790 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
791 we_finished = False
792 counter = 0
794 sequence_iterator = iter(xorg_compose_sequences)
795 sequence = sequence_iterator.next()
796 while True:
797 first_keysym = sequence[0] # Set the first keysym
798 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
799 while sequence[0] == first_keysym:
800 compose_table[counter][num_of_keysyms(sequence)-1] += 1
801 try:
802 sequence = sequence_iterator.next()
803 except StopIteration:
804 we_finished = True
805 break
806 if we_finished:
807 break
808 counter += 1
810 ct_index = start_offset
811 for line_num in range(len(compose_table)):
812 for i in range(WIDTHOFCOMPOSETABLE):
813 occurences = compose_table[line_num][i+1]
814 compose_table[line_num][i+1] = ct_index
815 ct_index += occurences * (i+2)
817 for sequence in xorg_compose_sequences:
818 ct_second_part.append(map(convert_UnotationToHex, sequence))
820 print headerfile_start
821 for i in compose_table:
822 if opt_gtkexpanded:
823 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
824 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
825 elif not match('^0x', i[0]):
826 print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
827 else:
828 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
829 for i in ct_second_part:
830 if opt_numeric:
831 for ks in i[1:][:-1]:
832 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
833 print '0x%(cp)04X, ' % { 'cp':i[-1] }
835 for ks in i[:-1]:
836 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
837 print '0x%(cp)04X, ' % { 'cp':i[-1] }
839 elif opt_gtkexpanded:
840 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
841 else:
842 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
843 print headerfile_end
845 def redecompose(codepoint):
846 (name, decomposition, combiningclass) = unicodedatabase[codepoint]
847 if decomposition[0] == '' or decomposition[0] == '0':
848 return [codepoint]
849 if match('<\w+>', decomposition[0]):
850 numdecomposition = map(stringtohex, decomposition[1:])
851 return map(redecompose, numdecomposition)
852 numdecomposition = map(stringtohex, decomposition)
853 return map(redecompose, numdecomposition)
855 def process_unicodedata_file(verbose = False):
856 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
857 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
858 try:
859 unicodedatatxt = open(filename_unicodedatatxt, 'r')
860 except IOError, (errno, strerror):
861 print "I/O error(%s): %s" % (errno, strerror)
862 sys.exit(-1)
863 except:
864 print "Unexpected error: ", sys.exc_info()[0]
865 sys.exit(-1)
866 for line in unicodedatatxt.readlines():
867 if line[0] == "" or line[0] == '#':
868 continue
869 line = line[:-1]
870 uniproperties = split(';', line)
871 codepoint = stringtohex(uniproperties[0])
872 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
873 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
874 continue
875 name = uniproperties[1]
876 category = uniproperties[2]
877 combiningclass = uniproperties[3]
878 decomposition = uniproperties[5]
879 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
881 counter_combinations = 0
882 counter_combinations_greek = 0
883 counter_entries = 0
884 counter_entries_greek = 0
886 for item in unicodedatabase.keys():
887 (name, decomposition, combiningclass) = unicodedatabase[item]
888 if decomposition[0] == '':
889 continue
890 print name, "is empty"
891 elif match('<\w+>', decomposition[0]):
892 continue
893 print name, "has weird", decomposition[0]
894 else:
895 sequence = map(stringtohex, decomposition)
896 chrsequence = map(unichr, sequence)
897 normalized = normalize('NFC', "".join(chrsequence))
899 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
900 decomposedsequence = []
901 for subseq in map(redecompose, sequence):
902 for seqitem in subseq:
903 if isinstance(seqitem, list):
904 for i in seqitem:
905 if isinstance(i, list):
906 for j in i:
907 decomposedsequence.append(j)
908 else:
909 decomposedsequence.append(i)
910 else:
911 decomposedsequence.append(seqitem)
912 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
913 if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
914 counter_entries += 1
915 counter_combinations += factorial(len(decomposedsequence)-1)
916 ch = item
917 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
918 counter_entries_greek += 1
919 counter_combinations_greek += factorial(len(decomposedsequence)-1)
920 if verbose:
921 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
922 print "[",
923 for elem in decomposedsequence:
924 print '<0x%(hex)04X>,' % { 'hex': elem },
925 print "], recomposed as", recomposedchar,
926 if unichr(item) == recomposedchar:
927 print "verified"
929 if verbose == False:
930 print "Unicode statistics from UnicodeData.txt"
931 print "Number of entries that can be algorithmically produced :", counter_entries
932 print " of which are for Greek :", counter_entries_greek
933 print "Number of compose sequence combinations requiring :", counter_combinations
934 print " of which are for Greek :", counter_combinations_greek
935 print "Note: We do not include partial compositions, "
936 print "thus the slight discrepancy in the figures"
937 print
939 if opt_unicodedatatxt:
940 process_unicodedata_file(True)
942 if opt_statistics:
943 print
944 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
945 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic)
946 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences)
947 print " of which have Multi_key :", counter_multikey
948 print
949 print "Algorithmic (stats for Xorg Compose file)"
950 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
951 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued)
952 print " of which are for Greek :", num_algorithmic_greek
953 print
954 process_unicodedata_file()
955 print "Not algorithmic (stats from Xorg Compose file)"
956 print "Number of sequences :", len(xorg_compose_sequences)
957 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
958 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library"
959 print "Number of items in flat array :", len(xorg_compose_sequences) * 6
960 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
961 print "Number of different first items :", num_first_keysyms
962 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6
963 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5
964 print
965 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
966 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
967 print
968 print "Existing (old) implementation in GTK+"
969 print "Number of sequences in old gtkimcontextsimple.c :", 691
970 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"