2 # -*- coding: utf-8 -*-
4 # compose-parse.py, version 1.3
6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
7 # the script produces statistics and information about the whole process, run with --help for more.
9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
13 from re
import findall
, match
, split
, sub
14 from string
import atoi
15 from unicodedata
import normalize
16 from urllib
import urlretrieve
17 from os
.path
import isfile
, getsize
23 # We grab files off the web, left and right.
24 URL_COMPOSE
= 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
25 URL_KEYSYMSTXT
= "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
26 URL_GDKKEYSYMSH
= "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
27 URL_UNICODEDATATXT
= 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt'
28 FILENAME_COMPOSE_SUPPLEMENTARY
= 'gtk-compose-lookaside.txt'
30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
31 # we might produce some tables with size 2 and some with size 4.
34 # Current max compose sequence length; in case it gets increased.
35 WIDTHOFCOMPOSETABLE
= 5
38 keysymunicodedatabase
= {}
41 headerfile_start
= """/* GTK - The GIMP Tool Kit
42 * Copyright (C) 2007, 2008 GNOME Foundation
44 * This library is free software; you can redistribute it and/or
45 * modify it under the terms of the GNU Lesser General Public
46 * License as published by the Free Software Foundation; either
47 * version 2 of the License, or (at your option) any later version.
49 * This library is distributed in the hope that it will be useful,
50 * but WITHOUT ANY WARRANTY; without even the implied warranty of
51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
52 * Lesser General Public License for more details.
54 * You should have received a copy of the GNU Lesser General Public
55 * License along with this library; if not, write to the
56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
57 * Boston, MA 02111-1307, USA.
61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
62 * using the input files
63 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
64 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
65 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
67 * This table is optimised for space and requires special handling to access the content.
68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
75 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS
76 * file for a list of people on the GTK+ Team. See the ChangeLog
77 * files for a list of changes. These files are distributed with
78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
84 /* === These are the original comments of the file; we keep for historical purposes ===
86 * The following table was generated from the X compose tables include with
87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
88 * to obtain the relevant perl scripts.
90 * The following compose letter letter sequences confliced
91 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
92 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ]
93 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ]
94 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ]
95 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ]
96 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ]
98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
99 * spanish. atilde and otilde are used at least for Portuguese ]
101 * at and Aring; resolved to Aring [ AA ]
102 * guillemotleft and caron; resolved to guillemotleft [ << ]
103 * ogonek and cedilla; resolved to cedilla [ ,, ]
105 * This probably should be resolved by first checking an additional set of compose tables
106 * that depend on the locale or selected input method.
109 static const guint16 gtk_compose_seqs_compact[] = {"""
111 headerfile_end
= """};
113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
116 def stringtohex(str): return atoi(str, 16)
122 return n
* factorial(n
-1)
125 """ Performs a uniq operation on a list or lists """
128 theInputList
+= theList
130 for elem
in theInputList
:
131 if elem
not in theFinalList
:
132 theFinalList
.append(elem
)
137 def all_permutations(seq
):
138 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
139 """ Produces all permutations of the items of a list """
143 for perm
in all_permutations(seq
[1:]):
144 for i
in range(len(perm
)+1):
145 #nb str[0:1] works in both string and list contexts
146 yield perm
[:i
] + seq
[0:1] + perm
[i
:]
149 print """compose-parse available parameters:
150 -h, --help this craft
151 -s, --statistics show overall statistics (both algorithmic, non-algorithmic)
152 -a, --algorithmic show sequences saved with algorithmic optimisation
153 -g, --gtk show entries that go to GTK+
154 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org)
155 -v, --verbose show verbose output
156 -p, --plane1 show plane1 compose sequences
157 -n, --numeric when used with --gtk, create file with numeric values only
158 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+
159 --all-sequences when used with --gtk, create file with entries rejected by default
160 Default is to show statistics.
164 opts
, args
= getopt
.getopt(sys
.argv
[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
165 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"])
170 opt_statistics
= False
171 opt_algorithmic
= False
173 opt_unicodedatatxt
= False
177 opt_gtkexpanded
= False
178 opt_allsequences
= False
181 if o
in ("-h", "--help"):
184 if o
in ("-s", "--statistics"):
185 opt_statistics
= True
186 if o
in ("-a", "--algorithmic"):
187 opt_algorithmic
= True
188 if o
in ("-g", "--gtk"):
190 if o
in ("-u", "--unicodedatatxt"):
191 opt_unicodedatatxt
= True
192 if o
in ("-v", "--verbose"):
194 if o
in ("-p", "--plane1"):
196 if o
in ("-n", "--numeric"):
198 if o
in ("-e", "--gtk-expanded"):
199 opt_gtkexpanded
= True
200 if o
== "--all-sequences":
201 opt_allsequences
= True
203 if not opt_algorithmic
and not opt_gtk
and not opt_unicodedatatxt
:
204 opt_statistics
= True
206 def download_hook(blocks_transferred
, block_size
, file_size
):
207 """ A download hook to provide some feedback when downloading """
208 if blocks_transferred
== 0:
211 print "Downloading", file_size
, "bytes: ",
214 print "Downloading: ",
215 sys
.stdout
.write('#')
219 def download_file(url
):
220 """ Downloads a file provided a URL. Returns the filename. """
221 """ Borks on failure """
222 localfilename
= url
.split('/')[-1]
223 if not isfile(localfilename
) or getsize(localfilename
) <= 0:
225 print "Downloading ", url
, "..."
227 urlretrieve(url
, localfilename
, download_hook
)
228 except IOError, (errno
, strerror
):
229 print "I/O error(%s): %s" % (errno
, strerror
)
232 print "Unexpected error: ", sys
.exc_info()[0]
237 print "Using cached file for ", url
240 def process_gdkkeysymsh():
241 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
242 """ Fills up keysymdb with contents """
243 filename_gdkkeysymsh
= download_file(URL_GDKKEYSYMSH
)
245 gdkkeysymsh
= open(filename_gdkkeysymsh
, 'r')
246 except IOError, (errno
, strerror
):
247 print "I/O error(%s): %s" % (errno
, strerror
)
250 print "Unexpected error: ", sys
.exc_info()[0]
253 """ Parse the gdkkeysyms.h file and place contents in keysymdb """
254 linenum_gdkkeysymsh
= 0
256 for line
in gdkkeysymsh
.readlines():
257 linenum_gdkkeysymsh
+= 1
259 if line
== "" or not match('^#define GDK_KEY_', line
):
261 components
= split('\s+', line
)
262 if len(components
) < 3:
263 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
264 % {'linenum': linenum_gdkkeysymsh
, 'filename': filename_gdkkeysymsh
, 'line': line
}
265 print "Was expecting 3 items in the line"
267 if not match('^GDK_KEY_', components
[1]):
268 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
269 % {'linenum': linenum_gdkkeysymsh
, 'filename': filename_gdkkeysymsh
, 'line': line
}
270 print "Was expecting a keysym starting with GDK_KEY_"
272 if match('^0x[0-9a-fA-F]+$', components
[2]):
273 unival
= long(components
[2][2:], 16)
276 keysymdb
[components
[1][8:]] = unival
278 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
279 % {'linenum': linenum_gdkkeysymsh
, 'filename': filename_gdkkeysymsh
, 'line': line
}
280 print "Was expecting a hexadecimal number at the end of the line"
284 """ Patch up the keysymdb with some of our own stuff """
286 """ This is for a missing keysym from the currently upstream file """
287 #keysymdb['dead_stroke'] = 0x338
289 """ This is for a missing keysym from the currently upstream file """
290 ###keysymdb['dead_belowring'] = 0x323
291 ###keysymdb['dead_belowmacron'] = 0x331
292 ###keysymdb['dead_belowcircumflex'] = 0x32d
293 ###keysymdb['dead_belowtilde'] = 0x330
294 ###keysymdb['dead_belowbreve'] = 0x32e
295 ###keysymdb['dead_belowdiaeresis'] = 0x324
297 """ This is^Wwas preferential treatment for Greek """
298 # keysymdb['dead_tilde'] = 0x342
299 """ This is^was preferential treatment for Greek """
300 #keysymdb['combining_tilde'] = 0x342
302 """ Fixing VoidSymbol """
303 keysymdb
['VoidSymbol'] = 0xFFFF
307 def process_keysymstxt():
308 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
309 """ This file keeps a record between keysyms <-> unicode chars """
310 filename_keysymstxt
= download_file(URL_KEYSYMSTXT
)
312 keysymstxt
= open(filename_keysymstxt
, 'r')
313 except IOError, (errno
, strerror
):
314 print "I/O error(%s): %s" % (errno
, strerror
)
317 print "Unexpected error: ", sys
.exc_info()[0]
320 """ Parse the keysyms.txt file and place content in keysymdb """
321 linenum_keysymstxt
= 0
323 for line
in keysymstxt
.readlines():
324 linenum_keysymstxt
+= 1
326 if line
== "" or match('^#', line
):
328 components
= split('\s+', line
)
329 if len(components
) < 5:
330 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
331 % {'linenum': linenum_keysymstxt
, 'filename': filename_keysymstxt
, 'line': line
}
332 print "Was expecting 5 items in the line"
334 if match('^U[0-9a-fA-F]+$', components
[1]):
335 unival
= long(components
[1][1:], 16)
338 keysymdb
[components
[4]] = unival
341 """ Patch up the keysymdb with some of our own stuff """
342 """ This is for a missing keysym from the currently upstream file """
343 ###keysymdb['dead_belowring'] = 0x323
344 ###keysymdb['dead_belowmacron'] = 0x331
345 ###keysymdb['dead_belowcircumflex'] = 0x32d
346 ###keysymdb['dead_belowtilde'] = 0x330
347 ###keysymdb['dead_belowbreve'] = 0x32e
348 ###keysymdb['dead_belowdiaeresis'] = 0x324
350 """ This is preferential treatment for Greek """
351 """ => we get more savings if used for Greek """
352 # keysymdb['dead_tilde'] = 0x342
353 """ This is preferential treatment for Greek """
354 # keysymdb['combining_tilde'] = 0x342
356 """ This is for a missing keysym from Markus Kuhn's db """
357 keysymdb
['dead_stroke'] = 0x338
358 """ This is for a missing keysym from Markus Kuhn's db """
359 keysymdb
['Oslash'] = 0x0d8
360 """ This is for a missing keysym from Markus Kuhn's db """
361 keysymdb
['Ssharp'] = 0x1e9e
363 """ This is for a missing (recently added) keysym """
364 keysymdb
['dead_psili'] = 0x313
365 """ This is for a missing (recently added) keysym """
366 keysymdb
['dead_dasia'] = 0x314
368 """ Allows to import Multi_key sequences """
369 keysymdb
['Multi_key'] = 0xff20
371 keysymdb
['zerosubscript'] = 0x2080
372 keysymdb
['onesubscript'] = 0x2081
373 keysymdb
['twosubscript'] = 0x2082
374 keysymdb
['threesubscript'] = 0x2083
375 keysymdb
['foursubscript'] = 0x2084
376 keysymdb
['fivesubscript'] = 0x2085
377 keysymdb
['sixsubscript'] = 0x2086
378 keysymdb
['sevensubscript'] = 0x2087
379 keysymdb
['eightsubscript'] = 0x2088
380 keysymdb
['ninesubscript'] = 0x2089
381 keysymdb
['dead_doublegrave'] = 0x030F
382 keysymdb
['dead_invertedbreve'] = 0x0311
386 def keysymvalue(keysym
, file = "n/a", linenum
= 0):
387 """ Extracts a value from the keysym """
388 """ Find the value of keysym, using the data from keysyms """
389 """ Use file and linenum to when reporting errors """
392 if keysymdatabase
.has_key(keysym
):
393 return keysymdatabase
[keysym
]
394 elif keysym
[0] == 'U' and match('[0-9a-fA-F]+$', keysym
[1:]):
395 return atoi(keysym
[1:], 16)
396 elif keysym
[:2] == '0x' and match('[0-9a-fA-F]+$', keysym
[2:]):
397 return atoi(keysym
[2:], 16)
399 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym
}
403 def keysymunicodevalue(keysym
, file = "n/a", linenum
= 0):
404 """ Extracts a value from the keysym """
405 """ Find the value of keysym, using the data from keysyms """
406 """ Use file and linenum to when reporting errors """
409 if keysymunicodedatabase
.has_key(keysym
):
410 return keysymunicodedatabase
[keysym
]
411 elif keysym
[0] == 'U' and match('[0-9a-fA-F]+$', keysym
[1:]):
412 return atoi(keysym
[1:], 16)
413 elif keysym
[:2] == '0x' and match('[0-9a-fA-F]+$', keysym
[2:]):
414 return atoi(keysym
[2:], 16)
416 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym
}
419 def rename_combining(seq
):
420 filtered_sequence
= []
422 if findall('^combining_', ks
):
423 ks
= sub('^combining_', 'dead_', ks
)
424 if ks
== 'dead_double_grave':
425 ks
= 'dead_doublegrave'
426 if ks
== 'dead_inverted_breve':
427 ks
= 'dead_invertedbreve'
428 filtered_sequence
.append(ks
)
429 return filtered_sequence
432 keysymunicodedatabase
= process_keysymstxt()
433 keysymdatabase
= process_gdkkeysymsh()
435 """ Grab and open the compose file from upstream """
436 filename_compose
= download_file(URL_COMPOSE
)
438 composefile
= open(filename_compose
, 'r')
439 except IOError, (errno
, strerror
):
440 print "I/O error(%s): %s" % (errno
, strerror
)
443 print "Unexpected error: ", sys
.exc_info()[0]
446 """ Look if there is a lookaside (supplementary) compose file in the current
447 directory, and if so, open, then merge with upstream Compose file.
449 xorg_compose_sequences_raw
= []
450 for seq
in composefile
.readlines():
451 xorg_compose_sequences_raw
.append(seq
)
454 composefile_lookaside
= open(FILENAME_COMPOSE_SUPPLEMENTARY
, 'r')
455 for seq
in composefile_lookaside
.readlines():
456 xorg_compose_sequences_raw
.append(seq
)
457 except IOError, (errno
, strerror
):
459 print "I/O error(%s): %s" % (errno
, strerror
)
460 print "Did not find lookaside compose file. Continuing..."
462 print "Unexpected error: ", sys
.exc_info()[0]
465 """ Parse the compose file in xorg_compose_sequences"""
466 xorg_compose_sequences
= []
467 xorg_compose_sequences_algorithmic
= []
469 comment_nest_depth
= 0
470 for line
in xorg_compose_sequences_raw
:
473 if match("^XCOMM", line
) or match("^#", line
):
476 line
= sub(r
"\/\*([^\*]*|[\*][^/])\*\/", "", line
)
478 comment_start
= line
.find("/*")
480 if comment_start
>= 0:
481 if comment_nest_depth
== 0:
482 line
= line
[:comment_start
]
486 comment_nest_depth
+= 1
488 comment_end
= line
.find("*/")
491 comment_nest_depth
-= 1
493 if comment_nest_depth
< 0:
494 print "Invalid comment %(linenum_compose)d in %(filename)s: \
495 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose
, "filename": filename_compose
}
498 if comment_nest_depth
> 0:
501 line
= line
[comment_end
+ 2:]
507 components
= split(':', line
)
508 if len(components
) != 2:
509 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
510 /value pair found" % { "linenum_compose": linenum_compose
, "filename": filename_compose
}
512 (seq
, val
) = split(':', line
)
515 raw_sequence
= findall('\w+', seq
)
516 values
= split('\s+', val
)
517 unichar_temp
= split('"', values
[0])
518 unichar
= unichar_temp
[1]
521 codepointstr
= values
[1]
523 # No codepoints that are >1 characters yet.
525 if raw_sequence
[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence
[0][1:]):
526 raw_sequence
[0] = '0x' + raw_sequence
[0][1:]
527 if match('^U[0-9a-fA-F]+$', codepointstr
):
528 codepoint
= long(codepointstr
[1:], 16)
529 elif keysymunicodedatabase
.has_key(codepointstr
):
530 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
531 #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
532 #print raw_sequence, codepointstr
533 codepoint
= keysymunicodedatabase
[codepointstr
]
536 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
537 %(line)s" % { "linenum_compose": linenum_compose
, "filename": filename_compose
, "line": line
}
539 sequence
= rename_combining(raw_sequence
)
542 if keysymvalue(i
) > 0xFFFF:
547 if keysymvalue(i
) < 0:
552 if "U0342" in sequence
or \
553 "U0313" in sequence
or \
554 "U0314" in sequence
or \
555 "0x0313" in sequence
or \
556 "0x0342" in sequence
or \
557 "0x0314" in sequence
:
559 if "dead_belowring" in sequence
or\
560 "dead_currency" in sequence
or\
561 "dead_belowcomma" in sequence
or\
562 "dead_belowmacron" in sequence
or\
563 "dead_belowtilde" in sequence
or\
564 "dead_belowbreve" in sequence
or\
565 "dead_belowdiaeresis" in sequence
or\
566 "dead_belowcircumflex" in sequence
:
568 #for i in range(len(sequence)):
569 # if sequence[i] == "0x0342":
570 # sequence[i] = "dead_tilde"
571 if "Multi_key" not in sequence
:
572 """ Ignore for now >0xFFFF keysyms """
573 if codepoint
< 0xFFFF:
574 original_sequence
= copy(sequence
)
575 stats_sequence
= copy(sequence
)
576 base
= sequence
.pop()
577 basechar
= keysymvalue(base
, filename_compose
, linenum_compose
)
579 if basechar
< 0xFFFF:
582 not_normalised
= True
583 skipping_this
= False
584 for i
in range(0, len(sequence
)):
585 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
586 because of lack of dead_perispomeni (i.e. conflict)
589 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
592 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
595 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
598 if sequence[-1] == "dead_psili":
599 sequence[i] = "dead_horn"
600 if sequence[-1] == "dead_dasia":
601 sequence[-1] = "dead_ogonek"
603 unisequence
.append(unichr(keysymunicodevalue(sequence
.pop(), filename_compose
, linenum_compose
)))
607 for perm
in all_permutations(unisequence
):
608 # print counter, original_sequence, unichr(basechar) + "".join(perm)
609 # print counter, map(unichr, perm)
610 normalized
= normalize('NFC', unichr(basechar
) + "".join(perm
))
611 if len(normalized
) == 1:
612 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
613 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
614 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
615 stats_sequence_data
= map(keysymunicodevalue
, stats_sequence
)
616 stats_sequence_data
.append(normalized
)
617 xorg_compose_sequences_algorithmic
.append(stats_sequence_data
)
618 not_normalised
= False
621 if not_normalised
or opt_allsequences
:
622 original_sequence
.append(codepoint
)
623 xorg_compose_sequences
.append(original_sequence
)
624 """ print xorg_compose_sequences[-1] """
627 print "Error in base char !?!"
630 print "OVER", sequence
633 sequence
.append(codepoint
)
634 xorg_compose_sequences
.append(sequence
)
635 """ print xorg_compose_sequences[-1] """
637 def sequence_cmp(x
, y
):
638 if keysymvalue(x
[0]) > keysymvalue(y
[0]):
640 elif keysymvalue(x
[0]) < keysymvalue(y
[0]):
642 elif len(x
) > len(y
):
644 elif len(x
) < len(y
):
646 elif keysymvalue(x
[1]) > keysymvalue(y
[1]):
648 elif keysymvalue(x
[1]) < keysymvalue(y
[1]):
652 elif keysymvalue(x
[2]) > keysymvalue(y
[2]):
654 elif keysymvalue(x
[2]) < keysymvalue(y
[2]):
658 elif keysymvalue(x
[3]) > keysymvalue(y
[3]):
660 elif keysymvalue(x
[3]) < keysymvalue(y
[3]):
664 elif keysymvalue(x
[4]) > keysymvalue(y
[4]):
666 elif keysymvalue(x
[4]) < keysymvalue(y
[4]):
671 def sequence_unicode_cmp(x
, y
):
672 if keysymunicodevalue(x
[0]) > keysymunicodevalue(y
[0]):
674 elif keysymunicodevalue(x
[0]) < keysymunicodevalue(y
[0]):
676 elif len(x
) > len(y
):
678 elif len(x
) < len(y
):
680 elif keysymunicodevalue(x
[1]) > keysymunicodevalue(y
[1]):
682 elif keysymunicodevalue(x
[1]) < keysymunicodevalue(y
[1]):
686 elif keysymunicodevalue(x
[2]) > keysymunicodevalue(y
[2]):
688 elif keysymunicodevalue(x
[2]) < keysymunicodevalue(y
[2]):
692 elif keysymunicodevalue(x
[3]) > keysymunicodevalue(y
[3]):
694 elif keysymunicodevalue(x
[3]) < keysymunicodevalue(y
[3]):
698 elif keysymunicodevalue(x
[4]) > keysymunicodevalue(y
[4]):
700 elif keysymunicodevalue(x
[4]) < keysymunicodevalue(y
[4]):
705 def sequence_algorithmic_cmp(x
, y
):
708 elif len(x
) > len(y
):
711 for i
in range(len(x
)):
719 xorg_compose_sequences
.sort(sequence_cmp
)
721 xorg_compose_sequences_uniqued
= []
724 for next_item
in xorg_compose_sequences
:
728 if sequence_unicode_cmp(item
, next_item
) != 0:
729 xorg_compose_sequences_uniqued
.append(item
)
732 xorg_compose_sequences
= copy(xorg_compose_sequences_uniqued
)
735 for item
in xorg_compose_sequences
:
736 if findall('Multi_key', "".join(item
[:-1])) != []:
737 counter_multikey
+= 1
739 xorg_compose_sequences_algorithmic
.sort(sequence_algorithmic_cmp
)
740 xorg_compose_sequences_algorithmic_uniqued
= uniq(xorg_compose_sequences_algorithmic
)
743 num_first_keysyms
= 0
746 num_algorithmic_greek
= 0
747 for sequence
in xorg_compose_sequences
:
748 if keysymvalue(firstitem
) != keysymvalue(sequence
[0]):
749 firstitem
= sequence
[0]
750 num_first_keysyms
+= 1
751 zeroes
+= 6 - len(sequence
) + 1
754 for sequence
in xorg_compose_sequences_algorithmic_uniqued
:
755 ch
= ord(sequence
[-1:][0])
756 if ch
>= 0x370 and ch
<= 0x3ff or ch
>= 0x1f00 and ch
<= 0x1fff:
757 num_algorithmic_greek
+= 1
761 for sequence
in xorg_compose_sequences_algorithmic_uniqued
:
762 letter
= "".join(sequence
[-1:])
763 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter
)), 'uni': letter
.encode('utf-8'), 'base': sequence
[-2] },
764 for elem
in sequence
[:-2]:
765 print "<0x%(keysym)04X>," % { 'keysym': elem
},
766 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
767 print "], recomposed as", letter
.encode('utf-8'), "verified"
769 def num_of_keysyms(seq
):
772 def convert_UnotationToHex(arg
):
773 if isinstance(arg
, str):
774 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg
):
775 return sub('^U', '0x', arg
)
778 def addprefix_GDK(arg
):
779 if match('^0x', arg
):
780 return '%(arg)s, ' % { 'arg': arg
}
782 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg
}
789 ct_sequence_width
= 2
790 start_offset
= num_first_keysyms
* (WIDTHOFCOMPOSETABLE
+1)
794 sequence_iterator
= iter(xorg_compose_sequences
)
795 sequence
= sequence_iterator
.next()
797 first_keysym
= sequence
[0] # Set the first keysym
798 compose_table
.append([first_keysym
, 0, 0, 0, 0, 0])
799 while sequence
[0] == first_keysym
:
800 compose_table
[counter
][num_of_keysyms(sequence
)-1] += 1
802 sequence
= sequence_iterator
.next()
803 except StopIteration:
810 ct_index
= start_offset
811 for line_num
in range(len(compose_table
)):
812 for i
in range(WIDTHOFCOMPOSETABLE
):
813 occurences
= compose_table
[line_num
][i
+1]
814 compose_table
[line_num
][i
+1] = ct_index
815 ct_index
+= occurences
* (i
+2)
817 for sequence
in xorg_compose_sequences
:
818 ct_second_part
.append(map(convert_UnotationToHex
, sequence
))
820 print headerfile_start
821 for i
in compose_table
:
823 print "0x%(ks)04X," % { "ks": keysymvalue(i
[0]) },
824 print '%(str)s' % { 'str': "".join(map(lambda x
: str(x
) + ", ", i
[1:])) }
825 elif not match('^0x', i
[0]):
826 print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x
: str(x
) + ", ", i
)) }
828 print '%(str)s' % { 'str': "".join(map(lambda x
: str(x
) + ", ", i
)) }
829 for i
in ct_second_part
:
831 for ks
in i
[1:][:-1]:
832 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks
) },
833 print '0x%(cp)04X, ' % { 'cp':i
[-1] }
836 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
837 print '0x%(cp)04X, ' % { 'cp':i[-1] }
839 elif opt_gtkexpanded
:
840 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK
, i
[:-1])), 'cp':i
[-1] }
842 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK
, i
[:-1][1:])), 'cp':i
[-1] }
845 def redecompose(codepoint
):
846 (name
, decomposition
, combiningclass
) = unicodedatabase
[codepoint
]
847 if decomposition
[0] == '' or decomposition
[0] == '0':
849 if match('<\w+>', decomposition
[0]):
850 numdecomposition
= map(stringtohex
, decomposition
[1:])
851 return map(redecompose
, numdecomposition
)
852 numdecomposition
= map(stringtohex
, decomposition
)
853 return map(redecompose
, numdecomposition
)
855 def process_unicodedata_file(verbose
= False):
856 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
857 filename_unicodedatatxt
= download_file(URL_UNICODEDATATXT
)
859 unicodedatatxt
= open(filename_unicodedatatxt
, 'r')
860 except IOError, (errno
, strerror
):
861 print "I/O error(%s): %s" % (errno
, strerror
)
864 print "Unexpected error: ", sys
.exc_info()[0]
866 for line
in unicodedatatxt
.readlines():
867 if line
[0] == "" or line
[0] == '#':
870 uniproperties
= split(';', line
)
871 codepoint
= stringtohex(uniproperties
[0])
872 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
873 if codepoint
> 0xFFFF or (codepoint
>= 0x4E00 and codepoint
<= 0x9FFF) or (codepoint
>= 0xF900 and codepoint
<= 0xFAFF):
875 name
= uniproperties
[1]
876 category
= uniproperties
[2]
877 combiningclass
= uniproperties
[3]
878 decomposition
= uniproperties
[5]
879 unicodedatabase
[codepoint
] = [name
, split('\s+', decomposition
), combiningclass
]
881 counter_combinations
= 0
882 counter_combinations_greek
= 0
884 counter_entries_greek
= 0
886 for item
in unicodedatabase
.keys():
887 (name
, decomposition
, combiningclass
) = unicodedatabase
[item
]
888 if decomposition
[0] == '':
890 print name
, "is empty"
891 elif match('<\w+>', decomposition
[0]):
893 print name
, "has weird", decomposition
[0]
895 sequence
= map(stringtohex
, decomposition
)
896 chrsequence
= map(unichr, sequence
)
897 normalized
= normalize('NFC', "".join(chrsequence
))
899 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """
900 decomposedsequence
= []
901 for subseq
in map(redecompose
, sequence
):
902 for seqitem
in subseq
:
903 if isinstance(seqitem
, list):
905 if isinstance(i
, list):
907 decomposedsequence
.append(j
)
909 decomposedsequence
.append(i
)
911 decomposedsequence
.append(seqitem
)
912 recomposedchar
= normalize('NFC', "".join(map(unichr, decomposedsequence
)))
913 if len(recomposedchar
) == 1 and len(decomposedsequence
) > 1:
915 counter_combinations
+= factorial(len(decomposedsequence
)-1)
917 if ch
>= 0x370 and ch
<= 0x3ff or ch
>= 0x1f00 and ch
<= 0x1fff:
918 counter_entries_greek
+= 1
919 counter_combinations_greek
+= factorial(len(decomposedsequence
)-1)
921 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item
, 'uni':unichr(item
) },
923 for elem
in decomposedsequence
:
924 print '<0x%(hex)04X>,' % { 'hex': elem
},
925 print "], recomposed as", recomposedchar
,
926 if unichr(item
) == recomposedchar
:
930 print "Unicode statistics from UnicodeData.txt"
931 print "Number of entries that can be algorithmically produced :", counter_entries
932 print " of which are for Greek :", counter_entries_greek
933 print "Number of compose sequence combinations requiring :", counter_combinations
934 print " of which are for Greek :", counter_combinations_greek
935 print "Note: We do not include partial compositions, "
936 print "thus the slight discrepancy in the figures"
939 if opt_unicodedatatxt
:
940 process_unicodedata_file(True)
944 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences
) + len(xorg_compose_sequences_algorithmic
)
945 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic
)
946 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences
)
947 print " of which have Multi_key :", counter_multikey
949 print "Algorithmic (stats for Xorg Compose file)"
950 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic
)
951 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued
)
952 print " of which are for Greek :", num_algorithmic_greek
954 process_unicodedata_file()
955 print "Not algorithmic (stats from Xorg Compose file)"
956 print "Number of sequences :", len(xorg_compose_sequences
)
957 print "Flat array looks like :", len(xorg_compose_sequences
), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
958 print "Flat array would have taken up (in bytes) :", num_entries
* 2 * 6, "bytes from the GTK+ library"
959 print "Number of items in flat array :", len(xorg_compose_sequences
) * 6
960 print " of which are zeroes :", zeroes
, "or ", (100 * zeroes
) / (len(xorg_compose_sequences
) * 6), " per cent"
961 print "Number of different first items :", num_first_keysyms
962 print "Number of max bytes (if using flat array) :", num_entries
* 2 * 6
963 print "Number of savings :", zeroes
* 2 - num_first_keysyms
* 2 * 5
965 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
966 print " :", num_entries
* 2 * 6 - zeroes
* 2 + num_first_keysyms
* 2 * 5
968 print "Existing (old) implementation in GTK+"
969 print "Number of sequences in old gtkimcontextsimple.c :", 691
970 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes"