third_party/gtk+/gtk/compose-parse.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 #
   4 # compose-parse.py, version 1.3
   5 #
   6 # multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c)
   7 # the script produces statistics and information about the whole process, run with --help for more.
   8 #
   9 # You may need to switch your python installation to utf-8, if you get 'ascii' codec errors.
  10 #
  11 # Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft.
  12
  13 from re                 import findall, match, split, sub
  14 from string             import atoi
  15 from unicodedata        import normalize
  16 from urllib             import urlretrieve
  17 from os.path            import isfile, getsize
  18 from copy               import copy
  19
  20 import sys
  21 import getopt
  22
  23 # We grab files off the web, left and right.
  24 URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre'
  25 URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt"
  26 URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h"
  27 URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt'
  28 FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt'
  29
  30 # We currently support keysyms of size 2; once upstream xorg gets sorted,
  31 # we might produce some tables with size 2 and some with size 4.
  32 SIZEOFINT = 2
  33
  34 # Current max compose sequence length; in case it gets increased.
  35 WIDTHOFCOMPOSETABLE = 5
  36
  37 keysymdatabase = {}
  38 keysymunicodedatabase = {}
  39 unicodedatabase = {}
  40
  41 headerfile_start = """/* GTK - The GIMP Tool Kit
  42  * Copyright (C) 2007, 2008 GNOME Foundation
  43  *
  44  * This library is free software; you can redistribute it and/or
  45  * modify it under the terms of the GNU Lesser General Public
  46  * License as published by the Free Software Foundation; either
  47  * version 2 of the License, or (at your option) any later version.
  48  *
  49  * This library is distributed in the hope that it will be useful,
  50  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  51  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  52  * Lesser General Public License for more details.
  53  *
  54  * You should have received a copy of the GNU Lesser General Public
  55  * License along with this library; if not, write to the
  56  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  57  * Boston, MA 02111-1307, USA.
  58  */
  59
  60 /*
  61  * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896
  62  * using the input files
  63  *  Input   : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre
  64  *  Input   : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt
  65  *  Input   : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
  66  *
  67  * This table is optimised for space and requires special handling to access the content.
  68  * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c
  69  *
  70  * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h
  71  * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896
  72  */
  73
  74 /*
  75  * Modified by the GTK+ Team and others 2007, 2008.  See the AUTHORS
  76  * file for a list of people on the GTK+ Team.  See the ChangeLog
  77  * files for a list of changes.  These files are distributed with
  78  * GTK+ at ftp://ftp.gtk.org/pub/gtk/.
  79  */
  80
  81 #ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  82 #define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__
  83
  84 /* === These are the original comments of the file; we keep for historical purposes ===
  85  *
  86  * The following table was generated from the X compose tables include with
  87  * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com>
  88  * to obtain the relevant perl scripts.
  89  *
  90  * The following compose letter letter sequences confliced
  91  *   Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over
  92  *                                ETH (Icelandic, Faroese, old English, IPA)  [ D- -D d- -d ]
  93  *   Amacron/amacron and ordfeminine; resolved to ordfeminine                 [ _A A_ a_ _a ]
  94  *   Amacron/amacron and Atilde/atilde; resolved to atilde                    [ -A A- a- -a ]
  95  *   Omacron/Omacron and masculine; resolved to masculine                     [ _O O_ o_ _o ]
  96  *   Omacron/omacron and Otilde/atilde; resolved to otilde                    [ -O O- o- -o ]
  97  *
  98  * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for
  99  *   spanish. atilde and otilde are used at least for Portuguese ]
 100  *
 101  *   at and Aring; resolved to Aring                                          [ AA ]
 102  *   guillemotleft and caron; resolved to guillemotleft                       [ << ]
 103  *   ogonek and cedilla; resolved to cedilla                                  [ ,, ]
 104  *
 105  * This probably should be resolved by first checking an additional set of compose tables
 106  * that depend on the locale or selected input method.
 107  */
 108
 109 static const guint16 gtk_compose_seqs_compact[] = {"""
 110
 111 headerfile_end = """};
 112
 113 #endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */
 114 """
 115
 116 def stringtohex(str): return atoi(str, 16)
 117
 118 def factorial(n):
 119         if n <= 1:
 120                 return 1
 121         else:
 122                 return n * factorial(n-1)
 123
 124 def uniq(*args) :
 125         """ Performs a uniq operation on a list or lists """
 126         theInputList = []
 127         for theList in args:
 128            theInputList += theList
 129         theFinalList = []
 130         for elem in theInputList:
 131                 if elem not in theFinalList:
 132                         theFinalList.append(elem)
 133         return theFinalList
 134
 135
 136
 137 def all_permutations(seq):
 138         """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """
 139         """ Produces all permutations of the items of a list """
 140         if len(seq) <=1:
 141             yield seq
 142         else:
 143             for perm in all_permutations(seq[1:]):
 144                 for i in range(len(perm)+1):
 145                     #nb str[0:1] works in both string and list contexts
 146                         yield perm[:i] + seq[0:1] + perm[i:]
 147
 148 def usage():
 149         print """compose-parse available parameters:
 150         -h, --help              this craft
 151         -s, --statistics        show overall statistics (both algorithmic, non-algorithmic)
 152         -a, --algorithmic       show sequences saved with algorithmic optimisation
 153         -g, --gtk               show entries that go to GTK+
 154         -u, --unicodedatatxt    show compose sequences derived from UnicodeData.txt (from unicode.org)
 155         -v, --verbose           show verbose output
 156         -p, --plane1            show plane1 compose sequences
 157         -n, --numeric           when used with --gtk, create file with numeric values only
 158         -e, --gtk-expanded      when used with --gtk, create file that repeats first column; not usable in GTK+
 159         --all-sequences         when used with --gtk, create file with entries rejected by default
 160         Default is to show statistics.
 161         """
 162
 163 try:
 164         opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt",
 165                 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"])
 166 except:
 167         usage()
 168         sys.exit(2)
 169
 170 opt_statistics = False
 171 opt_algorithmic = False
 172 opt_gtk = False
 173 opt_unicodedatatxt = False
 174 opt_verbose = False
 175 opt_plane1 = False
 176 opt_numeric = False
 177 opt_gtkexpanded = False
 178 opt_allsequences = False
 179
 180 for o, a in opts:
 181         if o in ("-h", "--help"):
 182                 usage()
 183                 sys.exit()
 184         if o in ("-s", "--statistics"):
 185                 opt_statistics = True
 186         if o in ("-a", "--algorithmic"):
 187                 opt_algorithmic = True
 188         if o in ("-g", "--gtk"):
 189                 opt_gtk = True
 190         if o in ("-u", "--unicodedatatxt"):
 191                 opt_unicodedatatxt = True
 192         if o in ("-v", "--verbose"):
 193                 opt_verbose = True
 194         if o in ("-p", "--plane1"):
 195                 opt_plane1 = True
 196         if o in ("-n", "--numeric"):
 197                 opt_numeric = True
 198         if o in ("-e", "--gtk-expanded"):
 199                 opt_gtkexpanded = True
 200         if o == "--all-sequences":
 201                 opt_allsequences = True
 202
 203 if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt:
 204         opt_statistics = True
 205
 206 def download_hook(blocks_transferred, block_size, file_size):
 207         """ A download hook to provide some feedback when downloading """
 208         if blocks_transferred == 0:
 209                 if file_size > 0:
 210                         if opt_verbose:
 211                                 print "Downloading", file_size, "bytes: ",
 212                 else:
 213                         if opt_verbose:
 214                                 print "Downloading: ",
 215         sys.stdout.write('#')
 216         sys.stdout.flush()
 217
 218
 219 def download_file(url):
 220         """ Downloads a file provided a URL. Returns the filename. """
 221         """ Borks on failure """
 222         localfilename = url.split('/')[-1]
 223         if not isfile(localfilename) or getsize(localfilename) <= 0:
 224                 if opt_verbose:
 225                         print "Downloading ", url, "..."
 226                 try:
 227                         urlretrieve(url, localfilename, download_hook)
 228                 except IOError, (errno, strerror):
 229                         print "I/O error(%s): %s" % (errno, strerror)
 230                         sys.exit(-1)
 231                 except:
 232                         print "Unexpected error: ", sys.exc_info()[0]
 233                         sys.exit(-1)
 234                 print " done."
 235         else:
 236                 if opt_verbose:
 237                         print "Using cached file for ", url
 238         return localfilename
 239
 240 def process_gdkkeysymsh():
 241         """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """
 242         """ Fills up keysymdb with contents """
 243         filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH)
 244         try:
 245                 gdkkeysymsh = open(filename_gdkkeysymsh, 'r')
 246         except IOError, (errno, strerror):
 247                 print "I/O error(%s): %s" % (errno, strerror)
 248                 sys.exit(-1)
 249         except:
 250                 print "Unexpected error: ", sys.exc_info()[0]
 251                 sys.exit(-1)
 252
 253         """ Parse the gdkkeysyms.h file and place contents in  keysymdb """
 254         linenum_gdkkeysymsh = 0
 255         keysymdb = {}
 256         for line in gdkkeysymsh.readlines():
 257                 linenum_gdkkeysymsh += 1
 258                 line = line.strip()
 259                 if line == "" or not match('^#define GDK_KEY_', line):
 260                         continue
 261                 components = split('\s+', line)
 262                 if len(components) < 3:
 263                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 264                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 265                         print "Was expecting 3 items in the line"
 266                         sys.exit(-1)
 267                 if not match('^GDK_KEY_', components[1]):
 268                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 269                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 270                         print "Was expecting a keysym starting with GDK_KEY_"
 271                         sys.exit(-1)
 272                 if match('^0x[0-9a-fA-F]+$', components[2]):
 273                         unival = long(components[2][2:], 16)
 274                         if unival == 0:
 275                                 continue
 276                         keysymdb[components[1][8:]] = unival
 277                 else:
 278                         print "Invalid line %(linenum)d in %(filename)s: %(line)s"\
 279                         % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line}
 280                         print "Was expecting a hexadecimal number at the end of the line"
 281                         sys.exit(-1)
 282         gdkkeysymsh.close()
 283
 284         """ Patch up the keysymdb with some of our own stuff """
 285
 286         """ This is for a missing keysym from the currently upstream file """
 287         #keysymdb['dead_stroke'] = 0x338
 288
 289         """ This is for a missing keysym from the currently upstream file """
 290         ###keysymdb['dead_belowring'] = 0x323
 291         ###keysymdb['dead_belowmacron'] = 0x331
 292         ###keysymdb['dead_belowcircumflex'] = 0x32d
 293         ###keysymdb['dead_belowtilde'] = 0x330
 294         ###keysymdb['dead_belowbreve'] = 0x32e
 295         ###keysymdb['dead_belowdiaeresis'] = 0x324
 296
 297         """ This is^Wwas preferential treatment for Greek """
 298         # keysymdb['dead_tilde'] = 0x342
 299         """ This is^was preferential treatment for Greek """
 300         #keysymdb['combining_tilde'] = 0x342
 301
 302         """ Fixing VoidSymbol """
 303         keysymdb['VoidSymbol'] = 0xFFFF
 304
 305         return keysymdb
 306
 307 def process_keysymstxt():
 308         """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """
 309         """ This file keeps a record between keysyms <-> unicode chars """
 310         filename_keysymstxt = download_file(URL_KEYSYMSTXT)
 311         try:
 312                 keysymstxt = open(filename_keysymstxt, 'r')
 313         except IOError, (errno, strerror):
 314                 print "I/O error(%s): %s" % (errno, strerror)
 315                 sys.exit(-1)
 316         except:
 317                 print "Unexpected error: ", sys.exc_info()[0]
 318                 sys.exit(-1)
 319
 320         """ Parse the keysyms.txt file and place content in  keysymdb """
 321         linenum_keysymstxt = 0
 322         keysymdb = {}
 323         for line in keysymstxt.readlines():
 324                 linenum_keysymstxt += 1
 325                 line = line.strip()
 326                 if line == "" or match('^#', line):
 327                         continue
 328                 components = split('\s+', line)
 329                 if len(components) < 5:
 330                         print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\
 331                         % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line}
 332                         print "Was expecting 5 items in the line"
 333                         sys.exit(-1)
 334                 if match('^U[0-9a-fA-F]+$', components[1]):
 335                         unival = long(components[1][1:], 16)
 336                 if unival == 0:
 337                         continue
 338                 keysymdb[components[4]] = unival
 339         keysymstxt.close()
 340
 341         """ Patch up the keysymdb with some of our own stuff """
 342         """ This is for a missing keysym from the currently upstream file """
 343         ###keysymdb['dead_belowring'] = 0x323
 344         ###keysymdb['dead_belowmacron'] = 0x331
 345         ###keysymdb['dead_belowcircumflex'] = 0x32d
 346         ###keysymdb['dead_belowtilde'] = 0x330
 347         ###keysymdb['dead_belowbreve'] = 0x32e
 348         ###keysymdb['dead_belowdiaeresis'] = 0x324
 349
 350         """ This is preferential treatment for Greek """
 351         """ => we get more savings if used for Greek """
 352         # keysymdb['dead_tilde'] = 0x342
 353         """ This is preferential treatment for Greek """
 354         # keysymdb['combining_tilde'] = 0x342
 355
 356         """ This is for a missing keysym from Markus Kuhn's db """
 357         keysymdb['dead_stroke'] = 0x338
 358         """ This is for a missing keysym from Markus Kuhn's db """
 359         keysymdb['Oslash'] = 0x0d8
 360         """ This is for a missing keysym from Markus Kuhn's db """
 361         keysymdb['Ssharp'] = 0x1e9e
 362
 363         """ This is for a missing (recently added) keysym """
 364         keysymdb['dead_psili'] = 0x313
 365         """ This is for a missing (recently added) keysym """
 366         keysymdb['dead_dasia'] = 0x314
 367
 368         """ Allows to import Multi_key sequences """
 369         keysymdb['Multi_key'] = 0xff20
 370
 371         keysymdb['zerosubscript'] = 0x2080
 372         keysymdb['onesubscript'] = 0x2081
 373         keysymdb['twosubscript'] = 0x2082
 374         keysymdb['threesubscript'] = 0x2083
 375         keysymdb['foursubscript'] = 0x2084
 376         keysymdb['fivesubscript'] = 0x2085
 377         keysymdb['sixsubscript'] = 0x2086
 378         keysymdb['sevensubscript'] = 0x2087
 379         keysymdb['eightsubscript'] = 0x2088
 380         keysymdb['ninesubscript'] = 0x2089
 381         keysymdb['dead_doublegrave'] = 0x030F
 382         keysymdb['dead_invertedbreve'] = 0x0311
 383
 384         return keysymdb
 385
 386 def keysymvalue(keysym, file = "n/a", linenum = 0):
 387         """ Extracts a value from the keysym """
 388         """ Find the value of keysym, using the data from keysyms """
 389         """ Use file and linenum to when reporting errors """
 390         if keysym == "":
 391                 return 0
 392         if keysymdatabase.has_key(keysym):
 393                 return keysymdatabase[keysym]
 394         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 395                 return atoi(keysym[1:], 16)
 396         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 397                 return atoi(keysym[2:], 16)
 398         else:
 399                 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 400                 #return -1
 401                 sys.exit(-1)
 402
 403 def keysymunicodevalue(keysym, file = "n/a", linenum = 0):
 404         """ Extracts a value from the keysym """
 405         """ Find the value of keysym, using the data from keysyms """
 406         """ Use file and linenum to when reporting errors """
 407         if keysym == "":
 408                 return 0
 409         if keysymunicodedatabase.has_key(keysym):
 410                 return keysymunicodedatabase[keysym]
 411         elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]):
 412                 return atoi(keysym[1:], 16)
 413         elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]):
 414                 return atoi(keysym[2:], 16)
 415         else:
 416                 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym }
 417                 sys.exit(-1)
 418
 419 def rename_combining(seq):
 420         filtered_sequence = []
 421         for ks in seq:
 422                 if findall('^combining_', ks):
 423                         ks = sub('^combining_', 'dead_', ks)
 424                 if ks == 'dead_double_grave':
 425                         ks = 'dead_doublegrave'
 426                 if ks == 'dead_inverted_breve':
 427                         ks = 'dead_invertedbreve'
 428                 filtered_sequence.append(ks)
 429         return filtered_sequence
 430
 431
 432 keysymunicodedatabase = process_keysymstxt()
 433 keysymdatabase = process_gdkkeysymsh()
 434
 435 """ Grab and open the compose file from upstream """
 436 filename_compose = download_file(URL_COMPOSE)
 437 try:
 438         composefile = open(filename_compose, 'r')
 439 except IOError, (errno, strerror):
 440         print "I/O error(%s): %s" % (errno, strerror)
 441         sys.exit(-1)
 442 except:
 443         print "Unexpected error: ", sys.exc_info()[0]
 444         sys.exit(-1)
 445
 446 """ Look if there is a lookaside (supplementary) compose file in the current
 447     directory, and if so, open, then merge with upstream Compose file.
 448 """
 449 xorg_compose_sequences_raw = []
 450 for seq in composefile.readlines():
 451         xorg_compose_sequences_raw.append(seq)
 452
 453 try:
 454         composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r')
 455         for seq in composefile_lookaside.readlines():
 456                 xorg_compose_sequences_raw.append(seq)
 457 except IOError, (errno, strerror):
 458         if opt_verbose:
 459                 print "I/O error(%s): %s" % (errno, strerror)
 460                 print "Did not find lookaside compose file. Continuing..."
 461 except:
 462         print "Unexpected error: ", sys.exc_info()[0]
 463         sys.exit(-1)
 464
 465 """ Parse the compose file in  xorg_compose_sequences"""
 466 xorg_compose_sequences = []
 467 xorg_compose_sequences_algorithmic = []
 468 linenum_compose = 0
 469 comment_nest_depth = 0
 470 for line in xorg_compose_sequences_raw:
 471         linenum_compose += 1
 472         line = line.strip()
 473         if match("^XCOMM", line) or match("^#", line):
 474                 continue
 475
 476         line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line)
 477
 478         comment_start = line.find("/*")
 479
 480         if comment_start >= 0:
 481                 if comment_nest_depth == 0:
 482                         line = line[:comment_start]
 483                 else:
 484                         line = ""
 485
 486                 comment_nest_depth += 1
 487         else:
 488                 comment_end = line.find("*/")
 489
 490                 if comment_end >= 0:
 491                         comment_nest_depth -= 1
 492
 493                 if comment_nest_depth < 0:
 494                         print "Invalid comment %(linenum_compose)d in %(filename)s: \
 495                         Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 496                         exit(-1)
 497
 498                 if comment_nest_depth > 0:
 499                         line = ""
 500                 else:
 501                         line = line[comment_end + 2:]
 502
 503         if line is "":
 504                 continue
 505
 506         #line = line[:-1]
 507         components = split(':', line)
 508         if len(components) != 2:
 509                 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\
 510                 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose }
 511                 exit(-1)
 512         (seq, val ) = split(':', line)
 513         seq = seq.strip()
 514         val = val.strip()
 515         raw_sequence = findall('\w+', seq)
 516         values = split('\s+', val)
 517         unichar_temp = split('"', values[0])
 518         unichar = unichar_temp[1]
 519         if len(values) == 1:
 520                 continue
 521         codepointstr = values[1]
 522         if values[1] == '#':
 523                 # No codepoints that are >1 characters yet.
 524                 continue
 525         if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]):
 526                 raw_sequence[0] = '0x' + raw_sequence[0][1:]
 527         if  match('^U[0-9a-fA-F]+$', codepointstr):
 528                 codepoint = long(codepointstr[1:], 16)
 529         elif keysymunicodedatabase.has_key(codepointstr):
 530                 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]:
 531                         #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]},
 532                         #print raw_sequence, codepointstr
 533                 codepoint = keysymunicodedatabase[codepointstr]
 534         else:
 535                 print
 536                 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\
 537                  %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line }
 538                 exit(-1)
 539         sequence = rename_combining(raw_sequence)
 540         reject_this = False
 541         for i in sequence:
 542                 if keysymvalue(i) > 0xFFFF:
 543                         reject_this = True
 544                         if opt_plane1:
 545                                 print sequence
 546                         break
 547                 if keysymvalue(i) < 0:
 548                         reject_this = True
 549                         break
 550         if reject_this:
 551                 continue
 552         if "U0342" in sequence or \
 553                 "U0313" in sequence or \
 554                 "U0314" in sequence or \
 555                 "0x0313" in sequence or \
 556                 "0x0342" in sequence or \
 557                 "0x0314" in sequence:
 558                 continue
 559         if "dead_belowring" in sequence or\
 560                 "dead_currency" in sequence or\
 561                 "dead_belowcomma" in sequence or\
 562                 "dead_belowmacron" in sequence or\
 563                 "dead_belowtilde" in sequence or\
 564                 "dead_belowbreve" in sequence or\
 565                 "dead_belowdiaeresis" in sequence or\
 566                 "dead_belowcircumflex" in sequence:
 567                 continue
 568         #for i in range(len(sequence)):
 569         #       if sequence[i] == "0x0342":
 570         #               sequence[i] = "dead_tilde"
 571         if "Multi_key" not in sequence:
 572                 """ Ignore for now >0xFFFF keysyms """
 573                 if codepoint < 0xFFFF:
 574                         original_sequence = copy(sequence)
 575                         stats_sequence = copy(sequence)
 576                         base = sequence.pop()
 577                         basechar = keysymvalue(base, filename_compose, linenum_compose)
 578
 579                         if basechar < 0xFFFF:
 580                                 counter = 1
 581                                 unisequence = []
 582                                 not_normalised = True
 583                                 skipping_this = False
 584                                 for i in range(0, len(sequence)):
 585                                         """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically
 586                                             because of lack of dead_perispomeni (i.e. conflict)
 587                                         """
 588                                         bc = basechar
 589                                         """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 590                                                 skipping_this = True
 591                                                 break
 592                                         if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 593                                                 skipping_this = True
 594                                                 break
 595                                         if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff):
 596                                                 skipping_this = True
 597                                                 break
 598                                         if sequence[-1] == "dead_psili":
 599                                                 sequence[i] = "dead_horn"
 600                                         if sequence[-1] == "dead_dasia":
 601                                                 sequence[-1] = "dead_ogonek"
 602                                         """
 603                                         unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose)))
 604
 605                                 if skipping_this:
 606                                         unisequence = []
 607                                 for perm in all_permutations(unisequence):
 608                                         # print counter, original_sequence, unichr(basechar) + "".join(perm)
 609                                         # print counter, map(unichr, perm)
 610                                         normalized = normalize('NFC', unichr(basechar) + "".join(perm))
 611                                         if len(normalized) == 1:
 612                                                 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \
 613                                                 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint },
 614                                                 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter }
 615                                                 stats_sequence_data = map(keysymunicodevalue, stats_sequence)
 616                                                 stats_sequence_data.append(normalized)
 617                                                 xorg_compose_sequences_algorithmic.append(stats_sequence_data)
 618                                                 not_normalised = False
 619                                                 break;
 620                                         counter += 1
 621                                 if not_normalised or opt_allsequences:
 622                                         original_sequence.append(codepoint)
 623                                         xorg_compose_sequences.append(original_sequence)
 624                                         """ print xorg_compose_sequences[-1] """
 625
 626                         else:
 627                                 print "Error in base char !?!"
 628                                 exit(-2)
 629                 else:
 630                         print "OVER", sequence
 631                         exit(-1)
 632         else:
 633                 sequence.append(codepoint)
 634                 xorg_compose_sequences.append(sequence)
 635                 """ print xorg_compose_sequences[-1] """
 636
 637 def sequence_cmp(x, y):
 638         if keysymvalue(x[0]) > keysymvalue(y[0]):
 639                 return 1
 640         elif keysymvalue(x[0]) < keysymvalue(y[0]):
 641                 return -1
 642         elif len(x) > len(y):
 643                 return 1
 644         elif len(x) < len(y):
 645                 return -1
 646         elif keysymvalue(x[1]) > keysymvalue(y[1]):
 647                 return 1
 648         elif keysymvalue(x[1]) < keysymvalue(y[1]):
 649                 return -1
 650         elif len(x) < 4:
 651                 return 0
 652         elif keysymvalue(x[2]) > keysymvalue(y[2]):
 653                 return 1
 654         elif keysymvalue(x[2]) < keysymvalue(y[2]):
 655                 return -1
 656         elif len(x) < 5:
 657                 return 0
 658         elif keysymvalue(x[3]) > keysymvalue(y[3]):
 659                 return 1
 660         elif keysymvalue(x[3]) < keysymvalue(y[3]):
 661                 return -1
 662         elif len(x) < 6:
 663                 return 0
 664         elif keysymvalue(x[4]) > keysymvalue(y[4]):
 665                 return 1
 666         elif keysymvalue(x[4]) < keysymvalue(y[4]):
 667                 return -1
 668         else:
 669                 return 0
 670
 671 def sequence_unicode_cmp(x, y):
 672         if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]):
 673                 return 1
 674         elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]):
 675                 return -1
 676         elif len(x) > len(y):
 677                 return 1
 678         elif len(x) < len(y):
 679                 return -1
 680         elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]):
 681                 return 1
 682         elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]):
 683                 return -1
 684         elif len(x) < 4:
 685                 return 0
 686         elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]):
 687                 return 1
 688         elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]):
 689                 return -1
 690         elif len(x) < 5:
 691                 return 0
 692         elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]):
 693                 return 1
 694         elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]):
 695                 return -1
 696         elif len(x) < 6:
 697                 return 0
 698         elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]):
 699                 return 1
 700         elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]):
 701                 return -1
 702         else:
 703                 return 0
 704
 705 def sequence_algorithmic_cmp(x, y):
 706         if len(x) < len(y):
 707                 return -1
 708         elif len(x) > len(y):
 709                 return 1
 710         else:
 711                 for i in range(len(x)):
 712                         if x[i] < y[i]:
 713                                 return -1
 714                         elif x[i] > y[i]:
 715                                 return 1
 716         return 0
 717
 718
 719 xorg_compose_sequences.sort(sequence_cmp)
 720
 721 xorg_compose_sequences_uniqued = []
 722 first_time = True
 723 item = None
 724 for next_item in xorg_compose_sequences:
 725         if first_time:
 726                 first_time = False
 727                 item = next_item
 728         if sequence_unicode_cmp(item, next_item) != 0:
 729                 xorg_compose_sequences_uniqued.append(item)
 730         item = next_item
 731
 732 xorg_compose_sequences = copy(xorg_compose_sequences_uniqued)
 733
 734 counter_multikey = 0
 735 for item in xorg_compose_sequences:
 736         if findall('Multi_key', "".join(item[:-1])) != []:
 737                 counter_multikey += 1
 738
 739 xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp)
 740 xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic)
 741
 742 firstitem = ""
 743 num_first_keysyms = 0
 744 zeroes = 0
 745 num_entries = 0
 746 num_algorithmic_greek = 0
 747 for sequence in xorg_compose_sequences:
 748         if keysymvalue(firstitem) != keysymvalue(sequence[0]):
 749                 firstitem = sequence[0]
 750                 num_first_keysyms += 1
 751         zeroes += 6 - len(sequence) + 1
 752         num_entries += 1
 753
 754 for sequence in xorg_compose_sequences_algorithmic_uniqued:
 755         ch = ord(sequence[-1:][0])
 756         if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 757                 num_algorithmic_greek += 1
 758
 759
 760 if opt_algorithmic:
 761         for sequence in xorg_compose_sequences_algorithmic_uniqued:
 762                 letter = "".join(sequence[-1:])
 763                 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] },
 764                 for elem in sequence[:-2]:
 765                         print "<0x%(keysym)04X>," % { 'keysym': elem },
 766                 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """
 767                 print "], recomposed as", letter.encode('utf-8'), "verified"
 768
 769 def num_of_keysyms(seq):
 770         return len(seq) - 1
 771
 772 def convert_UnotationToHex(arg):
 773         if isinstance(arg, str):
 774                 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg):
 775                         return sub('^U', '0x', arg)
 776         return arg
 777
 778 def addprefix_GDK(arg):
 779         if match('^0x', arg):
 780                 return '%(arg)s, ' % { 'arg': arg }
 781         else:
 782                 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg }
 783
 784 if opt_gtk:
 785         first_keysym = ""
 786         sequence = []
 787         compose_table = []
 788         ct_second_part = []
 789         ct_sequence_width = 2
 790         start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1)
 791         we_finished = False
 792         counter = 0
 793
 794         sequence_iterator = iter(xorg_compose_sequences)
 795         sequence = sequence_iterator.next()
 796         while True:
 797                 first_keysym = sequence[0]                                      # Set the first keysym
 798                 compose_table.append([first_keysym, 0, 0, 0, 0, 0])
 799                 while sequence[0] == first_keysym:
 800                         compose_table[counter][num_of_keysyms(sequence)-1] += 1
 801                         try:
 802                                 sequence = sequence_iterator.next()
 803                         except StopIteration:
 804                                 we_finished = True
 805                                 break
 806                 if we_finished:
 807                         break
 808                 counter += 1
 809
 810         ct_index = start_offset
 811         for line_num in range(len(compose_table)):
 812                 for i in range(WIDTHOFCOMPOSETABLE):
 813                         occurences = compose_table[line_num][i+1]
 814                         compose_table[line_num][i+1] = ct_index
 815                         ct_index += occurences * (i+2)
 816
 817         for sequence in xorg_compose_sequences:
 818                 ct_second_part.append(map(convert_UnotationToHex, sequence))
 819
 820         print headerfile_start
 821         for i in compose_table:
 822                 if opt_gtkexpanded:
 823                         print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) },
 824                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) }
 825                 elif not match('^0x', i[0]):
 826                         print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 827                 else:
 828                         print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) }
 829         for i in ct_second_part:
 830                 if opt_numeric:
 831                         for ks in i[1:][:-1]:
 832                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 833                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 834                         """
 835                         for ks in i[:-1]:
 836                                 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) },
 837                         print '0x%(cp)04X, ' % { 'cp':i[-1] }
 838                         """
 839                 elif opt_gtkexpanded:
 840                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] }
 841                 else:
 842                         print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] }
 843         print headerfile_end
 844
 845 def redecompose(codepoint):
 846         (name, decomposition, combiningclass) = unicodedatabase[codepoint]
 847         if decomposition[0] == '' or decomposition[0] == '0':
 848                 return [codepoint]
 849         if match('<\w+>', decomposition[0]):
 850                 numdecomposition = map(stringtohex, decomposition[1:])
 851                 return map(redecompose, numdecomposition)
 852         numdecomposition = map(stringtohex, decomposition)
 853         return map(redecompose, numdecomposition)
 854
 855 def process_unicodedata_file(verbose = False):
 856         """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """
 857         filename_unicodedatatxt = download_file(URL_UNICODEDATATXT)
 858         try:
 859                 unicodedatatxt = open(filename_unicodedatatxt, 'r')
 860         except IOError, (errno, strerror):
 861                 print "I/O error(%s): %s" % (errno, strerror)
 862                 sys.exit(-1)
 863         except:
 864                 print "Unexpected error: ", sys.exc_info()[0]
 865                 sys.exit(-1)
 866         for line in unicodedatatxt.readlines():
 867                 if line[0] == "" or line[0] == '#':
 868                         continue
 869                 line = line[:-1]
 870                 uniproperties = split(';', line)
 871                 codepoint = stringtohex(uniproperties[0])
 872                 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """
 873                 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF):
 874                         continue
 875                 name = uniproperties[1]
 876                 category = uniproperties[2]
 877                 combiningclass = uniproperties[3]
 878                 decomposition = uniproperties[5]
 879                 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass]
 880
 881         counter_combinations = 0
 882         counter_combinations_greek = 0
 883         counter_entries = 0
 884         counter_entries_greek = 0
 885
 886         for item in unicodedatabase.keys():
 887                 (name, decomposition, combiningclass) = unicodedatabase[item]
 888                 if decomposition[0] == '':
 889                         continue
 890                         print name, "is empty"
 891                 elif match('<\w+>', decomposition[0]):
 892                         continue
 893                         print name, "has weird", decomposition[0]
 894                 else:
 895                         sequence = map(stringtohex, decomposition)
 896                         chrsequence = map(unichr, sequence)
 897                         normalized = normalize('NFC', "".join(chrsequence))
 898
 899                         """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized),  """
 900                         decomposedsequence = []
 901                         for subseq in map(redecompose, sequence):
 902                                 for seqitem in subseq:
 903                                         if isinstance(seqitem, list):
 904                                                 for i in seqitem:
 905                                                         if isinstance(i, list):
 906                                                                 for j in i:
 907                                                                         decomposedsequence.append(j)
 908                                                         else:
 909                                                                 decomposedsequence.append(i)
 910                                         else:
 911                                                 decomposedsequence.append(seqitem)
 912                         recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence)))
 913                         if len(recomposedchar) == 1 and len(decomposedsequence) > 1:
 914                                 counter_entries += 1
 915                                 counter_combinations += factorial(len(decomposedsequence)-1)
 916                                 ch = item
 917                                 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff:
 918                                         counter_entries_greek += 1
 919                                         counter_combinations_greek += factorial(len(decomposedsequence)-1)
 920                                 if verbose:
 921                                         print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) },
 922                                         print "[",
 923                                         for elem in decomposedsequence:
 924                                                 print '<0x%(hex)04X>,' % { 'hex': elem },
 925                                         print "], recomposed as", recomposedchar,
 926                                         if unichr(item) == recomposedchar:
 927                                                 print "verified"
 928
 929         if verbose == False:
 930                 print "Unicode statistics from UnicodeData.txt"
 931                 print "Number of entries that can be algorithmically produced     :", counter_entries
 932                 print "  of which are for Greek                                   :", counter_entries_greek
 933                 print "Number of compose sequence combinations requiring          :", counter_combinations
 934                 print "  of which are for Greek                                   :", counter_combinations_greek
 935                 print "Note: We do not include partial compositions, "
 936                 print "thus the slight discrepancy in the figures"
 937                 print
 938
 939 if opt_unicodedatatxt:
 940         process_unicodedata_file(True)
 941
 942 if opt_statistics:
 943         print
 944         print "Total number of compose sequences (from file)              :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic)
 945         print "  of which can be expressed algorithmically                :", len(xorg_compose_sequences_algorithmic)
 946         print "  of which cannot be expressed algorithmically             :", len(xorg_compose_sequences)
 947         print "    of which have Multi_key                                :", counter_multikey
 948         print
 949         print "Algorithmic (stats for Xorg Compose file)"
 950         print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic)
 951         print "Number of sequences off due to algo (uniq(sort(array)))    :", len(xorg_compose_sequences_algorithmic_uniqued)
 952         print "  of which are for Greek                                   :", num_algorithmic_greek
 953         print
 954         process_unicodedata_file()
 955         print "Not algorithmic (stats from Xorg Compose file)"
 956         print "Number of sequences                                        :", len(xorg_compose_sequences)
 957         print "Flat array looks like                                      :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)"
 958         print "Flat array would have taken up (in bytes)                  :", num_entries * 2 * 6, "bytes from the GTK+ library"
 959         print "Number of items in flat array                              :", len(xorg_compose_sequences) * 6
 960         print "  of which are zeroes                                      :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent"
 961         print "Number of different first items                            :", num_first_keysyms
 962         print "Number of max bytes (if using flat array)                  :", num_entries * 2 * 6
 963         print "Number of savings                                          :", zeroes * 2 - num_first_keysyms * 2 * 5
 964         print
 965         print "Memory needs if both algorithmic+optimised table in latest Xorg compose file"
 966         print "                                                           :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5
 967         print
 968         print "Existing (old) implementation in GTK+"
 969         print "Number of sequences in old gtkimcontextsimple.c            :", 691
 970         print "The existing (old) implementation in GTK+ takes up         :", 691 * 2 * 12, "bytes"