xapian-core/unicode/uniParse.tcl

   1 # uniParse.tcl --
   2 #
   3 #       This program parses the UnicodeData file and generates the
   4 #       corresponding source file with compressed character
   5 #       data tables.  The input to this program should be the latest
   6 #       UnicodeData file from:
   7 #           ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
   8 #
   9 # Copyright (c) 1998-1999 by Scriptics Corporation.
  10 # All rights reserved.
  11
  12
  13 namespace eval uni {
  14     set shift 8;                # number of bits of data within a page
  15                                 # This value can be adjusted to find the
  16                                 # best split to minimize table size
  17
  18     variable pMap;              # map from page to page index, each entry is
  19                                 # an index into the pages table, indexed by
  20                                 # page number
  21     variable pages;             # map from page index to page info, each
  22                                 # entry is a list of indices into the groups
  23                                 # table, the list is indexed by the offset
  24     variable groups;            # list of character info values, indexed by
  25                                 # group number, initialized with the
  26                                 # unassigned character group
  27
  28     variable categories {
  29         Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
  30         Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
  31     };                          # Ordered list of character categories, must
  32                                 # match the enumeration in the header file.
  33 }
  34
  35 proc uni::getValue {items index} {
  36     variable categories
  37
  38     # Extract character info
  39
  40     set category [lindex $items 2]
  41     if {[scan [lindex $items 12] %x toupper] == 1} {
  42         set toupper [expr {$index - $toupper}]
  43     } else {
  44         set toupper 0
  45     }
  46     if {[scan [lindex $items 13] %x tolower] == 1} {
  47         set tolower [expr {$tolower - $index}]
  48     } else {
  49         set tolower 0
  50     }
  51     if {[scan [lindex $items 14] %x totitle] == 1} {
  52         set totitle [expr {$index - $totitle}]
  53     } elseif {$tolower} {
  54         set totitle 0
  55     } else {
  56         set totitle $toupper
  57     }
  58
  59     set categoryIndex [lsearch -exact $categories $category]
  60     if {$categoryIndex < 0} {
  61         puts "Unexpected character category: $index($category)"
  62         set categoryIndex 0
  63     }
  64
  65     return [list $categoryIndex $toupper $tolower $totitle]
  66 }
  67
  68 proc uni::getGroup {value} {
  69     variable groups
  70
  71     set gIndex [lsearch -exact $groups $value]
  72     if {$gIndex == -1} {
  73         set gIndex [llength $groups]
  74         lappend groups $value
  75     }
  76     return $gIndex
  77 }
  78
  79 proc uni::addPage {info} {
  80     variable pMap
  81     variable pages
  82
  83     set pIndex [lsearch -exact $pages $info]
  84     if {$pIndex == -1} {
  85         set pIndex [llength $pages]
  86         lappend pages $info
  87     }
  88     lappend pMap $pIndex
  89     return
  90 }
  91
  92 proc uni::buildTables {data} {
  93     variable shift
  94
  95     variable pMap {}
  96     variable pages {}
  97     variable groups {{0 0 0 0}}
  98     variable next 0
  99     set info {}                 ;# temporary page info
 100
 101     set mask [expr {(1 << $shift) - 1}]
 102
 103     foreach line [split $data \n] {
 104         if {$line eq ""} {
 105             if {!($next & $mask)} {
 106                 # next character is already on page boundary
 107                 continue
 108             }
 109             # fill remaining page
 110             set line [format %X [expr {($next-1)|$mask}]]
 111             append line ";;Cn;0;ON;;;;;N;;;;;\n"
 112         }
 113
 114         set items [split $line \;]
 115
 116         scan [lindex $items 0] %x index
 117         if {$index > 0x2ffff} then {
 118             # Ignore non-BMP characters, as long as Tcl doesn't support them
 119             #continue
 120         }
 121         set index [format %d $index]
 122
 123         set gIndex [getGroup [getValue $items $index]]
 124
 125         # Since the input table omits unassigned characters, these will
 126         # show up as gaps in the index sequence.  There are a few special cases
 127         # where the gaps correspond to a uniform block of assigned characters.
 128         # These are indicated as such in the character name.
 129
 130         # Enter all unassigned characters up to the current character.
 131         if {($index > $next) \
 132                 && ![regexp "Last>$" [lindex $items 1]]} {
 133             for {} {$next < $index} {incr next} {
 134                 lappend info 0
 135                 if {($next & $mask) == $mask} {
 136                     addPage $info
 137                     set info {}
 138                 }
 139             }
 140         }
 141
 142         # Enter all assigned characters up to the current character
 143         for {set i $next} {$i <= $index} {incr i} {
 144             # Add the group index to the info for the current page
 145             lappend info $gIndex
 146
 147             # If this is the last entry in the page, add the page
 148             if {($i & $mask) == $mask} {
 149                 addPage $info
 150                 set info {}
 151             }
 152         }
 153         set next [expr {$index + 1}]
 154     }
 155     return
 156 }
 157
 158 proc uni::main {} {
 159     global argc argv0 argv
 160     variable pMap
 161     variable pages
 162     variable groups
 163     variable shift
 164     variable next
 165     variable max_delta
 166
 167     if {$argc != 3} {
 168         puts stderr "\nusage: $argv0 <datafile> <version> <outfile>\n"
 169         exit 1
 170     }
 171
 172     set f [open [lindex $argv 0] r]
 173     set data [read $f]
 174     close $f
 175
 176     buildTables $data
 177     puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
 178     set size [expr {[llength $pMap] + ([llength $pages]<<$shift)}]
 179     puts "shift = $shift, space = $size"
 180
 181     set f [open [lindex $argv 2] w]
 182     fconfigure $f -translation lf
 183     puts $f "/*
 184  * [lindex $argv 2] --
 185  *
 186  *      Declarations of Unicode [lindex $argv 1] character information tables.  This
 187  *      file is automatically generated by a modified version of the
 188  *      tools/uniParse.tcl script from the Tcl sources.
 189  *
 190  *      Do not modify this file by hand!
 191  *
 192  * Copyright (c) 1998 by Scriptics Corporation.
 193  * All rights reserved.
 194  */
 195
 196 #include <config.h>
 197
 198 #include <xapian/unicode.h>
 199
 200 /*
 201  * A 16-bit Unicode character is split into two parts in order to index
 202  * into the following tables.  The lower OFFSET_BITS comprise an offset
 203  * into a page of characters.  The upper bits comprise the page number.
 204  */
 205
 206 #define OFFSET_BITS $shift
 207
 208 /*
 209  * The pageMap is indexed by page number and returns an alternate page number
 210  * that identifies a unique page of characters.  Many Unicode characters map
 211  * to the same alternate page number.
 212  */
 213
 214 static const unsigned char pageMap\[\] = {"
 215     set line "    "
 216     set last [expr {[llength $pMap] - 1}]
 217     for {set i 0} {$i <= $last} {incr i} {
 218 #       if {$i == [expr {0x10000 >> $shift}]} {
 219 #           set line [string trimright $line " \t,"]
 220 #           puts $f $line
 221 #           set lastpage [expr {[lindex $line end] >> $shift}]
 222 #           puts stdout "lastpage: $lastpage"
 223 #           puts $f "#if TCL_UTF_MAX > 3"
 224 #           set line "    ,"
 225 #       }
 226         append line [lindex $pMap $i]
 227         if {$i != $last} {
 228             append line ", "
 229         }
 230         if {[string length $line] > 70} {
 231             puts $f [string trimright $line]
 232             set line "    "
 233         }
 234     }
 235     puts $f $line
 236 #    puts $f "#endif /* TCL_UTF_MAX > 3 */"
 237     puts $f "};
 238
 239 /*
 240  * The groupMap is indexed by combining the alternate page number with
 241  * the page offset and returns a group number that identifies a unique
 242  * set of character attributes.
 243  */
 244
 245 static const unsigned char groupMap\[\] = {"
 246     set line "    "
 247     set lasti [expr {[llength $pages] - 1}]
 248     for {set i 0} {$i <= $lasti} {incr i} {
 249         set page [lindex $pages $i]
 250         set lastj [expr {[llength $page] - 1}]
 251 #       if {$i == ($lastpage + 1)} {
 252 #           puts $f [string trimright $line " \t,"]
 253 #           puts $f "#if TCL_UTF_MAX > 3"
 254 #           set line "    ,"
 255 #       }
 256         for {set j 0} {$j <= $lastj} {incr j} {
 257             append line [lindex $page $j]
 258             if {$j != $lastj || $i != $lasti} {
 259                 append line ", "
 260             }
 261             if {[string length $line] > 70} {
 262                 puts $f [string trimright $line]
 263                 set line "    "
 264             }
 265         }
 266     }
 267     puts $f $line
 268 #    puts $f "#endif /* TCL_UTF_MAX > 3 */"
 269     puts $f "};
 270
 271 /*
 272  * Each group represents a unique set of character attributes.  The attributes
 273  * are encoded into a 32-bit value as follows:
 274  *
 275  * Bits 0-4     Character category: see the constants listed below.
 276  *
 277  * Bits 5-7     Case delta type: 000 = identity
 278  *                               010 = add delta for lower
 279  *                               011 = add delta for lower, add 1 for title
 280  *                               100 = subtract delta for title/upper
 281  *                               101 = sub delta for upper, sub 1 for title
 282  *                               110 = sub delta for upper, add delta for lower
 283  *
 284  * Bits 8-31    Case delta: delta for case conversions.  This should be the
 285  *                          highest field so we can easily sign extend.
 286  */
 287
 288 static const int groups\[\] = {"
 289     set line "    "
 290     set last [expr {[llength $groups] - 1}]
 291     set max_delta -1
 292     for {set i 0} {$i <= $last} {incr i} {
 293         foreach {type toupper tolower totitle} [lindex $groups $i] {}
 294
 295         # Compute the case conversion type and delta
 296
 297         if {$totitle} {
 298             if {$totitle == $toupper} {
 299                 # subtract delta for title or upper
 300                 set case 4
 301                 set delta $toupper
 302                 if {$tolower} {
 303                     error "New case conversion type needed: $toupper $tolower $totitle"
 304                 }
 305             } elseif {$toupper} {
 306                 # subtract delta for upper, subtract 1 for title
 307                 set case 5
 308                 set delta $toupper
 309                 if {($totitle != 1) || $tolower} {
 310                     error "New case conversion type needed: $toupper $tolower $totitle"
 311                 }
 312             } else {
 313                 # add delta for lower, add 1 for title
 314                 set case 3
 315                 set delta $tolower
 316                 if {$totitle != -1} {
 317                     error "New case conversion type needed: $toupper $tolower $totitle"
 318                 }
 319             }
 320         } elseif {$toupper} {
 321             # subtract delta for upper, add delta for lower
 322             set case 6
 323             set delta $toupper
 324             if {$tolower != $toupper} {
 325                 error "New case conversion type needed: $toupper $tolower $totitle"
 326             }
 327         } elseif {$tolower} {
 328             # add delta for lower
 329             set case 2
 330             set delta $tolower
 331         } else {
 332             # noop
 333             set case 0
 334             set delta 0
 335         }
 336
 337         if {$delta >= (1 << 23) || $delta < -(1<<23)} {
 338             error "delta $delta out of range"
 339         }
 340         if {$delta > $max_delta} {
 341             set max_delta $delta
 342         } elseif {-$delta > $max_delta} {
 343             set max_delta [expr {-$delta}]
 344         }
 345         append line [expr {($delta << 8) | ($case << 5) | $type}]
 346         if {$i != $last} {
 347             append line ", "
 348         }
 349         if {[string length $line] > 65} {
 350             puts $f [string trimright $line]
 351             set line "    "
 352         }
 353     }
 354     puts "max_delta = $max_delta"
 355     puts $f $line
 356     puts -nonewline $f "};
 357
 358 #if 0
 359
 360 #if TCL_UTF_MAX > 3
 361 #   define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next])
 362 #else
 363 #   define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0)
 364 #endif
 365
 366 /*
 367  * The following constants are used to determine the category of a
 368  * Unicode character.
 369  */
 370
 371 enum {
 372     UNASSIGNED,
 373     UPPERCASE_LETTER,
 374     LOWERCASE_LETTER,
 375     TITLECASE_LETTER,
 376     MODIFIER_LETTER,
 377     OTHER_LETTER,
 378     NON_SPACING_MARK,
 379     ENCLOSING_MARK,
 380     COMBINING_SPACING_MARK,
 381     DECIMAL_DIGIT_NUMBER,
 382     LETTER_NUMBER,
 383     OTHER_NUMBER,
 384     SPACE_SEPARATOR,
 385     LINE_SEPARATOR,
 386     PARAGRAPH_SEPARATOR,
 387     CONTROL,
 388     FORMAT,
 389     PRIVATE_USE,
 390     SURROGATE,
 391     CONNECTOR_PUNCTUATION,
 392     DASH_PUNCTUATION,
 393     OPEN_PUNCTUATION,
 394     CLOSE_PUNCTUATION,
 395     INITIAL_QUOTE_PUNCTUATION,
 396     FINAL_QUOTE_PUNCTUATION,
 397     OTHER_PUNCTUATION,
 398     MATH_SYMBOL,
 399     CURRENCY_SYMBOL,
 400     MODIFIER_SYMBOL,
 401     OTHER_SYMBOL
 402 };
 403
 404 /*
 405  * The following macros extract the fields of the character info.  The
 406  * GetDelta() macro is complicated because we can't rely on the C compiler
 407  * to do sign extension on right shifts.
 408  */
 409
 410 #define GetCaseType(info) (((info) & 0xe0) >> 5)
 411 #define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f)
 412 #define GetDelta(info) ((info) >> 8)
 413 #endif
 414
 415 /** Extract information about a Unicode character.
 416  *
 417  *  This function extracts the information about a character from the
 418  *  Unicode character tables.
 419  */
 420 int
 421 Xapian::Unicode::Internal::get_character_info(unsigned ch) XAPIAN_NOEXCEPT
 422 {
 423     if (rare(ch >= 0x110000)) {
 424         // Categorise non-Unicode values as UNASSIGNED with no case variants.
 425         return Xapian::Unicode::UNASSIGNED;
 426     }
 427     auto group = (pageMap\[int(ch) >> OFFSET_BITS\] << OFFSET_BITS) |
 428                  ((ch) & ((1 << OFFSET_BITS) - 1));
 429     return groups\[groupMap\[group\]\];
 430 }
 431 "
 432
 433     close $f
 434 }
 435
 436 uni::main
 437
 438 return