3 # This program parses the UnicodeData file and generates the
4 # corresponding source file with compressed character
5 # data tables. The input to this program should be the latest
6 # UnicodeData file from:
7 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
9 # Copyright (c) 1998-1999 by Scriptics Corporation.
10 # All rights reserved.
14 set shift
8; # number of bits of data within a page
15 # This value can be adjusted to find the
16 # best split to minimize table size
18 variable pMap
; # map from page to page index, each entry is
19 # an index into the pages table, indexed by
21 variable pages
; # map from page index to page info, each
22 # entry is a list of indices into the groups
23 # table, the list is indexed by the offset
24 variable groups
; # list of character info values, indexed by
25 # group number, initialized with the
26 # unassigned character group
29 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
30 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
31 }; # Ordered list of character categories, must
32 # match the enumeration in the header file.
35 proc uni
::getValue {items index
} {
38 # Extract character info
40 set category
[lindex $items 2]
41 if {[scan [lindex $items 12] %x toupper
] == 1} {
42 set toupper
[expr {$index - $toupper}]
46 if {[scan [lindex $items 13] %x tolower
] == 1} {
47 set tolower
[expr {$tolower - $index}]
51 if {[scan [lindex $items 14] %x totitle
] == 1} {
52 set totitle
[expr {$index - $totitle}]
59 set categoryIndex
[lsearch -exact $categories $category]
60 if {$categoryIndex < 0} {
61 puts "Unexpected character category: $index($category)"
65 return [list $categoryIndex $toupper $tolower $totitle]
68 proc uni
::getGroup {value
} {
71 set gIndex
[lsearch -exact $groups $value]
73 set gIndex
[llength $groups]
79 proc uni
::addPage {info} {
83 set pIndex
[lsearch -exact $pages $info]
85 set pIndex
[llength $pages]
92 proc uni
::buildTables {data
} {
97 variable groups
{{0 0 0 0}}
99 set info {} ;# temporary page info
101 set mask
[expr {(1 << $shift) - 1}]
103 foreach line
[split $data \n] {
105 if {!($next & $mask)} {
106 # next character is already on page boundary
109 # fill remaining page
110 set line
[format %X
[expr {($next-1)|
$mask}]]
111 append line
";;Cn;0;ON;;;;;N;;;;;\n"
114 set items
[split $line \;]
116 scan [lindex $items 0] %x index
117 if {$index > 0x2ffff} then
{
118 # Ignore non-BMP characters, as long as Tcl doesn't support them
121 set index
[format %d
$index]
123 set gIndex
[getGroup
[getValue
$items $index]]
125 # Since the input table omits unassigned characters, these will
126 # show up as gaps in the index sequence. There are a few special cases
127 # where the gaps correspond to a uniform block of assigned characters.
128 # These are indicated as such in the character name.
130 # Enter all unassigned characters up to the current character.
131 if {($index > $next) \
132 && ![regexp "Last>$" [lindex $items 1]]} {
133 for {} {$next < $index} {incr next
} {
135 if {($next & $mask) == $mask} {
142 # Enter all assigned characters up to the current character
143 for {set i
$next} {$i <= $index} {incr i
} {
144 # Add the group index to the info for the current page
147 # If this is the last entry in the page, add the page
148 if {($i & $mask) == $mask} {
153 set next
[expr {$index + 1}]
159 global argc argv0 argv
168 puts stderr
"\nusage: $argv0 <datafile> <version> <outfile>\n"
172 set f
[open [lindex $argv 0] r
]
177 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
178 set size
[expr {[llength $pMap] + ([llength $pages]<<$shift)}]
179 puts "shift = $shift, space = $size"
181 set f
[open [lindex $argv 2] w
]
182 fconfigure $f -translation lf
184 * [lindex $argv 2] --
186 * Declarations of Unicode [lindex $argv 1] character information tables. This
187 * file is automatically generated by a modified version of the
188 * tools/uniParse.tcl script from the Tcl sources.
190 * Do not modify this file by hand!
192 * Copyright (c) 1998 by Scriptics Corporation.
193 * All rights reserved.
198 #include <xapian/unicode.h>
201 * A 16-bit Unicode character is split into two parts in order to index
202 * into the following tables. The lower OFFSET_BITS comprise an offset
203 * into a page of characters. The upper bits comprise the page number.
206 #define OFFSET_BITS $shift
209 * The pageMap is indexed by page number and returns an alternate page number
210 * that identifies a unique page of characters. Many Unicode characters map
211 * to the same alternate page number.
214 static const unsigned char pageMap\[\] = {"
216 set last
[expr {[llength $pMap] - 1}]
217 for {set i
0} {$i <= $last} {incr i
} {
218 # if {$i == [expr {0x10000 >> $shift}]} {
219 # set line [string trimright $line " \t,"]
221 # set lastpage [expr {[lindex $line end] >> $shift}]
222 # puts stdout "lastpage: $lastpage"
223 # puts $f "#if TCL_UTF_MAX > 3"
226 append line
[lindex $pMap $i]
230 if {[string length
$line] > 70} {
231 puts $f [string trimright
$line]
236 # puts $f "#endif /* TCL_UTF_MAX > 3 */"
240 * The groupMap is indexed by combining the alternate page number with
241 * the page offset and returns a group number that identifies a unique
242 * set of character attributes.
245 static const unsigned char groupMap\[\] = {"
247 set lasti
[expr {[llength $pages] - 1}]
248 for {set i
0} {$i <= $lasti} {incr i
} {
249 set page
[lindex $pages $i]
250 set lastj
[expr {[llength $page] - 1}]
251 # if {$i == ($lastpage + 1)} {
252 # puts $f [string trimright $line " \t,"]
253 # puts $f "#if TCL_UTF_MAX > 3"
256 for {set j
0} {$j <= $lastj} {incr j
} {
257 append line
[lindex $page $j]
258 if {$j != $lastj ||
$i != $lasti} {
261 if {[string length
$line] > 70} {
262 puts $f [string trimright
$line]
268 # puts $f "#endif /* TCL_UTF_MAX > 3 */"
272 * Each group represents a unique set of character attributes. The attributes
273 * are encoded into a 32-bit value as follows:
275 * Bits 0-4 Character category: see the constants listed below.
277 * Bits 5-7 Case delta type: 000 = identity
278 * 010 = add delta for lower
279 * 011 = add delta for lower, add 1 for title
280 * 100 = subtract delta for title/upper
281 * 101 = sub delta for upper, sub 1 for title
282 * 110 = sub delta for upper, add delta for lower
284 * Bits 8-31 Case delta: delta for case conversions. This should be the
285 * highest field so we can easily sign extend.
288 static const int groups\[\] = {"
290 set last
[expr {[llength $groups] - 1}]
292 for {set i
0} {$i <= $last} {incr i
} {
293 foreach {type toupper tolower totitle
} [lindex $groups $i] {}
295 # Compute the case conversion type and delta
298 if {$totitle == $toupper} {
299 # subtract delta for title or upper
303 error "New case conversion type needed: $toupper $tolower $totitle"
305 } elseif
{$toupper} {
306 # subtract delta for upper, subtract 1 for title
309 if {($totitle != 1) ||
$tolower} {
310 error "New case conversion type needed: $toupper $tolower $totitle"
313 # add delta for lower, add 1 for title
316 if {$totitle != -1} {
317 error "New case conversion type needed: $toupper $tolower $totitle"
320 } elseif
{$toupper} {
321 # subtract delta for upper, add delta for lower
324 if {$tolower != $toupper} {
325 error "New case conversion type needed: $toupper $tolower $totitle"
327 } elseif
{$tolower} {
328 # add delta for lower
337 if {$delta >= (1 << 23) ||
$delta < -(1<<23)} {
338 error "delta $delta out of range"
340 if {$delta > $max_delta} {
342 } elseif
{-$delta > $max_delta} {
343 set max_delta
[expr {-$delta}]
345 append line
[expr {($delta << 8) |
($case << 5) |
$type}]
349 if {[string length
$line] > 65} {
350 puts $f [string trimright
$line]
354 puts "max_delta = $max_delta"
356 puts -nonewline $f "};
361 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next])
363 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0)
367 * The following constants are used to determine the category of a
380 COMBINING_SPACING_MARK,
381 DECIMAL_DIGIT_NUMBER,
391 CONNECTOR_PUNCTUATION,
395 INITIAL_QUOTE_PUNCTUATION,
396 FINAL_QUOTE_PUNCTUATION,
405 * The following macros extract the fields of the character info. The
406 * GetDelta() macro is complicated because we can't rely on the C compiler
407 * to do sign extension on right shifts.
410 #define GetCaseType(info) (((info) & 0xe0) >> 5)
411 #define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f)
412 #define GetDelta(info) ((info) >> 8)
415 /** Extract information about a Unicode character.
417 * This function extracts the information about a character from the
418 * Unicode character tables.
421 Xapian::Unicode::Internal::get_character_info(unsigned ch) XAPIAN_NOEXCEPT
423 if (rare(ch >= 0x110000)) {
424 // Categorise non-Unicode values as UNASSIGNED with no case variants.
425 return Xapian::Unicode::UNASSIGNED;
427 auto group = (pageMap\[int(ch) >> OFFSET_BITS\] << OFFSET_BITS) |
428 ((ch) & ((1 << OFFSET_BITS) - 1));
429 return groups\[groupMap\[group\]\];