bootstrap: Update to omega to latest common
[xapian.git] / xapian-core / unicode / uniParse.tcl
blob3aa88a9c3e74e6ec18d604e24814266727b8f995
1 # uniParse.tcl --
3 # This program parses the UnicodeData file and generates the
4 # corresponding source file with compressed character
5 # data tables. The input to this program should be the latest
6 # UnicodeData file from:
7 # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
9 # Copyright (c) 1998-1999 by Scriptics Corporation.
10 # All rights reserved.
13 namespace eval uni {
14 set shift 8; # number of bits of data within a page
15 # This value can be adjusted to find the
16 # best split to minimize table size
18 variable pMap; # map from page to page index, each entry is
19 # an index into the pages table, indexed by
20 # page number
21 variable pages; # map from page index to page info, each
22 # entry is a list of indices into the groups
23 # table, the list is indexed by the offset
24 variable groups; # list of character info values, indexed by
25 # group number, initialized with the
26 # unassigned character group
28 variable categories {
29 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
30 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
31 }; # Ordered list of character categories, must
32 # match the enumeration in the header file.
35 proc uni::getValue {items index} {
36 variable categories
38 # Extract character info
40 set category [lindex $items 2]
41 if {[scan [lindex $items 12] %x toupper] == 1} {
42 set toupper [expr {$index - $toupper}]
43 } else {
44 set toupper 0
46 if {[scan [lindex $items 13] %x tolower] == 1} {
47 set tolower [expr {$tolower - $index}]
48 } else {
49 set tolower 0
51 if {[scan [lindex $items 14] %x totitle] == 1} {
52 set totitle [expr {$index - $totitle}]
53 } elseif {$tolower} {
54 set totitle 0
55 } else {
56 set totitle $toupper
59 set categoryIndex [lsearch -exact $categories $category]
60 if {$categoryIndex < 0} {
61 puts "Unexpected character category: $index($category)"
62 set categoryIndex 0
65 return [list $categoryIndex $toupper $tolower $totitle]
68 proc uni::getGroup {value} {
69 variable groups
71 set gIndex [lsearch -exact $groups $value]
72 if {$gIndex == -1} {
73 set gIndex [llength $groups]
74 lappend groups $value
76 return $gIndex
79 proc uni::addPage {info} {
80 variable pMap
81 variable pages
83 set pIndex [lsearch -exact $pages $info]
84 if {$pIndex == -1} {
85 set pIndex [llength $pages]
86 lappend pages $info
88 lappend pMap $pIndex
89 return
92 proc uni::buildTables {data} {
93 variable shift
95 variable pMap {}
96 variable pages {}
97 variable groups {{0 0 0 0}}
98 variable next 0
99 set info {} ;# temporary page info
101 set mask [expr {(1 << $shift) - 1}]
103 foreach line [split $data \n] {
104 if {$line eq ""} {
105 if {!($next & $mask)} {
106 # next character is already on page boundary
107 continue
109 # fill remaining page
110 set line [format %X [expr {($next-1)|$mask}]]
111 append line ";;Cn;0;ON;;;;;N;;;;;\n"
114 set items [split $line \;]
116 scan [lindex $items 0] %x index
117 if {$index > 0x2ffff} then {
118 # Ignore non-BMP characters, as long as Tcl doesn't support them
119 #continue
121 set index [format %d $index]
123 set gIndex [getGroup [getValue $items $index]]
125 # Since the input table omits unassigned characters, these will
126 # show up as gaps in the index sequence. There are a few special cases
127 # where the gaps correspond to a uniform block of assigned characters.
128 # These are indicated as such in the character name.
130 # Enter all unassigned characters up to the current character.
131 if {($index > $next) \
132 && ![regexp "Last>$" [lindex $items 1]]} {
133 for {} {$next < $index} {incr next} {
134 lappend info 0
135 if {($next & $mask) == $mask} {
136 addPage $info
137 set info {}
142 # Enter all assigned characters up to the current character
143 for {set i $next} {$i <= $index} {incr i} {
144 # Add the group index to the info for the current page
145 lappend info $gIndex
147 # If this is the last entry in the page, add the page
148 if {($i & $mask) == $mask} {
149 addPage $info
150 set info {}
153 set next [expr {$index + 1}]
155 return
158 proc uni::main {} {
159 global argc argv0 argv
160 variable pMap
161 variable pages
162 variable groups
163 variable shift
164 variable next
165 variable max_delta
167 if {$argc != 3} {
168 puts stderr "\nusage: $argv0 <datafile> <version> <outfile>\n"
169 exit 1
172 set f [open [lindex $argv 0] r]
173 set data [read $f]
174 close $f
176 buildTables $data
177 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]"
178 set size [expr {[llength $pMap] + ([llength $pages]<<$shift)}]
179 puts "shift = $shift, space = $size"
181 set f [open [lindex $argv 2] w]
182 fconfigure $f -translation lf
183 puts $f "/*
184 * [lindex $argv 2] --
186 * Declarations of Unicode [lindex $argv 1] character information tables. This
187 * file is automatically generated by a modified version of the
188 * tools/uniParse.tcl script from the Tcl sources.
190 * Do not modify this file by hand!
192 * Copyright (c) 1998 by Scriptics Corporation.
193 * All rights reserved.
196 #include <config.h>
198 #include <xapian/unicode.h>
201 * A 16-bit Unicode character is split into two parts in order to index
202 * into the following tables. The lower OFFSET_BITS comprise an offset
203 * into a page of characters. The upper bits comprise the page number.
206 #define OFFSET_BITS $shift
209 * The pageMap is indexed by page number and returns an alternate page number
210 * that identifies a unique page of characters. Many Unicode characters map
211 * to the same alternate page number.
214 static const unsigned char pageMap\[\] = {"
215 set line " "
216 set last [expr {[llength $pMap] - 1}]
217 for {set i 0} {$i <= $last} {incr i} {
218 # if {$i == [expr {0x10000 >> $shift}]} {
219 # set line [string trimright $line " \t,"]
220 # puts $f $line
221 # set lastpage [expr {[lindex $line end] >> $shift}]
222 # puts stdout "lastpage: $lastpage"
223 # puts $f "#if TCL_UTF_MAX > 3"
224 # set line " ,"
226 append line [lindex $pMap $i]
227 if {$i != $last} {
228 append line ", "
230 if {[string length $line] > 70} {
231 puts $f [string trimright $line]
232 set line " "
235 puts $f $line
236 # puts $f "#endif /* TCL_UTF_MAX > 3 */"
237 puts $f "};
240 * The groupMap is indexed by combining the alternate page number with
241 * the page offset and returns a group number that identifies a unique
242 * set of character attributes.
245 static const unsigned char groupMap\[\] = {"
246 set line " "
247 set lasti [expr {[llength $pages] - 1}]
248 for {set i 0} {$i <= $lasti} {incr i} {
249 set page [lindex $pages $i]
250 set lastj [expr {[llength $page] - 1}]
251 # if {$i == ($lastpage + 1)} {
252 # puts $f [string trimright $line " \t,"]
253 # puts $f "#if TCL_UTF_MAX > 3"
254 # set line " ,"
256 for {set j 0} {$j <= $lastj} {incr j} {
257 append line [lindex $page $j]
258 if {$j != $lastj || $i != $lasti} {
259 append line ", "
261 if {[string length $line] > 70} {
262 puts $f [string trimright $line]
263 set line " "
267 puts $f $line
268 # puts $f "#endif /* TCL_UTF_MAX > 3 */"
269 puts $f "};
272 * Each group represents a unique set of character attributes. The attributes
273 * are encoded into a 32-bit value as follows:
275 * Bits 0-4 Character category: see the constants listed below.
277 * Bits 5-7 Case delta type: 000 = identity
278 * 010 = add delta for lower
279 * 011 = add delta for lower, add 1 for title
280 * 100 = subtract delta for title/upper
281 * 101 = sub delta for upper, sub 1 for title
282 * 110 = sub delta for upper, add delta for lower
284 * Bits 8-31 Case delta: delta for case conversions. This should be the
285 * highest field so we can easily sign extend.
288 static const int groups\[\] = {"
289 set line " "
290 set last [expr {[llength $groups] - 1}]
291 set max_delta -1
292 for {set i 0} {$i <= $last} {incr i} {
293 foreach {type toupper tolower totitle} [lindex $groups $i] {}
295 # Compute the case conversion type and delta
297 if {$totitle} {
298 if {$totitle == $toupper} {
299 # subtract delta for title or upper
300 set case 4
301 set delta $toupper
302 if {$tolower} {
303 error "New case conversion type needed: $toupper $tolower $totitle"
305 } elseif {$toupper} {
306 # subtract delta for upper, subtract 1 for title
307 set case 5
308 set delta $toupper
309 if {($totitle != 1) || $tolower} {
310 error "New case conversion type needed: $toupper $tolower $totitle"
312 } else {
313 # add delta for lower, add 1 for title
314 set case 3
315 set delta $tolower
316 if {$totitle != -1} {
317 error "New case conversion type needed: $toupper $tolower $totitle"
320 } elseif {$toupper} {
321 # subtract delta for upper, add delta for lower
322 set case 6
323 set delta $toupper
324 if {$tolower != $toupper} {
325 error "New case conversion type needed: $toupper $tolower $totitle"
327 } elseif {$tolower} {
328 # add delta for lower
329 set case 2
330 set delta $tolower
331 } else {
332 # noop
333 set case 0
334 set delta 0
337 if {$delta >= (1 << 23) || $delta < -(1<<23)} {
338 error "delta $delta out of range"
340 if {$delta > $max_delta} {
341 set max_delta $delta
342 } elseif {-$delta > $max_delta} {
343 set max_delta [expr {-$delta}]
345 append line [expr {($delta << 8) | ($case << 5) | $type}]
346 if {$i != $last} {
347 append line ", "
349 if {[string length $line] > 65} {
350 puts $f [string trimright $line]
351 set line " "
354 puts "max_delta = $max_delta"
355 puts $f $line
356 puts -nonewline $f "};
358 #if 0
360 #if TCL_UTF_MAX > 3
361 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next])
362 #else
363 # define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0)
364 #endif
367 * The following constants are used to determine the category of a
368 * Unicode character.
371 enum {
372 UNASSIGNED,
373 UPPERCASE_LETTER,
374 LOWERCASE_LETTER,
375 TITLECASE_LETTER,
376 MODIFIER_LETTER,
377 OTHER_LETTER,
378 NON_SPACING_MARK,
379 ENCLOSING_MARK,
380 COMBINING_SPACING_MARK,
381 DECIMAL_DIGIT_NUMBER,
382 LETTER_NUMBER,
383 OTHER_NUMBER,
384 SPACE_SEPARATOR,
385 LINE_SEPARATOR,
386 PARAGRAPH_SEPARATOR,
387 CONTROL,
388 FORMAT,
389 PRIVATE_USE,
390 SURROGATE,
391 CONNECTOR_PUNCTUATION,
392 DASH_PUNCTUATION,
393 OPEN_PUNCTUATION,
394 CLOSE_PUNCTUATION,
395 INITIAL_QUOTE_PUNCTUATION,
396 FINAL_QUOTE_PUNCTUATION,
397 OTHER_PUNCTUATION,
398 MATH_SYMBOL,
399 CURRENCY_SYMBOL,
400 MODIFIER_SYMBOL,
401 OTHER_SYMBOL
405 * The following macros extract the fields of the character info. The
406 * GetDelta() macro is complicated because we can't rely on the C compiler
407 * to do sign extension on right shifts.
410 #define GetCaseType(info) (((info) & 0xe0) >> 5)
411 #define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f)
412 #define GetDelta(info) ((info) >> 8)
413 #endif
415 /** Extract information about a Unicode character.
417 * This function extracts the information about a character from the
418 * Unicode character tables.
421 Xapian::Unicode::Internal::get_character_info(unsigned ch) XAPIAN_NOEXCEPT
423 if (rare(ch >= 0x110000)) {
424 // Categorise non-Unicode values as UNASSIGNED with no case variants.
425 return Xapian::Unicode::UNASSIGNED;
427 auto group = (pageMap\[int(ch) >> OFFSET_BITS\] << OFFSET_BITS) |
428 ((ch) & ((1 << OFFSET_BITS) - 1));
429 return groups\[groupMap\[group\]\];
433 close $f
436 uni::main
438 return