3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
4 # reads the file and returns a list of mappings required to remove all
5 # diacritical marks from a unicode string. Each mapping is itself a list
6 # consisting of two elements - the unicode codepoint and the single ASCII
7 # character that it should be replaced with, or an empty string if the
8 # codepoint should simply be removed from the input. Examples:
10 # { 224 a } (replace codepoint 224 to "a")
11 # { 769 "" } (remove codepoint 769 from input)
13 # Mappings are only returned for non-upper case codepoints. It is assumed
14 # that the input has already been folded to lower case.
16 proc rd_load_unicodedata_text
{zName
} {
17 global tl_lookup_table
24 canonical_combining_classes
25 bidirectional_category
26 character_decomposition_mapping
32 iso10646_comment_field
39 while { ![eof $fd] } {
41 if {$line == ""} continue
43 set fields
[split $line ";"]
44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
45 foreach $lField $fields {}
46 if { [llength $character_decomposition_mapping]!=2
47 ||
[string is xdigit
[lindex $character_decomposition_mapping 0]]==0
52 set iCode
[expr "0x$code"]
53 set iAscii
[expr "0x[lindex $character_decomposition_mapping 0]"]
54 set iDia
[expr "0x[lindex $character_decomposition_mapping 1]"]
56 if {[info exists tl_lookup_table
($iCode)]} continue
58 if { ($iAscii >= 97 && $iAscii <= 122)
59 ||
($iAscii >= 65 && $iAscii <= 90)
61 lappend lRet
[list $iCode [string tolower
[format %c
$iAscii]]]
66 foreach d
[array names dia
] {
67 lappend lRet
[list $d ""]
69 set lRet
[lsort -integer -index 0 $lRet]
77 global tl_lookup_table
82 set iFirst
[lindex $map 0 0]
83 set cPrev
[lindex $map 0 1]
85 foreach m
[lrange $map 1 end
] {
89 for {set j
[expr $iFirst+$nRange]} {$j<$i} {incr j
} {
90 if {[info exists tl_lookup_table
($j)]==0} break
94 set nNew
[expr {(1 + $i - $iFirst)}]
102 lappend lRange
[list $iFirst $nRange]
109 lappend lRange
[list $iFirst $nRange]
113 puts "** If the argument is a codepoint corresponding to a lowercase letter"
114 puts "** in the ASCII range with a diacritic added, return the codepoint"
115 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
116 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
117 puts "** E\"). The resuls of passing a codepoint that corresponds to an"
118 puts "** uppercase letter are undefined."
120 puts "static int remove_diacritic(int c)\{"
121 puts " unsigned short aDia\[\] = \{"
122 puts -nonewline " 0, "
125 foreach {iCode nRange
} $r {}
126 if {($i % 8)==0} {puts "" ; puts -nonewline " " }
129 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
134 puts " char aChar\[\] = \{"
135 puts -nonewline " '\\0', "
139 if {$c == ""} { set str
"'\\0', " }
141 if {($i % 12)==0} {puts "" ; puts -nonewline " " }
143 puts -nonewline "$str"
148 unsigned int key
= (((unsigned int
)c
)<<3) |
0x00000007;
150 int iHi
= sizeof
(aDia
)/sizeof
(aDia
[0]) - 1;
153 int iTest
= (iHi
+ iLo
) / 2;
154 if( key
>= aDia
[iTest
] ){
161 assert
( key
>=aDia
[iRes
] );
162 return ((c
> (aDia
[iRes
]>>3) + (aDia
[iRes
]&0x07)) ? c
: (int
)aChar
[iRes
]);}
166 proc print_isdiacritic
{zFunc map
} {
170 foreach {code char
} $m {}
171 if {$code && $char == ""} { lappend lCode
$code }
173 set lCode
[lsort -integer $lCode]
174 set iFirst
[lindex $lCode 0]
175 set iLast
[lindex $lCode end
]
181 set i
[expr $c - $iFirst]
183 set i1
[expr {$i1 |
(1<<$i)}]
185 set i2
[expr {$i2 |
(1<<($i-32))}]
190 puts "** Return true if the argument interpreted as a unicode codepoint"
191 puts "** is a diacritical modifier character."
193 puts "int ${zFunc}\(int c)\{"
194 puts " unsigned int mask0 = [format "0x
%08X
" $i1];"
195 puts " unsigned int mask1 = [format "0x
%08X
" $i2];"
197 puts " if( c<$iFirst || c>$iLast ) return 0;"
198 puts " return (c < $iFirst+32) ?"
199 puts " (mask0 & (1 << (c-$iFirst))) :"
200 puts " (mask1 & (1 << (c-$iFirst-32)));"
205 #-------------------------------------------------------------------------
207 # Parameter $zName must be a path to the file UnicodeData.txt. This command
208 # reads the file and returns a list of codepoints (integers). The list
209 # contains all codepoints in the UnicodeData.txt assigned to any "General
210 # Category" that is not a "Letter" or "Number".
212 proc an_load_unicodedata_text
{zName
} {
218 canonical_combining_classes
219 bidirectional_category
220 character_decomposition_mapping
226 iso10646_comment_field
233 while { ![eof $fd] } {
235 if {$line == ""} continue
237 set fields
[split $line ";"]
238 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
239 foreach $lField $fields {}
241 set iCode
[expr "0x$code"]
243 [lsearch {L N
} [string range
$general_category 0 0]] >= 0
244 ||
$general_category=="Co"
247 if { !$bAlnum } { lappend lRet
$iCode }
254 proc an_load_separator_ranges
{} {
255 global unicodedata.txt
256 set lSep
[an_load_unicodedata_text
${unicodedata.txt
}]
257 unset -nocomplain iFirst
258 unset -nocomplain nRange
261 if {0==[info exists iFirst
]} {
264 } elseif
{ $sep == ($iFirst+$nRange) } {
267 lappend lRange
[list $iFirst $nRange]
272 lappend lRange
[list $iFirst $nRange]
276 proc an_print_range_array
{lRange
} {
279 foreach range
$lRange {
280 foreach {iFirst nRange
} $range {}
281 if {$iFirst > $iFirstMax} {set iFirstMax
$iFirst}
282 if {$nRange > $nRangeMax} {set nRangeMax
$nRange}
284 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
285 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
289 /* Each unsigned integer in the following
array corresponds to a contiguous
290 ** range of unicode codepoints that are not either letters or numbers
(i.e.
291 ** codepoints
for which this function should
return 0).
293 ** The most significant
22 bits in each
32-bit value contain the first
294 ** codepoint in the range. The least significant
10 bits are used to store
295 ** the size of the range
(always at least
1). In other words
, the value
296 ** ((C
<<22) + N
) represents a range of N codepoints starting with codepoint
297 ** C. It is not possible to represent a range larger than
1023 codepoints
298 ** using this
format.
301 puts -nonewline " static const unsigned int aEntry\[\] = \{"
303 foreach range
$lRange {
304 foreach {iFirst nRange
} $range {}
305 set u32
[format "0x%08X" [expr ($iFirst<<10) + $nRange]]
307 if {($i % 5)==0} {puts "" ; puts -nonewline " "}
308 puts -nonewline " $u32,"
315 proc an_print_ascii_bitmap
{lRange
} {
316 foreach range
$lRange {
317 foreach {iFirst nRange
} $range {}
318 for {set i
$iFirst} {$i < ($iFirst+$nRange)} {incr i
} {
319 if {$i<=127} { set a
($i) 1 }
323 set aAscii
[list 0 0 0 0]
324 foreach key
[array names a
] {
325 set idx
[expr $key >> 5]
326 lset aAscii
$idx [expr [lindex $aAscii $idx] |
(1 << ($key&0x001F))]
329 puts " static const unsigned int aAscii\[4\] = \{"
331 foreach v
$aAscii { puts -nonewline [format " 0x%08X," $v] }
336 proc print_isalnum
{zFunc lRange
} {
338 puts "** Return true if the argument corresponds to a unicode codepoint"
339 puts "** classified as either a letter or a number. Otherwise false."
341 puts "** The results are undefined if the value passed to this function"
342 puts "** is less than zero."
344 puts "int ${zFunc}\(int c)\{"
345 an_print_range_array
$lRange
346 an_print_ascii_bitmap
$lRange
349 return ( (aAscii
[c
>> 5] & (1 << (c
& 0x001F)))==0 );
350 }else if( c
<(1<<22) ){
351 unsigned int key
= (((unsigned int
)c
)<<10) |
0x000003FF;
353 int iHi
= sizeof
(aEntry
)/sizeof
(aEntry
[0]) - 1;
356 int iTest
= (iHi
+ iLo
) / 2;
357 if( key
>= aEntry
[iTest
] ){
364 assert
( aEntry
[0]<key
);
365 assert
( key
>=aEntry
[iRes
] );
366 return (((unsigned int
)c
) >= ((aEntry
[iRes
]>>10) + (aEntry
[iRes
]&0x3FF)));
372 proc print_test_isalnum
{zFunc lRange
} {
373 foreach range
$lRange {
374 foreach {iFirst nRange
} $range {}
375 for {set i
$iFirst} {$i < ($iFirst+$nRange)} {incr i
} { set a
($i) 1 }
378 puts "static int isalnum_test(int *piCode)\{"
379 puts -nonewline " unsigned char aAlnum\[\] = \{"
380 for {set i
0} {$i < 70000} {incr i
} {
381 if {($i % 32)==0} { puts "" ; puts -nonewline " " }
382 set bFlag
[expr ![info exists a
($i)]]
383 puts -nonewline "${bFlag},"
388 puts -nonewline " int aLargeSep\[\] = \{"
390 foreach iSep
[lsort -integer [array names a
]] {
391 if {$iSep<70000} continue
392 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
393 puts -nonewline " $iSep,"
398 puts -nonewline " int aLargeOther\[\] = \{"
400 foreach iSep
[lsort -integer [array names a
]] {
401 if {$iSep<70000} continue
402 if {[info exists a
([expr $iSep-1])]==0} {
403 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
404 puts -nonewline " [expr $iSep-1],"
407 if {[info exists a
([expr $iSep+1])]==0} {
408 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
409 puts -nonewline " [expr $iSep+1],"
416 puts [subst -nocommands {
418 for(i
=0; i
<sizeof
(aAlnum
)/sizeof
(aAlnum
[0]); i
++){
419 if( ${zFunc
}(i
)!=aAlnum
[i
] ){
424 for(i
=0; i
<sizeof
(aLargeSep
)/sizeof
(aLargeSep
[0]); i
++){
425 if( ${zFunc
}(aLargeSep
[i
])!=0 ){
426 *piCode
= aLargeSep
[i
];
430 for(i
=0; i
<sizeof
(aLargeOther
)/sizeof
(aLargeOther
[0]); i
++){
431 if( ${zFunc
}(aLargeOther
[i
])!=1 ){
432 *piCode
= aLargeOther
[i
];
441 #-------------------------------------------------------------------------
443 proc tl_load_casefolding_txt
{zName
} {
444 global tl_lookup_table
447 while { ![eof $fd] } {
449 if {[string range
$line 0 0] == "#"} continue
450 if {$line == ""} continue
452 foreach x
{a b c d
} {unset -nocomplain $x}
453 foreach {a b c d
} [split $line ";"] {}
457 foreach elem
$a { lappend a2
[expr "0x[string trim $elem]"] }
458 foreach elem
$c { lappend c2
[expr "0x[string trim $elem]"] }
459 set b
[string trim
$b]
460 set d
[string trim
$d]
462 if {$b=="C" ||
$b=="S"} { set tl_lookup_table
($a2) $c2 }
466 proc tl_create_records
{} {
467 global tl_lookup_table
475 foreach code
[lsort -integer [array names tl_lookup_table
]] {
476 set mapping
$tl_lookup_table($code)
479 set nOff
[expr $mapping - $code]
483 set diff
[expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
484 if { $nRange==1 && ($diff==1 ||
$diff==2) } {
488 if {$diff != $nIncr ||
($mapping - $code)!=$nOff} {
489 if { $nRange==1 } {set nIncr
1}
490 lappend lRecord
[list $iFirst $nIncr $nRange $nOff]
492 set nOff
[expr $mapping - $code]
501 lappend lRecord
[list $iFirst $nIncr $nRange $nOff]
506 proc tl_print_table_header
{} {
509 /* Each
entry in the following
array defines a rule
for folding a range
510 ** of codepoints to
lower case. The rule applies to a range of nRange
511 ** codepoints starting at codepoint iCode.
513 ** If the least significant bit in flags is clear
, then the rule applies
514 ** to all nRange codepoints
(i.e. all nRange codepoints are upper case and
515 ** need to be folded
). Or
, if it is
set, then the rule only applies to
516 ** every second codepoint in the range
, starting with codepoint C.
518 ** The
7 most significant bits in flags are an index into the aiOff
[]
519 ** array. If a specific codepoint C does require folding
, then its
lower
520 ** case equivalent is
((C
+ aiOff
[flags
>>1]) & 0xFFFF).
522 ** The contents of this
array are generated by parsing the CaseFolding.txt
523 ** file distributed as part of the
"Unicode Character Database". See
524 ** http://www.unicode.org
for details.
527 puts " static const struct TableEntry \{"
528 puts " unsigned short iCode;"
529 puts " unsigned char flags;"
530 puts " unsigned char nRange;"
531 puts " \} aEntry\[\] = \{"
534 proc tl_print_table_entry
{togglevar
entry liOff
} {
536 foreach {iFirst nIncr nRange nOff
} $entry {}
538 if {$iFirst > (1<<16)} { return 1 }
540 if {[info exists t
]==0} {set t
0}
541 if {$t==0} { puts -nonewline " " }
544 if {$nIncr==2} { set flags
1 ; set nRange
[expr $nRange * 2]}
545 if {$nOff<0} { incr nOff
[expr (1<<16)] }
547 set idx
[lsearch $liOff $nOff]
548 if {$idx<0} {error "malfunction generating aiOff"}
549 set flags
[expr $flags + $idx*2]
551 set txt
"{$iFirst, $flags, $nRange},"
555 puts -nonewline [format "% -23s" $txt]
557 set t
[expr ($t+1)%3]
562 proc tl_print_table_footer
{togglevar
} {
568 proc tl_print_if_entry
{entry} {
569 foreach {iFirst nIncr nRange nOff
} $entry {}
570 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
572 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
573 puts " ret = c + $nOff;"
577 proc tl_generate_ioff_table
{lRecord
} {
578 foreach entry $lRecord {
579 foreach {iFirst nIncr nRange iOff
} $entry {}
580 if {$iOff<0} { incr iOff
[expr (1<<16)] }
581 if {[info exists a
($iOff)]} continue
585 set liOff
[lsort -integer [array names a
]]
586 if {[llength $liOff]>128} { error "Too many distinct ioffs" }
590 proc tl_print_ioff_table
{liOff
} {
591 puts -nonewline " static const unsigned short aiOff\[\] = \{"
594 if {($i % 8)==0} {puts "" ; puts -nonewline " "}
595 puts -nonewline [format "% -7s" "$off,"]
603 proc print_fold
{zFunc
} {
605 set lRecord
[tl_create_records
]
609 puts "** Interpret the argument as a unicode codepoint. If the codepoint"
610 puts "** is an upper case character that has a lower case equivalent,"
611 puts "** return the codepoint corresponding to the lower case version."
612 puts "** Otherwise, return a copy of the argument."
614 puts "** The results are undefined if the value passed to this function"
615 puts "** is less than zero."
617 puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
619 set liOff
[tl_generate_ioff_table
$lRecord]
620 tl_print_table_header
621 foreach entry $lRecord {
622 if {[tl_print_table_entry toggle
$entry $liOff]} {
626 tl_print_table_footer toggle
627 tl_print_ioff_table
$liOff
633 assert
( sizeof
(unsigned short
)==2 && sizeof
(unsigned char
)==1 );
636 if( c
>='A'
&& c
<='Z'
) ret
= c
+ ('a'
- 'A'
);
638 int iHi
= sizeof
(aEntry
)/sizeof
(aEntry
[0]) - 1;
643 int iTest
= (iHi
+ iLo
) / 2;
644 int cmp
= (c
- aEntry
[iTest
].iCode
);
652 assert
( iRes
<0 || c
>=aEntry
[iRes
].iCode
);
655 const struct TableEntry
*p
= &aEntry
[iRes
];
656 if( c
<(p-
>iCode
+ p-
>nRange
) && 0==(0x01 & p-
>flags
& (p-
>iCode ^ c
)) ){
657 ret
= (c
+ (aiOff
[p-
>flags
>>1])) & 0x0000FFFF;
662 if( bRemoveDiacritic
) ret
= remove_diacritic
(ret
);
666 foreach entry $lHigh {
667 tl_print_if_entry
$entry
675 proc print_fold_test
{zFunc mappings
} {
676 global tl_lookup_table
678 foreach m
$mappings {
681 set extra
([lindex $m 0]) 0
684 set extra
([lindex $m 0]) $i
688 puts "static int fold_test(int *piCode)\{"
689 puts -nonewline " static int aLookup\[\] = \{"
690 for {set i
0} {$i < 70000} {incr i
} {
693 catch { set expected
$tl_lookup_table($i) }
694 set expected2
$expected
695 catch { set expected2
$extra($expected2) }
697 if {($i % 4)==0} { puts "" ; puts -nonewline " " }
698 puts -nonewline "$expected, $expected2, "
702 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
703 puts " int iCode = (i/2);"
704 puts " int bFlag = i & 0x0001;"
705 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
706 puts " *piCode = iCode;"
715 proc print_fileheader
{} {
720 ** The author disclaims copyright to this
source code. In
place of
721 ** a legal notice
, here is a blessing
:
723 ** May you do good and not evil.
724 ** May you find forgiveness
for yourself and forgive others.
725 ** May you share freely
, never taking more than you give.
727 ******************************************************************************
731 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
735 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
736 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
738 puts "#include <assert.h>"
742 proc print_test_main
{} {
744 puts "#include <stdio.h>"
746 puts "int main(int argc, char **argv)\{"
749 puts " r1 = isalnum_test(&code);"
750 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
751 puts " else printf(\"isalnum(): test passed\\n\");"
752 puts " r2 = fold_test(&code);"
753 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
754 puts " else printf(\"fold(): test passed\\n\");"
755 puts " return (r1 || r2);"
759 # Proces the command line arguments. Exit early if they are not to
763 puts -nonewline stderr
"Usage: $::argv0 ?-test? "
764 puts stderr
"<CaseFolding.txt file> <UnicodeData.txt file>"
767 if {[llength $argv]!=2 && [llength $argv]!=3} usage
768 if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
769 set unicodedata.txt
[lindex $argv end
]
770 set casefolding.txt
[lindex $argv end-1
]
771 set generate_test_code
[expr {[llength $argv]==3}]
775 # Print the isalnum() function to stdout.
777 set lRange
[an_load_separator_ranges
]
778 print_isalnum sqlite3FtsUnicodeIsalnum
$lRange
780 # Leave a gap between the two generated C functions.
785 # Load the fold data. This is used by the [rd_XXX] commands
786 # as well as [print_fold].
787 tl_load_casefolding_txt
${casefolding.txt
}
789 set mappings
[rd_load_unicodedata_text
${unicodedata.txt
}]
793 print_isdiacritic sqlite3FtsUnicodeIsdiacritic
$mappings
797 # Print the fold() function to stdout.
799 print_fold sqlite3FtsUnicodeFold
801 # Print the test routines and main() function to stdout, if -test
804 if {$::generate_test_code} {
805 print_test_isalnum sqlite3FtsUnicodeIsalnum
$lRange
806 print_fold_test sqlite3FtsUnicodeFold
$mappings
810 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
811 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"