Fixes default log output to console for macOS
[sqlcipher.git] / ext / fts3 / unicode / parseunicode.tcl
blob7c246a4a0925670669e81093951760047f7b7b05
2 #--------------------------------------------------------------------------
3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
4 # reads the file and returns a list of mappings required to remove all
5 # diacritical marks from a unicode string. Each mapping is itself a list
6 # consisting of two elements - the unicode codepoint and the single ASCII
7 # character that it should be replaced with, or an empty string if the
8 # codepoint should simply be removed from the input. Examples:
10 # { 224 a 0 } (replace codepoint 224 to "a")
11 # { 769 "" 0 } (remove codepoint 769 from input)
13 # Mappings are only returned for non-upper case codepoints. It is assumed
14 # that the input has already been folded to lower case.
16 # The third value in the list is always either 0 or 1. 0 if the
17 # UnicodeData.txt file maps the codepoint to a single ASCII character and
18 # a diacritic, or 1 if the mapping is indirect. For example, consider the
19 # two entries:
21 # 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
22 # 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
24 # The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
25 # diacritic). The second is an indirect mapping, as it maps to the
26 # first codepoint plus 0302 (a diacritic).
28 proc rd_load_unicodedata_text {zName} {
29 global tl_lookup_table
31 set fd [open $zName]
32 set lField {
33 code
34 character_name
35 general_category
36 canonical_combining_classes
37 bidirectional_category
38 character_decomposition_mapping
39 decimal_digit_value
40 digit_value
41 numeric_value
42 mirrored
43 unicode_1_name
44 iso10646_comment_field
45 uppercase_mapping
46 lowercase_mapping
47 titlecase_mapping
49 set lRet [list]
51 while { ![eof $fd] } {
52 set line [gets $fd]
53 if {$line == ""} continue
55 set fields [split $line ";"]
56 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
57 foreach $lField $fields {}
58 if { [llength $character_decomposition_mapping]!=2
59 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
60 } {
61 continue
64 set iCode [expr "0x$code"]
65 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
66 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
68 # Filter out upper-case characters, as they will be mapped to their
69 # lower-case equivalents before this data is used.
70 if {[info exists tl_lookup_table($iCode)]} continue
72 # Check if this is an indirect mapping. If so, set bIndirect to true
73 # and change $iAscii to the indirectly mappped ASCII character.
74 set bIndirect 0
75 if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
76 set iAscii $mapping($iAscii)
77 set bIndirect 1
80 if { ($iAscii >= 97 && $iAscii <= 122)
81 || ($iAscii >= 65 && $iAscii <= 90)
82 } {
83 lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
84 set mapping($iCode) $iAscii
85 set dia($iDia) 1
89 foreach d [array names dia] {
90 lappend lRet [list $d "" 0]
92 set lRet [lsort -integer -index 0 $lRet]
94 close $fd
95 set lRet
98 #-------------------------------------------------------------------------
99 # Parameter $zName must be a path to the file UnicodeData.txt. This command
100 # reads the file and returns a list of codepoints (integers). The list
101 # contains all codepoints in the UnicodeData.txt assigned to any "General
102 # Category" that is not a "Letter" or "Number".
104 proc an_load_unicodedata_text {zName} {
105 set fd [open $zName]
106 set lField {
107 code
108 character_name
109 general_category
110 canonical_combining_classes
111 bidirectional_category
112 character_decomposition_mapping
113 decimal_digit_value
114 digit_value
115 numeric_value
116 mirrored
117 unicode_1_name
118 iso10646_comment_field
119 uppercase_mapping
120 lowercase_mapping
121 titlecase_mapping
123 set lRet [list]
125 while { ![eof $fd] } {
126 set line [gets $fd]
127 if {$line == ""} continue
129 set fields [split $line ";"]
130 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
131 foreach $lField $fields {}
133 set iCode [expr "0x$code"]
134 set bAlnum [expr {
135 [lsearch {L N} [string range $general_category 0 0]] >= 0
136 || $general_category=="Co"
139 if { !$bAlnum } { lappend lRet $iCode }
142 close $fd
143 set lRet
146 proc tl_load_casefolding_txt {zName} {
147 global tl_lookup_table
149 set fd [open $zName]
150 while { ![eof $fd] } {
151 set line [gets $fd]
152 if {[string range $line 0 0] == "#"} continue
153 if {$line == ""} continue
155 foreach x {a b c d} {unset -nocomplain $x}
156 foreach {a b c d} [split $line ";"] {}
158 set a2 [list]
159 set c2 [list]
160 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
161 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
162 set b [string trim $b]
163 set d [string trim $d]
165 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
169 proc cc_load_unicodedata_text {zName} {
170 set fd [open $zName]
171 set lField {
172 code
173 character_name
174 general_category
175 canonical_combining_classes
176 bidirectional_category
177 character_decomposition_mapping
178 decimal_digit_value
179 digit_value
180 numeric_value
181 mirrored
182 unicode_1_name
183 iso10646_comment_field
184 uppercase_mapping
185 lowercase_mapping
186 titlecase_mapping
188 set lRet [list]
190 while { ![eof $fd] } {
191 set line [gets $fd]
192 if {$line == ""} continue
194 set fields [split $line ";"]
195 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
196 foreach $lField $fields {}
198 lappend lRet [list $code $general_category]
201 close $fd
202 set lRet