ext/fts3/unicode/parseunicode.tcl

   1
   2 #--------------------------------------------------------------------------
   3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
   4 # reads the file and returns a list of mappings required to remove all
   5 # diacritical marks from a unicode string. Each mapping is itself a list
   6 # consisting of two elements - the unicode codepoint and the single ASCII
   7 # character that it should be replaced with, or an empty string if the
   8 # codepoint should simply be removed from the input. Examples:
   9 #
  10 #   { 224 a  0 }     (replace codepoint 224 to "a")
  11 #   { 769 "" 0 }     (remove codepoint 769 from input)
  12 #
  13 # Mappings are only returned for non-upper case codepoints. It is assumed
  14 # that the input has already been folded to lower case.
  15 #
  16 # The third value in the list is always either 0 or 1. 0 if the
  17 # UnicodeData.txt file maps the codepoint to a single ASCII character and
  18 # a diacritic, or 1 if the mapping is indirect. For example, consider the
  19 # two entries:
  20 #
  21 # 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
  22 # 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
  23 #
  24 # The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a
  25 # diacritic). The second is an indirect mapping, as it maps to the
  26 # first codepoint plus 0302 (a diacritic).
  27 #
  28 proc rd_load_unicodedata_text {zName} {
  29   global tl_lookup_table
  30
  31   set fd [open $zName]
  32   set lField {
  33     code
  34     character_name
  35     general_category
  36     canonical_combining_classes
  37     bidirectional_category
  38     character_decomposition_mapping
  39     decimal_digit_value
  40     digit_value
  41     numeric_value
  42     mirrored
  43     unicode_1_name
  44     iso10646_comment_field
  45     uppercase_mapping
  46     lowercase_mapping
  47     titlecase_mapping
  48   }
  49   set lRet [list]
  50
  51   while { ![eof $fd] } {
  52     set line [gets $fd]
  53     if {$line == ""} continue
  54
  55     set fields [split $line ";"]
  56     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
  57     foreach $lField $fields {}
  58     if { [llength $character_decomposition_mapping]!=2
  59       || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
  60     } {
  61       continue
  62     }
  63
  64     set iCode  [expr "0x$code"]
  65     set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
  66     set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
  67
  68     # Filter out upper-case characters, as they will be mapped to their
  69     # lower-case equivalents before this data is used.
  70     if {[info exists tl_lookup_table($iCode)]} continue
  71
  72     # Check if this is an indirect mapping. If so, set bIndirect to true
  73     # and change $iAscii to the indirectly mappped ASCII character.
  74     set bIndirect 0
  75     if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
  76       set iAscii $mapping($iAscii)
  77       set bIndirect 1
  78     }
  79
  80     if { ($iAscii >= 97 && $iAscii <= 122)
  81       || ($iAscii >= 65 && $iAscii <= 90)
  82     } {
  83       lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
  84       set mapping($iCode) $iAscii
  85       set dia($iDia) 1
  86     }
  87   }
  88
  89   foreach d [array names dia] {
  90     lappend lRet [list $d "" 0]
  91   }
  92   set lRet [lsort -integer -index 0 $lRet]
  93
  94   close $fd
  95   set lRet
  96 }
  97
  98 #-------------------------------------------------------------------------
  99 # Parameter $zName must be a path to the file UnicodeData.txt. This command
 100 # reads the file and returns a list of codepoints (integers). The list
 101 # contains all codepoints in the UnicodeData.txt assigned to any "General
 102 # Category" that is not a "Letter" or "Number".
 103 #
 104 proc an_load_unicodedata_text {zName} {
 105   set fd [open $zName]
 106   set lField {
 107     code
 108     character_name
 109     general_category
 110     canonical_combining_classes
 111     bidirectional_category
 112     character_decomposition_mapping
 113     decimal_digit_value
 114     digit_value
 115     numeric_value
 116     mirrored
 117     unicode_1_name
 118     iso10646_comment_field
 119     uppercase_mapping
 120     lowercase_mapping
 121     titlecase_mapping
 122   }
 123   set lRet [list]
 124
 125   while { ![eof $fd] } {
 126     set line [gets $fd]
 127     if {$line == ""} continue
 128
 129     set fields [split $line ";"]
 130     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 131     foreach $lField $fields {}
 132
 133     set iCode [expr "0x$code"]
 134     set bAlnum [expr {
 135          [lsearch {L N} [string range $general_category 0 0]] >= 0
 136       || $general_category=="Co"
 137     }]
 138
 139     if { !$bAlnum } { lappend lRet $iCode }
 140   }
 141
 142   close $fd
 143   set lRet
 144 }
 145
 146 proc tl_load_casefolding_txt {zName} {
 147   global tl_lookup_table
 148
 149   set fd [open $zName]
 150   while { ![eof $fd] } {
 151     set line [gets $fd]
 152     if {[string range $line 0 0] == "#"} continue
 153     if {$line == ""} continue
 154
 155     foreach x {a b c d} {unset -nocomplain $x}
 156     foreach {a b c d} [split $line ";"] {}
 157
 158     set a2 [list]
 159     set c2 [list]
 160     foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
 161     foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
 162     set b [string trim $b]
 163     set d [string trim $d]
 164
 165     if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
 166   }
 167 }
 168
 169 proc cc_load_unicodedata_text {zName} {
 170   set fd [open $zName]
 171   set lField {
 172     code
 173     character_name
 174     general_category
 175     canonical_combining_classes
 176     bidirectional_category
 177     character_decomposition_mapping
 178     decimal_digit_value
 179     digit_value
 180     numeric_value
 181     mirrored
 182     unicode_1_name
 183     iso10646_comment_field
 184     uppercase_mapping
 185     lowercase_mapping
 186     titlecase_mapping
 187   }
 188   set lRet [list]
 189
 190   while { ![eof $fd] } {
 191     set line [gets $fd]
 192     if {$line == ""} continue
 193
 194     set fields [split $line ";"]
 195     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 196     foreach $lField $fields {}
 197
 198     lappend lRet [list $code $general_category]
 199   }
 200
 201   close $fd
 202   set lRet
 203 }
 204
 205