1 # Id: UCD.pm,v 1.1.1.1 2003/06/04 00:27:53 marka Exp
3 # Copyright (c) 2000,2001 Japan Network Information Center.
6 # By using this file, you agree to the terms and conditions set forth bellow.
8 # LICENSE TERMS AND CONDITIONS
10 # The following License Terms and Conditions apply, unless a different
11 # license is obtained from Japan Network Information Center ("JPNIC"),
12 # a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
13 # Chiyoda-ku, Tokyo 101-0047, Japan.
15 # 1. Use, Modification and Redistribution (including distribution of any
16 # modified or derived work) in source and/or binary forms is permitted
17 # under this License Terms and Conditions.
19 # 2. Redistribution of source code must retain the copyright notices as they
20 # appear in each source code file, this License Terms and Conditions.
22 # 3. Redistribution in binary form must reproduce the Copyright Notice,
23 # this License Terms and Conditions, in the documentation and/or other
24 # materials provided with the distribution. For the purposes of binary
25 # distribution the "Copyright Notice" refers to the following language:
26 # "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
28 # 4. The name of JPNIC may not be used to endorse or promote products
29 # derived from this Software without specific prior written approval of
32 # 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
33 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
34 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
35 # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
36 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
37 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
38 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
39 # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 # WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
41 # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42 # ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48 # UCD.pm -- parser for Unicode Character Database files.
50 # This file is an aggregation of the following modules, each of which
51 # provides a parser for a specific data file of UCD.
52 # UCD::UnicodeData -- for UnicodeData.txt
53 # UCD::CaseFolding -- for CaseFolding.txt
54 # UCD::SpecialCasing -- for SpecialCasing.txt
55 # UCD::CompositionExclusions -- for CompositionExclusions-1.txt
57 # Each module provides two subroutines:
59 # $line = getline(\*HANDLE);
60 # reads next non-comment line from HANDLE, and returns it.
61 # undef will be returned upon EOF.
63 # %fields = parse($line);
64 # parses a line and extract fields, and returns a list of
65 # field name and its value, suitable for assignment to a hash.
68 package UCD
::UnicodeData
;
76 $s =~ s/\r?\n$// if $s;
83 my @f = split /;/, $s, -1;
84 return (CODE
=> hex($f[0]),
89 DECOMP
=> dcmap
($f[5]),
90 DECIMAL
=> dvalue
($f[6]),
91 DIGIT
=> dvalue
($f[7]),
92 NUMERIC
=> dvalue
($f[8]),
93 MIRRORED
=> $f[9] eq 'Y',
96 UPPER
=> ucode
($f[12]),
97 LOWER
=> ucode
($f[13]),
98 TITLE
=> ucode
($f[14]));
103 return undef if $v eq '';
104 $v =~ /^(?:(<[^>]+>)\s*)?(\S.*)/
105 or croak
"invalid decomposition mapping \"$v\"";
107 [$tag, map {hex($_)} split(' ', $2)];
112 return undef if $v eq '';
118 return undef if $v eq '';
122 package UCD
::CaseFolding
;
128 while (defined(my $s = <$fh>)) {
130 next if $s =~ /^\s*$/;
139 my @f = split /;\s*/, $s, -1;
140 return (CODE
=> hex($f[0]),
142 MAP
=> [map(hex, split ' ', $f[2])],
146 package UCD
::SpecialCasing
;
152 while (defined(my $s = <$fh>)) {
154 next if $s =~ /^\s*$/;
164 my @f = split /;\s*/, $s, -1;
165 my $cond = (@f > 5) ?
$f[4] : undef;
166 return (CODE
=> hex($f[0]),
167 LOWER
=> [map(hex, split ' ', $f[1])],
168 TITLE
=> [map(hex, split ' ', $f[2])],
169 UPPER
=> [map(hex, split ' ', $f[3])],
173 package UCD
::CompositionExclusions
;
179 while (defined(my $s = <$fh>)) {
181 next if $s =~ /^\s*$/;
191 return (CODE
=> hex($&));