Update contact email
[latex2e.git] / trunk / support / load-unicode-data.tex
blobb9011421e23470f2c6b80fb1631a1e133f196818
1 % File load-unicode-data.tex
3 % Copyright 2015,2016 The LaTeX3 Project
5 % It may be distributed and/or modified under the conditions of
6 % the LaTeX Project Public License (LPPL), either version 1.3c of
7 % this license or (at your option) any later version. The latest
8 % version of this license is in the file
9 % http://www.latex-project.org/lppl.txt.
11 % Issues with this file should be reported at
12 % https://github.com/latex3/unicode-data
14 % This file parses a number of data files provided by the Unicode Consortium
15 % and when used with used Unicode-capable engine sets up a range of TeX-related
16 % parameters based on the extracted information.
18 % From the file UnicodeData.txt the following properties are set:
19 % - \catcode 11 for all letters (Unicode class "L")
20 % - \catcode 11 for all combining marks (Unicode class "M")
21 % - \sfcode 999 for all code points of class "Lu" (upper case letters)
22 % - \lccode for all of class "Ll" (lower case letters) to the code point
23 % itself, and \uccode to the upper case mapping (or if not given
24 % to the code point itself)
25 % - \uccode for all of class "Lu" (upper case letters) to the code point
26 % itself, and \lccode to the lower case mapping (or if not given
27 % to the code point itself)
28 % - \lccode and \uccode for all of class "Lt" (title case letters) to the
29 % lower and upper case mappings (or if not given to the code point itself)
30 % - \lccode and \uccode for all other letter code points are set to
31 % the code point itself
32 % - \lccode and/or \uccode for non-letter code points for which an upper
33 % or lower case mapping is given
34 % - \sfcode 0 (ignored) for code points of Unicode classes "Pe" (closing
35 % punctuation marks) and "Pf" (final quotation marks)
36 % - \Umathcode for all letters in the base plane class 7 (var) and for
37 % the supplementary plane class 0 (regular)
39 % =============================================================================
41 % The data can only be loaded by Unicode engines. Currently this is limited to
42 % XeTeX and LuaTeX, both of which define \Umathcode.
43 \ifx\Umathcode\undefined
44 \expandafter\endinput
45 \fi
46 % Just in case, check for the e-TeX extensions.
47 \ifx\eTeXversion\undefined
48 \expandafter\endinput
49 \fi
50 % This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
51 % |#| may not be correct. Everything is done in a group so that only the
52 % settings we want to propagate are made available generally.
53 \begingroup
54 \catcode`\{=1 %
55 \catcode`\}=2 %
56 \catcode`\#=6 %
57 % Write some basic information to the log.
58 \catcode`\^=7 %
59 \newlinechar=`\^^J %
60 \message{^^J}%
61 \message{load-unicode-data.tex v1.2 (2016-02-02)^^J}%
62 \message{Reading Unicode data^^J}%
63 % The first stage of parsing is dealing with the fact that there are lots of
64 % data items separated by |;|. Of those, only a few are needed so they are
65 % picked out and everything else is dropped. There is one complication: there
66 % are a few cases in the data file of ranges which are marked by the descriptor
67 % |First| and a matching |Last|. A separate routine is used to handle these
68 % cases.
69 \def\parseunicodedataI#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
70 \parseunicodedataII#1;#3;#2 First>\relax
72 \def\parseunicodedataII#1;#2;#3 First>#4\relax{%
73 \ifx\relax#4\relax
74 \expandafter\parseunicodedataIII
75 \else
76 \expandafter\parseunicodedataVII
77 \fi
78 #1;#2;%
80 \def\parseunicodedataIII#1;#2;#3;#4;#5;#6;#7;#8\relax{%
81 \parseunicodedataIV{#1}{#2}{#6}{#7}%
83 % At this stage we have a `normal' data line with four pieces of information:
84 % the code point, the Unicode class and the (possibly empty) upper and lower
85 % case mappings. A few utility macros are defined, then we branch based on the
86 % Unicode class. Notice that for all letter-like code points we first set the
87 % |\lccode| and |\uccode| values to the code point itself then test for the
88 % classes where a different setting might be appropriate. For non-letters
89 % there is a check to see if any mappings are available, and also for trailing
90 % punctuation to set the appropriate |\sfcode|.
91 \def\Ll{Ll}%
92 \def\Lt{Lt}%
93 \def\Lu{Lu}%
94 \def\Pe{Pe}%
95 \def\Pf{Pf}%
96 \def\firsttoken#1#2\relax{#1}%
97 \def\parseunicodedataIV#1#2#3#4{%
98 \ifnum 0%
99 \if L\firsttoken#2?\relax 1\fi
100 \if M\firsttoken#2?\relax 1\fi
101 >0 %
102 \parseunicodedataV{"#1}%
103 \def\temp{#2}%
104 \ifx\Ll\temp
105 \parseunicodedataVI\uccode{#1}{#3}%
107 \ifx\Lt\temp
108 \parseunicodedataVI\uccode{#1}{#3}%
109 \parseunicodedataVI\lccode{#1}{#4}%
111 \ifx\Lu\temp
112 \parseunicodedataVI\lccode{#1}{#4}%
113 \global\sfcode"#1=999 %
115 % Letters in base plane are class~$7$, those in the supplementary plane are
116 % class~$0$.
117 \ifnum"#1<10000 %
118 \global\Umathcode"#1="7"01"#1 %
119 \else
120 \global\Umathcode"#1="0"01"#1 %
122 \else
123 \def\temp{#2}%
124 \ifnum 0\ifx\temp\Pe 1\fi\ifx\temp\Pf 1\fi>0 %
125 \global\sfcode"#1=0 %
127 \ifx\relax#3\relax
128 \else
129 \global\uccode"#1="#3 %
131 \ifx\relax#4\relax
132 \else
133 \global\lccode"#1="#4 %
137 % A simple auxiliary for all letter-like code points: the |\lccode| and
138 % |\uccode| may get reset for cased letters but this means the initial
139 % setting can't be forgotten.
140 \def\parseunicodedataV#1{%
141 \global\catcode#1=11 %
142 \global\lccode#1=#1 %
143 \global\uccode#1=#1 %
145 % An auxiliary to deal with the fact that some cased letters don't actually
146 % have a case mapping available.
147 \def\parseunicodedataVI#1#2#3{%
148 \ifx\relax#3\relax
149 \else
150 \global#1"#2="#3 %
153 % For lines that were the |First>| of a range, read the data source again for
154 % last line. Lines for letters then trigger a loop over the entire range. These
155 % are always non-cased letters.
156 \def\parseunicodedataVII#1;#2;#3\relax{%
157 \read0 to \unicodedataline
158 \expandafter\parseunicodedataXII\unicodedataline\relax#1;#2\relax
160 \def\parseunicodedataXII#1;#2\relax#3;#4\relax{%
161 \if L\firsttoken#4?\relax
162 \begingroup
163 \count0="#3 %
164 \loop
165 \unless\ifnum\count0>"#1 %
166 \parseunicodedataV{\count0 }%
167 \advance\count0 by 1 %
168 \repeat
169 \endgroup
172 % From plain: may not be defined (yet).
173 \def\loop#1\repeat{\def\body{#1}\iterate}%
174 \def\iterate{%
175 \body
176 \let\next\iterate
177 \else
178 \let\next\relax
180 \next
182 \let\repeat\fi
183 % There is no version data in |UnicodeData.txt|: log that it is being used with
184 % a hard-coded date (when the download took place). This obviously needs to be
185 % updated when a new download takes place!
186 \message{\string# UnicodeData-8.0.0.txt^^J}%
187 \message{\string# Downloaded 2015-12-01 09:00:00 GMT [JAW]^^J}%
188 % Actually loading the file requires an input stream, done directly.
189 % There is a blank line at the end of the data source so there is a check
190 % here for a |\par|.
191 \def\storedpar{\par}%
192 \openin0=UnicodeData.txt %
193 \loop\unless\ifeof0 %
194 \read0 to \unicodedataline
195 \unless\ifx\unicodedataline\storedpar
196 \expandafter\parseunicodedataI\unicodedataline\relax
198 \repeat
199 \closein0 %
200 % All of the other data files have some common aspects to their format. We
201 % therefore begin with some shared code. First a check for a comment line:
202 % these can be skipped. (Currently only |MathClass.txt| is used by this code
203 % is also usable with the other Unicode data files.)
204 \edef\hash{\string#}%
205 \def\parseunicodedataI#1\relax{%
206 \unless\if\hash\firsttoken#1?\relax
207 \parseunicodedataII#1\relax
210 % The first entry in all of the files is a code point or range of code points:
211 % set up to find a range. The definition of |\parseunicodedataIV| will depend on
212 % the data being processed and may need to split the remainder of the line
213 % further.
214 \def\parseunicodedataII#1;#2\relax{%
215 \parseunicodedataIII#1....\relax{#2}%
217 \def\parseunicodedataIII#1..#2..#3\relax#4{%
218 \ifx\relax#2\relax
219 \parseunicodedataIV{#1}{#1}#4\relax
220 \else
221 \parseunicodedataIV{#1}{#2}#4\relax
224 % A shared routine for reading the data files: only one part of the parser
225 % has to be altered.
226 \def\readandparse#1{%
227 \catcode`\#=12 %
228 \openin0=#1.txt %
229 % Read two lines from the source file to extract the version information: it is
230 % always the first two lines of the file.
231 \read0 to \unicodedataline
232 \message{\unicodedataline ^^J}%
233 \read0 to \unicodedataline
234 \message{\unicodedataline ^^J}%
235 \loop\unless\ifeof0 %
236 \read0 to \unicodedataline
237 \unless\ifx\unicodedataline\storedpar
238 \expandafter\parseunicodedataI\unicodedataline\relax
240 \repeat
241 \catcode`\#=6 %
242 \closein0 %
244 \endgroup