Tidy up char 127
[latex2e.git] / trunk / support / load-unicode-xetex-classes.tex
blob816488ef80ba941127f6d129c0fe1876b5319380
1 % File load-unicode-xetex-classes.tex
3 % Copyright 2015 The LaTeX3 Project
5 % It may be distributed and/or modified under the conditions of
6 % the LaTeX Project Public License (LPPL), either version 1.3c of
7 % this license or (at your option) any later version. The latest
8 % version of this license is in the file
9 % http://www.latex-project.org/lppl.txt.
11 % Issues with this file should be reported at
12 % https://github.com/latex3/unicode-data
14 % This file parses EastAsianWidth.txt and LineBreak.txt, provided by the
15 % Unicode Consortium, and when used with XeTeX sets \XeTeXcharclass for
16 % the following classes of code point:
17 % - "ID" (ideographic)
18 % - "OP" (opener)
19 % - "CL" (closer)
20 % - "NS" (non-starter)
21 % - "EX" (exclamation)
22 % - "IS" (infix separator)
23 % - "CM" (combining marks)
25 % All code points of class "ID" are assigned to a \XeTeXcharclass, but for
26 % other classes this only occurs when they fall into east Asian width type
27 % "F", "H" or "W" (full-, half- and wide-width).
29 % The following mappings between Unicode and XeTeX classes occur
30 % - "ID" is class 1
31 % - "OP" is class 2
32 % - "CL", "NS", "EX", "IS" are class 3
33 % - "CM" is class 256 (ignored)
35 % This file does _not_ activate XeTeX's inter-character token mechanism
36 % (\XeTeXinterchartokenstate is not set) nor does it install any material in
37 % the inter-character token registers.
39 % Note that this file is separate from the main loader as the data structure
40 % here may need more refinement at the macro level.
42 % =============================================================================
44 % The data loaded here can currently only be used by XeTeX: check for the
45 % appropriate primitive.
46 \ifx\XeTeXcharclass\undefined
47 \expandafter\endinput
48 \fi
49 % This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
50 % |#| may not be correct. Everything is done in a group so that only the
51 % settings we want to propagate are made available generally.
52 \begingroup
53 \catcode`\{=1 %
54 \catcode`\}=2 %
55 % Write some basic information to the log.
56 \catcode`\^=7 %
57 \newlinechar=`\^^J %
58 \message{^^J}%
59 \message{load-unicode-xetex-classes.tex v0.6 (2015-12-09)^^J}%
60 \message{Reading Unicode east Asian character class data^^J}%
61 % A string version of |#| will be needed to look for comment lines in the
62 % source. Once that is done proper parsing can begin.
63 \catcode`\#=12 %
64 \def\hash{#}%
65 \catcode`\#=6 %
66 \def\firsttoken#1#2\relax{#1}%
67 \def\parseunicodedataI#1\relax{%
68 \unless\if\hash\firsttoken#1?\relax
69 \parseunicodedataII#1\relax
70 \fi
72 % Both files to be parsed here have potential ranges of code points: find the
73 % first entry and search for the second.
74 \def\parseunicodedataII#1;#2 #3\relax{%
75 \parseunicodedataIII#1....\relax{#2}%
77 % From plain: may not be defined (yet).
78 \def\loop#1\repeat{\def\body{#1}\iterate}%
79 \def\iterate{%
80 \body
81 \let\next\iterate
82 \else
83 \let\next\relax
84 \fi
85 \next
87 \let\repeat\fi
88 % For the East Asian width data, save the class of the current token.
89 \def\parseunicodedataIII#1..#2..#3\relax#4{%
90 \expandafter\def\csname EAW@\number"#1\endcsname{#4}%
91 \ifx\relax#2\relax
92 \else
93 \count0="#1 %
94 \loop
95 \ifnum\count0<"#2 %
96 \advance\count0 by 1 %
97 \expandafter\def\csname EAW@\number\count0\endcsname{#4}%
98 \repeat
99 \fi
101 % A shared routine for reading the data files: only one part of the parser
102 % has to be altered.
103 \def\storedpar{\par}%
104 \def\readandparse#1{%
105 \openin0=#1.txt %
106 % Read two lines from the source file to extract the version information
107 \catcode`\#=12 %
108 \read0 to \unicodedataline
109 \message{\unicodedataline ^^J}%
110 \read0 to \unicodedataline
111 \message{\unicodedataline ^^J}%
112 \loop\unless\ifeof0 %
113 \read0 to \unicodedataline
114 \unless\ifx\unicodedataline\storedpar
115 \expandafter\parseunicodedataI\unicodedataline\relax
117 \repeat
118 \catcode`\#=6 %
119 \closein0 %
121 % Read the east Asian width data: no settings are made at this stage.
122 \readandparse{EastAsianWidth}%
123 % Set up the different line break classes recognised.
124 \chardef\XeTeXcharclassID=1 %
125 \chardef\XeTeXcharclassOP=2 %
126 \chardef\XeTeXcharclassCL=3 %
127 \chardef\XeTeXcharclassEX=3 %
128 \chardef\XeTeXcharclassIS=3 %
129 \chardef\XeTeXcharclassNS=3 %
130 \chardef\XeTeXcharclassCM=256 %
131 % Check the line break class and if necessary the east Asian width for the
132 % current code point. For code points of class |ID| there may be a range to
133 % set, and these are always recorded. In other cases check the east Asian width
134 % and set the class if appropriate.
135 \def\ID{ID}%
136 \def\parseunicodedataIII#1..#2..#3\relax#4{%
137 \def\temp{#4}%
138 \ifx\temp\ID
139 \ifx\relax#2\relax
140 \parseunicodedataIV{#1}{#1}%
141 \else
142 \parseunicodedataIV{#1}{#2}%
144 \else
145 \ifnum 0%
146 \if F\csname EAW@\number"#1\endcsname 1\fi
147 \if H\csname EAW@\number"#1\endcsname 1\fi
148 \if W\csname EAW@\number"#1\endcsname 1\fi
149 >0 %
150 \global\XeTeXcharclass"#1=\csname XeTeXcharclass#4\endcsname
154 % As we are inside a loop already, there needs to be a group here to preserve
155 % the iterator.
156 \def\parseunicodedataIV#1#2{%
157 \begingroup
158 \count0="#1 %
159 \loop
160 \ifnum\count0<"#2 %
161 \global\XeTeXcharclass\count0=1 %
162 \advance\count0 by 1 %
163 \repeat
164 \endgroup
166 \readandparse{LineBreak}%
167 \endgroup