1 % File load-unicode-xetex-classes.tex
3 % Copyright 2015 The LaTeX3 Project
5 % It may be distributed and/or modified under the conditions of
6 % the LaTeX Project Public License (LPPL), either version 1.3c of
7 % this license or (at your option) any later version. The latest
8 % version of this license is in the file
9 % http://www.latex-project.org/lppl.txt.
11 % Issues with this file should be reported at
12 % https://github.com/latex3/unicode-data
14 % This file parses EastAsianWidth.txt and LineBreak.txt, provided by the
15 % Unicode Consortium, and when used with XeTeX sets \XeTeXcharclass for
16 % the following classes of code point:
17 % - "ID" (ideographic)
20 % - "NS" (non-starter)
21 % - "EX" (exclamation)
22 % - "IS" (infix separator)
23 % - "CM" (combining marks)
25 % All code points of class "ID" are assigned to a \XeTeXcharclass, but for
26 % other classes this only occurs when they fall into east Asian width type
27 % "F", "H" or "W" (full-, half- and wide-width).
29 % The following mappings between Unicode and XeTeX classes occur
32 % - "CL", "NS", "EX", "IS" are class 3
33 % - "CM" is class 256 (ignored)
35 % This file does _not_ activate XeTeX's inter-character token mechanism
36 % (\XeTeXinterchartokenstate is not set) nor does it install any material in
37 % the inter-character token registers.
39 % Note that this file is separate from the main loader as the data structure
40 % here may need more refinement at the macro level.
42 % =============================================================================
44 % The data loaded here can currently only be used by XeTeX: check for the
45 % appropriate primitive.
46 \ifx\XeTeXcharclass\undefined
49 % This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
50 % |#| may not be correct. Everything is done in a group so that only the
51 % settings we want to propagate are made available generally.
55 % Write some basic information to the log.
59 \message{load-unicode-xetex-classes.tex v0.6 (
2015-
12-
09)^^J
}%
60 \message{Reading Unicode east Asian character class data^^J
}%
61 % A string version of |#| will be needed to look for comment lines in the
62 % source. Once that is done proper parsing can begin.
66 \def\firsttoken#1#2\relax{#1}%
67 \def\parseunicodedataI#1\relax{%
68 \unless\if\hash\firsttoken#1?
\relax
69 \parseunicodedataII#1\relax
72 % Both files to be parsed here have potential ranges of code points: find the
73 % first entry and search for the second.
74 \def\parseunicodedataII#1;
#2 #3\relax{%
75 \parseunicodedataIII#1....
\relax{#2}%
77 % From plain: may not be defined (yet).
78 \def\loop#1\repeat{\def\body{#1}\iterate}%
88 % For the East Asian width data, save the class of the current token.
89 \def\parseunicodedataIII#1..
#2..
#3\relax#4{%
90 \expandafter\def\csname EAW@
\number"
#1\endcsname{#4}%
96 \advance\count0 by
1 %
97 \expandafter\def\csname EAW@
\number\count0\endcsname{#4}%
101 % A shared routine for reading the data files: only one part of the parser
103 \def\storedpar{\par}%
104 \def\readandparse#1{%
106 % Read two lines from the source file to extract the version information
108 \read0 to
\unicodedataline
109 \message{\unicodedataline ^^J
}%
110 \read0 to
\unicodedataline
111 \message{\unicodedataline ^^J
}%
112 \loop\unless\ifeof0 %
113 \read0 to
\unicodedataline
114 \unless\ifx\unicodedataline\storedpar
115 \expandafter\parseunicodedataI\unicodedataline\relax
121 % Read the east Asian width data: no settings are made at this stage.
122 \readandparse{EastAsianWidth
}%
123 % Set up the different line break classes recognised.
124 \chardef\XeTeXcharclassID=
1 %
125 \chardef\XeTeXcharclassOP=
2 %
126 \chardef\XeTeXcharclassCL=
3 %
127 \chardef\XeTeXcharclassEX=
3 %
128 \chardef\XeTeXcharclassIS=
3 %
129 \chardef\XeTeXcharclassNS=
3 %
130 \chardef\XeTeXcharclassCM=
256 %
131 % Check the line break class and if necessary the east Asian width for the
132 % current code point. For code points of class |ID| there may be a range to
133 % set, and these are always recorded. In other cases check the east Asian width
134 % and set the class if appropriate.
136 \def\parseunicodedataIII#1..
#2..
#3\relax#4{%
140 \parseunicodedataIV{#1}{#1}%
142 \parseunicodedataIV{#1}{#2}%
146 \if F
\csname EAW@
\number"
#1\endcsname 1\fi
147 \if H
\csname EAW@
\number"
#1\endcsname 1\fi
148 \if W
\csname EAW@
\number"
#1\endcsname 1\fi
150 \global\XeTeXcharclass"
#1=
\csname XeTeXcharclass
#4\endcsname
154 % As we are inside a loop already, there needs to be a group here to preserve
156 \def\parseunicodedataIV#1#2{%
161 \global\XeTeXcharclass\count0=
1 %
162 \advance\count0 by
1 %
166 \readandparse{LineBreak
}%