trunk/support/load-unicode-data.tex

   1 % File load-unicode-data.tex
   2 %
   3 % Copyright 2015,2016 The LaTeX3 Project
   4 %
   5 % It may be distributed and/or modified under the conditions of
   6 % the LaTeX Project Public License (LPPL), either version 1.3c of
   7 % this license or (at your option) any later version. The latest
   8 % version of this license is in the file
   9 % http://www.latex-project.org/lppl.txt.
  10 %
  11 % Issues with this file should be reported at
  12 % https://github.com/latex3/unicode-data
  13 %
  14 % This file parses a number of data files provided by the Unicode Consortium
  15 % and when used with used Unicode-capable engine sets up a range of TeX-related
  16 % parameters based on the extracted information.
  17 %
  18 % From the file UnicodeData.txt the following properties are set:
  19 % - \catcode 11 for all letters (Unicode class "L")
  20 % - \catcode 11 for all combining marks (Unicode class "M")
  21 % - \sfcode 999 for all code points of class "Lu" (upper case letters)
  22 % - \lccode for all of class "Ll" (lower case letters) to the code point
  23 %   itself, and \uccode to the upper case mapping (or if not given
  24 %   to the code point itself)
  25 % - \uccode for all of class "Lu" (upper case letters) to the code point
  26 %   itself, and \lccode to the lower case mapping (or if not given
  27 %   to the code point itself)
  28 % - \lccode and \uccode for all of class "Lt" (title case letters) to the
  29 %   lower and upper case mappings (or if not given to the code point itself)
  30 % - \lccode and \uccode for all other letter code points are set to
  31 %   the code point itself
  32 % - \lccode and/or \uccode for non-letter code points for which an upper
  33 %   or lower case mapping is given
  34 % - \sfcode 0 (ignored) for code points of Unicode classes "Pe" (closing
  35 %   punctuation marks) and "Pf" (final quotation marks)
  36 % - \Umathcode for all letters in the base plane class 7 (var) and for
  37 %   the supplementary plane class 0 (regular)
  38 %
  39 % =============================================================================
  40 %
  41 % The data can only be loaded by Unicode engines. Currently this is limited to
  42 % XeTeX and LuaTeX, both of which define \Umathcode.
  43 \ifx\Umathcode\undefined
  44   \expandafter\endinput
  45 \fi
  46 % Just in case, check for the e-TeX extensions.
  47 \ifx\eTeXversion\undefined
  48   \expandafter\endinput
  49 \fi
  50 % This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
  51 % |#| may not be correct. Everything is done in a group so that only the
  52 % settings we want to propagate are made available generally.
  53 \begingroup
  54   \catcode`\{=1 %
  55   \catcode`\}=2 %
  56   \catcode`\#=6 %
  57 % Write some basic information to the log.
  58   \catcode`\^=7 %
  59   \newlinechar=`\^^J %
  60   \message{^^J}%
  61   \message{load-unicode-data.tex v1.2 (2016-02-02)^^J}%
  62   \message{Reading Unicode data^^J}%
  63 % The first stage of parsing is dealing with the fact that there are lots of
  64 % data items separated by |;|. Of those, only a few are needed so they are
  65 % picked out and everything else is dropped. There is one complication: there
  66 % are a few cases in the data file of ranges which are marked by the descriptor
  67 % |First| and a matching |Last|. A separate routine is used to handle these
  68 % cases.
  69   \def\parseunicodedataI#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
  70     \parseunicodedataII#1;#3;#2 First>\relax
  71   }%
  72   \def\parseunicodedataII#1;#2;#3 First>#4\relax{%
  73     \ifx\relax#4\relax
  74       \expandafter\parseunicodedataIII
  75     \else
  76       \expandafter\parseunicodedataVII
  77     \fi
  78     #1;#2;%
  79   }%
  80   \def\parseunicodedataIII#1;#2;#3;#4;#5;#6;#7;#8\relax{%
  81     \parseunicodedataIV{#1}{#2}{#6}{#7}%
  82   }%
  83 % At this stage we have a `normal' data line with four pieces of information:
  84 % the code point, the Unicode class and the (possibly empty) upper and lower
  85 % case mappings. A few utility macros are defined, then we branch based on the
  86 % Unicode class. Notice that for all letter-like code points we first set the
  87 % |\lccode| and |\uccode| values to the code point itself then test for the
  88 % classes where a different setting might be appropriate. For non-letters
  89 % there is a check to see if any mappings are available, and also for trailing
  90 % punctuation to set the appropriate |\sfcode|.
  91   \def\Ll{Ll}%
  92   \def\Lt{Lt}%
  93   \def\Lu{Lu}%
  94   \def\Pe{Pe}%
  95   \def\Pf{Pf}%
  96   \def\firsttoken#1#2\relax{#1}%
  97   \def\parseunicodedataIV#1#2#3#4{%
  98     \ifnum 0%
  99       \if L\firsttoken#2?\relax 1\fi
 100       \if M\firsttoken#2?\relax 1\fi
 101       >0 %
 102       \parseunicodedataV{"#1}%
 103       \def\temp{#2}%
 104       \ifx\Ll\temp
 105         \parseunicodedataVI\uccode{#1}{#3}%
 106       \fi
 107       \ifx\Lt\temp
 108         \parseunicodedataVI\uccode{#1}{#3}%
 109         \parseunicodedataVI\lccode{#1}{#4}%
 110       \fi
 111       \ifx\Lu\temp
 112         \parseunicodedataVI\lccode{#1}{#4}%
 113         \global\sfcode"#1=999 %
 114       \fi
 115 % Letters in base plane are class~$7$, those in the supplementary plane are
 116 % class~$0$.
 117       \ifnum"#1<10000 %
 118         \global\Umathcode"#1="7"01"#1 %
 119       \else
 120         \global\Umathcode"#1="0"01"#1 %
 121       \fi
 122     \else
 123       \def\temp{#2}%
 124       \ifnum 0\ifx\temp\Pe 1\fi\ifx\temp\Pf 1\fi>0 %
 125         \global\sfcode"#1=0 %
 126       \fi
 127       \ifx\relax#3\relax
 128       \else
 129         \global\uccode"#1="#3 %
 130       \fi
 131       \ifx\relax#4\relax
 132       \else
 133         \global\lccode"#1="#4 %
 134       \fi
 135     \fi
 136   }%
 137 % A simple auxiliary for all letter-like code points: the |\lccode| and
 138 % |\uccode| may get reset for cased letters but this means the initial
 139 % setting can't be forgotten.
 140   \def\parseunicodedataV#1{%
 141     \global\catcode#1=11 %
 142     \global\lccode#1=#1 %
 143     \global\uccode#1=#1 %
 144   }%
 145 % An auxiliary to deal with the fact that some cased letters don't actually
 146 % have a case mapping available.
 147   \def\parseunicodedataVI#1#2#3{%
 148     \ifx\relax#3\relax
 149     \else
 150       \global#1"#2="#3 %
 151     \fi
 152   }%
 153 % For lines that were the |First>| of a range, read the data source again for
 154 % last line. Lines for letters then trigger a loop over the entire range. These
 155 % are always non-cased letters.
 156   \def\parseunicodedataVII#1;#2;#3\relax{%
 157     \read0 to \unicodedataline
 158     \expandafter\parseunicodedataXII\unicodedataline\relax#1;#2\relax
 159   }%
 160   \def\parseunicodedataXII#1;#2\relax#3;#4\relax{%
 161     \if L\firsttoken#4?\relax
 162       \begingroup
 163         \count0="#3 %
 164         \loop
 165           \unless\ifnum\count0>"#1 %
 166             \parseunicodedataV{\count0 }%
 167             \advance\count0 by 1 %
 168         \repeat
 169       \endgroup
 170     \fi
 171   }%
 172 % From plain: may not be defined (yet).
 173   \def\loop#1\repeat{\def\body{#1}\iterate}%
 174   \def\iterate{%
 175     \body
 176       \let\next\iterate
 177     \else
 178       \let\next\relax
 179     \fi
 180     \next
 181   }%
 182   \let\repeat\fi
 183 % There is no version data in |UnicodeData.txt|: log that it is being used with
 184 % a hard-coded date (when the download took place). This obviously needs to be
 185 % updated when a new download takes place!
 186   \message{\string# UnicodeData-8.0.0.txt^^J}%
 187   \message{\string# Downloaded 2015-12-01 09:00:00 GMT [JAW]^^J}%
 188 % Actually loading the file requires an input stream, done directly.
 189 % There is a blank line at the end of the data source so there is a check
 190 % here for a |\par|.
 191   \def\storedpar{\par}%
 192   \openin0=UnicodeData.txt %
 193   \loop\unless\ifeof0 %
 194     \read0 to \unicodedataline
 195     \unless\ifx\unicodedataline\storedpar
 196       \expandafter\parseunicodedataI\unicodedataline\relax
 197     \fi
 198   \repeat
 199   \closein0 %
 200 % All of the other data files have some common aspects to their format. We
 201 % therefore begin with some shared code. First a check for a comment line:
 202 % these can be skipped. (Currently only |MathClass.txt| is used by this code
 203 % is also usable with the other Unicode data files.)
 204   \edef\hash{\string#}%
 205   \def\parseunicodedataI#1\relax{%
 206     \unless\if\hash\firsttoken#1?\relax
 207       \parseunicodedataII#1\relax
 208     \fi
 209   }%
 210 % The first entry in all of the files is a code point or range of code points:
 211 % set up to find a range. The definition of |\parseunicodedataIV| will depend on
 212 % the data being processed and may need to split the remainder of the line
 213 % further.
 214   \def\parseunicodedataII#1;#2\relax{%
 215     \parseunicodedataIII#1....\relax{#2}%
 216   }%
 217   \def\parseunicodedataIII#1..#2..#3\relax#4{%
 218     \ifx\relax#2\relax
 219       \parseunicodedataIV{#1}{#1}#4\relax
 220     \else
 221       \parseunicodedataIV{#1}{#2}#4\relax
 222     \fi
 223   }%
 224 % A shared routine for reading the data files: only one part of the parser
 225 % has to be altered.
 226   \def\readandparse#1{%
 227     \catcode`\#=12 %
 228     \openin0=#1.txt %
 229 % Read two lines from the source file to extract the version information: it is
 230 % always the first two lines of the file.
 231     \read0 to \unicodedataline
 232     \message{\unicodedataline ^^J}%
 233     \read0 to \unicodedataline
 234     \message{\unicodedataline ^^J}%
 235     \loop\unless\ifeof0 %
 236       \read0 to \unicodedataline
 237       \unless\ifx\unicodedataline\storedpar
 238         \expandafter\parseunicodedataI\unicodedataline\relax
 239       \fi
 240     \repeat
 241     \catcode`\#=6 %
 242     \closein0 %
 243   }%
 244 \endgroup