trunk/support/load-unicode-xetex-classes.tex

   1 % File load-unicode-xetex-classes.tex
   2 %
   3 % Copyright 2015 The LaTeX3 Project
   4 %
   5 % It may be distributed and/or modified under the conditions of
   6 % the LaTeX Project Public License (LPPL), either version 1.3c of
   7 % this license or (at your option) any later version. The latest
   8 % version of this license is in the file
   9 % http://www.latex-project.org/lppl.txt.
  10 %
  11 % Issues with this file should be reported at
  12 % https://github.com/latex3/unicode-data
  13 %
  14 % This file parses EastAsianWidth.txt and LineBreak.txt, provided by the
  15 % Unicode Consortium, and when used with XeTeX sets \XeTeXcharclass for
  16 % the following classes of code point:
  17 % - "ID" (ideographic)
  18 % - "OP" (opener)
  19 % - "CL" (closer)
  20 % - "NS" (non-starter)
  21 % - "EX" (exclamation)
  22 % - "IS" (infix separator)
  23 % - "CM" (combining marks)
  24 %
  25 % All code points of class "ID" are assigned to a \XeTeXcharclass, but for
  26 % other classes this only occurs when they fall into east Asian width type
  27 % "F", "H" or "W" (full-, half- and wide-width).
  28 %
  29 % The following mappings between Unicode and XeTeX classes occur
  30 % - "ID" is class 1
  31 % - "OP" is class 2
  32 % - "CL", "NS", "EX", "IS" are class 3
  33 % - "CM" is class 256 (ignored)
  34 %
  35 % This file does _not_ activate XeTeX's inter-character token mechanism
  36 % (\XeTeXinterchartokenstate is not set) nor does it install any material in
  37 % the inter-character token registers.
  38 %
  39 % Note that this file is separate from the main loader as the data structure
  40 % here may need more refinement at the macro level.
  41 %
  42 % =============================================================================
  43 %
  44 % The data loaded here can currently only be used by XeTeX: check for the
  45 % appropriate primitive.
  46 \ifx\XeTeXcharclass\undefined
  47   \expandafter\endinput
  48 \fi
  49 % This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
  50 % |#| may not be correct. Everything is done in a group so that only the
  51 % settings we want to propagate are made available generally.
  52 \begingroup
  53   \catcode`\{=1 %
  54   \catcode`\}=2 %
  55 % Write some basic information to the log.
  56   \catcode`\^=7 %
  57   \newlinechar=`\^^J %
  58   \message{^^J}%
  59   \message{load-unicode-xetex-classes.tex v0.6 (2015-12-09)^^J}%
  60   \message{Reading Unicode east Asian character class data^^J}%
  61 % A string version of |#| will be needed to look for comment lines in the
  62 % source. Once that is done proper parsing can begin.
  63   \catcode`\#=12 %
  64   \def\hash{#}%
  65   \catcode`\#=6 %
  66   \def\firsttoken#1#2\relax{#1}%
  67   \def\parseunicodedataI#1\relax{%
  68     \unless\if\hash\firsttoken#1?\relax
  69       \parseunicodedataII#1\relax
  70     \fi
  71   }%
  72 % Both files to be parsed here have potential ranges of code points: find the
  73 % first entry and search for the second.
  74   \def\parseunicodedataII#1;#2 #3\relax{%
  75     \parseunicodedataIII#1....\relax{#2}%
  76   }%
  77 % From plain: may not be defined (yet).
  78   \def\loop#1\repeat{\def\body{#1}\iterate}%
  79   \def\iterate{%
  80     \body
  81       \let\next\iterate
  82     \else
  83       \let\next\relax
  84     \fi
  85     \next
  86   }%
  87   \let\repeat\fi
  88 % For the East Asian width data, save the class of the current token.
  89   \def\parseunicodedataIII#1..#2..#3\relax#4{%
  90     \expandafter\def\csname EAW@\number"#1\endcsname{#4}%
  91     \ifx\relax#2\relax
  92     \else
  93       \count0="#1 %
  94       \loop
  95         \ifnum\count0<"#2 %
  96           \advance\count0 by 1 %
  97           \expandafter\def\csname EAW@\number\count0\endcsname{#4}%
  98       \repeat
  99     \fi
 100   }%
 101 % A shared routine for reading the data files: only one part of the parser
 102 % has to be altered.
 103   \def\storedpar{\par}%
 104   \def\readandparse#1{%
 105     \openin0=#1.txt %
 106 % Read two lines from the source file to extract the version information
 107     \catcode`\#=12 %
 108     \read0 to \unicodedataline
 109     \message{\unicodedataline ^^J}%
 110     \read0 to \unicodedataline
 111     \message{\unicodedataline ^^J}%
 112     \loop\unless\ifeof0 %
 113       \read0 to \unicodedataline
 114       \unless\ifx\unicodedataline\storedpar
 115         \expandafter\parseunicodedataI\unicodedataline\relax
 116       \fi
 117     \repeat
 118     \catcode`\#=6 %
 119     \closein0 %
 120   }%
 121 % Read the east Asian width data: no settings are made at this stage.
 122   \readandparse{EastAsianWidth}%
 123 % Set up the different line break classes recognised.
 124   \chardef\XeTeXcharclassID=1 %
 125   \chardef\XeTeXcharclassOP=2 %
 126   \chardef\XeTeXcharclassCL=3 %
 127   \chardef\XeTeXcharclassEX=3 %
 128   \chardef\XeTeXcharclassIS=3 %
 129   \chardef\XeTeXcharclassNS=3 %
 130   \chardef\XeTeXcharclassCM=256 %
 131 % Check the line break class and if necessary the east Asian width for the
 132 % current code point. For code points of class |ID| there may be a range to
 133 % set, and these are always recorded. In other cases check the east Asian width
 134 % and set the class if appropriate.
 135   \def\ID{ID}%
 136   \def\parseunicodedataIII#1..#2..#3\relax#4{%
 137     \def\temp{#4}%
 138     \ifx\temp\ID
 139       \ifx\relax#2\relax
 140         \parseunicodedataIV{#1}{#1}%
 141       \else
 142         \parseunicodedataIV{#1}{#2}%
 143       \fi
 144     \else
 145       \ifnum 0%
 146         \if F\csname EAW@\number"#1\endcsname 1\fi
 147         \if H\csname EAW@\number"#1\endcsname 1\fi
 148         \if W\csname EAW@\number"#1\endcsname 1\fi
 149          >0 %
 150        \global\XeTeXcharclass"#1=\csname XeTeXcharclass#4\endcsname
 151       \fi
 152     \fi
 153   }%
 154 % As we are inside a loop already, there needs to be a group here to preserve
 155 % the iterator.
 156   \def\parseunicodedataIV#1#2{%
 157     \begingroup
 158       \count0="#1 %
 159       \loop
 160         \ifnum\count0<"#2 %
 161           \global\XeTeXcharclass\count0=1 %
 162           \advance\count0 by 1 %
 163       \repeat
 164     \endgroup
 165   }%
 166   \readandparse{LineBreak}%
 167 \endgroup