latex2e-20151001/base/ltunicode.dtx

   1 % \iffalse meta-comment
   2 %
   3 % Copyright 2014-2015
   4 % The LaTeX3 Project and any individual authors listed elsewhere
   5 % in this file.
   6 %
   7 % This file is part of the LaTeX base system.
   8 % -------------------------------------------
   9 %
  10 % It may be distributed and/or modified under the
  11 % conditions of the LaTeX Project Public License, either version 1.3c
  12 % of this license or (at your option) any later version.
  13 % The latest version of this license is in
  14 %    http://www.latex-project.org/lppl.txt
  15 % and version 1.3c or later is part of all distributions of LaTeX
  16 % version 2005/12/01 or later.
  17 %
  18 % This file has the LPPL maintenance status "maintained".
  19 %
  20 % The list of all files belonging to the LaTeX base distribution is
  21 % given in the file `manifest.txt'. See also `legal.txt' for additional
  22 % information.
  23 %
  24 % The list of derived (unpacked) files belonging to the distribution
  25 % and covered by LPPL is defined by the unpacking scripts (with
  26 % extension .ins) which are part of the distribution.
  27 %
  28 % -----------------------------------------------------------------------------
  29 %
  30 % The same approach as used in \pkg{DocStrip}: if \cs{documentclass}
  31 % is undefined then skip the driver, allowing the file to be used directly.
  32 % This works as the \cs{fi} is only seen if \LaTeX{} is not in use. The odd
  33 % \cs{jobname} business allows the extraction to work with \LaTeX{} provided
  34 % an appropriate \texttt{.ins} file is set up.
  35 %<*gobble>
  36 \ifx\jobname\relax
  37   \let\documentclass\undefined
  38 \fi
  39 \begingroup\expandafter\expandafter\expandafter\endgroup
  40 \expandafter\ifx\csname documentclass\endcsname\relax
  41 \else
  42   \csname fi\endcsname
  43 %</gobble>
  44 %
  45 %<*driver>
  46 \ProvidesFile{ltunicode.dtx}
  47   [2015/08/10 v1.0l LaTeX Kernel (Unicode data)]
  48 \documentclass{ltxdoc}
  49 \begin{document}
  50 \DocInput{\jobname.dtx}
  51 \end{document}
  52 %<*gobble>
  53 \fi
  54 %</gobble>
  55 %</driver>
  56 % \fi
  57 %
  58 % \GetFileInfo{ltunicode.dtx}
  59 % \title{The \texttt{ltunicode.dtx} file\thanks
  60 %     {This file has version number \fileversion, dated \filedate.}\\
  61 %       for use with \LaTeXe}
  62 % \author{The \LaTeX3 Project}
  63 %
  64 % \MaintainedByLaTeXTeam{latex}
  65 % \maketitle
  66 %
  67 % This script extracts data from the Unicode Consortium files
  68 % |UnicodeData.txt|, |EastAsianWidth.txt| and |LineBreak.txt| to be used for
  69 % setting up \LaTeXe{} (or plain \TeX{}) with sane default settings when using
  70 % the Xe\TeX{} and Lua\TeX{} engines. Details of the process are included in
  71 % the code comments.
  72 %
  73 % To create the extracted file, run this file in a location containing
  74 % the three input data files using \texttt{pdftex}. (The code requires
  75 % \cs{pdfmdfivesum} and the e-\TeX{} extensions: it could be adapted for
  76 % Lua\TeX{}).
  77 %
  78 % \StopEventually{}
  79 %
  80 %    \begin{macrocode}
  81 %<*script>
  82 %    \end{macrocode}
  83 %
  84 % \section{General set up}
  85 %
  86 % The script is designed to work with plain \TeX{} and so |@| is made into
  87 % a `letter' using the primitive approach.
  88 %    \begin{macrocode}
  89 \catcode`\@=11 %
  90 %    \end{macrocode}
  91 %
  92 % \begin{macro}{\gobble}
  93 % \begin{macro}{\gobblethree}
  94 % \begin{macro}{\firsttoken}
  95 %   Standard utilities.
  96 %    \begin{macrocode}
  97 \long\def\gobble#1{}
  98 \long\def\gobblethree#1#2#3{}
  99 \long\def\firsttoken#1#2\relax{#1}
 100 %    \end{macrocode}
 101 % \end{macro}
 102 % \end{macro}
 103 % \end{macro}
 104 %
 105 % \begin{macro}{\storedpar}
 106 %   A simple piece of test set up: the final line of the read file will be
 107 %   tokenized by \TeX{} as \cs{par} which can be tested by \cs{ifx} provided
 108 %   we have an equivalent available.
 109 %    \begin{macrocode}
 110 \def\storedpar{\par}
 111 %    \end{macrocode}
 112 % \end{macro}
 113 %
 114 % \begin{macro}{\return}
 115 %   A stored |^^M| for string comparisons.
 116 %    \begin{macrocode}
 117 \begingroup
 118   \catcode`\^^M=12 %
 119   \gdef\return{^^M}%
 120 \endgroup%
 121 %    \end{macrocode}
 122 % \end{macro}
 123 %
 124 % \begin{macro}{\sourceforhex}
 125 % \begin{macro}{\sethex}
 126 % \begin{macro}{\dohex}
 127 % \begin{macro}{\hexdigit}
 128 %   Some parts of the code here will need to be able to convert integers
 129 %   to their hexadecimal equivalent. That is easiest to do for the requirements
 130 %   here using a modified version of some code from Appendix~D of \emph{The
 131 %   \TeX{}book}.
 132 %    \begin{macrocode}
 133 \newcount\sourceforhex
 134 \def\sethex#1#2{%
 135   \def#1{}%
 136   \sourceforhex=#2\relax
 137   \ifnum\sourceforhex=0 %
 138     \def#1{0}%
 139   \else
 140     \dohex#1%
 141   \fi
 142 }
 143 \def\dohex#1{%
 144   \begingroup
 145     \count0=\sourceforhex
 146     \divide\sourceforhex by 16 %
 147     \ifnum\sourceforhex>0 %
 148       \dohex#1%
 149     \fi
 150     \count2=\sourceforhex
 151     \multiply\count2 by -16 %
 152     \advance\count0 by\count2
 153     \hexdigit#1%
 154   \expandafter\endgroup
 155   \expandafter\def\expandafter#1\expandafter{#1}%
 156 }
 157 \def\hexdigit#1{%
 158   \ifnum\count0<10 %
 159     \edef#1{#1\number\count0}%
 160   \else
 161     \advance\count0 by -10 %
 162     \edef#1{#1\ifcase\count0 A\or B\or C\or D\or E\or F\fi}%
 163   \fi
 164 }
 165 %    \end{macrocode}
 166 % \end{macro}
 167 % \end{macro}
 168 % \end{macro}
 169 % \end{macro}
 170 %
 171 % \begin{macro}{\unicoderead, \unicodewrite}
 172 %   Set up the streams for data.
 173 %    \begin{macrocode}
 174 \newread\unicoderead
 175 \newwrite\unicodewrite
 176 %    \end{macrocode}
 177 % \end{macro}
 178 %
 179 % \section{Verbatim copying}
 180 %
 181 % \begin{macro}{\verbatimcopy}
 182 % \begin{macro}{\endverbatimcopy}
 183 % \begin{macro}{\verbatimcopy@auxii}
 184 % \begin{macro}{\verbatimcopy@auxii}
 185 % \begin{macro}{\verbatim@endmarker}
 186 %   Set up to read some material verbatim and write it to the output stream.
 187 %   There needs to be a dedicated `clean up first line' macro, but other than
 188 %   that life is simple enough.
 189 %    \begin{macrocode}
 190 \begingroup
 191   \catcode`\^^M=12 %
 192   \gdef\verbatimcopy{%
 193     \begingroup%
 194       \catcode`\^^M=12 %
 195       \catcode`\\=12 %
 196       \catcode`\{=12 %
 197       \catcode`\}=12 %
 198       \catcode`\#=12 %
 199       \catcode`\%=12 %
 200       \catcode`\ =12 %
 201       \endlinechar=`\^^M %
 202       \verbatimcopy@auxi
 203   }%
 204   \gdef\verbatimcopy@auxi#1^^M{%
 205     \expandafter\verbatimcopy@auxii\gobble#1^^M%
 206   }%
 207   \gdef\verbatimcopy@auxii#1^^M{%
 208     \def\temp{#1}%
 209     \ifx\temp\verbatim@endmarker%
 210       \expandafter\endgroup%
 211     \else%
 212       \ifx\temp\empty\else%
 213         \immediate\write\unicodewrite{#1}%
 214       \fi%
 215       \expandafter\verbatimcopy@auxii%
 216     \fi%
 217   }%
 218 \endgroup%
 219 \edef\verbatim@endmarker{\expandafter\gobble\string\\}
 220 \edef\verbatim@endmarker{\verbatim@endmarker endverbatimcopy}
 221 %    \end{macrocode}
 222 % \end{macro}
 223 % \end{macro}
 224 % \end{macro}
 225 % \end{macro}
 226 % \end{macro}
 227 %
 228 % \section{File header section}
 229 %
 230 % \changes{v1.0d}{2015/03/26}{Renamed data file to
 231 %   \texttt{unicode-letters.def}}
 232 % With the mechanisms set up, open the data file for writing.
 233 %    \begin{macrocode}
 234 \immediate\openout\unicodewrite=unicode-letters.def %
 235 %    \end{macrocode}
 236 % There are various lines that now need to go at the start of the file.
 237 % First, there is some header information. Parts of it are auto-generated,
 238 % so there is some interspersing of verbatim and non-verbatim parts.
 239 %    \begin{macrocode}
 240 \verbatimcopy
 241 %% This is the file `unicode-letters.def',
 242 %% generated using the script ltunicode.dtx.
 243 %%
 244 %% The data here are derived from the files
 245 \endverbatimcopy
 246 %    \end{macrocode}
 247 %
 248 % \changes{v1.0b}{2015/03/25}{Include Unicode version data in generated
 249 %   file}
 250 % \changes{v1.0c}{2015/03/25}{Include MD5 sums for sources in generated
 251 %   file}
 252 % \changes{v1.0f}{2015/03/26}{Include dates for sources in generated
 253 %   file}
 254 % \begin{macro}{\parseunicodedata}
 255 % \begin{macro}{\parseunicodedata@auxi}
 256 % \begin{macro}{\parseunicodedata@auxii}
 257 % \begin{macro}{\mdfiveinfo}
 258 %   To ensure that there is a full audit trail for the data, we record
 259 %   both the reported file version (if available) and the checksum for each
 260 %   of the source files. This is done by reading the first line of each file
 261 %   and parsing for the version string and if found reading the second line
 262 %   for a date/time, and then `catching' the entire files inside a macro to
 263 %   work out the checksums.
 264 %    \begin{macrocode}
 265 \def\parseunicodedata#1{%
 266   \openin\unicoderead=#1.txt %
 267   \ifeof\unicoderead
 268     \errmessage{Data file missing: #1.txt}%
 269   \fi
 270   \immediate\write\unicodewrite{%
 271     \expandafter\gobble\string\%\expandafter\gobble\string\%
 272     - #1.txt
 273   }%
 274   \readline\unicoderead to \unicodedataline
 275   \edef\unicodedataline{\unicodedataline\detokenize{-.txt}}%
 276   \expandafter\parseunicodedata@auxi\unicodedataline\relax{#1}%
 277 }
 278 \begingroup
 279 \catcode`\T=12 %
 280 \catcode`\X=12 %
 281 \lowercase{%
 282   \endgroup
 283   \def\parseunicodedata@auxi#1-#2.TXT#3\relax#4}%
 284 {%
 285   \ifx\relax#2\relax
 286   \else
 287     \readline\unicoderead to \unicodedataline
 288     \expandafter\parseunicodedata@auxii\unicodedataline\relax
 289   \fi
 290   \closein\unicoderead
 291   \begingroup
 292     \everyeof{\noexpand}%
 293     \catcode`\#=12 %
 294     \edef\mdfiveinfo{\input#4.txt\space}%
 295   \expandafter\endgroup
 296   \expandafter\def\expandafter\mdfiveinfo\expandafter{\mdfiveinfo}%
 297   \immediate\write\unicodewrite{%
 298     \expandafter\gobble\string\%\expandafter\gobble\string\%
 299     \space\space
 300     \ifx\relax#2\relax
 301     \else
 302       Version #2 dated \temp^^J%
 303       \expandafter\gobble\string\%\expandafter\gobble\string\%
 304       \space\space
 305     \fi
 306     MD5 sum \pdfmdfivesum\expandafter{\mdfiveinfo}%
 307   }%
 308 }
 309 \def\parseunicodedata@auxii#1: #2, #3 #4\relax{%
 310   \def\temp{#2, #3}%
 311 }
 312 \parseunicodedata{UnicodeData}
 313 \parseunicodedata{EastAsianWidth}
 314 \parseunicodedata{LineBreak}
 315 %    \end{macrocode}
 316 % \end{macro}
 317 % \end{macro}
 318 % \end{macro}
 319 % \end{macro}
 320 %
 321 %    \begin{macrocode}
 322 \verbatimcopy
 323 %% which are maintained by the Unicode Consortium.
 324 %%
 325 \endverbatimcopy
 326 %    \end{macrocode}
 327 %
 328 % Automatically include the current date.
 329 %    \begin{macrocode}
 330 \immediate\write\unicodewrite{%
 331   \expandafter\gobble\string\%\expandafter\gobble\string\%
 332   Generated on \the\year
 333     -\ifnum\month>9 \else 0\fi \the\month
 334     -\ifnum\day>9   \else 0\fi \the\day.
 335 }
 336 %    \end{macrocode}
 337 %
 338 % Back to simple text copying
 339 %    \begin{macrocode}
 340 \verbatimcopy
 341 %%
 342 %% Copyright 2014-2015
 343 %% The LaTeX3 Project and any individual authors listed elsewhere
 344 %% in this file.
 345 %%
 346 %% This file is part of the LaTeX base system.
 347 %% -------------------------------------------
 348 %%
 349 %% It may be distributed and/or modified under the
 350 %% conditions of the LaTeX Project Public License, either version 1.3c
 351 %% of this license or (at your option) any later version.
 352 %% The latest version of this license is in
 353 %%    http://www.latex-project.org/lppl.txt
 354 %% and version 1.3c or later is part of all distributions of LaTeX
 355 %% version 2005/12/01 or later.
 356 %%
 357 %% This file has the LPPL maintenance status "maintained".
 358 %%
 359 %% The list of all files belonging to the LaTeX base distribution is
 360 %% given in the file `manifest.txt'. See also `legal.txt' for additional
 361 %% information.
 362 \endverbatimcopy
 363 %    \end{macrocode}
 364 %
 365 % \section{Unicode character data}
 366 %
 367 % \changes{v1.0e}{2015/03/26}{Correctly parse ranges in
 368 %   \texttt{UnicodeData.txt}}
 369 % \begin{macro}{\parseunicodedata}
 370 % \begin{macro}{\parseunicodedata@auxi}
 371 % \begin{macro}{\parseunicodedata@auxii}
 372 % \begin{macro}{\parseunicodedata@auxiii}
 373 % \begin{macro}{\parseunicodedata@auxiv}
 374 % \begin{macro}{\parseunicodedata@auxv}
 375 % \begin{macro}{\parseunicodedata@auxvi}
 376 %   The first step of parsing a line of data is to check that it's not come
 377 %   from a blank in the source, which will have been tokenized as \cs{par}.
 378 %   Assuming that is not the case, there are lots of data items separated by
 379 %   |;|. Of those, only a few are needed so they are picked out and everything
 380 %   else is dropped. There is one complication: there are a few cases in the
 381 %   data file of ranges which are marked by the descriptor |First| and a
 382 %   matching |Last|. A separate routine is used to handle these cases.
 383 %    \begin{macrocode}
 384 \def\parseunicodedata#1{%
 385   \ifx#1\storedpar
 386   \else
 387     \expandafter\parseunicodedata@auxi#1\relax
 388   \fi
 389 }
 390 \def\parseunicodedata@auxi#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
 391   \parseunicodedata@auxii#1;#3;#2 First>\relax
 392 }
 393 \def\parseunicodedata@auxii#1;#2;#3 First>#4\relax{%
 394   \ifx\relax#4\relax
 395     \expandafter\parseunicodedata@auxiii
 396   \else
 397     \expandafter\parseunicodedata@auxv
 398   \fi
 399     #1;#2;%
 400 }
 401 \def\parseunicodedata@auxiii#1;#2;#3;#4;#5;#6;#7;#8\relax{%
 402   \parseunicodedata@auxiv{#1}{#2}{#6}{#7}%
 403 }
 404 %    \end{macrocode}
 405 %   At this stage we have only four pieces of data
 406 %   \begin{enumerate}
 407 %     \item The code value
 408 %     \item The general class
 409 %     \item The uppercase mapping
 410 %     \item The lowercase mapping
 411 %   \end{enumerate}
 412 %   where both one or both of the last two may be empty. Everything here could
 413 %   be done in a single conditional within a \cs{write}, but that would be
 414 %   tricky to follow. Instead, a series of defined auxiliaries are used to
 415 %   show the flow. Notice that combining marks are treated as letters here
 416 %   (the second `letter' test).
 417 %    \begin{macrocode}
 418 \def\parseunicodedata@auxiv#1#2#3#4{%
 419   \if L\firsttoken#2?\relax
 420     \expandafter\unicodeletter
 421   \else
 422     \if M\firsttoken#2?\relax
 423       \expandafter\expandafter\expandafter\unicodeletter
 424     \else
 425       \expandafter\expandafter\expandafter\unicodenonletter
 426     \fi
 427   \fi
 428     {#1}{#3}{#4}%
 429 }
 430 %    \end{macrocode}
 431 % \changes{v1.0i}{2015/06/20}{Include first code point in a range in output}
 432 %   In the case where the first code point for a range was found, we
 433 %   assume the next line is the last code point (it always is). It's then
 434 %   a question of checking if the range is a set of letters or not, and if
 435 %   so going though them all and adding to the data file.
 436 %    \begin{macrocode}
 437 \def\parseunicodedata@auxv#1;#2;#3\relax{%
 438   \read\unicoderead to \unicodedataline
 439   \expandafter\parseunicodedata@auxvi\unicodedataline\relax#1;#2\relax
 440 }
 441 \def\parseunicodedata@auxvi#1;#2\relax#3;#4\relax{%
 442   \if L\firsttoken#4?\relax
 443     \count@="#3 %
 444     \begingroup
 445       \loop
 446         \unless\ifnum\count@>"#1 %
 447           \sethex\temp{\count@}%
 448           \unicodeletter\temp\temp\temp
 449           \advance\count@\@ne
 450       \repeat
 451     \endgroup
 452   \fi
 453 }
 454 %    \end{macrocode}
 455 % \end{macro}
 456 % \end{macro}
 457 % \end{macro}
 458 % \end{macro}
 459 % \end{macro}
 460 % \end{macro}
 461 % \end{macro}
 462 %
 463 % \changes{v1.0g}{2015/03/26}{Add missing \cs{global} in definition of \cs{C}}
 464 % \changes{v1.0j}{2015/08/05}{Compress data for caseless letters}
 465 % \changes{v1.0j}{2015/08/05}{Save some space by dropping end-of-line
 466 %   percent chars}
 467 % \begin{macro}{\unicodeletter, \unicodenonletter}
 468 % \begin{macro}{\writeunicodedatafull}
 469 % \begin{macro}{\writeunicodedatacompact}
 470 %   For `letters', we always want to write the data to file, and the only
 471 %   question here is if the character has case mappings or these point back
 472 %   to the character itself. If there are no mappings or the mappings are
 473 %   all equivalent to the same code point then use a shorter version of
 474 %   the write macro.
 475 %    \begin{macrocode}
 476 \def\unicodeletter#1#2#3{%
 477   \ifx\relax#2#3\relax
 478     \writeunicodedatacompact\l{#1}%
 479   \else
 480     \ifnum 0%
 481       \ifnum"#1="\ifx\relax#2\relax#1 \else#2 \fi\else1\fi
 482       \ifnum"#1="\ifx\relax#3\relax#1 \else#3 \fi\else1\fi
 483       >0 %
 484       \writeunicodedatafull\L{#1}{#2}{#3}%
 485     \else
 486       \writeunicodedatacompact\l{#1}%
 487     \fi
 488   \fi
 489 }
 490 %    \end{macrocode}
 491 %   Cased non-letters can also exist: they can be detected as they have at
 492 %   least one case mapping. Write these in much the same way as letters,
 493 %   but always with a full mapping (must be the case to require the entry
 494 %   at all).
 495 %    \begin{macrocode}
 496 \def\unicodenonletter#1#2#3{%
 497   \ifx\relax#2#3\relax
 498   \else
 499     \writeunicodedatafull\C{#1}{#2}{#3}%
 500   \fi
 501 }
 502 %    \end{macrocode}
 503 %   Actually write the data. In all cases both upper- and lower-case mappings
 504 %   are given, so there is a need to test that both were actually available and
 505 %   if not set up to do nothing. Cases where both mappings are no-ops will
 506 %   already have been filtered out and are written in a shorter form: this
 507 %   saves a significant amount of space in the file.
 508 %    \begin{macrocode}
 509 \def\writeunicodedatafull#1#2#3#4{%
 510   \immediate\write\unicodewrite{%
 511     \space\space
 512     \string#1\space
 513     #2 %
 514     \ifx\relax#3\relax
 515       #2 %
 516     \else
 517       #3 %
 518     \fi
 519     \ifx\relax#4\relax
 520       #2%
 521     \else
 522       #4%
 523     \fi
 524   }%
 525 }
 526 \def\writeunicodedatacompact#1#2{%
 527   \immediate\write\unicodewrite{%
 528     \space\space
 529     \string#1\space
 530     #2%
 531   }%
 532 }
 533 %    \end{macrocode}
 534 % \end{macro}
 535 % \end{macro}
 536 % \end{macro}
 537 %
 538 % There is now a lead-in section which creates the macros which take the
 539 % processed data and do the code assignments. Everything is done within a
 540 % group so that there is no need to worry about names.
 541 %    \begin{macrocode}
 542 \verbatimcopy
 543 \begingroup
 544 \endverbatimcopy
 545 %    \end{macrocode}
 546 % Cased non-letters simply need to have the case mappings set.
 547 % For letters, there are a few things to sort out. First, the case mappings are
 548 % defined as for non-letters. Category code is then set to $11$ before a check
 549 % to see if this is an upper case letter. If it is then the \cs{sfcode} is set
 550 % to $999$. Finally there is a need to deal with Unicode math codes, where base
 551 % plane letters are class $7$ but supplementary plane letters are class~$1$.
 552 % Older versions of Xe\TeX{} used a different name here: easy to pick up as
 553 % we know that this primitive must be defined in some way. There is also an issue
 554 % with the supplementary plane and older Xe\TeX{} versions, which is dealt with
 555 % using a check at run time.
 556 %    \begin{macrocode}
 557 \verbatimcopy
 558   \def\C#1 #2 #3 {%
 559     \XeTeXcheck{#1}%
 560     \global\uccode"#1="#2 %
 561     \global\lccode"#1="#3 %
 562   }
 563   \def\L#1 #2 #3 {%
 564     \C #1 #2 #3 %
 565     \global\catcode"#1=11 %
 566     \ifnum"#1="#3 %
 567     \else
 568       \global\sfcode"#1=999 %
 569     \fi
 570     \ifnum"#1<"10000 %
 571       \global\Umathcode"#1="7"01"#1 %
 572     \else
 573       \global\Umathcode"#1="0"01"#1 %
 574     \fi
 575   }
 576   \def\l#1 {\L#1 #1 #1 }
 577   \ifx\Umathcode\undefined
 578     \let\Umathcode\XeTeXmathcode
 579   \fi
 580   \def\XeTeXcheck#1{}
 581   \ifx\XeTeXversion\undefined
 582   \else
 583     \def\XeTeXcheck.#1.#2-#3\relax{#1}
 584      \ifnum\expandafter\XeTeXcheck\XeTeXrevision.-\relax>996 %
 585        \def\XeTeXcheck#1{}
 586      \else
 587        \def\XeTeXcheck#1{%
 588           \ifnum"#1>"FFFF %
 589             \long\def\XeTeXcheck##1\endgroup{\endgroup}
 590             \expandafter\XeTeXcheck
 591           \fi
 592        }
 593      \fi
 594   \fi
 595 \endverbatimcopy
 596 %    \end{macrocode}
 597 % Read the data and write the resulting code assignments to the file.
 598 %    \begin{macrocode}
 599 \openin\unicoderead=UnicodeData.txt %
 600 \loop\unless\ifeof\unicoderead
 601   \read\unicoderead to \unicodedataline
 602   \parseunicodedata\unicodedataline
 603 \repeat
 604 %    \end{macrocode}
 605 % End the group for setting character codes and assign a couple of special
 606 % cases.
 607 %    \begin{macrocode}
 608 \verbatimcopy
 609 \endgroup
 610 \global\sfcode"2019=0 %
 611 \global\sfcode"201D=0 %
 612 \endverbatimcopy
 613 %    \end{macrocode}
 614 %
 615 % \section{Xe\TeX{} Character classes}
 616 %
 617 % The Xe\TeX{} engine includes the concept of character classes, which allow
 618 % insertion of tokens into the input stream at defined boundaries. Setting
 619 % up this data requires a two-part process as the information is split over
 620 % two input files.
 621 %
 622 % \begin{macro}{\parseunicodedata}
 623 % \begin{macro}{\parseunicodedata@auxi}
 624 % \begin{macro}{\parseunicodedata@auxii}
 625 %   The parsing system is redefined to parse a detokenized input line which
 626 %   may be a comment starting with |#|. Assuming that is not the case, the
 627 %   data line with start with a code point potentially forming part of a range.
 628 %   The range is extracted and the width stored for each code point.
 629 %    \begin{macrocode}
 630 \def\parseunicodedata#1{%
 631   \ifx#1\return
 632   \else
 633     \if\expandafter\gobble\string\#\expandafter\firsttoken#1?\relax
 634     \else
 635       \expandafter\parseunicodedata@auxi#1\relax
 636     \fi
 637   \fi
 638 }
 639 \def\parseunicodedata@auxi#1;#2 #3\relax{%
 640   \parseunicodedata@auxii#1....\relax{#2}%
 641 }
 642 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
 643   \expandafter\gdef\csname EAW@#1\endcsname{#4}%
 644   \ifx\relax#2\relax
 645   \else
 646     \count@="#1 %
 647     \begingroup
 648       \loop
 649         \ifnum\count@<"#2 %
 650           \advance\count@\@ne
 651           \sethex\temp{\count@}%
 652           \expandafter\gdef\csname EAW@\temp\endcsname{#4}%
 653       \repeat
 654     \endgroup
 655   \fi
 656 }
 657 %    \end{macrocode}
 658 % \end{macro}
 659 % \end{macro}
 660 % \end{macro}
 661 %
 662 % With the right parser in place, read the data file.
 663 %    \begin{macrocode}
 664 \openin\unicoderead=EastAsianWidth.txt %
 665 \loop\unless\ifeof\unicoderead
 666   \readline\unicoderead to \unicodedataline
 667   \parseunicodedata\unicodedataline
 668 \repeat
 669 %    \end{macrocode}
 670 %
 671 % \changes{v1.0j}{2015/08/05}{Compress East Asian width data by recording
 672 %   ranges for code points of type \texttt{ID}}
 673 % \begin{macro}{\parseunicodedata@auxii}
 674 % \begin{macro}{\parseunicodedata@auxiii}
 675 % \begin{macro}{\parseunicodedata@auxiv}
 676 % \begin{macro}{\parseunicodedata@auxv}
 677 % \begin{macro}{\ID}
 678 % \begin{macro}{\OP}
 679 % \begin{macro}{\CL}
 680 % \begin{macro}{\EX}
 681 % \begin{macro}{\IS}
 682 % \begin{macro}{\NS}
 683 % \begin{macro}{\CM}
 684 %  The final file to read, |LineBreak.txt|, uses the same format as\\
 685 %  |EastAsianWidth.txt|. As such, only the final parts of the parser have to be
 686 %  redefined. The first stage here is to check if the line breaking class
 687 %  is known, and if so if it is equal to |ID| (class one).
 688 %    \begin{macrocode}
 689 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
 690   \ifcsname #4\endcsname
 691     \ifnum\csname #4\endcsname=\@ne
 692       \expandafter\expandafter\expandafter\parseunicodedata@auxiii
 693     \else
 694       \expandafter\expandafter\expandafter\parseunicodedata@auxiv
 695     \fi
 696   \else
 697     \expandafter\gobblethree
 698   \fi
 699     {#1}{#2}{#4}%
 700 }
 701 %    \end{macrocode}
 702 %   For ranges of class \texttt{ID}, the entire range is written to the data
 703 %   file as a single block: no need to check on the width data.
 704 %    \begin{macrocode}
 705 \def\parseunicodedata@auxiii#1#2#3{%
 706   \immediate\write\unicodewrite{%
 707     \space\space
 708     \expandafter\string\csname #3\endcsname
 709     \space
 710     #1 \ifx\relax#2\relax#1\else#2\fi
 711   }%
 712 }
 713 %    \end{macrocode}
 714 %   For other cases, loop over each code point separately. If the
 715 %   code point is of width |F|, |H| or |W| then the line breaking
 716 %   property is written to the data file. The earlier check means that
 717 %   this only happens for characters of classes \texttt{OP} (opener),
 718 %   \texttt{CL} (closer), \texttt{EX} (exclamation), \texttt{IS} (infix sep),
 719 %   \texttt{NS} (non-starter) and \texttt{CM} (combining marks) characters
 720 %   (the latter need to be transparent to the mechanism).
 721 %    \begin{macrocode}
 722 \def\parseunicodedata@auxiv#1#2#3{%
 723   \parseunicodedata@auxv{#1}{#3}%
 724   \ifx\relax#2\relax
 725   \else
 726     \count@="#1 %
 727     \begingroup
 728       \loop
 729         \ifnum\count@<"#2 %
 730           \advance\count@\@ne
 731           \sethex\temp{\count@}%
 732           \expandafter\parseunicodedata@auxv\expandafter{\temp}{#3}%
 733       \repeat
 734     \endgroup
 735   \fi
 736 }
 737 \def\parseunicodedata@auxv#1#2{%
 738   \ifnum 0%
 739     \if F\csname EAW@#1\endcsname 1\fi
 740     \if H\csname EAW@#1\endcsname 1\fi
 741     \if W\csname EAW@#1\endcsname 1\fi
 742      >0 %
 743     \immediate\write\unicodewrite{%
 744       \space\space
 745       \expandafter\string\csname #2\endcsname
 746       \space
 747       #1%
 748     }%
 749   \fi
 750 }
 751 %    \end{macrocode}
 752 %   The East Asian width class mappings.
 753 %    \begin{macrocode}
 754 \def\ID{1}
 755 \def\OP{2}
 756 \def\CL{3}
 757 \let\EX\CL
 758 \let\IS\CL
 759 \let\NS\CL
 760 \def\CM{256}
 761 %    \end{macrocode}
 762 % \end{macro}
 763 % \end{macro}
 764 % \end{macro}
 765 % \end{macro}
 766 % \end{macro}
 767 % \end{macro}
 768 % \end{macro}
 769 % \end{macro}
 770 % \end{macro}
 771 % \end{macro}
 772 % \end{macro}
 773 %
 774 % \changes{v1.0h}{2015/05/11}{Apply category codes to East Asian
 775 %   chars in all cases}
 776 % \changes{v1.0j}{2015/08/05}{Move the stop point for Lua\TeX{}}
 777 % \changes{v1.0j}{2015/08/05}{Only set \cs{XeTeXcharclass} from
 778 %   East Asian char data}
 779 % \changes{v1.0k}{2015/08/06}{Add missing \cs{endgroup}}
 780 % \changes{v1.0l}{2015/08/10}{Add missing \cs{endgroup}}
 781 % Before actually reading the line breaking data file, the appropriate
 782 % temporary code is added to the output. As described above, only a limited
 783 % number of classes need to be covered: they are hard-coded as classes
 784 % $1$, $2$ and $3$ following the convention adopted by plain Xe\TeX{}.
 785 %    \begin{macrocode}
 786 \verbatimcopy
 787 \begingroup
 788   \ifx\XeTeXchartoks\XeTeXcharclass
 789     \endgroup\expandafter\endinput
 790   \else
 791     \def\setclass#1#2#3{%
 792       \ifnum#1>#2 %
 793         \expandafter\gobble
 794       \else
 795         \expandafter\firstofone
 796       \fi
 797         {%
 798           \global\XeTeXcharclass#1=#3 %
 799           \expandafter\setclass\expandafter
 800             {\number\numexpr#1+1\relax}{#2}{#3}%
 801         }%
 802     }%
 803     \def\gobble#1{}
 804     \def\firstofone#1{#1}
 805     \def\ID#1 #2 {\setclass{"#1}{"#2}{1}}
 806     \def\OP#1 {\setclass{"#1}{"#1}{2}}
 807     \def\CL#1 {\setclass{"#1}{"#1}{3}}
 808     \def\EX#1 {\setclass{"#1}{"#1}{3}}
 809     \def\IS#1 {\setclass{"#1}{"#1}{3}}
 810     \def\NS#1 {\setclass{"#1}{"#1}{3}}
 811     \def\CM#1 {\setclass{"#1}{"#1}{256}}
 812   \fi
 813 \endverbatimcopy
 814 %    \end{macrocode}
 815 %
 816 % Read the line breaking data and save to the output.
 817 %    \begin{macrocode}
 818 \openin\unicoderead=LineBreak.txt %
 819 \loop\unless\ifeof\unicoderead
 820   \readline\unicoderead to \unicodedataline
 821   \parseunicodedata\unicodedataline
 822 \repeat
 823 %    \end{macrocode}
 824 %
 825 % \changes{v1.0a}{2015/03/25}{Use \cs{hskip} rather than \cs{hspace}
 826 %   in glue settings}
 827 % Set up material to be inserted between character classes.
 828 % that provided by plain Xe\TeX{}. Using \cs{hskip} here means the code will
 829 % work with plain as well as \LaTeXe{}.
 830 %    \begin{macrocode}
 831 \verbatimcopy
 832 \endgroup
 833 \gdef\xtxHanGlue{\hskip0pt plus 0.1em\relax}
 834 \gdef\xtxHanSpace{\hskip0.2em plus 0.2em minus 0.1em\relax}
 835 \global\XeTeXinterchartoks 0 1 = {\xtxHanSpace}
 836 \global\XeTeXinterchartoks 0 2 = {\xtxHanSpace}
 837 \global\XeTeXinterchartoks 0 3 = {\nobreak\xtxHanSpace}
 838 \global\XeTeXinterchartoks 1 0 = {\xtxHanSpace}
 839 \global\XeTeXinterchartoks 2 0 = {\nobreak\xtxHanSpace}
 840 \global\XeTeXinterchartoks 3 0 = {\xtxHanSpace}
 841 \global\XeTeXinterchartoks 1 1 = {\xtxHanGlue}
 842 \global\XeTeXinterchartoks 1 2 = {\xtxHanGlue}
 843 \global\XeTeXinterchartoks 1 3 = {\nobreak\xtxHanGlue}
 844 \global\XeTeXinterchartoks 2 1 = {\nobreak\xtxHanGlue}
 845 \global\XeTeXinterchartoks 2 2 = {\nobreak\xtxHanGlue}
 846 \global\XeTeXinterchartoks 2 3 = {\xtxHanGlue}
 847 \global\XeTeXinterchartoks 3 1 = {\xtxHanGlue}
 848 \global\XeTeXinterchartoks 3 2 = {\xtxHanGlue}
 849 \global\XeTeXinterchartoks 3 3 = {\nobreak\xtxHanGlue}
 850 \endverbatimcopy
 851 %    \end{macrocode}
 852 %
 853 % Done: end the script.
 854 %    \begin{macrocode}
 855 \bye
 856 %    \end{macrocode}
 857 %
 858 %    \begin{macrocode}
 859 %</script>
 860 %    \end{macrocode}