base/ltunicode.dtx

   1 % \iffalse meta-comment
   2 %
   3 % Copyright 2014-2015
   4 % The LaTeX3 Project and any individual authors listed elsewhere
   5 % in this file.
   6 %
   7 % This file is part of the LaTeX base system.
   8 % -------------------------------------------
   9 %
  10 % It may be distributed and/or modified under the
  11 % conditions of the LaTeX Project Public License, either version 1.3c
  12 % of this license or (at your option) any later version.
  13 % The latest version of this license is in
  14 %    http://www.latex-project.org/lppl.txt
  15 % and version 1.3c or later is part of all distributions of LaTeX
  16 % version 2005/12/01 or later.
  17 %
  18 % This file has the LPPL maintenance status "maintained".
  19 %
  20 % The list of all files belonging to the LaTeX base distribution is
  21 % given in the file `manifest.txt'. See also `legal.txt' for additional
  22 % information.
  23 %
  24 % The list of derived (unpacked) files belonging to the distribution
  25 % and covered by LPPL is defined by the unpacking scripts (with
  26 % extension .ins) which are part of the distribution.
  27 %
  28 % -----------------------------------------------------------------------------
  29 %
  30 % The same approach as used in \pkg{DocStrip}: if \cs{documentclass}
  31 % is undefined then skip the driver, allowing the file to be used directly.
  32 % This works as the \cs{fi} is only seen if \LaTeX{} is not in use. The odd
  33 % \cs{jobname} business allows the extraction to work with \LaTeX{} provided
  34 % an appropriate \texttt{.ins} file is set up.
  35 %<*gobble>
  36 \ifx\jobname\relax
  37   \let\documentclass\undefined
  38 \fi
  39 \begingroup\expandafter\expandafter\expandafter\endgroup
  40 \expandafter\ifx\csname documentclass\endcsname\relax
  41 \else
  42   \csname fi\endcsname
  43 %</gobble>
  44 %
  45 %<*driver>
  46 \ProvidesFile{ltunicode.dtx}
  47   [2015/03/26 v1.0d LaTeX Kernel (Unicode data)]
  48 \documentclass{ltxdoc}
  49 \begin{document}
  50 \DocInput{\jobname.dtx}
  51 \end{document}
  52 %<*gobble>
  53 \fi
  54 %</gobble>
  55 %</driver>
  56 % \fi
  57 %
  58 % \GetFileInfo{ltunicode.dtx}
  59 % \title{The \texttt{ltunicode.dtx} file\thanks
  60 %     {This file has version number \fileversion, dated \filedate.}\\
  61 %       for use with \LaTeXe}
  62 % \author{The \LaTeX3 Project}
  63 %
  64 % \maketitle
  65 %
  66 % This script extracts data from the Unicode Consortium files
  67 % |UnicodeData.txt|, |EastAsianWidth.txt| and |LineBreak.txt| to be used for
  68 % setting up \LaTeXe{} (or plain \TeX{}) with sane default settings when using
  69 % the Xe\TeX{} and Lua\TeX{} engines. Details of the process are included in
  70 % the code comments.
  71 %
  72 % To create the extracted file, run this file in a location containing
  73 % the three input data files using \texttt{pdftex}. (The code requires
  74 % \cs{pdfmdfivesum} and the e-\TeX{} extensions: it could be adapted for
  75 % Lua\TeX{}).
  76 %
  77 % \StopEventually{}
  78 %
  79 %    \begin{macrocode}
  80 %<*script>
  81 %    \end{macrocode}
  82 %
  83 % \section{General set up}
  84 %
  85 % The script is designed to work with plain \TeX{} and so |@| is made into
  86 % a `letter' using the primitive approach.
  87 %    \begin{macrocode}
  88 \catcode`\@=11 %
  89 %    \end{macrocode}
  90 %
  91 % \begin{macro}{\gobble}
  92 % \begin{macro}{\firsttoken}
  93 %   Standard utilities.
  94 %    \begin{macrocode}
  95 \long\def\gobble#1{}
  96 \long\def\firsttoken#1#2\relax{#1}
  97 %    \end{macrocode}
  98 % \end{macro}
  99 % \end{macro}
 100 %
 101 % \begin{macro}{\storedpar}
 102 %   A simple piece of test set up: the final line of the read file will be
 103 %   tokenized by \TeX{} as \cs{par} which can be tested by \cs{ifx} provided
 104 %   we have an equivalent available.
 105 %    \begin{macrocode}
 106 \def\storedpar{\par}
 107 %    \end{macrocode}
 108 % \end{macro}
 109 %
 110 % \begin{macro}{\return}
 111 %   A stored |^^M| for string comparisons.
 112 %    \begin{macrocode}
 113 \begingroup
 114   \catcode`\^^M=12 %
 115   \gdef\return{^^M}%
 116 \endgroup%
 117 %    \end{macrocode}
 118 % \end{macro}
 119 %
 120 % \begin{macro}{\sourceforhex}
 121 % \begin{macro}{\sethex}
 122 % \begin{macro}{\dohex}
 123 % \begin{macro}{\hexdigit}
 124 %   Some parts of the code here will need to be able to convert integers
 125 %   to their hexadecimal equivalent. That is easiest to do for the requirements
 126 %   here using a modified version of some code from Appendix~D of \emph{The
 127 %   \TeX{}book}.
 128 %    \begin{macrocode}
 129 \newcount\sourceforhex
 130 \def\sethex#1#2{%
 131   \def#1{}%
 132   \sourceforhex=#2\relax
 133   \ifnum\sourceforhex=0 %
 134     \def#1{0}%
 135   \else
 136     \dohex#1%
 137   \fi
 138 }
 139 \def\dohex#1{%
 140   \begingroup
 141     \count0=\sourceforhex
 142     \divide\sourceforhex by 16 %
 143     \ifnum\sourceforhex>0 %
 144       \dohex#1%
 145     \fi
 146     \count2=\sourceforhex
 147     \multiply\count2 by -16 %
 148     \advance\count0 by\count2
 149     \hexdigit#1%
 150   \expandafter\endgroup
 151   \expandafter\def\expandafter#1\expandafter{#1}%
 152 }
 153 \def\hexdigit#1{%
 154   \ifnum\count0<10 %
 155     \edef#1{#1\number\count0}%
 156   \else
 157     \advance\count0 by -10 %
 158     \edef#1{#1\ifcase\count0 A\or B\or C\or D\or E\or F\fi}%
 159   \fi
 160 }
 161 %    \end{macrocode}
 162 % \end{macro}
 163 % \end{macro}
 164 % \end{macro}
 165 % \end{macro}
 166 %
 167 % \begin{macro}{\unicoderead, \unicodewrite}
 168 %   Set up the streams for data.
 169 %    \begin{macrocode}
 170 \newread\unicoderead
 171 \newwrite\unicodewrite
 172 %    \end{macrocode}
 173 % \end{macro}
 174 %
 175 % \section{Verbatim copying}
 176 %
 177 % \begin{macro}{\verbatimcopy}
 178 % \begin{macro}{\endverbatimcopy}
 179 % \begin{macro}{\verbatimcopy@auxii}
 180 % \begin{macro}{\verbatimcopy@auxii}
 181 % \begin{macro}{\verbatim@endmarker}
 182 %   Set up to read some material verbatim and write it to the output stream.
 183 %   There needs to be a dedicated `clean up first line' macro, but other than
 184 %   that life is simple enough.
 185 %    \begin{macrocode}
 186 \begingroup
 187   \catcode`\^^M=12 %
 188   \gdef\verbatimcopy{%
 189     \begingroup%
 190       \catcode`\^^M=12 %
 191       \catcode`\\=12 %
 192       \catcode`\{=12 %
 193       \catcode`\}=12 %
 194       \catcode`\#=12 %
 195       \catcode`\%=12 %
 196       \catcode`\ =12 %
 197       \endlinechar=`\^^M %
 198       \verbatimcopy@auxi
 199   }%
 200   \gdef\verbatimcopy@auxi#1^^M{%
 201     \expandafter\verbatimcopy@auxii\gobble#1^^M%
 202   }%
 203   \gdef\verbatimcopy@auxii#1^^M{%
 204     \def\temp{#1}%
 205     \ifx\temp\verbatim@endmarker%
 206       \expandafter\endgroup%
 207     \else%
 208       \ifx\temp\empty\else%
 209         \immediate\write\unicodewrite{#1}%
 210       \fi%
 211       \expandafter\verbatimcopy@auxii%
 212     \fi%
 213   }%
 214 \endgroup%
 215 \edef\verbatim@endmarker{\expandafter\gobble\string\\}
 216 \edef\verbatim@endmarker{\verbatim@endmarker endverbatimcopy}
 217 %    \end{macrocode}
 218 % \end{macro}
 219 % \end{macro}
 220 % \end{macro}
 221 % \end{macro}
 222 % \end{macro}
 223 %
 224 % \section{File header section}
 225 %
 226 % \changes{v1.0d}{2015/03/26}{Renamed data file to
 227 %   \texttt{unicode-letters.def}}
 228 % With the mechanisms set up, open the data file for writing.
 229 %    \begin{macrocode}
 230 \immediate\openout\unicodewrite=unicode-letters.def %
 231 %    \end{macrocode}
 232 % There are various lines that now need to go at the start of the file.
 233 % First, there is some header information. Parts of it are auto-generated,
 234 % so there is some interspersing of verbatim and non-verbatim parts.
 235 %    \begin{macrocode}
 236 \verbatimcopy
 237 %% This is the file `unicode-letters.def',
 238 %% generated using the script ltunicode.dtx.
 239 %%
 240 %% The data here are derived from the files
 241 \endverbatimcopy
 242 %    \end{macrocode}
 243 %
 244 % \changes{v1.0b}{2015/03/25}{Include Unicode version data in generated
 245 %   file}
 246 % \changes{v1.0c}{2015/03/25}{Include MD5 sums for sources in generated
 247 %   file}
 248 % \begin{macro}{\parseunicodedata}
 249 % \begin{macro}{\parseunicodedataauxi}
 250 %   To ensure that there is a full audit trail for the data, we record
 251 %   both the reported file version (if available) and the checksum for each
 252 %   of the source files. This is done by reading the first line of each file
 253 %   and parsing for the version string, then `catching' the entire files
 254 %   inside a macro to work out the checksums.
 255 %    \begin{macrocode}
 256 \def\parseunicodedata#1{%
 257   \openin\unicoderead=#1.txt %
 258   \ifeof\unicoderead
 259     \errmessage{Data file missing: #1.txt}%
 260   \fi
 261   \immediate\write\unicodewrite{%
 262     \expandafter\gobble\string\%\expandafter\gobble\string\%
 263     - #1.txt
 264   }%
 265   \readline\unicoderead to \unicodedataline
 266   \closein\unicoderead
 267   \edef\unicodedataline{\unicodedataline\detokenize{-.txt}}%
 268   \expandafter\parseunicodedataauxi\unicodedataline\relax{#1}%
 269 }
 270 \begingroup
 271 \catcode`\T=12 %
 272 \catcode`\X=12 %
 273 \lowercase{%
 274   \endgroup
 275   \def\parseunicodedataauxi#1-#2.TXT#3\relax#4}%
 276 {%
 277   \begingroup
 278     \everyeof{\noexpand}%
 279     \catcode`\#=12 %
 280     \edef\unicodedataline{\input#4.txt\space}%
 281     \immediate\write\unicodewrite{%
 282       \expandafter\gobble\string\%\expandafter\gobble\string\%
 283       \space\space
 284       (%
 285         \ifx\relax#2\relax
 286         \else
 287           Version #2,
 288         \fi
 289         MD5 sum \pdfmdfivesum\expandafter{\unicodedataline}%
 290       )%
 291     }%
 292   \endgroup
 293 }
 294 \parseunicodedata{UnicodeData}
 295 \parseunicodedata{EastAsianWidth}
 296 \parseunicodedata{LineBreak}
 297 %    \end{macrocode}
 298 % \end{macro}
 299 % \end{macro}
 300 %
 301 %    \begin{macrocode}
 302 \verbatimcopy
 303 %% which are maintained by the Unicode Consortium.
 304 %%
 305 \endverbatimcopy
 306 %    \end{macrocode}
 307 %
 308 % Automatically include the current date.
 309 %    \begin{macrocode}
 310 \immediate\write\unicodewrite{%
 311   \expandafter\gobble\string\%\expandafter\gobble\string\%
 312   Generated on \the\year
 313     -\ifnum\month>9 \else 0\fi \the\month
 314     -\ifnum\day>9   \else 0\fi \the\day.
 315 }
 316 %    \end{macrocode}
 317 %
 318 % Back to simple text copying
 319 %    \begin{macrocode}
 320 \verbatimcopy
 321 %%
 322 %% Copyright 2014-2015
 323 %% The LaTeX3 Project and any individual authors listed elsewhere
 324 %% in this file.
 325 %%
 326 %% This file is part of the LaTeX base system.
 327 %% -------------------------------------------
 328 %%
 329 %% It may be distributed and/or modified under the
 330 %% conditions of the LaTeX Project Public License, either version 1.3c
 331 %% of this license or (at your option) any later version.
 332 %% The latest version of this license is in
 333 %%    http://www.latex-project.org/lppl.txt
 334 %% and version 1.3c or later is part of all distributions of LaTeX
 335 %% version 2005/12/01 or later.
 336 %%
 337 %% This file has the LPPL maintenance status "maintained".
 338 %%
 339 %% The list of all files belonging to the LaTeX base distribution is
 340 %% given in the file `manifest.txt'. See also `legal.txt' for additional
 341 %% information.
 342 \endverbatimcopy
 343 %    \end{macrocode}
 344 %
 345 % \section{Unicode character data}
 346 %
 347 % \begin{macro}{\parseunicodedata}
 348 % \begin{macro}{\parseunicodedata@auxi}
 349 % \begin{macro}{\parseunicodedata@auxii}
 350 % \begin{macro}{\parseunicodedata@auxiii}
 351 %   The first step of parsing a line of data is to check that it's not come
 352 %   from a blank in the source, which will have been tokenized as \cs{par}.
 353 %   Assuming that is not the case, there are lots of data items separated by
 354 %   |;|. Of those, only a few are needed so they are picked out and everything
 355 %   else is dropped.
 356 %    \begin{macrocode}
 357 \def\parseunicodedata#1{%
 358   \ifx#1\storedpar
 359   \else
 360     \expandafter\parseunicodedata@auxi#1\relax
 361   \fi
 362 }
 363 \def\parseunicodedata@auxi#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
 364   \parseunicodedata@auxii#1;#3;
 365 }
 366 \def\parseunicodedata@auxii#1;#2;#3;#4;#5;#6;#7;#8\relax{%
 367   \parseunicodedata@auxiii{#1}{#2}{#6}{#7}%
 368 }
 369 %    \end{macrocode}
 370 %   At this stage we have only four pieces of data
 371 %   \begin{enumerate}
 372 %     \item The code value
 373 %     \item The general class
 374 %     \item The uppercase mapping
 375 %     \item The lowercase mapping
 376 %   \end{enumerate}
 377 %   where both one or both of the last two may be empty. Everything here could
 378 %   be done in a single conditional within a \cs{write}, but that would be
 379 %   tricky to follow. Instead, a series of defined auxiliaries are used to
 380 %   show the flow. Notice that combining marks are treated as letters here
 381 %   (the second `letter' test).
 382 %    \begin{macrocode}
 383 \def\parseunicodedata@auxiii#1#2#3#4{%
 384   \if L\firsttoken#2?\relax
 385     \expandafter\unicodeletter
 386   \else
 387     \if M\firsttoken#2?\relax
 388       \expandafter\expandafter\expandafter\unicodeletter
 389     \else
 390       \expandafter\expandafter\expandafter\unicodenonletter
 391     \fi
 392   \fi
 393     {#1}{#3}{#4}%
 394 }
 395 %    \end{macrocode}
 396 % \end{macro}
 397 % \end{macro}
 398 % \end{macro}
 399 % \end{macro}
 400 %
 401 % \begin{macro}{\unicodeletter, \unicodenonletter}
 402 % \begin{macro}{\writeunicodedata}
 403 %   For `letters', we always want to write the data to file, and the only
 404 %   question here is if the character has case mappings or these point back
 405 %   to the character itself.
 406 %    \begin{macrocode}
 407 \def\unicodeletter#1#2#3{%
 408   \writeunicodedata\L{#1}{#2}{#3}%
 409 }
 410 %    \end{macrocode}
 411 %   Cased non-letters can also exist: they can be detected as they have at
 412 %   least one case mapping. Write these in much the same way as letters.
 413 %    \begin{macrocode}
 414 \def\unicodenonletter#1#2#3{%
 415   \ifx\relax#2#3\relax
 416   \else
 417     \writeunicodedata\C{#1}{#2}{#3}%
 418   \fi
 419 }
 420 %    \end{macrocode}
 421 %   Actually write the data. In all cases both upper- and lower-case mappings
 422 %   are given, so there is a need to test that both were actually available and
 423 %   if not set up to do nothing.
 424 %    \begin{macrocode}
 425 \def\writeunicodedata#1#2#3#4{%
 426   \immediate\write\unicodewrite{%
 427     \space\space
 428     \string#1\space
 429     #2 %
 430     \ifx\relax#3\relax
 431       #2 %
 432     \else
 433       #3 %
 434     \fi
 435     \ifx\relax#4\relax
 436       #2 %
 437     \else
 438       #4 %
 439     \fi
 440     \expandafter\gobble\string\%
 441   }%
 442 }
 443 %    \end{macrocode}
 444 % \end{macro}
 445 % \end{macro}
 446 %
 447 % There is now a lead-in section which creates the macros which take the
 448 % processed data and do the code assignments. Everything is done within a
 449 % group so that there is no need to worry about names.
 450 %    \begin{macrocode}
 451 \verbatimcopy
 452 \begingroup
 453 \endverbatimcopy
 454 %    \end{macrocode}
 455 % Cased non-letters simply need to have the case mappings set.
 456 % For letters, there are a few things to sort out. First, the case mappings are
 457 % defined as for non-letters. Category code is then set to $11$ before a check
 458 % to see if this is an upper case letter. If it is then the \cs{sfcode} is set
 459 % to $999$. Finally there is a need to deal with Unicode math codes, where base
 460 % plane letters are class $7$ but supplementary plane letters are class~$1$.
 461 % Older versions of Xe\TeX{} used a different name here: easy to pick up as
 462 % we know that this primitive must be defined in some way. There is also an issue
 463 % with the supplementary plane and older Xe\TeX{} versions, which is dealt with
 464 % using a check at run time.
 465 %    \begin{macrocode}
 466 \verbatimcopy
 467   \def\C#1 #2 #3 {%
 468     \XeTeXcheck{#1}%
 469     \global\uccode"#1="#2 %
 470     \global\lccode"#1="#3 %
 471   }
 472   \def\L#1 #2 #3 {%
 473     \C #1 #2 #3 %
 474     \catcode"#1=11 %
 475     \ifnum"#1="#3 %
 476     \else
 477       \global\sfcode"#1=999 %
 478     \fi
 479     \ifnum"#1<"10000 %
 480       \global\Umathcode"#1="7"01"#1 %
 481     \else
 482       \global\Umathcode"#1="0"01"#1 %
 483     \fi
 484   }
 485   \ifx\Umathcode\undefined
 486     \let\Umathcode\XeTeXmathcode
 487   \fi
 488   \def\XeTeXcheck#1{}
 489   \ifx\XeTeXversion\undefined
 490   \else
 491     \def\XeTeXcheck.#1.#2-#3\relax{#1}
 492      \ifnum\expandafter\XeTeXcheck\XeTeXrevision.-\relax>996 %
 493        \def\XeTeXcheck#1{}
 494      \else
 495        \def\XeTeXcheck#1{%
 496           \ifnum"#1>"FFFF %
 497             \long\def\XeTeXcheck##1\endgroup{\endgroup}
 498             \expandafter\XeTeXcheck
 499           \fi
 500        }
 501      \fi
 502   \fi
 503 \endverbatimcopy
 504 %    \end{macrocode}
 505 % Read the data and write the resulting code assignments to the file.
 506 %    \begin{macrocode}
 507 \openin\unicoderead=UnicodeData.txt %
 508 \loop\unless\ifeof\unicoderead
 509   \read\unicoderead to \unicodedataline
 510   \parseunicodedata\unicodedataline
 511 \repeat
 512 %    \end{macrocode}
 513 % End the group for setting character codes and assign a couple of special
 514 % cases.
 515 %    \begin{macrocode}
 516 \verbatimcopy
 517 \endgroup
 518 \global\sfcode"2019=0 %
 519 \global\sfcode"201D=0 %
 520 \endverbatimcopy
 521 %    \end{macrocode}
 522 % Lua\TeX{} and older versions of Xe\TeX{} stop here: character classes are a
 523 % Xe\TeX{}-only concept.
 524 %    \begin{macrocode}
 525 \verbatimcopy
 526 \ifx\XeTeXchartoks\XeTeXcharclass
 527   \expandafter\endinput
 528 \fi
 529 \endverbatimcopy
 530 %    \end{macrocode}
 531 %
 532 % \section{Xe\TeX{} Character classes}
 533 %
 534 % The Xe\TeX{} engine includes the concept of character classes, which allow
 535 % insertion of tokens into the input stream at defined boundaries. Setting
 536 % up this data requires a two-part process as the information is split over
 537 % two input files.
 538 %
 539 % \begin{macro}{\parseunicodedata}
 540 % \begin{macro}{\parseunicodedata@auxi}
 541 % \begin{macro}{\parseunicodedata@auxii}
 542 %   The parsing system is redefined to parse a detokenized input line which
 543 %   may be a comment starting with |#|. Assuming that is not the case, the
 544 %   data line with start with a code point potentially forming part of a range.
 545 %   The range is extracted and the width stored for each code point.
 546 %    \begin{macrocode}
 547 \def\parseunicodedata#1{%
 548   \ifx#1\return
 549   \else
 550     \if\expandafter\gobble\string\#\expandafter\firsttoken#1?\relax
 551     \else
 552       \expandafter\parseunicodedata@auxi#1\relax
 553     \fi
 554   \fi
 555 }
 556 \def\parseunicodedata@auxi#1;#2 #3\relax{%
 557   \parseunicodedata@auxii#1....\relax{#2}%
 558 }
 559 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
 560   \expandafter\gdef\csname EAW@#1\endcsname{#4}%
 561   \ifx\relax#2\relax
 562   \else
 563     \count@="#1 %
 564     \begingroup
 565       \loop
 566         \ifnum\count@<"#2 %
 567           \advance\count@\@ne
 568           \sethex\temp{\count@}%
 569           \expandafter\gdef\csname EAW@\temp\endcsname{#4}%
 570       \repeat
 571     \endgroup
 572   \fi
 573 }
 574 %    \end{macrocode}
 575 % \end{macro}
 576 % \end{macro}
 577 % \end{macro}
 578 %
 579 % With the right parser in place, read the data file.
 580 %    \begin{macrocode}
 581 \openin\unicoderead=EastAsianWidth.txt %
 582 \loop\unless\ifeof\unicoderead
 583   \readline\unicoderead to \unicodedataline
 584   \parseunicodedata\unicodedataline
 585 \repeat
 586 %    \end{macrocode}
 587 %
 588 % \begin{macro}{\parseunicodedata@auxii}
 589 % \begin{macro}{\parseunicodedata@auxiii}
 590 % \begin{macro}{\parseunicodedata@auxiv}
 591 % \begin{macro}{\ID}
 592 % \begin{macro}{\OP}
 593 % \begin{macro}{\CL}
 594 % \begin{macro}{\EX}
 595 % \begin{macro}{\IS}
 596 % \begin{macro}{\NS}
 597 % \begin{macro}{\CM}
 598 %  The final file to read, |LineBreak.txt|, uses the same format as\\
 599 %  |EastAsianWidth.txt|. As such, only the final parts of the parser have to be
 600 %  redefined.
 601 %    \begin{macrocode}
 602 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
 603   \parseunicodedata@auxiii{#1}{#4}%
 604   \ifx\relax#2\relax
 605   \else
 606     \count@="#1 %
 607     \begingroup
 608       \loop
 609         \ifnum\count@<"#2 %
 610           \advance\count@\@ne
 611           \sethex\temp{\count@}%
 612           \expandafter\parseunicodedata@auxiii\expandafter{\temp}{#4}%
 613       \repeat
 614     \endgroup
 615   \fi
 616 }
 617 %    \end{macrocode}
 618 %   Adding data to the processed file depends on two factors: the
 619 %   classification in the line-breaking file and (possibly) the width data
 620 %   too. Any characters of class \texttt{ID} (ideograph) are stored: they
 621 %   always need special treatment. For characters of classes \texttt{OP}
 622 %   (opener), \texttt{CL} (closer), \texttt{EX} (exclamation), \texttt{IS}
 623 %   (infix sep) and \texttt{NS} (non-starter) the data is stored if the
 624 %   character is full, half or wide width. The same is true for
 625 %   \texttt{CM} (combining marks) characters, which need to be transparent
 626 %   to the mechanism.
 627 %    \begin{macrocode}
 628 \def\parseunicodedata@auxiii#1#2{%
 629   \ifcsname #2\endcsname
 630     \ifnum\csname #2\endcsname=1 %
 631       \parseunicodedata@auxiv{#1}{#2}%
 632     \else
 633       \ifnum 0%
 634         \if F\csname EAW@#1\endcsname 1\fi
 635         \if H\csname EAW@#1\endcsname 1\fi
 636         \if W\csname EAW@#1\endcsname 1\fi
 637         >0 %
 638         \parseunicodedata@auxiv{#1}{#2}%
 639       \fi
 640     \fi
 641   \fi
 642 }
 643 \def\parseunicodedata@auxiv#1#2{%
 644   \immediate\write\unicodewrite{%
 645     \space\space
 646     \expandafter\string\csname #2\endcsname
 647     \space
 648     #1 %
 649     \expandafter\gobble\string\%
 650   }%
 651 }
 652 \def\ID{1}
 653 \def\OP{2}
 654 \def\CL{3}
 655 \let\EX\CL
 656 \let\IS\CL
 657 \let\NS\CL
 658 \def\CM{256}
 659 %    \end{macrocode}
 660 % \end{macro}
 661 % \end{macro}
 662 % \end{macro}
 663 % \end{macro}
 664 % \end{macro}
 665 % \end{macro}
 666 % \end{macro}
 667 % \end{macro}
 668 % \end{macro}
 669 % \end{macro}
 670 %
 671 % Before actually reading the line breaking data file, the appropriate
 672 % temporary code is added to the output. As described above, only a limited
 673 % number of classes need to be covered: they are hard-coded as classes
 674 % $1$, $2$ and $3$ following the convention adopted by plain Xe\TeX{}.
 675 %    \begin{macrocode}
 676 \verbatimcopy
 677 \begingroup
 678   \def\ID#1 {\global\XeTeXcharclass"#1=1 \global\catcode"#1=11 }
 679   \def\OP#1 {\global\XeTeXcharclass"#1=2 }
 680   \def\CL#1 {\global\XeTeXcharclass"#1=3 }
 681   \def\EX#1 {\global\XeTeXcharclass"#1=3 }
 682   \def\IS#1 {\global\XeTeXcharclass"#1=3 }
 683   \def\NS#1 {\global\XeTeXcharclass"#1=3 }
 684   \def\CM#1 {\global\XeTeXcharclass"#1=256 }
 685 \endverbatimcopy
 686 %    \end{macrocode}
 687 %
 688 % Read the line breaking data and save to the output.
 689 %    \begin{macrocode}
 690 \openin\unicoderead=LineBreak.txt %
 691 \loop\unless\ifeof\unicoderead
 692   \readline\unicoderead to \unicodedataline
 693   \parseunicodedata\unicodedataline
 694 \repeat
 695 %    \end{macrocode}
 696 %
 697 % \changes{v1.0a}{2015/03/25}{Use \cs{hskip} rather than \cs{hspace}
 698 %   in glue settings}
 699 % Set up material to be inserted between character classes.
 700 % that provided by plain Xe\TeX{}. Using \cs{hskip} here means the code will
 701 % work with plain as well as \LaTeXe{}.
 702 %    \begin{macrocode}
 703 \verbatimcopy
 704 \endgroup
 705 \gdef\xtxHanGlue{\hskip0pt plus 0.1em\relax}
 706 \gdef\xtxHanSpace{\hskip0.2em plus 0.2em minus 0.1em\relax}
 707 \global\XeTeXinterchartoks 0 1 = {\xtxHanSpace}
 708 \global\XeTeXinterchartoks 0 2 = {\xtxHanSpace}
 709 \global\XeTeXinterchartoks 0 3 = {\nobreak\xtxHanSpace}
 710 \global\XeTeXinterchartoks 1 0 = {\xtxHanSpace}
 711 \global\XeTeXinterchartoks 2 0 = {\nobreak\xtxHanSpace}
 712 \global\XeTeXinterchartoks 3 0 = {\xtxHanSpace}
 713 \global\XeTeXinterchartoks 1 1 = {\xtxHanGlue}
 714 \global\XeTeXinterchartoks 1 2 = {\xtxHanGlue}
 715 \global\XeTeXinterchartoks 1 3 = {\nobreak\xtxHanGlue}
 716 \global\XeTeXinterchartoks 2 1 = {\nobreak\xtxHanGlue}
 717 \global\XeTeXinterchartoks 2 2 = {\nobreak\xtxHanGlue}
 718 \global\XeTeXinterchartoks 2 3 = {\xtxHanGlue}
 719 \global\XeTeXinterchartoks 3 1 = {\xtxHanGlue}
 720 \global\XeTeXinterchartoks 3 2 = {\xtxHanGlue}
 721 \global\XeTeXinterchartoks 3 3 = {\nobreak\xtxHanGlue}
 722 \endverbatimcopy
 723 %    \end{macrocode}
 724 %
 725 % Done: end the script.
 726 %    \begin{macrocode}
 727 \bye
 728 %    \end{macrocode}
 729 %
 730 %    \begin{macrocode}
 731 %</script>
 732 %    \end{macrocode}