1 % \iffalse meta-comment
4 % The LaTeX3 Project and any individual authors listed elsewhere
7 % This file is part of the LaTeX base system.
8 % -------------------------------------------
10 % It may be distributed and/or modified under the
11 % conditions of the LaTeX Project Public License, either version 1.3c
12 % of this license or (at your option) any later version.
13 % The latest version of this license is in
14 % http://www.latex-project.org/lppl.txt
15 % and version 1.3c or later is part of all distributions of LaTeX
16 % version 2005/12/01 or later.
18 % This file has the LPPL maintenance status "maintained".
20 % The list of all files belonging to the LaTeX base distribution is
21 % given in the file `manifest.txt'. See also `legal.txt' for additional
24 % The list of derived (unpacked) files belonging to the distribution
25 % and covered by LPPL is defined by the unpacking scripts (with
26 % extension .ins) which are part of the distribution.
28 % -----------------------------------------------------------------------------
30 % The same approach as used in \pkg{DocStrip}: if \cs{documentclass}
31 % is undefined then skip the driver, allowing the file to be used directly.
32 % This works as the \cs{fi} is only seen if \LaTeX{} is not in use. The odd
33 % \cs{jobname} business allows the extraction to work with \LaTeX{} provided
34 % an appropriate \texttt{.ins} file is set up.
37 \let\documentclass\undefined
39 \begingroup\expandafter\expandafter\expandafter\endgroup
40 \expandafter\ifx\csname documentclass\endcsname\relax
46 \ProvidesFile{ltunicode.dtx}
47 [2015/03/26 v1.0d LaTeX Kernel (Unicode data)]
48 \documentclass{ltxdoc}
50 \DocInput{\jobname.dtx}
58 % \GetFileInfo{ltunicode.dtx}
59 % \title{The \texttt{ltunicode.dtx} file\thanks
60 % {This file has version number \fileversion, dated \filedate.}\\
61 % for use with \LaTeXe}
62 % \author{The \LaTeX3 Project}
66 % This script extracts data from the Unicode Consortium files
67 % |UnicodeData.txt|, |EastAsianWidth.txt| and |LineBreak.txt| to be used for
68 % setting up \LaTeXe{} (or plain \TeX{}) with sane default settings when using
69 % the Xe\TeX{} and Lua\TeX{} engines. Details of the process are included in
72 % To create the extracted file, run this file in a location containing
73 % the three input data files using \texttt{pdftex}. (The code requires
74 % \cs{pdfmdfivesum} and the e-\TeX{} extensions: it could be adapted for
83 % \section{General set up}
85 % The script is designed to work with plain \TeX{} and so |@| is made into
86 % a `letter' using the primitive approach.
91 % \begin{macro}{\gobble}
92 % \begin{macro}{\firsttoken}
96 \long\def\firsttoken#1#2\relax{#1}
101 % \begin{macro}{\storedpar}
102 % A simple piece of test set up: the final line of the read file will be
103 % tokenized by \TeX{} as \cs{par} which can be tested by \cs{ifx} provided
104 % we have an equivalent available.
110 % \begin{macro}{\return}
111 % A stored |^^M| for string comparisons.
120 % \begin{macro}{\sourceforhex}
121 % \begin{macro}{\sethex}
122 % \begin{macro}{\dohex}
123 % \begin{macro}{\hexdigit}
124 % Some parts of the code here will need to be able to convert integers
125 % to their hexadecimal equivalent. That is easiest to do for the requirements
126 % here using a modified version of some code from Appendix~D of \emph{The
129 \newcount\sourceforhex
132 \sourceforhex=#2\relax
133 \ifnum\sourceforhex=0 %
141 \count0=\sourceforhex
142 \divide\sourceforhex by 16 %
143 \ifnum\sourceforhex>0 %
146 \count2=\sourceforhex
147 \multiply\count2 by -16 %
148 \advance\count0 by\count2
150 \expandafter\endgroup
151 \expandafter\def\expandafter#1\expandafter{#1}%
155 \edef#1{#1\number\count0}%
157 \advance\count0 by -10 %
158 \edef#1{#1\ifcase\count0 A\or B\or C\or D\or E\or F\fi}%
167 % \begin{macro}{\unicoderead, \unicodewrite}
168 % Set up the streams for data.
171 \newwrite\unicodewrite
175 % \section{Verbatim copying}
177 % \begin{macro}{\verbatimcopy}
178 % \begin{macro}{\endverbatimcopy}
179 % \begin{macro}{\verbatimcopy@auxii}
180 % \begin{macro}{\verbatimcopy@auxii}
181 % \begin{macro}{\verbatim@endmarker}
182 % Set up to read some material verbatim and write it to the output stream.
183 % There needs to be a dedicated `clean up first line' macro, but other than
184 % that life is simple enough.
200 \gdef\verbatimcopy@auxi#1^^M{%
201 \expandafter\verbatimcopy@auxii\gobble#1^^M%
203 \gdef\verbatimcopy@auxii#1^^M{%
205 \ifx\temp\verbatim@endmarker%
206 \expandafter\endgroup%
208 \ifx\temp\empty\else%
209 \immediate\write\unicodewrite{#1}%
211 \expandafter\verbatimcopy@auxii%
215 \edef\verbatim@endmarker{\expandafter\gobble\string\\}
216 \edef\verbatim@endmarker{\verbatim@endmarker endverbatimcopy}
224 % \section{File header section}
226 % \changes{v1.0d}{2015/03/26}{Renamed data file to
227 % \texttt{unicode-letters.def}}
228 % With the mechanisms set up, open the data file for writing.
230 \immediate\openout\unicodewrite=unicode-letters.def %
232 % There are various lines that now need to go at the start of the file.
233 % First, there is some header information. Parts of it are auto-generated,
234 % so there is some interspersing of verbatim and non-verbatim parts.
237 %% This is the file `unicode-letters.def',
238 %% generated using the script ltunicode.dtx.
240 %% The data here are derived from the files
244 % \changes{v1.0b}{2015/03/25}{Include Unicode version data in generated
246 % \changes{v1.0c}{2015/03/25}{Include MD5 sums for sources in generated
248 % \begin{macro}{\parseunicodedata}
249 % \begin{macro}{\parseunicodedataauxi}
250 % To ensure that there is a full audit trail for the data, we record
251 % both the reported file version (if available) and the checksum for each
252 % of the source files. This is done by reading the first line of each file
253 % and parsing for the version string, then `catching' the entire files
254 % inside a macro to work out the checksums.
256 \def\parseunicodedata#1{%
257 \openin\unicoderead=#1.txt %
259 \errmessage{Data file missing: #1.txt}%
261 \immediate\write\unicodewrite{%
262 \expandafter\gobble\string\%\expandafter\gobble\string\%
265 \readline\unicoderead to \unicodedataline
267 \edef\unicodedataline{\unicodedataline\detokenize{-.txt}}%
268 \expandafter\parseunicodedataauxi\unicodedataline\relax{#1}%
275 \def\parseunicodedataauxi#1-#2.TXT#3\relax#4}%
278 \everyeof{\noexpand}%
280 \edef\unicodedataline{\input#4.txt\space}%
281 \immediate\write\unicodewrite{%
282 \expandafter\gobble\string\%\expandafter\gobble\string\%
289 MD5 sum \pdfmdfivesum\expandafter{\unicodedataline}%
294 \parseunicodedata{UnicodeData}
295 \parseunicodedata{EastAsianWidth}
296 \parseunicodedata{LineBreak}
303 %% which are maintained by the Unicode Consortium.
308 % Automatically include the current date.
310 \immediate\write\unicodewrite{%
311 \expandafter\gobble\string\%\expandafter\gobble\string\%
312 Generated on \the\year
313 -\ifnum\month>9 \else 0\fi \the\month
314 -\ifnum\day>9 \else 0\fi \the\day.
318 % Back to simple text copying
322 %% Copyright 2014-2015
323 %% The LaTeX3 Project and any individual authors listed elsewhere
326 %% This file is part of the LaTeX base system.
327 %% -------------------------------------------
329 %% It may be distributed and/or modified under the
330 %% conditions of the LaTeX Project Public License, either version 1.3c
331 %% of this license or (at your option) any later version.
332 %% The latest version of this license is in
333 %% http://www.latex-project.org/lppl.txt
334 %% and version 1.3c or later is part of all distributions of LaTeX
335 %% version 2005/12/01 or later.
337 %% This file has the LPPL maintenance status "maintained".
339 %% The list of all files belonging to the LaTeX base distribution is
340 %% given in the file `manifest.txt'. See also `legal.txt' for additional
345 % \section{Unicode character data}
347 % \begin{macro}{\parseunicodedata}
348 % \begin{macro}{\parseunicodedata@auxi}
349 % \begin{macro}{\parseunicodedata@auxii}
350 % \begin{macro}{\parseunicodedata@auxiii}
351 % The first step of parsing a line of data is to check that it's not come
352 % from a blank in the source, which will have been tokenized as \cs{par}.
353 % Assuming that is not the case, there are lots of data items separated by
354 % |;|. Of those, only a few are needed so they are picked out and everything
357 \def\parseunicodedata#1{%
360 \expandafter\parseunicodedata@auxi#1\relax
363 \def\parseunicodedata@auxi#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
364 \parseunicodedata@auxii#1;#3;
366 \def\parseunicodedata@auxii#1;#2;#3;#4;#5;#6;#7;#8\relax{%
367 \parseunicodedata@auxiii{#1}{#2}{#6}{#7}%
370 % At this stage we have only four pieces of data
372 % \item The code value
373 % \item The general class
374 % \item The uppercase mapping
375 % \item The lowercase mapping
377 % where both one or both of the last two may be empty. Everything here could
378 % be done in a single conditional within a \cs{write}, but that would be
379 % tricky to follow. Instead, a series of defined auxiliaries are used to
380 % show the flow. Notice that combining marks are treated as letters here
381 % (the second `letter' test).
383 \def\parseunicodedata@auxiii#1#2#3#4{%
384 \if L\firsttoken#2?\relax
385 \expandafter\unicodeletter
387 \if M\firsttoken#2?\relax
388 \expandafter\expandafter\expandafter\unicodeletter
390 \expandafter\expandafter\expandafter\unicodenonletter
401 % \begin{macro}{\unicodeletter, \unicodenonletter}
402 % \begin{macro}{\writeunicodedata}
403 % For `letters', we always want to write the data to file, and the only
404 % question here is if the character has case mappings or these point back
405 % to the character itself.
407 \def\unicodeletter#1#2#3{%
408 \writeunicodedata\L{#1}{#2}{#3}%
411 % Cased non-letters can also exist: they can be detected as they have at
412 % least one case mapping. Write these in much the same way as letters.
414 \def\unicodenonletter#1#2#3{%
417 \writeunicodedata\C{#1}{#2}{#3}%
421 % Actually write the data. In all cases both upper- and lower-case mappings
422 % are given, so there is a need to test that both were actually available and
423 % if not set up to do nothing.
425 \def\writeunicodedata#1#2#3#4{%
426 \immediate\write\unicodewrite{%
440 \expandafter\gobble\string\%
447 % There is now a lead-in section which creates the macros which take the
448 % processed data and do the code assignments. Everything is done within a
449 % group so that there is no need to worry about names.
455 % Cased non-letters simply need to have the case mappings set.
456 % For letters, there are a few things to sort out. First, the case mappings are
457 % defined as for non-letters. Category code is then set to $11$ before a check
458 % to see if this is an upper case letter. If it is then the \cs{sfcode} is set
459 % to $999$. Finally there is a need to deal with Unicode math codes, where base
460 % plane letters are class $7$ but supplementary plane letters are class~$1$.
461 % Older versions of Xe\TeX{} used a different name here: easy to pick up as
462 % we know that this primitive must be defined in some way. There is also an issue
463 % with the supplementary plane and older Xe\TeX{} versions, which is dealt with
464 % using a check at run time.
469 \global\uccode"#1="#2 %
470 \global\lccode"#1="#3 %
477 \global\sfcode"#1=999 %
480 \global\Umathcode"#1="7"01"#1 %
482 \global\Umathcode"#1="0"01"#1 %
485 \ifx\Umathcode\undefined
486 \let\Umathcode\XeTeXmathcode
489 \ifx\XeTeXversion\undefined
491 \def\XeTeXcheck.#1.#2-#3\relax{#1}
492 \ifnum\expandafter\XeTeXcheck\XeTeXrevision.-\relax>996 %
497 \long\def\XeTeXcheck##1\endgroup{\endgroup}
498 \expandafter\XeTeXcheck
505 % Read the data and write the resulting code assignments to the file.
507 \openin\unicoderead=UnicodeData.txt %
508 \loop\unless\ifeof\unicoderead
509 \read\unicoderead to \unicodedataline
510 \parseunicodedata\unicodedataline
513 % End the group for setting character codes and assign a couple of special
518 \global\sfcode"2019=0 %
519 \global\sfcode"201D=0 %
522 % Lua\TeX{} and older versions of Xe\TeX{} stop here: character classes are a
523 % Xe\TeX{}-only concept.
526 \ifx\XeTeXchartoks\XeTeXcharclass
527 \expandafter\endinput
532 % \section{Xe\TeX{} Character classes}
534 % The Xe\TeX{} engine includes the concept of character classes, which allow
535 % insertion of tokens into the input stream at defined boundaries. Setting
536 % up this data requires a two-part process as the information is split over
539 % \begin{macro}{\parseunicodedata}
540 % \begin{macro}{\parseunicodedata@auxi}
541 % \begin{macro}{\parseunicodedata@auxii}
542 % The parsing system is redefined to parse a detokenized input line which
543 % may be a comment starting with |#|. Assuming that is not the case, the
544 % data line with start with a code point potentially forming part of a range.
545 % The range is extracted and the width stored for each code point.
547 \def\parseunicodedata#1{%
550 \if\expandafter\gobble\string\#\expandafter\firsttoken#1?\relax
552 \expandafter\parseunicodedata@auxi#1\relax
556 \def\parseunicodedata@auxi#1;#2 #3\relax{%
557 \parseunicodedata@auxii#1....\relax{#2}%
559 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
560 \expandafter\gdef\csname EAW@#1\endcsname{#4}%
568 \sethex\temp{\count@}%
569 \expandafter\gdef\csname EAW@\temp\endcsname{#4}%
579 % With the right parser in place, read the data file.
581 \openin\unicoderead=EastAsianWidth.txt %
582 \loop\unless\ifeof\unicoderead
583 \readline\unicoderead to \unicodedataline
584 \parseunicodedata\unicodedataline
588 % \begin{macro}{\parseunicodedata@auxii}
589 % \begin{macro}{\parseunicodedata@auxiii}
590 % \begin{macro}{\parseunicodedata@auxiv}
598 % The final file to read, |LineBreak.txt|, uses the same format as\\
599 % |EastAsianWidth.txt|. As such, only the final parts of the parser have to be
602 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
603 \parseunicodedata@auxiii{#1}{#4}%
611 \sethex\temp{\count@}%
612 \expandafter\parseunicodedata@auxiii\expandafter{\temp}{#4}%
618 % Adding data to the processed file depends on two factors: the
619 % classification in the line-breaking file and (possibly) the width data
620 % too. Any characters of class \texttt{ID} (ideograph) are stored: they
621 % always need special treatment. For characters of classes \texttt{OP}
622 % (opener), \texttt{CL} (closer), \texttt{EX} (exclamation), \texttt{IS}
623 % (infix sep) and \texttt{NS} (non-starter) the data is stored if the
624 % character is full, half or wide width. The same is true for
625 % \texttt{CM} (combining marks) characters, which need to be transparent
628 \def\parseunicodedata@auxiii#1#2{%
629 \ifcsname #2\endcsname
630 \ifnum\csname #2\endcsname=1 %
631 \parseunicodedata@auxiv{#1}{#2}%
634 \if F\csname EAW@#1\endcsname 1\fi
635 \if H\csname EAW@#1\endcsname 1\fi
636 \if W\csname EAW@#1\endcsname 1\fi
638 \parseunicodedata@auxiv{#1}{#2}%
643 \def\parseunicodedata@auxiv#1#2{%
644 \immediate\write\unicodewrite{%
646 \expandafter\string\csname #2\endcsname
649 \expandafter\gobble\string\%
671 % Before actually reading the line breaking data file, the appropriate
672 % temporary code is added to the output. As described above, only a limited
673 % number of classes need to be covered: they are hard-coded as classes
674 % $1$, $2$ and $3$ following the convention adopted by plain Xe\TeX{}.
678 \def\ID#1 {\global\XeTeXcharclass"#1=1 \global\catcode"#1=11 }
679 \def\OP#1 {\global\XeTeXcharclass"#1=2 }
680 \def\CL#1 {\global\XeTeXcharclass"#1=3 }
681 \def\EX#1 {\global\XeTeXcharclass"#1=3 }
682 \def\IS#1 {\global\XeTeXcharclass"#1=3 }
683 \def\NS#1 {\global\XeTeXcharclass"#1=3 }
684 \def\CM#1 {\global\XeTeXcharclass"#1=256 }
688 % Read the line breaking data and save to the output.
690 \openin\unicoderead=LineBreak.txt %
691 \loop\unless\ifeof\unicoderead
692 \readline\unicoderead to \unicodedataline
693 \parseunicodedata\unicodedataline
697 % \changes{v1.0a}{2015/03/25}{Use \cs{hskip} rather than \cs{hspace}
699 % Set up material to be inserted between character classes.
700 % that provided by plain Xe\TeX{}. Using \cs{hskip} here means the code will
701 % work with plain as well as \LaTeXe{}.
705 \gdef\xtxHanGlue{\hskip0pt plus 0.1em\relax}
706 \gdef\xtxHanSpace{\hskip0.2em plus 0.2em minus 0.1em\relax}
707 \global\XeTeXinterchartoks 0 1 = {\xtxHanSpace}
708 \global\XeTeXinterchartoks 0 2 = {\xtxHanSpace}
709 \global\XeTeXinterchartoks 0 3 = {\nobreak\xtxHanSpace}
710 \global\XeTeXinterchartoks 1 0 = {\xtxHanSpace}
711 \global\XeTeXinterchartoks 2 0 = {\nobreak\xtxHanSpace}
712 \global\XeTeXinterchartoks 3 0 = {\xtxHanSpace}
713 \global\XeTeXinterchartoks 1 1 = {\xtxHanGlue}
714 \global\XeTeXinterchartoks 1 2 = {\xtxHanGlue}
715 \global\XeTeXinterchartoks 1 3 = {\nobreak\xtxHanGlue}
716 \global\XeTeXinterchartoks 2 1 = {\nobreak\xtxHanGlue}
717 \global\XeTeXinterchartoks 2 2 = {\nobreak\xtxHanGlue}
718 \global\XeTeXinterchartoks 2 3 = {\xtxHanGlue}
719 \global\XeTeXinterchartoks 3 1 = {\xtxHanGlue}
720 \global\XeTeXinterchartoks 3 2 = {\xtxHanGlue}
721 \global\XeTeXinterchartoks 3 3 = {\nobreak\xtxHanGlue}
725 % Done: end the script.