1 % \iffalse meta-comment
4 % The LaTeX3 Project and any individual authors listed elsewhere
7 % This file is part of the LaTeX base system.
8 % -------------------------------------------
10 % It may be distributed and/or modified under the
11 % conditions of the LaTeX Project Public License, either version 1.3c
12 % of this license or (at your option) any later version.
13 % The latest version of this license is in
14 % http://www.latex-project.org/lppl.txt
15 % and version 1.3c or later is part of all distributions of LaTeX
16 % version 2005/12/01 or later.
18 % This file has the LPPL maintenance status "maintained".
20 % The list of all files belonging to the LaTeX base distribution is
21 % given in the file `manifest.txt'. See also `legal.txt' for additional
24 % The list of derived (unpacked) files belonging to the distribution
25 % and covered by LPPL is defined by the unpacking scripts (with
26 % extension .ins) which are part of the distribution.
28 % -----------------------------------------------------------------------------
30 % The same approach as used in \pkg{DocStrip}: if \cs{documentclass}
31 % is undefined then skip the driver, allowing the file to be used directly.
32 % This works as the \cs{fi} is only seen if \LaTeX{} is not in use. The odd
33 % \cs{jobname} business allows the extraction to work with \LaTeX{} provided
34 % an appropriate \texttt{.ins} file is set up.
37 \let\documentclass\undefined
39 \begingroup\expandafter\expandafter\expandafter\endgroup
40 \expandafter\ifx\csname documentclass\endcsname\relax
46 \ProvidesFile{ltunicode.dtx}
47 [2015/08/10 v1.0l LaTeX Kernel (Unicode data)]
48 \documentclass{ltxdoc}
50 \DocInput{\jobname.dtx}
58 % \GetFileInfo{ltunicode.dtx}
59 % \title{The \texttt{ltunicode.dtx} file\thanks
60 % {This file has version number \fileversion, dated \filedate.}\\
61 % for use with \LaTeXe}
62 % \author{The \LaTeX3 Project}
64 % \MaintainedByLaTeXTeam{latex}
67 % This script extracts data from the Unicode Consortium files
68 % |UnicodeData.txt|, |EastAsianWidth.txt| and |LineBreak.txt| to be used for
69 % setting up \LaTeXe{} (or plain \TeX{}) with sane default settings when using
70 % the Xe\TeX{} and Lua\TeX{} engines. Details of the process are included in
73 % To create the extracted file, run this file in a location containing
74 % the three input data files using \texttt{pdftex}. (The code requires
75 % \cs{pdfmdfivesum} and the e-\TeX{} extensions: it could be adapted for
84 % \section{General set up}
86 % The script is designed to work with plain \TeX{} and so |@| is made into
87 % a `letter' using the primitive approach.
92 % \begin{macro}{\gobble}
93 % \begin{macro}{\gobblethree}
94 % \begin{macro}{\firsttoken}
98 \long\def\gobblethree#1#2#3{}
99 \long\def\firsttoken#1#2\relax{#1}
105 % \begin{macro}{\storedpar}
106 % A simple piece of test set up: the final line of the read file will be
107 % tokenized by \TeX{} as \cs{par} which can be tested by \cs{ifx} provided
108 % we have an equivalent available.
114 % \begin{macro}{\return}
115 % A stored |^^M| for string comparisons.
124 % \begin{macro}{\sourceforhex}
125 % \begin{macro}{\sethex}
126 % \begin{macro}{\dohex}
127 % \begin{macro}{\hexdigit}
128 % Some parts of the code here will need to be able to convert integers
129 % to their hexadecimal equivalent. That is easiest to do for the requirements
130 % here using a modified version of some code from Appendix~D of \emph{The
133 \newcount\sourceforhex
136 \sourceforhex=#2\relax
137 \ifnum\sourceforhex=0 %
145 \count0=\sourceforhex
146 \divide\sourceforhex by 16 %
147 \ifnum\sourceforhex>0 %
150 \count2=\sourceforhex
151 \multiply\count2 by -16 %
152 \advance\count0 by\count2
154 \expandafter\endgroup
155 \expandafter\def\expandafter#1\expandafter{#1}%
159 \edef#1{#1\number\count0}%
161 \advance\count0 by -10 %
162 \edef#1{#1\ifcase\count0 A\or B\or C\or D\or E\or F\fi}%
171 % \begin{macro}{\unicoderead, \unicodewrite}
172 % Set up the streams for data.
175 \newwrite\unicodewrite
179 % \section{Verbatim copying}
181 % \begin{macro}{\verbatimcopy}
182 % \begin{macro}{\endverbatimcopy}
183 % \begin{macro}{\verbatimcopy@auxii}
184 % \begin{macro}{\verbatimcopy@auxii}
185 % \begin{macro}{\verbatim@endmarker}
186 % Set up to read some material verbatim and write it to the output stream.
187 % There needs to be a dedicated `clean up first line' macro, but other than
188 % that life is simple enough.
204 \gdef\verbatimcopy@auxi#1^^M{%
205 \expandafter\verbatimcopy@auxii\gobble#1^^M%
207 \gdef\verbatimcopy@auxii#1^^M{%
209 \ifx\temp\verbatim@endmarker%
210 \expandafter\endgroup%
212 \ifx\temp\empty\else%
213 \immediate\write\unicodewrite{#1}%
215 \expandafter\verbatimcopy@auxii%
219 \edef\verbatim@endmarker{\expandafter\gobble\string\\}
220 \edef\verbatim@endmarker{\verbatim@endmarker endverbatimcopy}
228 % \section{File header section}
230 % \changes{v1.0d}{2015/03/26}{Renamed data file to
231 % \texttt{unicode-letters.def}}
232 % With the mechanisms set up, open the data file for writing.
234 \immediate\openout\unicodewrite=unicode-letters.def %
236 % There are various lines that now need to go at the start of the file.
237 % First, there is some header information. Parts of it are auto-generated,
238 % so there is some interspersing of verbatim and non-verbatim parts.
241 %% This is the file `unicode-letters.def',
242 %% generated using the script ltunicode.dtx.
244 %% The data here are derived from the files
248 % \changes{v1.0b}{2015/03/25}{Include Unicode version data in generated
250 % \changes{v1.0c}{2015/03/25}{Include MD5 sums for sources in generated
252 % \changes{v1.0f}{2015/03/26}{Include dates for sources in generated
254 % \begin{macro}{\parseunicodedata}
255 % \begin{macro}{\parseunicodedata@auxi}
256 % \begin{macro}{\parseunicodedata@auxii}
257 % \begin{macro}{\mdfiveinfo}
258 % To ensure that there is a full audit trail for the data, we record
259 % both the reported file version (if available) and the checksum for each
260 % of the source files. This is done by reading the first line of each file
261 % and parsing for the version string and if found reading the second line
262 % for a date/time, and then `catching' the entire files inside a macro to
263 % work out the checksums.
265 \def\parseunicodedata#1{%
266 \openin\unicoderead=#1.txt %
268 \errmessage{Data file missing: #1.txt}%
270 \immediate\write\unicodewrite{%
271 \expandafter\gobble\string\%\expandafter\gobble\string\%
274 \readline\unicoderead to \unicodedataline
275 \edef\unicodedataline{\unicodedataline\detokenize{-.txt}}%
276 \expandafter\parseunicodedata@auxi\unicodedataline\relax{#1}%
283 \def\parseunicodedata@auxi#1-#2.TXT#3\relax#4}%
287 \readline\unicoderead to \unicodedataline
288 \expandafter\parseunicodedata@auxii\unicodedataline\relax
292 \everyeof{\noexpand}%
294 \edef\mdfiveinfo{\input#4.txt\space}%
295 \expandafter\endgroup
296 \expandafter\def\expandafter\mdfiveinfo\expandafter{\mdfiveinfo}%
297 \immediate\write\unicodewrite{%
298 \expandafter\gobble\string\%\expandafter\gobble\string\%
302 Version #2 dated \temp^^J%
303 \expandafter\gobble\string\%\expandafter\gobble\string\%
306 MD5 sum \pdfmdfivesum\expandafter{\mdfiveinfo}%
309 \def\parseunicodedata@auxii#1: #2, #3 #4\relax{%
312 \parseunicodedata{UnicodeData}
313 \parseunicodedata{EastAsianWidth}
314 \parseunicodedata{LineBreak}
323 %% which are maintained by the Unicode Consortium.
328 % Automatically include the current date.
330 \immediate\write\unicodewrite{%
331 \expandafter\gobble\string\%\expandafter\gobble\string\%
332 Generated on \the\year
333 -\ifnum\month>9 \else 0\fi \the\month
334 -\ifnum\day>9 \else 0\fi \the\day.
338 % Back to simple text copying
342 %% Copyright 2014-2015
343 %% The LaTeX3 Project and any individual authors listed elsewhere
346 %% This file is part of the LaTeX base system.
347 %% -------------------------------------------
349 %% It may be distributed and/or modified under the
350 %% conditions of the LaTeX Project Public License, either version 1.3c
351 %% of this license or (at your option) any later version.
352 %% The latest version of this license is in
353 %% http://www.latex-project.org/lppl.txt
354 %% and version 1.3c or later is part of all distributions of LaTeX
355 %% version 2005/12/01 or later.
357 %% This file has the LPPL maintenance status "maintained".
359 %% The list of all files belonging to the LaTeX base distribution is
360 %% given in the file `manifest.txt'. See also `legal.txt' for additional
365 % \section{Unicode character data}
367 % \changes{v1.0e}{2015/03/26}{Correctly parse ranges in
368 % \texttt{UnicodeData.txt}}
369 % \begin{macro}{\parseunicodedata}
370 % \begin{macro}{\parseunicodedata@auxi}
371 % \begin{macro}{\parseunicodedata@auxii}
372 % \begin{macro}{\parseunicodedata@auxiii}
373 % \begin{macro}{\parseunicodedata@auxiv}
374 % \begin{macro}{\parseunicodedata@auxv}
375 % \begin{macro}{\parseunicodedata@auxvi}
376 % The first step of parsing a line of data is to check that it's not come
377 % from a blank in the source, which will have been tokenized as \cs{par}.
378 % Assuming that is not the case, there are lots of data items separated by
379 % |;|. Of those, only a few are needed so they are picked out and everything
380 % else is dropped. There is one complication: there are a few cases in the
381 % data file of ranges which are marked by the descriptor |First| and a
382 % matching |Last|. A separate routine is used to handle these cases.
384 \def\parseunicodedata#1{%
387 \expandafter\parseunicodedata@auxi#1\relax
390 \def\parseunicodedata@auxi#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
391 \parseunicodedata@auxii#1;#3;#2 First>\relax
393 \def\parseunicodedata@auxii#1;#2;#3 First>#4\relax{%
395 \expandafter\parseunicodedata@auxiii
397 \expandafter\parseunicodedata@auxv
401 \def\parseunicodedata@auxiii#1;#2;#3;#4;#5;#6;#7;#8\relax{%
402 \parseunicodedata@auxiv{#1}{#2}{#6}{#7}%
405 % At this stage we have only four pieces of data
407 % \item The code value
408 % \item The general class
409 % \item The uppercase mapping
410 % \item The lowercase mapping
412 % where both one or both of the last two may be empty. Everything here could
413 % be done in a single conditional within a \cs{write}, but that would be
414 % tricky to follow. Instead, a series of defined auxiliaries are used to
415 % show the flow. Notice that combining marks are treated as letters here
416 % (the second `letter' test).
418 \def\parseunicodedata@auxiv#1#2#3#4{%
419 \if L\firsttoken#2?\relax
420 \expandafter\unicodeletter
422 \if M\firsttoken#2?\relax
423 \expandafter\expandafter\expandafter\unicodeletter
425 \expandafter\expandafter\expandafter\unicodenonletter
431 % \changes{v1.0i}{2015/06/20}{Include first code point in a range in output}
432 % In the case where the first code point for a range was found, we
433 % assume the next line is the last code point (it always is). It's then
434 % a question of checking if the range is a set of letters or not, and if
435 % so going though them all and adding to the data file.
437 \def\parseunicodedata@auxv#1;#2;#3\relax{%
438 \read\unicoderead to \unicodedataline
439 \expandafter\parseunicodedata@auxvi\unicodedataline\relax#1;#2\relax
441 \def\parseunicodedata@auxvi#1;#2\relax#3;#4\relax{%
442 \if L\firsttoken#4?\relax
446 \unless\ifnum\count@>"#1 %
447 \sethex\temp{\count@}%
448 \unicodeletter\temp\temp\temp
463 % \changes{v1.0g}{2015/03/26}{Add missing \cs{global} in definition of \cs{C}}
464 % \changes{v1.0j}{2015/08/05}{Compress data for caseless letters}
465 % \changes{v1.0j}{2015/08/05}{Save some space by dropping end-of-line
467 % \begin{macro}{\unicodeletter, \unicodenonletter}
468 % \begin{macro}{\writeunicodedatafull}
469 % \begin{macro}{\writeunicodedatacompact}
470 % For `letters', we always want to write the data to file, and the only
471 % question here is if the character has case mappings or these point back
472 % to the character itself. If there are no mappings or the mappings are
473 % all equivalent to the same code point then use a shorter version of
476 \def\unicodeletter#1#2#3{%
478 \writeunicodedatacompact\l{#1}%
481 \ifnum"#1="\ifx\relax#2\relax#1 \else#2 \fi\else1\fi
482 \ifnum"#1="\ifx\relax#3\relax#1 \else#3 \fi\else1\fi
484 \writeunicodedatafull\L{#1}{#2}{#3}%
486 \writeunicodedatacompact\l{#1}%
491 % Cased non-letters can also exist: they can be detected as they have at
492 % least one case mapping. Write these in much the same way as letters,
493 % but always with a full mapping (must be the case to require the entry
496 \def\unicodenonletter#1#2#3{%
499 \writeunicodedatafull\C{#1}{#2}{#3}%
503 % Actually write the data. In all cases both upper- and lower-case mappings
504 % are given, so there is a need to test that both were actually available and
505 % if not set up to do nothing. Cases where both mappings are no-ops will
506 % already have been filtered out and are written in a shorter form: this
507 % saves a significant amount of space in the file.
509 \def\writeunicodedatafull#1#2#3#4{%
510 \immediate\write\unicodewrite{%
526 \def\writeunicodedatacompact#1#2{%
527 \immediate\write\unicodewrite{%
538 % There is now a lead-in section which creates the macros which take the
539 % processed data and do the code assignments. Everything is done within a
540 % group so that there is no need to worry about names.
546 % Cased non-letters simply need to have the case mappings set.
547 % For letters, there are a few things to sort out. First, the case mappings are
548 % defined as for non-letters. Category code is then set to $11$ before a check
549 % to see if this is an upper case letter. If it is then the \cs{sfcode} is set
550 % to $999$. Finally there is a need to deal with Unicode math codes, where base
551 % plane letters are class $7$ but supplementary plane letters are class~$1$.
552 % Older versions of Xe\TeX{} used a different name here: easy to pick up as
553 % we know that this primitive must be defined in some way. There is also an issue
554 % with the supplementary plane and older Xe\TeX{} versions, which is dealt with
555 % using a check at run time.
560 \global\uccode"#1="#2 %
561 \global\lccode"#1="#3 %
565 \global\catcode"#1=11 %
568 \global\sfcode"#1=999 %
571 \global\Umathcode"#1="7"01"#1 %
573 \global\Umathcode"#1="0"01"#1 %
576 \def\l#1 {\L#1 #1 #1 }
577 \ifx\Umathcode\undefined
578 \let\Umathcode\XeTeXmathcode
581 \ifx\XeTeXversion\undefined
583 \def\XeTeXcheck.#1.#2-#3\relax{#1}
584 \ifnum\expandafter\XeTeXcheck\XeTeXrevision.-\relax>996 %
589 \long\def\XeTeXcheck##1\endgroup{\endgroup}
590 \expandafter\XeTeXcheck
597 % Read the data and write the resulting code assignments to the file.
599 \openin\unicoderead=UnicodeData.txt %
600 \loop\unless\ifeof\unicoderead
601 \read\unicoderead to \unicodedataline
602 \parseunicodedata\unicodedataline
605 % End the group for setting character codes and assign a couple of special
610 \global\sfcode"2019=0 %
611 \global\sfcode"201D=0 %
615 % \section{Xe\TeX{} Character classes}
617 % The Xe\TeX{} engine includes the concept of character classes, which allow
618 % insertion of tokens into the input stream at defined boundaries. Setting
619 % up this data requires a two-part process as the information is split over
622 % \begin{macro}{\parseunicodedata}
623 % \begin{macro}{\parseunicodedata@auxi}
624 % \begin{macro}{\parseunicodedata@auxii}
625 % The parsing system is redefined to parse a detokenized input line which
626 % may be a comment starting with |#|. Assuming that is not the case, the
627 % data line with start with a code point potentially forming part of a range.
628 % The range is extracted and the width stored for each code point.
630 \def\parseunicodedata#1{%
633 \if\expandafter\gobble\string\#\expandafter\firsttoken#1?\relax
635 \expandafter\parseunicodedata@auxi#1\relax
639 \def\parseunicodedata@auxi#1;#2 #3\relax{%
640 \parseunicodedata@auxii#1....\relax{#2}%
642 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
643 \expandafter\gdef\csname EAW@#1\endcsname{#4}%
651 \sethex\temp{\count@}%
652 \expandafter\gdef\csname EAW@\temp\endcsname{#4}%
662 % With the right parser in place, read the data file.
664 \openin\unicoderead=EastAsianWidth.txt %
665 \loop\unless\ifeof\unicoderead
666 \readline\unicoderead to \unicodedataline
667 \parseunicodedata\unicodedataline
671 % \changes{v1.0j}{2015/08/05}{Compress East Asian width data by recording
672 % ranges for code points of type \texttt{ID}}
673 % \begin{macro}{\parseunicodedata@auxii}
674 % \begin{macro}{\parseunicodedata@auxiii}
675 % \begin{macro}{\parseunicodedata@auxiv}
676 % \begin{macro}{\parseunicodedata@auxv}
684 % The final file to read, |LineBreak.txt|, uses the same format as\\
685 % |EastAsianWidth.txt|. As such, only the final parts of the parser have to be
686 % redefined. The first stage here is to check if the line breaking class
687 % is known, and if so if it is equal to |ID| (class one).
689 \def\parseunicodedata@auxii#1..#2..#3\relax#4{%
690 \ifcsname #4\endcsname
691 \ifnum\csname #4\endcsname=\@ne
692 \expandafter\expandafter\expandafter\parseunicodedata@auxiii
694 \expandafter\expandafter\expandafter\parseunicodedata@auxiv
697 \expandafter\gobblethree
702 % For ranges of class \texttt{ID}, the entire range is written to the data
703 % file as a single block: no need to check on the width data.
705 \def\parseunicodedata@auxiii#1#2#3{%
706 \immediate\write\unicodewrite{%
708 \expandafter\string\csname #3\endcsname
710 #1 \ifx\relax#2\relax#1\else#2\fi
714 % For other cases, loop over each code point separately. If the
715 % code point is of width |F|, |H| or |W| then the line breaking
716 % property is written to the data file. The earlier check means that
717 % this only happens for characters of classes \texttt{OP} (opener),
718 % \texttt{CL} (closer), \texttt{EX} (exclamation), \texttt{IS} (infix sep),
719 % \texttt{NS} (non-starter) and \texttt{CM} (combining marks) characters
720 % (the latter need to be transparent to the mechanism).
722 \def\parseunicodedata@auxiv#1#2#3{%
723 \parseunicodedata@auxv{#1}{#3}%
731 \sethex\temp{\count@}%
732 \expandafter\parseunicodedata@auxv\expandafter{\temp}{#3}%
737 \def\parseunicodedata@auxv#1#2{%
739 \if F\csname EAW@#1\endcsname 1\fi
740 \if H\csname EAW@#1\endcsname 1\fi
741 \if W\csname EAW@#1\endcsname 1\fi
743 \immediate\write\unicodewrite{%
745 \expandafter\string\csname #2\endcsname
752 % The East Asian width class mappings.
774 % \changes{v1.0h}{2015/05/11}{Apply category codes to East Asian
775 % chars in all cases}
776 % \changes{v1.0j}{2015/08/05}{Move the stop point for Lua\TeX{}}
777 % \changes{v1.0j}{2015/08/05}{Only set \cs{XeTeXcharclass} from
778 % East Asian char data}
779 % \changes{v1.0k}{2015/08/06}{Add missing \cs{endgroup}}
780 % \changes{v1.0l}{2015/08/10}{Add missing \cs{endgroup}}
781 % Before actually reading the line breaking data file, the appropriate
782 % temporary code is added to the output. As described above, only a limited
783 % number of classes need to be covered: they are hard-coded as classes
784 % $1$, $2$ and $3$ following the convention adopted by plain Xe\TeX{}.
788 \ifx\XeTeXchartoks\XeTeXcharclass
789 \endgroup\expandafter\endinput
791 \def\setclass#1#2#3{%
795 \expandafter\firstofone
798 \global\XeTeXcharclass#1=#3 %
799 \expandafter\setclass\expandafter
800 {\number\numexpr#1+1\relax}{#2}{#3}%
804 \def\firstofone#1{#1}
805 \def\ID#1 #2 {\setclass{"#1}{"#2}{1}}
806 \def\OP#1 {\setclass{"#1}{"#1}{2}}
807 \def\CL#1 {\setclass{"#1}{"#1}{3}}
808 \def\EX#1 {\setclass{"#1}{"#1}{3}}
809 \def\IS#1 {\setclass{"#1}{"#1}{3}}
810 \def\NS#1 {\setclass{"#1}{"#1}{3}}
811 \def\CM#1 {\setclass{"#1}{"#1}{256}}
816 % Read the line breaking data and save to the output.
818 \openin\unicoderead=LineBreak.txt %
819 \loop\unless\ifeof\unicoderead
820 \readline\unicoderead to \unicodedataline
821 \parseunicodedata\unicodedataline
825 % \changes{v1.0a}{2015/03/25}{Use \cs{hskip} rather than \cs{hspace}
827 % Set up material to be inserted between character classes.
828 % that provided by plain Xe\TeX{}. Using \cs{hskip} here means the code will
829 % work with plain as well as \LaTeXe{}.
833 \gdef\xtxHanGlue{\hskip0pt plus 0.1em\relax}
834 \gdef\xtxHanSpace{\hskip0.2em plus 0.2em minus 0.1em\relax}
835 \global\XeTeXinterchartoks 0 1 = {\xtxHanSpace}
836 \global\XeTeXinterchartoks 0 2 = {\xtxHanSpace}
837 \global\XeTeXinterchartoks 0 3 = {\nobreak\xtxHanSpace}
838 \global\XeTeXinterchartoks 1 0 = {\xtxHanSpace}
839 \global\XeTeXinterchartoks 2 0 = {\nobreak\xtxHanSpace}
840 \global\XeTeXinterchartoks 3 0 = {\xtxHanSpace}
841 \global\XeTeXinterchartoks 1 1 = {\xtxHanGlue}
842 \global\XeTeXinterchartoks 1 2 = {\xtxHanGlue}
843 \global\XeTeXinterchartoks 1 3 = {\nobreak\xtxHanGlue}
844 \global\XeTeXinterchartoks 2 1 = {\nobreak\xtxHanGlue}
845 \global\XeTeXinterchartoks 2 2 = {\nobreak\xtxHanGlue}
846 \global\XeTeXinterchartoks 2 3 = {\xtxHanGlue}
847 \global\XeTeXinterchartoks 3 1 = {\xtxHanGlue}
848 \global\XeTeXinterchartoks 3 2 = {\xtxHanGlue}
849 \global\XeTeXinterchartoks 3 3 = {\nobreak\xtxHanGlue}
853 % Done: end the script.