Update LuaTeX testfiles for ^@ change
[latex2e.git] / latex2e-20170101 / doc / encguide.tex
blob96ed7edf6f48f5de2292fd66c0977fa5abb704a4
1 % \iffalse meta-comment
3 % Copyright 1993-2016
4 % The LaTeX3 Project and any individual authors listed elsewhere
5 % in this file.
6 %
7 % This file is part of the LaTeX base system.
8 % -------------------------------------------
9 %
10 % It may be distributed and/or modified under the
11 % conditions of the LaTeX Project Public License, either version 1.3c
12 % of this license or (at your option) any later version.
13 % The latest version of this license is in
14 % http://www.latex-project.org/lppl.txt
15 % and version 1.3c or later is part of all distributions of LaTeX
16 % version 2005/12/01 or later.
18 % This file has the LPPL maintenance status "maintained".
20 % The list of all files belonging to the LaTeX base distribution is
21 % given in the file `manifest.txt'. See also `legal.txt' for additional
22 % information.
24 % The list of derived (unpacked) files belonging to the distribution
25 % and covered by LPPL is defined by the unpacking scripts (with
26 % extension .ins) which are part of the distribution.
28 % \fi
32 \NeedsTeXFormat{LaTeX2e}[1995/12/01]
34 \documentclass{ltxguide}[1994/11/20]
36 \usepackage[T1]{fontenc}
37 \IfFileExists{lmodern.sty}{\usepackage{lmodern}}{}
38 \usepackage{textcomp}
39 \usepackage{url}
40 \usepackage{mflogo}
42 \addtolength\textheight{6\baselineskip}
43 \addtolength\topmargin{-2\baselineskip}
46 \newcommand\ttverb[1]{\texttt{\string#1}}
49 % for encodings
50 \providecommand{\Enc}[1]{\texttt{#1}}
52 % for packages
53 \providecommand{\Pkg}[1]{%
54 \textsf{#1}}
56 % for files
57 \providecommand{\File}[1]{%
58 \texttt{#1}}
60 % let's have meta values too
61 \providecommand{\meta}[1]{%
62 \ensuremath{\langle}\emph{#1}\ensuremath{\rangle}}
64 \usepackage{tabularx}
66 % eine Umgebung zur Darstellung von Kodierungen
68 % Argumente:
69 % #1: Name in LaTeX (z.B. OT1)
70 % #2: Name der Kodierung (z.B. TeX text)
71 % #3: Name des Autors (z.B. Don Knuth)
72 % #4: Bereich der benützten Glyphindizes
73 % #5: variable Positionen
74 % #6: Beispielzeichensatz
75 % #7: Referenz
77 % XXX add code to handle more than a single font example (e.g., larm1000,
78 % lbrm1000, and lcrm1000).
80 \newenvironment{encodinginfo}[7]%
81 {\noindent
82 \begin{tabularx}{\linewidth}{@{}l>{\raggedright\let\\\tabularnewline}X}%
83 \LaTeX{} name: & \texttt{#1}\\%
84 Public name: & #2\\%
85 Author: & #3\\%
86 Glyph slots used: & #4\\%
87 Variable slots: & #5\\%
88 Font example: & \def\@tempa{#6}\ifx\@tempa\@empty---%
89 \else\texttt{#6}\referenceftable{#6}\fi\\%
90 Further reference: & #7%
91 \end{tabularx}%
92 \par\nobreak
93 \vspace*{3pt}%
94 \quote
96 {\endquote
97 \vspace{6pt}}
99 \makeatletter
100 \def\referenceftable#1{
101 \@ifundefined{r@fonttable:#1}%
102 \relax
103 {;\space encoding table on page~\pageref{fonttable:#1}}%
106 % font table macros mainly lifted from manmac.tex
107 \def\oct#1{\hbox{\rm\'{}\kern-.2em\it#1\/\kern.05em}}
108 \def\hex#1{\hbox{\rm\H{}\tt#1}}
110 \def\oddline#1{\cr\noalign{\nointerlineskip}
111 \multispan{19}\hrulefill&
112 \setbox0=\hbox{\lower 2.3pt\hbox{\hex{#1x}}}\smash{\box0}\cr
113 \noalign{\nointerlineskip}}
114 \def\evenline{\cr\noalign{\hrule}}
115 \def\chartstrut{\lower4.5pt\vbox to14pt{}}
116 \def\beginchart#1#2{$$\global\count@=0 #1
117 \halign to\hsize\bgroup
118 \chartstrut##\tabskip0pt plus10pt&
119 &\hfil##\hfil&\vrule##\cr
120 \lower6.5pt\null
121 &#2&&\oct0&&\oct1&&\oct2&&\oct3&&\oct4&&\oct5&&\oct6&&\oct7&\evenline}
122 \def\endchart{\raise11.5pt\null&&&\hex 8&&\hex 9&&\hex A&&\hex B&
123 &\hex C&&\hex D&&\hex E&&\hex F&\cr\egroup$$}
124 \def\:{\setbox0=\hbox{\noboundary\char\count@\noboundary}%
125 \ifdim\ht0>7.5pt\reposition
126 \else\ifdim\dp0>2.5pt\reposition\fi\fi
127 \box0\global\advance\count@ by1 }
128 \def\reposition{\setbox0=\hbox{$\vcenter{\kern2pt\box0\kern2pt}$}}
129 \def\normalchart{%
130 &\oct{00x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline0
131 &\oct{01x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
132 &\oct{02x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline1
133 &\oct{03x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
134 &\oct{04x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline2
135 &\oct{05x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
136 &\oct{06x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline3
137 &\oct{07x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
138 &\oct{10x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline4
139 &\oct{11x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
140 &\oct{12x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline5
141 &\oct{13x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
142 &\oct{14x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline6
143 &\oct{15x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
144 &\oct{16x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline7
145 &\oct{17x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
146 \top}
148 \def\notophalf{}
149 \def\tophalf{%
150 %\noalign{\vskip 5pt\hrule}
151 &\oct{20x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline8
152 &\oct{21x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
153 &\oct{22x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline9
154 &\oct{23x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
155 &\oct{24x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline A
156 &\oct{25x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
157 &\oct{26x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline B
158 &\oct{27x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
159 &\oct{30x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline C
160 &\oct{31x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
161 &\oct{32x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline D
162 &\oct{33x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
163 &\oct{34x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline E
164 &\oct{35x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
165 &\oct{36x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline F
166 &\oct{37x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline}
168 \def\ftable#1#2{%
169 \batchmode
170 \font\X=#1%
171 \errorstopmode
172 \ifx\X\nullfont
173 \@warning{Font #1 not found, table omitted}
174 \else
175 \count@="80
176 \setbox0=\hbox{\X
177 \loop\char\count@\advance\count@ by1 \ifnum\count@<"100
178 \repeat}%
179 \ifdim\wd0>0pt \let\top\tophalf\else\let\top\notophalf\fi
180 \beginchart\X{\hfill\llap{\textbf{#1, \large#2}\label{fonttable:#1}}}\normalchart
181 \endchart\par\vfill
182 \fi}
183 \makeatother
186 \setcounter{tocdepth}{3}
188 \title{\LaTeX{} font encodings}
190 \author{Frank Mittelbach \and Robin
191 Fairbairns \and Werner Lemberg \and \LaTeX3 Project Team.}
193 \date{\copyright~Copyright 1995--2016 \\[5pt] 18 February 2016}
195 \begin{document}
197 \maketitle
199 \tableofcontents
201 \section{Introduction}
203 This document explains the ideas that underpin \LaTeX{} font
204 encodings and the constraints that apply when defining a new encoding; it
205 also lists the encodings that have already been defined.
207 \subsection{Encodings in \TeX{}}
209 \TeX{} (the program) implicitly recognises three sorts of encoding,
210 and all are (in a sense) discussed in the \TeX{}book~\cite{A-W:DKn86}:
211 \begin{itemize}
212 \item[1.] The input encoding, which specifies the meanings of characters
213 in files presented to \TeX{} for processing. The \TeX{}book
214 suggests that `your version of \TeX{} will recognise the characters
215 you type on your keyboard' (\TeX{} the program has provision for
216 static translations of input characters).
217 \end{itemize}
218 Such direct use of \TeX{}'s facilities is not the way modern
219 \LaTeX{} (or indeed any other \TeX{} macro package) is likely to deal
220 with input encodings. This document does not address the topic of
221 input encodings; the interested reader should examine the \LaTeX{}
222 base package \Pkg{inputenc} \cite[sec.~7.5.2, p.~357]{A-W:MG2004}.
223 \begin{itemize}
224 \item[2.] The token stream that \TeX{} processes internally. This stream
225 of \TeX{}'s consciousness is discussed in great detail in the
226 \TeX{}book.
227 \end{itemize}
228 Again, this document does not address the topic. \LaTeX's internal
229 character representation (\textsc{licr}) is well discussed in
230 \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
231 \begin{itemize}
232 \item[3.] The font encoding---i.e., the mapping of character codes to
233 glyphs in the fonts that are used to typeset \TeX{}'s output.
234 Again, a set of font encodings is enumerated in the \TeX{}book, but
235 that set has proved inadequate to the needs of modern multilingual
236 use of \LaTeX.
237 \end{itemize}
238 This document explains \emph{why} Knuth's original set of encodings is
239 inadequate to modern conditions, and discusses the issues that
240 surround the design and definition of new font encodings.
242 Font encodings are important for more than their r\^{o}le in mapping the
243 glyphs of the fonts to be used for typesetting: their glyph tables are
244 also the context in which \TeX{}'s hyphenation algorithm operates.
245 There are constraints imposed by \TeX{} that affect the way in which
246 new font encodings, for use in a multi-lingual environment, may be
247 structured (see section~\ref{sec:restrictions} for details).
249 \subsection{The history of \TeX{} font encodings}
251 Little attention was paid to font encodings prior to the arrival of
252 \TeX{}\,3. Up to that time, one used Donald Knuth's fonts (the
253 Computer Modern family, using the encodings we now refer to as \Enc{OT1} and
254 the \Enc{OM} series), or one was on one's own.
256 The Computer Modern text encoding raises problems in unmodified
257 \TeX{}, because hyphenation cannot break words containing
258 \verb"\accent" commands. Even in those Western European languages for
259 which the \Enc{OT1} encoding has symbols for the necessary
260 \verb"\accent"-based diacritics, this shortcoming ruins typesetting of
261 running text.
263 With the advent of \TeX{}\,3, with its ability to switch between
264 hyphenation pattern sets, it was clear that the situation could not
265 continue. Thus a group at the TUG Annual General Meeting in Cork,
266 Ireland, specified a uniform encoding for 256-glyph fonts, that
267 contains accented letters and non-\textsc{ascii} letters necessary to
268 express most Western European languages (and some Eastern European ones)
269 without recourse to the \verb"\accent" command.
271 This ``Cork'' encoding has since been realised in a series of fonts
272 designed with Metafont, in at least one font series that is available
273 both in Adobe Type 1 format and in OpenType format, % viz., Latin Modern
274 and in a number of virtual-font mappings of other font series.
276 Since the time of the Cork meeting, much effort has been devoted to
277 the design of encodings for text fonts to use with \TeX{}, and the
278 Cork encoding influenced the design of many such encodings.
280 Encodings for mathematical fonts have, in contrast, changed little
281 since Knuth's contributions. A TUG Technical Working Group was
282 established at the Cork meeting, whose aim was to define a set of
283 256-glyph encodings to regularise and extend Knuth's originals, using
284 ideas from several other fonts that had appeared since, and from the
285 known needs of researchers in mathematics and the mathematical sciences.
287 Independently, a first proposal (the so-called \emph{Aston proposal}) was worked
288 out by Justin Ziegler together with Frank Mittelbach and other members of the
289 \LaTeX3 project team~\cite{ziegler}. A first implementation of
290 this propsal was realized by Matthias Clasen und Ulrik
291 Vieth~\cite{clasen,clasen-vieth}.
293 However, the slow progress of these Mathematical encodings has been
294 overtaken by the addition (in the last decade or so) of a large number
295 of mathematical symbols to Unicode~\cite{beeton}; one can expect
296 further changes so that new public mathematical font encodings will
297 most likely be delayed still further.
301 \subsection{Further information}
303 For a general introduction to \LaTeX, including the new features of
304 \LaTeXe, you should read \emph{\LaTeXbook},
305 Leslie Lamport, Addison Wesley, 2nd~ed, 1994.
307 A more detailed description of the new features of \LaTeX, including an
308 overview of more than 200 packages and nearly 1000 ready to run examples, is
309 to be found in \emph{\LaTeXcomp{} second edition} by Frank Mittelbach and
310 Michel Goossens~\cite{A-W:MG2004}.
312 The \LaTeX{} project sponsored a report on Mathematical % spelt out in full
313 font encodings, which
314 is worth reading for its insight into the problems of defining the way
315 in which math is used: see~\cite{ziegler,clasen,clasen-vieth}.
317 The \LaTeX{} font selection scheme is based on \TeX, which is described
318 by its developer in \emph{The \TeX book}, Donald E.~Knuth, Addison
319 Wesley, 1986, revised in 1991 to include the features of \TeX~3.
321 For more information about \TeX{} and \LaTeX, please contact your local
322 \TeX{} Users Group, or the international \TeX{} Users Group
323 (\url{http://www.tug.org}).
327 \section{Existing font encodings}
329 This section lists the encodings currently assigned; for each
330 encoding, we list the registered (\LaTeX{}) name, the assigned purpose
331 of the encoding, and the author. Further details may list the code
332 positions used in the encoding, the \emph{variable slots} (see below),
333 an example font (for which a listing will be provided later in the
334 document if the relevant fonts are present), and a source for further
335 reference.
337 While the characteristic feature of an encoding is that each font
338 encoded according to the encoding should have the same glyph set,
339 there are some encodings (notably \Enc{OT1} and its descendants) in
340 which a few glyph code slots differ in their contents in different
341 fonts.
343 \subsection{Naming conventions}
345 Names for encoding schemes are strings of up to three letters (all
346 upper case) plus digits.
348 The \LaTeX3 project reserves the use of encoding names starting with the
349 following letters: |T| (standard 256-long text encodings), |TS|
350 (symbols that are designed to extend a corresponding |T| encoding),
351 |X| (text encodings that do not conform to the strict requirements for
352 |T| encodings), |M| (standard 256-long mathematical encodings), |S| (other
353 symbol encodings), |A| (other special applications), |OT| (standard
354 128-long text encodings), and |OM| (standard 128-long mathematical encodings).
356 Please do not use the above starting letters for non-portable
357 encodings. If new standard encodings emerge then we shall add them in
358 a later release of \LaTeX.
360 Encoding schemes which are local to a site or a system should start
361 with |L|, experimental encodings intended for wide distribution will
362 start with |E|, whilst |U| is for Unknown or Unclassified encodings.
364 \begin{quote}
365 \itshape We recommend that new encoding names should not be
366 introduced unless careful consideration and discussion in the user
367 community has confirmed the need for the encoding. If encodings have to
368 change from font to font, a number of problems arise, so it is best to
369 develop encodings that can be used with a large number of fonts in parallel.
370 This allows documents to be typeset using different fonts without problems.
372 The \Enc{TS1} encoding is a good example of a \emph{bad} encoding (even
373 though it was developed with the best intentions) as a huge number of fonts
374 can only implement parts of it. Similarly, the fact that the few sets of
375 available mathematical fonts (beside Computer Modern Math) nearly
376 all implement slightly different encodings is a huge source of
377 problems. Don't add to this if possible!
378 \end{quote}
381 \subsection{128$^+$ glyph encodings (text)}
383 The `OT' series of font encodings start with Donald Knuth's original
384 text encoding, that used for the text fonts in the earliest releases
385 of \TeX{} itself. The `O' of the encoding designator may be taken as
386 signifying `original', or just `old'.
388 \begin{encodinginfo}{OT1}
389 {\TeX{} text}
390 {Donald Ervin Knuth}
391 {0x00--0x7F}
392 {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
393 % {0X--'177}
394 % {'13--'17, '44, '74, '76, '134, '173--'175}
395 {cmr10}
396 {\cite[p.427]{A-W:DKn86}}
398 Donald Knuth designed his font encoding (and hence his fonts) in a
399 very different environment from that which now pervades the \TeX{}
400 world: his (mainframe) computer had very little memory, there was
401 little experience in (or demand for) for multilingual technical
402 typesetting, and as a result it was appropriate to sacrifice
403 uniformity for efficiency.
405 Thus Knuth's original fonts differ slightly in some encoded slots:
406 for example, the glyphs \texttt{\string<}, \texttt{\string>},
407 \verb=\=, \verb={=, and \verb=}= are only available in the
408 typewriter fonts and the \textdollar{} and \textsterling{} signs
409 share the same position (in different font shapes).
411 This means that direct selection of these slots can produce
412 unpredictable results, e.g., typing \texttt{\string<} or
413 \verb=\symbol{'74}= in a document can yield `\textquestiondown'.
414 \end{encodinginfo}
417 \begin{encodinginfo}{OT2}
418 {UW cyrillic encoding}
419 {University of Washington}
420 {0x00--0x7F}
421 {---}
422 {wnr10}
423 {\cite{Beeton:TB6-3-124}}
424 Support for this encoding is available in the Cyrillic bundle although for
425 all practical purposes it is better to use one of the \Enc{T2} encodings.
426 \end{encodinginfo}
429 \begin{encodinginfo}{OT3}
430 {UW IPA encoding}
431 {University of Washington}
432 {0x00--0x7f}
433 {---}
434 {wsuipa10}
435 {\cite[p.149]{CorkGW:91}}
436 The \Enc{OT3} encoding was never really used with \LaTeXe{}
437 following the introduction of the TIPA system which offers much
438 better support for IPA. In particular, no \File{ot3enc.def}
439 file was ever produced.
440 \end{encodinginfo}
443 \begin{encodinginfo}{OT4}
444 {Polish text encoding}
445 {B.~Jackowski and M.~Ry\'cko} %% ? Marcin Woli\'nski
446 {0x00--0x7F, 0x81, 0x82, 0x86, 0x8A, 0x8B, 0x91, 0x99, 0x9B, 0xA1,
447 0xA2, 0xA6, 0xAA, 0xAB, 0xAE, 0xAF, 0xB1, 0xB9, 0xBB, 0xD3, 0xF3,
448 0xFF}
449 {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
450 {plr10}
451 {---}
453 While Knuth included the means of typesetting the `lost L' (\L) in
454 his \Enc{OT1} encoding, he omitted the ogonek (\,\,\k{}), a diacritic
455 mark that is also needed in Polish text; hence the appearance, well
456 before the \Enc{T1} encoding, of fonts using this encoding.
457 \end{encodinginfo}
459 \begin{encodinginfo}{OT5}
460 {Not currently allocated}
461 {---}
462 {---}
463 {---}
465 {---}
467 \end{encodinginfo}
471 \begin{encodinginfo}{OT6}
472 {Armenian text encoding}
473 {Serguei Dachian}
474 {0x03--0x0F, 0x13--0x7F}
475 {---}
476 {artmr10}
477 {---}
479 This encoding was allocated to permit use of Dachian's
480 Armenian fonts in a standard \LaTeX{} environment.
482 Because of license issues the \texttt{artmr} fonts are not necessarily
483 included in distributed \TeX{} installations (and for this reason the
484 corresponding encoding table is not shown below). However, the fonts
485 and the support macros can be found on the CTAN archives (look for
486 \texttt{armtex}).
488 \end{encodinginfo}
492 \subsection{256 glyph encodings (text)}
494 \begin{encodinginfo}{T1}
495 {Cork encoding}
496 {Euro \TeX{} conference at Cork}
497 {0x00--0xFF}
498 {---}
499 {ecrm1000}
500 {\cite[p.514]{tub:MFe90}, \cite[p.99]{Knappen:TB17-2-96}}
502 The Cork encoding was developed so that advantage could be taken of
503 the (then) new facilities of \TeX{}\,3, allowing hyphenation of
504 most Western European (and some Eastern European) languages in an
505 unmodified version of \TeX{}.
507 The encoding was developed in the absence of any extant effort at
508 font design, but instances written in Metafont (the `EC' fonts), and
509 more recently Adobe Type 1 instances of the same fonts have become
510 available.
512 Substantial (but incomplete) instances have also been developed,
513 which use virtual fonts. These latter instances map either Knuth's
514 original (OT1-encoded) fonts, or commercial fonts that contain the
515 Adobe `standard' set of 224 glyphs.
516 \end{encodinginfo}
518 \begin{encodinginfo}
519 {T2A, T2B, T2C}
520 {Cyrillic encodings}
521 {The CyrTUG font team}
522 {0x00--0xFF}
523 {--- (within each encoding)}
524 {larm1000}
525 {\cite{Berdnikov:eurotex-98}}
527 There are too many glyphs in the full Cyrillic complement of
528 languages for all of them to be covered by a single
529 \LaTeX{}-compliant encoding (the lower half of each
530 \Enc{T2}~encoding is identical to that of \Enc{T1}, in order that
531 each should be a conforming \LaTeX{} encoding~--- see
532 section~\ref{sec:restrictions}). The approach taken is
533 therefore to develop a single encoding, \Enc{X2} (see \ref{sec:extendedenc})
534 which contains all the glyphs needed for the full set of
535 languages, and then to derive the three \LaTeX{}-complaint
536 \Enc{T2}-family encodings using the \Enc{X2} set together with that of
537 \Enc{T1}.
539 \end{encodinginfo}
543 \begin{encodinginfo}{T3}
544 {IPA encoding}
545 {FUKUI Rei, University of Tokyo}
546 {0x00--0xFF}
547 {---}
548 {tipa10}
549 {\cite[p.102]{Rei:TB17-2-102}}
552 The \Enc{T3} encoding (and associated macros) provides the glyphs required
553 in phonetic description according to current International Phonetic
554 Association recommendations \cite{ipa}.
556 The \Enc{T3} encoding does \emph{not fulfil} the requirements for \Enc{T}
557 encodings---the name is a historical accident. The correct name would be
558 \Enc{X3}, but due to the fact that this font family has been used under its
559 current encoding name for a long time, the name will not change for
560 compatibility reasons.
562 \end{encodinginfo}
566 \begin{encodinginfo}{T4}
567 {African Latin (fc)} % public name
568 {J\"org Knappen} % author name
569 {0x00--0xFF} % range(s) of slots used for glyphs
570 {0x24} % range(s) of slots with variable glyphs if any
571 {fcr10} % name of an example font
572 {\cite{tub:JKn93}}
574 The African Latin fonts contain in their lower half (0--127) the same
575 characters as the European Latin (T1-encoded) Fonts, while in their
576 upper half (128--255) they
577 contain letters and symbols for African languages that use extended
578 Latin alphabets.
579 Due to lack of space, J\"org had to play the unfortunate trick of
580 assigning \verb=\textdollar= and \verb=\textsterling=
581 the same position; users should take these characters
582 from the text companion font, if they are needed. Instead of defining
583 a lot of new control sequences for the single letters, there are three
584 accent-like control sequences with general purpose:
585 \verb=\m= (Modified-1),
586 \verb=\M= (Modified-2) and
587 \verb=\B= (Barred).
588 Most standard \LaTeX{} encoding-dependent commands
589 work. However, the Icelandic special letters are not available and `best
590 replacements' for \verb=\Th=, \verb=\th=, and \verb=\dh=
591 are used (barred T and d resp.).
592 \end{encodinginfo}
595 \begin{encodinginfo}{T5}
596 {Vietnamese encoding}
597 {Werner Lemberg and
598 Vladimir Volovich}
599 {0x00--0xFF}
600 {---}
601 {vnr10}
602 {\cite{vnr}}
604 The \Enc{T5} encoding was developed for Vietnamese. Again, this encoding
605 \emph{does not} conform to the requirements for a \Enc{T}-encoding
606 because its large number of accented letters prevent the \verb=\lccode= and
607 \verb=\uccode= mapping requirements for \Enc{T} encodings from being
608 fulfilled. However, since the Vietnamese language does not
609 use word division in typesetting so that this requirement is
610 actually not important for this particular language.
611 Since every glyph used in Vietnamese text is internally
612 represented as \textsc{licr} macros, the commands \verb=\MakeUppercase= and
613 \verb=\MakeLowercase= still work as expected (as they change the case of the
614 \textsc{ascii} characters in \textsc{licr} definitions).
616 \end{encodinginfo}
618 \begin{encodinginfo}
619 {T6}
620 {Armenian}
621 {---}
622 {---}
623 {---}
625 {---}
627 This encoding is reserved to permit future expansion of Armenian
628 \TeX{} to use 256-character (hyphenatable) fonts.
629 \end{encodinginfo}
631 \begin{encodinginfo}{T7}
632 {Greek encoding}
633 {---}
634 {---}
635 {---}
637 {---}
639 The name is already reserved for a 256 glyph greek encoding. The encoding
640 itself hasn't been defined so far.
642 \end{encodinginfo}
646 \subsection{256$^-$ glyph encodings (text symbols)}
648 \begin{encodinginfo}{TS1}
649 {Text Companion encoding (Cork)}
650 {J\"org Knappen}
651 {0x00--0x0D, 0x12, 0x15, 0x16, 0x18--0x1D, 0x20, 0x24, 0x27, 0x2A,
652 0x2C--0x3A, 0x3C--0x3E, 0x4D, 0x4F, 0x57, 0x5B, 0x5D--0x60,
653 0x62--0x64, 0x6C--0x6E, 0x7E--0xBF, 0xD6, 0xF6}
654 {---}
655 {tcrm1000}
656 {\cite{Knappen:TB17-2-96}}
658 The text symbol encoding offers access to symbolic glyphs that are
659 commonly used in text (for a variety of reasons), and whose style
660 should vary with the text that surrounds them.
662 Unfortunately, the \Enc{TS1} encoding was developed without
663 reference to the glyphs available in existing commercial fonts.
664 As a result, only font families
665 explicitly developed for \TeX{} (i.e., typically originating with
666 \MF{}) actually contain all glyphs required by the \Enc{TS1}
667 encoding. Most other font families (whether free or commercial)
668 often only provide half of the set%
670 %% don't show the comment if the tables are not generated
672 \expandafter\ifx\csname r@fonttable:tcrm1000\endcsname\relax
673 \else
674 \expandafter\ifx\csname r@fonttable:ptmr8c\endcsname\relax
675 \else
676 \space (compare the two tables for \Enc{TS1} on
677 pages~\pageref{fonttable:tcrm1000}
678 and~\pageref{fonttable:ptmr8c})%
680 \fi.
681 To improve this situation somewhat, NFSS provides a way to define encoding
682 subsets on a per family basis in the \Pkg{textcomp} package (which
683 package offers support for the \Enc{TS1} encoding).
684 \end{encodinginfo}
687 \begin{encodinginfo}{TS3}
688 {IPA symbol encoding}
689 {FUKUI Rei, University of Tokyo}
690 {0x00--0x0A, 0x20--0x49, 0x50--0x56, 0x70--0x7B}
691 {---}
692 {tipx10}
693 {\cite{Rei:TB17-2-102}}
695 The \Enc{TS3} encoding (together with the \Enc{T3} encoding) provides the
696 glyphs for typesetting phonetic transcriptions following the
697 guidelines of the International Phonetic Association \cite{ipa}. Support
698 is offered through the \Pkg{tipa} package.
699 \end{encodinginfo}
704 \subsection{256 glyph encodings (text extended)}
705 \label{sec:extendedenc}
707 \begin{encodinginfo}
708 {X2}
709 {Cyrillic glyph container}
710 {The CyrTUG font team}
711 {0x00--0xFF}
712 {---}
713 {rxrm1000}
714 {\cite{Berdnikov:eurotex-98}}
716 This encoding specifies the glyph container for Cyrillic characters,
717 which is used in specifying the \Enc{T2A}, \Enc{T2B} and \Enc{T2C} encodings.
718 \end{encodinginfo}
723 \subsection{128$^+$ glyph encodings (mathematics)}
726 \begin{encodinginfo}{OML}
727 {\TeX{} math italic}
728 {Donald Ervin Knuth}
729 {0x00--0x7F}
730 {---}
731 {cmmi10}
732 {\cite[p.430]{A-W:DKn86}}
734 The \Enc{OML} encoding contains italic Latin and Greek letters for
735 use in mathematical formulas (typically used for variables) together
736 with some symbols.
738 \end{encodinginfo}
740 \begin{encodinginfo}{OMS}
741 {\TeX{} math symbol}
742 {Donald Ervin Knuth}
743 {0x00--0x7F}
744 {---}
745 {cmsy10}
746 {\cite[p.431]{A-W:DKn86}}
748 The \Enc{OMS} encoding contains basic mathematical symbols,
749 together with an uppercase ``calligraphic'' Latin alphabet.
750 \end{encodinginfo}
753 \begin{encodinginfo}{OMX}
754 {\TeX{} math extension}
755 {Donald Ervin Knuth}
756 {0x00--0x7F}
757 {---}
758 {cmex10}
759 {\cite[p.432]{A-W:DKn86}}
761 \Enc{OMS} encodes mathematical symbols with variable sizes, such as
762 the $\sum$ sign, which changes its size if used in displayed
763 formulas, and the construction parts for
764 brackets, braces and radicals, etc., which can stretch to accommodate
765 the thing they're enclosing.
767 \end{encodinginfo}
772 \subsection{256 glyph encodings (mathematics)}
774 So far there are no 256 glyph mathematical encodings. A proposal is
775 given in \cite{ziegler}.
778 \subsection{Other encodings}
780 \begin{encodinginfo}
781 {C..}
782 {CJK encodings}
783 {Werner Lemberg}
784 {0x00--0xFF}
785 {---}
786 {} % no font, of course
787 {\cite{CJK}}
789 The \Pkg{CJK} package defines a number of encodings which access Chinese,
790 Japanese and Korean fonts.
792 \end{encodinginfo}
794 \begin{encodinginfo}
795 {E..}
796 {Experimental encodings}
797 {---}
798 {0x00--0xFF}
799 {all}
801 {\cite[p.416]{A-W:MG2004}}
803 As the name indicates, encodings starting with the letter \Enc{E} are
804 intended for experimental encodings, that are still likely to change.
805 \end{encodinginfo}
807 \begin{encodinginfo}{L..}
808 {Local encoding (site dependent)}
809 {---}
810 {0x00--0xFF}
811 {all}
813 {\cite[p.416]{A-W:MG2004}}
815 `Local' encodings provide the means to develop representation
816 techniques that are suited to a particular \TeX{} environment. While
817 the developer has freedom to specify their encoding as he or she
818 pleases, there is a strong incentive to obey the \LaTeX{} rules for
819 encodings, since it will otherwise be difficult to compose text using
820 the encoding.
822 At least it was the intention that \Enc{L..} encodings are local and
823 site dependent. However, a number of such encodings became generally
824 used without ever getting a different name allocated.
826 \end{encodinginfo}
830 \begin{encodinginfo}{LY1}
831 {Y\&Y 256 glyph encoding}
832 {Berthold Horn}
833 {0x00--0x08, 0x0C, 0x10, 0x12--0xFF}
834 {\emph{believed none}}
835 {ptmr8y}
836 {\cite[p.416]{A-W:MG2004}}
838 This is an alternative to the \Enc{T1} encoding developed by Y\&Y and
839 used in their commercial \TeX{} implementation.
841 \end{encodinginfo}
844 \begin{encodinginfo}{LV1}
845 {MicroPress encoding}
846 {Michael Vulis}
847 {\emph{unknown}}
848 {\emph{unknown}}
850 {\cite[p.416]{A-W:MG2004}}
852 This is an encoding developed by MicroPress and used for some of their
853 fonts.
855 \end{encodinginfo}
858 \begin{encodinginfo}{LGR}
859 {Greek 256 glyph encoding}
860 {\emph{unknown}}
861 {0x00--0xFF}
862 {\emph{believed none}}
863 {grmn1000}
864 {\cite[p.575]{A-W:MG2004}}
866 Currently the main encoding in use for the Greek language.
868 This encoding doesn't conform to the restrictions for
869 \Enc{T}-encodings described in section~\ref{sec:restrictions} on
870 page~\pageref{sec:restrictions} as it doesn't have \textsc{ascii}
871 glyphs at all.
873 \end{encodinginfo}
876 \begin{encodinginfo}
877 {PD1}
878 {PDF DocEncoding}
879 {Adobe}
880 {0x08--0x0A, 0x0C, 0x0D, 0x18--0x7E, 0x80--0x9E, 0xA0--0xAE, 0xB0--0xFF}
881 {---}
883 {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
885 The \Enc{PD1} encoding is a virtual encoding with 256 glyphs needed to
886 produce bookmarks and similar text in PDF document generated with pdf\LaTeX.
887 The encoding is ``virtual'' because by design there are no \TeX{}
888 fonts that cover \Enc{PD1}. Details can be found in appendix D.1
889 of~\cite{Adobe:PDF-1.6}.
890 \end{encodinginfo}
892 \begin{encodinginfo}
893 {PU}
894 {PDF Unicode Encoding}
895 {Adobe}
896 {---}
897 {---}
899 {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
901 Another virtual encoding (with more than 600 characters) for
902 Unicode-encoded bookmarks in PDF documents.
903 \end{encodinginfo}
905 \begin{encodinginfo}{U}
906 {Unknown encoding}
907 {---}
908 {potentially 0x00-0xFF}
909 {all}
910 {wasy10}
911 {\cite[p.416]{A-W:MG2004}}
913 This encoding should be used for fonts that resist classification,
914 e.g., when it is clear that there will never be more than one font
915 using the same encoding.
917 \end{encodinginfo}
921 \section{Restrictions}
922 \label{sec:restrictions}
925 \subsection{Required glyphs for general text encodings}
927 Encodings that are supposed to be used with \LaTeX{} for `general
928 purpose text fonts' need to have certain fixed glyphs in certain
929 encoding slots. A `general purpose text font' is one intended for
930 arbitrary natural language text and not just within special
931 environments (such as the phonetic alphabet) or just for typesetting
932 individual symbols (e.g., the text companion font with encoding
933 \Enc{TS1}).
935 This is the case for the following glyphs that have to be in their
936 \textsc{ascii} positions for general purpose text encodings:
937 \begin{center}
938 \begin{tabular}[t]{cc}
939 Glyph & Position \\ \hline
940 ! & \number`\! \\
941 ' & \number`\' \\
942 ( & \number`\( \\
943 ) & \number`\) \\
944 \relax* & \number`\* \\
945 + & \number`\+ \\
946 , & \number`\, \\
947 - & \number`\- \\
948 . & \number`\. \\
949 / & \number`\/ \\
950 0 \ldots\ 9 & \number`\0\ to \number`\9 \\
951 \end{tabular}
952 \quad
953 \begin{tabular}[t]{cc}
954 Glyph & Position \\ \hline
955 : & \number`\: \\
956 ; & \number`\; \\
957 = & \number`\= \\
958 ? & \number`\? \\
959 @ & \number`\@ \\
960 A \ldots\ Z & \number`\A\ to \number`\Z \\
961 \relax[ & \number`\[ \\
962 ] & \number`\] \\
963 ` & \number`\` \\
964 a \ldots\ z & \number`\a\ to \number`\z \\
965 \end{tabular}
966 \quad
967 \begin{tabular}[t]{cc}
968 Glyph\footnotemark & Position \\ \hline
969 < & \number`\< \\
970 > & \number`\> \\
971 \string| & \number`\| \\
972 \end{tabular}\footnotetext{The requirement for these three glyphs is
973 violated in the Latin alphabet \Enc{OT} encodings.}
974 \end{center}
975 In addition the following glyphs have to be present
976 somewhere\footnote{The position in this case is not important as they
977 are generated from ligature programs.} in the encoding together with
978 corresponding ligature programs to generate them:
979 \begin{center}
980 \begin{tabular}[t]{cc}
981 Glyph & Ligature program \\ \hline
982 `` & \texttt{`\/`} \\
983 '' & \texttt{'\/'} \\
984 -- & \texttt{-\/-} \\
985 --- & \texttt{-\/-\/-} \\
986 \end{tabular}
987 \end{center}
989 This is $33 + 2 * 26 = 85$ positions ``required'', which leaves 171
990 positions free.
992 If there are free slots available then adding all or some of the
993 diacritics would be the best way to fill them.
995 If there are insufficient slots for the characters needed, a possible
996 technique is to create a subsidiary encoding, and to move non-letter
997 characters to it. Since only ``letters'' take part in the hyphenation
998 algorithm, this technique doesn't affect the appearance of the typeset
999 result.
1001 \subsection{The constraints on upper/lower case tables}
1003 Due to some technical restrictions of \TeX{} related to hyphenation it
1004 is not possible in \LaTeX{} to use more than one \verb=\lccode= or
1005 \verb=\uccode= table. Therefore all encodings need to share these two
1006 tables which are defined to be those of the \Enc{T1} encoding.
1008 The \Enc{T1} encoding has some nasty peculiarities which make certain slot
1009 positions more or less unusable for other encodings if this
1010 restriction is to be obeyed. This is unfortunate but since \Enc{T1} is well
1011 established and the basis for a large number of languages it seemed
1012 better to live with this situation instead of trying to replace \Enc{T1} with a
1013 slightly better standard (with the result that for a long time
1014 different \LaTeX{} installations would not be able to communicate with
1015 each other because of incompatible font sets).
1017 The positions that are problematic are as follows.
1018 \begin{center}
1019 \begin{tabular}{lp{.8\linewidth}}
1020 25 (\char 25) & uppercase maps strangely (same as for 105, \char 105)\\
1021 26 (\char 26) & uppercase maps strangely (same as for 106, \char 106)\\
1022 27 (\char 27) & lowercase maps to itself which makes this slot subject
1023 to hyphenation (used to support \Enc{OT1} encoding) \\
1024 157 (\char 157) & lowercase maps strangely (same as for 73, \char 73) \\
1025 158 (\char 158) & uppercase maps strangely (same as for 240, \char 240) \\
1026 \end{tabular}
1027 \end{center}
1028 One way to use such slots is to fill them with ligature glyphs as
1029 \TeX{} will not consult these tables for glyphs constructed through
1030 ligatures programs but instead uses the entries for the individual
1031 glyphs used to produce the ligature.
1033 A complete listing of the uppercase/lowercase mapping tables is to be
1034 found in section~\ref{sec:uclc-tab} (page \pageref{sec:uclc-tab}).
1036 \newcount\temp \newcount\tempL \newcount\tempU
1038 \def\nextstep{\global\tempL=\lccode\temp
1039 \global\tempU=\uccode\temp
1040 \lctablenumbersize\the\temp &
1041 \the\tempL&
1042 \the\tempU&\printlowerupper{\the\temp}{\the\tempL}{\the\tempU}\\
1043 \global\advance\temp by 1
1044 \stepprint}
1046 \def\printlowerupper#1#2#3{\char#1\relax
1047 (\ifnum#2=0\relax--\else\char#2\fi
1048 /\ifnum#3=0\relax--\else\char#3\fi)}
1050 \def\stepprint{\relax\ifnum\temp<\endval
1051 \let\next=\nextstep
1052 \else
1053 \let\next=\relax
1055 \next}
1057 \def\dolctable#1#2{{\temp=#1\relax
1058 \def\endval{#2}%
1059 \setlength\tabcolsep{1.5pt}%
1060 \begin{tabular}[t]{@{}cccc@{}}
1061 pos&lc&uc&glyphs\\\hline
1062 \stepprint
1063 \end{tabular}}}
1065 \iffalse
1066 \begin{center}
1067 \tiny\let\lctablenumbersize\tiny
1068 \mbox{\dolctable{0}{52}\vrule
1069 \dolctable{52}{104}\vrule
1070 \dolctable{104}{156}\vrule
1071 \dolctable{156}{208}\vrule
1072 \dolctable{208}{256}}
1073 \end{center}
1076 \iffalse
1077 \begin{center}\tiny
1078 \mbox{\dolctable{0}{65}\vrule
1079 \dolctable{65}{128}\vrule
1080 \dolctable{128}{193}\vrule
1081 \dolctable{193}{256}}
1082 \end{center}
1087 \section{Encoding specific commands}
1089 An encoding specific command is one that generates a glyph (or
1090 glyphs), to produce a graphic effect that may be implemented
1091 differently in different encodings. The encoding specific command
1092 automatically changes its implementation when the encoding changes in
1093 the course of the document. Encoding specific commands figure in
1094 \LaTeX's internal character representation (\textsc{licr}) and are also
1095 discussed in \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
1097 The following table only covers the encoding specific commands from
1098 the \Enc{OT1} and \Enc{T1} encodings. Other encodings may specify
1099 additional encoding specific commands. In the table, the first 15
1100 commands are `accent-like' and need as an argument the character to be
1101 accented. For example, |\v{c}| is the \textsc{licr} for `\v{c}'.
1103 \begin{tabbing}
1104 \ttverb\textvisiblespace\quad\=bbbbbbbbbbbbbb\=b'b'\=ccccccccccc\kill
1105 \ttverb\`{} \>OT1,T1\> \a`{}\> (grave) \\
1106 \ttverb\'{} \>OT1,T1\> \a'{}\> (acute) \\
1107 \ttverb\^{} \>OT1,T1\> \^{}\> (circumflex) \\
1108 \ttverb\~{} \>OT1,T1\> \~{}\> (tilde) \\
1109 \ttverb\"{} \>OT1,T1\> \"{}\> (umlaut) \\
1110 \ttverb\H{} \>OT1,T1\> \H{}\> (Hungarian umlaut) \\
1111 \ttverb\r{} \>OT1,T1\> \r{}\> (ring) \\
1112 \ttverb\v{} \>OT1,T1\> \v{}\> (ha\v{c}ek) \\
1113 \ttverb\u{} \>OT1,T1\> \u{}\> (breve) \\
1114 \ttverb\t{} \>OT1,T1\> \t{}\> (tie) \\
1115 \ttverb\={} \>OT1,T1\> \a={}\> (macron) \\
1116 \ttverb\.{} \>OT1,T1\> \.{}\> (dot) \\
1117 \ttverb\b{} \>OT1,T1\> \b{}\> (underbar) \\
1118 \ttverb\c{} \>OT1,T1\> \c{}\> (cedilla) \\
1119 \ttverb\d{} \>OT1,T1\> \d{}\> (dot under) \\
1120 \ttverb\k{} \>T1 \> \k{}\> (ogonek) \\
1121 % \ttverb\AA \>OT1,T1\> \AA \> \\ % no longer
1122 \ttverb\AE \>OT1,T1\> \AE \> \\
1123 \ttverb\DH \>T1 \> \DH \> \\
1124 \ttverb\DJ \>T1 \> \DJ \> \\
1125 \ttverb\L \>OT1,T1\> \L \> \\
1126 \ttverb\NG \>T1 \> \NG \> \\
1127 \ttverb\OE \>OT1,T1\> \OE \> \\
1128 \ttverb\O \>OT1,T1\> \O \> \\
1129 \ttverb\SS \>OT1,T1\> \SS \> \\
1130 \ttverb\TH \>T1 \> \TH \> \\
1131 % \ttverb\aa \>OT1,T1\> \aa \> \\ no-longer
1132 \ttverb\ae \>OT1,T1\> \ae \> \\
1133 \ttverb\dh \>T1 \> \dh \> \\
1134 \ttverb\dj \>T1 \> \dj \> \\
1135 \ttverb\guillemotleft \>T1 \> \guillemotleft \> (guillemet) \\
1136 \ttverb\guillemotright \>T1 \> \guillemotright \> (guillemet) \\
1137 \ttverb\guilsinglleft \>T1 \> \guilsinglleft \> (guillemet) \\
1138 \ttverb\guilsinglright \>T1 \> \guilsinglright \> (guillemet) \\
1139 \ttverb\i \>OT1,T1\> \i \> \\
1140 \ttverb\j \>OT1,T1\> \j \> \\
1141 \ttverb\l \>OT1,T1\> \l \> \\
1142 \ttverb\ng \>T1 \> \ng \> \\
1143 \ttverb\oe \>OT1,T1\> \oe \> \\
1144 \ttverb\o \>OT1,T1\> \o \> \\
1145 \ttverb\quotedblbase \>T1 \> \quotedblbase \> \\
1146 \ttverb\quotesinglbase \>T1 \> \quotesinglbase \> \\
1147 \ttverb\ss \>OT1,T1\> \ss \> \\
1148 \ttverb\textasciicircum \>OT1,T1\> \textasciicircum \> \\
1149 \ttverb\textasciitilde \>OT1,T1\> \textasciitilde \> \\
1150 \ttverb\textbackslash \>OT1,T1\> \textbackslash \> \\
1151 \ttverb\textbar \>OT1,T1\> \textbar \> \\
1152 \ttverb\textbraceleft \>OT1,T1\> \textbraceleft \> \\
1153 \ttverb\textbraceright \>OT1,T1\> \textbraceright \> \\
1154 \ttverb\textcompwordmark \>OT1,T1\> \textcompwordmark\> (invisible) \\
1155 \ttverb\textdollar \>OT1,T1\> \textdollar \> \\
1156 \ttverb\textemdash \>OT1,T1\> \textemdash \> \\
1157 \ttverb\textendash \>OT1,T1\> \textendash \> \\
1158 \ttverb\textexclamdown \>OT1,T1\> \textexclamdown \> \\
1159 \ttverb\textgreater \>OT1,T1\> \textgreater \> \\
1160 \ttverb\textless \>OT1,T1\> \textless \> \\
1161 \ttverb\textquestiondown \>OT1,T1\> \textquestiondown\> \\
1162 \ttverb\textquotedbl \>T1 \> \textquotedbl \> \\
1163 \ttverb\textquotedblleft \>OT1,T1\> \textquotedblleft\> \\
1164 \ttverb\textquotedblright \>OT1,T1\> \textquotedblright\> \\
1165 \ttverb\textquoteleft \>OT1,T1\> \textquoteleft \> \\
1166 \ttverb\textquoteright \>OT1,T1\> \textquoteright \> \\
1167 \ttverb\textregistered \>OT1,T1\> \textregistered \> \\
1168 \ttverb\textsection \>OT1,T1\> \textsection \> \\
1169 \ttverb\textsterling \>OT1,T1\> \textsterling \> \\
1170 \ttverb\texttrademark \>OT1,T1\> \texttrademark \> \\
1171 \ttverb\textunderscore \>OT1,T1\> \textunderscore \> \\
1172 \ttverb\textvisiblespace \>OT1,T1\> \textvisiblespace\> \\
1173 \ttverb\th \>T1 \> \th \>
1174 \end{tabbing}
1176 \section{Encodings for Unicode based \TeX\ systems}
1177 \label{sec:unicode}
1179 The preceding text has assumed a classic TeX system that is
1180 restricted to the use of fonts with at most 256 characters. In order
1181 to accommodate all the characters needed for different languages and
1182 mathematics it is necessary to have multiple encodings as described
1183 above, and \LaTeX\ needs to be aware of the encoding used for each
1184 font.
1186 Unicode aims to provide a single encoding that removes most of the
1187 need to switch encodings, apart from very specialist use for non-standard characters. Rather than assign codes in the range 0--256 (hex
1188 FF) Unicode codes are in the range 0--1,114,111 (hex 10FFFF), although
1189 not all slots are available for distinct characters for technical
1190 reasons. Unicode offers the possibility to use a single input encoding
1191 (usually UTF-8) for all documents and to use essentially the same
1192 Unicode encoding for all fonts, so removing the need to switch
1193 encodings in different contexts.
1195 Omega was perhaps the first widely used \TeX\ extension that
1196 supported Unicode. Currently the two actively supported systems that are
1197 present in most modern \TeX\ distributions are Xe\TeX\ and Lua\TeX.
1199 When used with these extended \TeX\ engines, \LaTeX's font system can
1200 refer to Unicode fonts (typically OpenType fonts installed system-wide
1201 on your operating system rather than fonts specifically encoded/installed for
1202 \TeX). Currently the usual method of accessing these fonts is through
1203 the contributed \Pkg{fontspec} package. This uses as encoding \Enc{TU}:
1204 ``\TeX{} Unicode'' (historically two experimental encodings \Enc{EU1}
1205 and \Enc{EU2}
1206 were used, depending on the engine, but these are deprecated).
1207 The exact rules for \LaTeX\ encodings
1208 for Unicode engines have not yet been finalised in terms of the (usual)
1209 requirement that each slot should be defined. (This is not realistic for
1210 a Unicode font, as almost all fonts address subsets of the full range.)
1211 It is rare to need to specify the \Enc{TU} encoding a document as the
1212 \Pkg{fontspec} package sets up the correct encoding when loaded.
1214 The restrictions described in section \ref{sec:restrictions} do not
1215 apply, or need to be modified in a Unicode based engine. Clearly the
1216 lowercase table (and hyphenation patterns) can not be restricted to
1217 the values used for \Enc{T1} and do only refer to the first 256
1218 characters.
1220 When the \LaTeX\ format is made \LaTeX\ sets up the lowercase table
1221 and classifies characters as letter- or non-letter-based on \Enc{T1} if
1222 a classic \TeX\ or pdf\TeX\ is being used. If a Unicode based \TeX\ is
1223 detected, the values are instead based on the classification and
1224 lower-case mappings provided by the Unicode Character Database
1225 \cite{ucd}. The \LaTeX{} team have written a generic loader bundle,
1226 \Pkg{unicode-data}, which provides the mechanism to load this information
1227 directly from the Unicode Character Database data files and which is read
1228 when a Unicode-compliant engine is detected during format-building.
1230 Similarly in the default configuration files used by modern \TeX\
1231 distribution, the hyphenation files for each supported language are
1232 written in UTF-8 encoding, using Unicode code points for all letters,
1233 then if a classic \TeX\ system is detected, some additional macros are
1234 loaded to convert these files to 256-character encodings where
1235 possible, and assuming the \Enc{T1} lowercase table. For Unicode engines
1236 no conversion takes place. (The hyphenation patterns for a small number of
1237 languages require that some punctuation characters have non-zero
1238 c values. This are set during pattern reading, and may at some
1239 stage in the future use the e-\TeX{} \verb=\savinghyphcodes= mechanism to
1240 avoid any need to manipulate \verb=\lccode= in the document.)
1245 \begin{thebibliography}{99}
1246 \addcontentsline{toc}{section}{\numberline{\relax}\refname}
1249 \bibitem{Adobe:PDF-1.6} \emph{\textsc{PDF} reference}:
1250 Adobe portable document format version~1.6. Adobe Systems
1251 Incorporated, 2005. % why \textsuperscript{3}?
1252 \url{http://partners.adobe.com/public/developer/en/pdf/PDFReference16.pdf}.
1254 \bibitem{Beeton:TB6-3-124} Barbara Beeton:
1255 \emph{Mathematical symbols and cyrillic fonts ready for
1256 distribution}. In: TUGBoat, 6\#3), 1985.
1257 \url{http://tug.org/TUGboat/Articles/tb06-3/tb13beetcyr.pdf}.
1259 \bibitem{beeton} Barbara Beeton: \emph{Unicode
1260 and math, a combination whose time has come -- Finally!}. In:
1261 TUGBoat, 21\#3, 2000.
1262 \url{http://www.tug.org/TUGboat/Articles/tb21-3/tb68beet.pdf}.
1265 \bibitem{Berdnikov:eurotex-98} A.\@ Berdnikov, O.\@
1266 Lapko, M.\@ Kolodin, A.\@ Janishevsky and
1267 A.\@ Burykin: \emph{The Encoding Paradigm in
1268 \LaTeXe{} and the Projected X2 Encoding for Cyrillic Texts}.
1269 Euro\TeX~98.
1270 \url{http://www.gutenberg.eu.org/pub/GUTenberg/publicationsPDF/28-29-berdnikova.pdf}.
1272 \bibitem{CJK} \emph{The \Pkg{CJK} package}:
1273 \url{http://cjk.ffii.org}.
1275 \bibitem{clasen} Matthias Clasen: \emph{A new
1276 implementation of \LaTeX{} math}, 1997-98.
1277 \url{http://www.tug.org/twg/mfg/papers/current/newmath.ps.gz}.
1279 \bibitem{clasen-vieth} Matthias Clasen and Ulrik
1280 Vieth: \emph{Towards a new Math Font Encoding
1281 for (La)\TeX}. March 1998,
1282 \url{http://www.tug.org/twg/mfg/papers/current/mfg-euro-all.ps.gz}.
1284 \bibitem{CorkGW:91}
1285 Dean Guenther and Janene Winter.
1286 \newblock An international phonetic alphabet.
1287 \newblock In Guenther \cite{proc:MGu91}, pages 149--156.
1288 \newblock Published as {TUG}boat 12\#1.
1290 \bibitem{proc:MGu91}
1291 Mary Guenther, editor.
1292 \newblock {\em {\TeX} 90 Conference Proceedings}, March 1991.
1293 \newblock Published as {TUG}boat 12\#1.
1295 \bibitem{tub:MFe90}
1296 Michael~J. Ferguson.
1297 \newblock Report on multilingual activities.
1298 \newblock {\em {TUG}boat}, 11(4):514--516, 1990.
1300 \bibitem{fontinst} \emph{The \Pkg{fontinst} package}:
1301 \textlangle CTAN\textrangle\url{/fonts/utilities/fontinst}.
1303 \bibitem{Rei:TB17-2-102} Fukui Rei:
1304 \emph{\textsl{TIPA}: A system for processing phonetic
1305 symbols in \LaTeX}. In: TUGBoat, 17\#, 1996.
1306 \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51rei.pdf}.
1308 \bibitem{hyperref} \emph{The \Pkg{hyperref} package}:
1309 \url{http://www.tug.org/applications/hyperref}.
1311 \bibitem{tub:JKn93}
1312 J\"org Knappen.
1313 \newblock Fonts for Africa: The fc Fonts.
1314 \newblock {\em {TUG}boat}, 14(2):104, 1993.
1316 \bibitem{Knappen:TB17-2-96} J\"org Knappen:
1317 \emph{The \Pkg{dc} fonts~1.3: Move towards stability
1318 and completeness}. In: TUGBoat 17\#2, 1996.
1319 \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51knap.pdf}.
1321 \bibitem{A-W:DKn86}
1322 Donald~E. Knuth.
1323 \newblock {\em The {\TeX}book}.
1324 \newblock Volume~A of {\em Computers \& {T}ypesetting\/},
1325 May 1989.
1326 \newblock Eight printing.
1328 \bibitem{vnr} \emph{The \Pkg{vnr} font family}, developed by
1329 the author of pdf\TeX, {H\`an Th\^e\protect\llap{\raise 0.5ex\hbox{\'{\relax}}} Th\`anh}.
1330 \url{http://vntex.org/download/vntex}.
1332 \bibitem{ipa} Home page of the International Phonetic Association.
1333 \url{http://www.arts.gla.ac.uk/IPA/ipa.html}
1335 \bibitem{A-W:LLa94}
1336 Leslie Lamport.
1337 \newblock {\em {\LaTeX:} A Document Preparation System}.
1338 \newblock Addison-Wesley, Reading, Massachusetts, second edition, 1994.
1340 \bibitem{LH-Fonts} \emph{The \Pkg{lh}-Fonts for Cyrillic}:
1341 \textlangle CTAN\textrangle\url{/fonts/cyrillic/lh}.
1343 \bibitem{A-W:MG2004}
1344 Frank Mittelbach and Michel Goossens.
1345 \newblock {\em The {\LaTeX} Companion second edition}.
1346 \newblock With Johannes Braams, David Carlisle, and Chris Rowley.
1347 \newblock Addison-Wesley, Reading, Massachusetts, 2004.
1349 \bibitem{Unicode} \emph{The Unicode Standard}.
1350 \url{http://unicode.org}.
1352 \bibitem{ucd} \emph{The Unicode Character Database}.
1353 \url{http://unicode.org/ucd}.
1355 \bibitem{ziegler} Justin Ziegler, \emph{Technical
1356 Report on Math Font Encodings}, June 1994,
1357 \url{http://www.tug.org/twg/mfg/papers/ltx3pub/l3d007.ps.gz}.
1359 \end{thebibliography}
1361 \clearpage\appendix
1362 \begin{center}
1363 \Large\bfseries Appendices
1364 \end{center}
1366 \section{Example code tables}
1368 This appendix contains a table of each font mentioned as an ``example''
1369 font above, providing that the font was available when the document
1370 was processed with \LaTeX{}. (\LaTeX{} generates a warning message
1371 for each font it fails to find.)
1373 \subsection{Text encodings}
1375 \ftable{cmr10}{OT1}
1377 \ftable{wnr10}{OT2}
1379 \ftable{wsuipa10}{OT3}
1381 \ftable{plr10}{OT4}
1383 %\ftable{artmr10}{OT6}
1385 \ftable{ecrm1000}{T1}
1387 \ftable{larm1000}{T2A}
1389 \ftable{lbrm1000}{T2B}
1391 \ftable{lcrm1000}{T2C}
1393 \ftable{tipa10}{T3}
1395 \ftable{fcr10}{T4}
1397 \ftable{vnr10}{T5}
1400 \subsection{Text symbol encodings}
1402 The full table for \Enc{TS1} as provided by European Computer Modern family:
1403 \ftable{tcrm1000}{TS1}
1405 \pagebreak
1407 In contrast typical PostScript fonts usually have incomplete implementations
1408 of \Enc{TS1} sometimes missing more than half of the glyphs:
1410 \ftable{ptmr8c}{TS1}
1412 \ftable{tipx10}{TS3}
1416 \subsection{Extended text encodings}
1418 \ftable{rxrm1000}{X2}
1421 \subsection{Mathematical encodings}
1423 \ftable{cmmi10}{OML}
1425 \ftable{cmsy10}{OMS}
1427 \ftable{cmex10}{OMX}
1430 \subsection{Other encodings}
1432 \ftable{ptmr8y}{LY1}
1434 %%\ftable{????}{LV1}
1436 \ftable{grmn1000}{LGR}
1438 \ftable{wasy10}{U}
1439 \ftable{logo10}{U}
1441 \clearpage
1442 \section{Uppercase and lowercase tables}
1443 \label{sec:uclc-tab}
1445 The following two sets of tables list the \verb"\uppercase" and
1446 \verb"\lowercase" values for each position in the \LaTeX{} standard
1447 256-character tables.
1449 Each row of each table lists:
1450 \begin{quote}
1451 \begin{tabular}{lp{0.7\textwidth}}
1452 pos & The position in the table (0-255) \\
1453 lc & The value in the \verb"\lowercase" table at the position \\
1454 & (note that value 0 here means that \verb"\lowercase" is
1455 ineffective for this character, and hyphenation does not apply
1456 to it) \\
1457 uc & The value in the \verb"\uppercase" table at the position \\
1458 & (note that value 0 here means that \verb"\uppercase" is
1459 ineffective for this character) \\
1460 glyphs & The glyphs specified for the T1 encoding for this
1461 position, laid out as \meta{glyph}\textbf{(}\meta{lowercase
1462 glyph}\textbf{/}\meta{uppercase glyph}\textbf{)}
1463 \end{tabular}
1464 \end{quote}
1466 \begin{center}
1467 \let\lctablenumbersize\footnotesize
1468 \makebox[\textwidth]{\hss
1469 \dolctable{0}{32}\quad\dolctable{32}{64}\quad
1470 \dolctable{64}{96}\quad\dolctable{96}{128}%
1471 \hss}
1473 \makebox[\textwidth]{\hss
1474 \dolctable{128}{160}\quad\dolctable{160}{192}\quad
1475 \dolctable{192}{224}\quad\dolctable{224}{256}%
1476 \hss}
1477 \end{center}
1478 \end{document}
1481 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%