Missed one line
[latex2e.git] / trunk / doc / encguide.tex
blob3a59efa581bba2d1b72fd2977d995e0c683b864f
1 % \iffalse meta-comment
3 % Copyright 1993-2014
4 % The LaTeX3 Project and any individual authors listed elsewhere
5 % in this file.
6 %
7 % This file is part of the LaTeX base system.
8 % -------------------------------------------
9 %
10 % It may be distributed and/or modified under the
11 % conditions of the LaTeX Project Public License, either version 1.3c
12 % of this license or (at your option) any later version.
13 % The latest version of this license is in
14 % http://www.latex-project.org/lppl.txt
15 % and version 1.3c or later is part of all distributions of LaTeX
16 % version 2005/12/01 or later.
18 % This file has the LPPL maintenance status "maintained".
20 % The list of all files belonging to the LaTeX base distribution is
21 % given in the file `manifest.txt'. See also `legal.txt' for additional
22 % information.
24 % The list of derived (unpacked) files belonging to the distribution
25 % and covered by LPPL is defined by the unpacking scripts (with
26 % extension .ins) which are part of the distribution.
28 % \fi
29 % $Id: encguide.tex 5713 2006-01-18 23:29:23Z robin $
33 \NeedsTeXFormat{LaTeX2e}[1995/12/01]
35 \documentclass{ltxguide}[1994/11/20]
37 \usepackage[T1]{fontenc}
38 \IfFileExists{lmodern.sty}{\usepackage{lmodern}}{}
39 \usepackage{textcomp}
40 \usepackage{url}
41 \usepackage{mflogo}
43 \addtolength\textheight{6\baselineskip}
44 \addtolength\topmargin{-2\baselineskip}
47 \newcommand\ttverb[1]{\texttt{\string#1}}
50 % for encodings
51 \providecommand{\Enc}[1]{\texttt{#1}}
53 % for packages
54 \providecommand{\Pkg}[1]{%
55 \textsf{#1}}
57 % for files
58 \providecommand{\File}[1]{%
59 \texttt{#1}}
61 % let's have meta values too
62 \providecommand{\meta}[1]{%
63 \ensuremath{\langle}\emph{#1}\ensuremath{\rangle}}
65 \usepackage{tabularx}
67 % eine Umgebung zur Darstellung von Kodierungen
69 % Argumente:
70 % #1: Name in LaTeX (z.B. OT1)
71 % #2: Name der Kodierung (z.B. TeX text)
72 % #3: Name des Autors (z.B. Don Knuth)
73 % #4: Bereich der benützten Glyphindizes
74 % #5: variable Positionen
75 % #6: Beispielzeichensatz
76 % #7: Referenz
78 % XXX add code to handle more than a single font example (e.g., larm1000,
79 % lbrm1000, and lcrm1000).
81 \newenvironment{encodinginfo}[7]%
82 {\noindent
83 \begin{tabularx}{\linewidth}{@{}l>{\raggedright\let\\\tabularnewline}X}%
84 \LaTeX{} name: & \texttt{#1}\\%
85 Public name: & #2\\%
86 Author: & #3\\%
87 Glyph slots used: & #4\\%
88 Variable slots: & #5\\%
89 Font example: & \def\@tempa{#6}\ifx\@tempa\@empty---%
90 \else\texttt{#6}\referenceftable{#6}\fi\\%
91 Further reference: & #7%
92 \end{tabularx}%
93 \par\nobreak
94 \vspace*{3pt}%
95 \quote
97 {\endquote
98 \vspace{6pt}}
100 \makeatletter
101 \def\referenceftable#1{
102 \@ifundefined{r@fonttable:#1}%
103 \relax
104 {;\space encoding table on page~\pageref{fonttable:#1}}%
107 % font table macros mainly lifted from manmac.tex
108 \def\oct#1{\hbox{\rm\'{}\kern-.2em\it#1\/\kern.05em}}
109 \def\hex#1{\hbox{\rm\H{}\tt#1}}
111 \def\oddline#1{\cr\noalign{\nointerlineskip}
112 \multispan{19}\hrulefill&
113 \setbox0=\hbox{\lower 2.3pt\hbox{\hex{#1x}}}\smash{\box0}\cr
114 \noalign{\nointerlineskip}}
115 \def\evenline{\cr\noalign{\hrule}}
116 \def\chartstrut{\lower4.5pt\vbox to14pt{}}
117 \def\beginchart#1#2{$$\global\count@=0 #1
118 \halign to\hsize\bgroup
119 \chartstrut##\tabskip0pt plus10pt&
120 &\hfil##\hfil&\vrule##\cr
121 \lower6.5pt\null
122 &#2&&\oct0&&\oct1&&\oct2&&\oct3&&\oct4&&\oct5&&\oct6&&\oct7&\evenline}
123 \def\endchart{\raise11.5pt\null&&&\hex 8&&\hex 9&&\hex A&&\hex B&
124 &\hex C&&\hex D&&\hex E&&\hex F&\cr\egroup$$}
125 \def\:{\setbox0=\hbox{\noboundary\char\count@\noboundary}%
126 \ifdim\ht0>7.5pt\reposition
127 \else\ifdim\dp0>2.5pt\reposition\fi\fi
128 \box0\global\advance\count@ by1 }
129 \def\reposition{\setbox0=\hbox{$\vcenter{\kern2pt\box0\kern2pt}$}}
130 \def\normalchart{%
131 &\oct{00x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline0
132 &\oct{01x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
133 &\oct{02x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline1
134 &\oct{03x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
135 &\oct{04x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline2
136 &\oct{05x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
137 &\oct{06x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline3
138 &\oct{07x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
139 &\oct{10x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline4
140 &\oct{11x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
141 &\oct{12x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline5
142 &\oct{13x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
143 &\oct{14x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline6
144 &\oct{15x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
145 &\oct{16x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline7
146 &\oct{17x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
147 \top}
149 \def\notophalf{}
150 \def\tophalf{%
151 %\noalign{\vskip 5pt\hrule}
152 &\oct{20x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline8
153 &\oct{21x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
154 &\oct{22x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline9
155 &\oct{23x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
156 &\oct{24x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline A
157 &\oct{25x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
158 &\oct{26x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline B
159 &\oct{27x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
160 &\oct{30x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline C
161 &\oct{31x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
162 &\oct{32x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline D
163 &\oct{33x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
164 &\oct{34x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline E
165 &\oct{35x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
166 &\oct{36x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline F
167 &\oct{37x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline}
169 \def\ftable#1#2{%
170 \batchmode
171 \font\X=#1%
172 \errorstopmode
173 \ifx\X\nullfont
174 \@warning{Font #1 not found, table omitted}
175 \else
176 \count@="80
177 \setbox0=\hbox{\X
178 \loop\char\count@\advance\count@ by1 \ifnum\count@<"100
179 \repeat}%
180 \ifdim\wd0>0pt \let\top\tophalf\else\let\top\notophalf\fi
181 \beginchart\X{\hfill\llap{\textbf{#1, \large#2}\label{fonttable:#1}}}\normalchart
182 \endchart\par\vfill
183 \fi}
184 \makeatother
187 \setcounter{tocdepth}{3}
189 \title{\LaTeX{} font encodings}
191 \author{Frank Mittelbach \and Robin
192 Fairbairns \and Werner Lemberg \and \LaTeX3 Project Team.}
194 \date{\copyright~Copyright 1995--2014 \\[5pt] 29 October 2014}
196 \begin{document}
198 \maketitle
200 \tableofcontents
202 \section{Introduction}
204 This document explains the ideas that underpin \LaTeX{} font
205 encodings and the constraints that apply when defining a new encoding; it
206 also lists the encodings that have already been defined.
208 \subsection{Encodings in \TeX{}}
210 \TeX{} (the program) implicitly recognises three sorts of encoding,
211 and all are (in a sense) discussed in the \TeX{}book~\cite{A-W:DKn86}:
212 \begin{itemize}
213 \item[1.] The input encoding, which specifies the meanings of characters
214 in files presented to \TeX{} for processing. The \TeX{}book
215 suggests that `your version of \TeX{} will recognise the characters
216 you type on your keyboard' (\TeX{} the program has provision for
217 static translations of input characters).
218 \end{itemize}
219 Such direct use of \TeX{}'s facilities is not the way modern
220 \LaTeX{} (or indeed any other \TeX{} macro package) is likely to deal
221 with input encodings. This document does not address the topic of
222 input encodings; the interested reader should examine the \LaTeX{}
223 base package \Pkg{inputenc} \cite[sec.~7.5.2, p.~357]{A-W:MG2004}.
224 \begin{itemize}
225 \item[2.] The token stream that \TeX{} processes internally. This stream
226 of \TeX{}'s consciousness is discussed in great detail in the
227 \TeX{}book.
228 \end{itemize}
229 Again, this document does not address the topic. \LaTeX's internal
230 character representation (\textsc{licr}) is well discussed in
231 \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
232 \begin{itemize}
233 \item[3.] The font encoding---i.e., the mapping of character codes to
234 glyphs in the fonts that are used to typeset \TeX{}'s output.
235 Again, a set of font encodings is enumerated in the \TeX{}book, but
236 that set has proved inadequate to the needs of modern multilingual
237 use of \LaTeX.
238 \end{itemize}
239 This document explains \emph{why} Knuth's original set of encodings is
240 inadequate to modern conditions, and discusses the issues that
241 surround the design and definition of new font encodings.
243 Font encodings are important for more than their r\^{o}le in mapping the
244 glyphs of the fonts to be used for typesetting: their glyph tables are
245 also the context in which \TeX{}'s hyphenation algorithm operates.
246 There are constraints imposed by \TeX{} that affect the way in which
247 new font encodings, for use in a multi-lingual environment, may be
248 structured (see section~\ref{sec:restrictions} for details).
250 \subsection{The history of \TeX{} font encodings}
252 Little attention was paid to font encodings prior to the arrival of
253 \TeX{}\,3. Up to that time, one used Donald Knuth's fonts (the
254 Computer Modern family, using the encodings we now refer to as \Enc{OT1} and
255 the \Enc{OM} series), or one was on one's own.
257 The Computer Modern text encoding raises problems in unmodified
258 \TeX{}, because hyphenation cannot break words containing
259 \verb"\accent" commands. Even in those Western European languages for
260 which the \Enc{OT1} encoding has symbols for the necessary
261 \verb"\accent"-based diacritics, this shortcoming ruins typesetting of
262 running text.
264 With the advent of \TeX{}\,3, with its ability to switch between
265 hyphenation pattern sets, it was clear that the situation could not
266 continue. Thus a group at the TUG Annual General Meeting in Cork,
267 Ireland, specified a uniform encoding for 256-glyph fonts, that
268 contains accented letters and non-\textsc{ascii} letters necessary to
269 express most Western European languages (and some Eastern European ones)
270 without recourse to the \verb"\accent" command.
272 This ``Cork'' encoding has since been realised in a series of fonts
273 designed with Metafont, in at least one font series that is available
274 both in Adobe Type 1 format and in OpenType format, % viz., Latin Modern
275 and in a number of virtual-font mappings of other font series.
277 Since the time of the Cork meeting, much effort has been devoted to
278 the design of encodings for text fonts to use with \TeX{}, and the
279 Cork encoding influenced the design of many such encodings.
281 Encodings for mathematical fonts have, in contrast, changed little
282 since Knuth's contributions. A TUG Technical Working Group was
283 established at the Cork meeting, whose aim was to define a set of
284 256-glyph encodings to regularise and extend Knuth's originals, using
285 ideas from several other fonts that had appeared since, and from the
286 known needs of researchers in mathematics and the mathematical sciences.
288 Independently, a first proposal (the so-called \emph{Aston proposal}) was worked
289 out by Justin Ziegler together with Frank Mittelbach and other members of the
290 \LaTeX3 project team~\cite{ziegler}. A first implementation of
291 this propsal was realized by Matthias Clasen und Ulrik
292 Vieth~\cite{clasen,clasen-vieth}.
294 However, the slow progress of these Mathematical encodings has been
295 overtaken by the addition (in the last decade or so) of a large number
296 of mathematical symbols to Unicode~\cite{beeton}; one can expect
297 further changes so that new public mathematical font encodings will
298 most likely be delayed still further.
302 \subsection{Further information}
304 For a general introduction to \LaTeX, including the new features of
305 \LaTeXe, you should read \emph{\LaTeXbook},
306 Leslie Lamport, Addison Wesley, 2nd~ed, 1994.
308 A more detailed description of the new features of \LaTeX, including an
309 overview of more than 200 packages and nearly 1000 ready to run examples, is
310 to be found in \emph{\LaTeXcomp{} second edition} by Frank Mittelbach and
311 Michel Goossens~\cite{A-W:MG2004}.
313 The \LaTeX{} project sponsored a report on Mathematical % spelt out in full
314 font encodings, which
315 is worth reading for its insight into the problems of defining the way
316 in which math is used: see~\cite{ziegler,clasen,clasen-vieth}.
318 The \LaTeX{} font selection scheme is based on \TeX, which is described
319 by its developer in \emph{The \TeX book}, Donald E.~Knuth, Addison
320 Wesley, 1986, revised in 1991 to include the features of \TeX~3.
322 For more information about \TeX{} and \LaTeX, please contact your local
323 \TeX{} Users Group, or the international \TeX{} Users Group
324 (\url{http://www.tug.org}).
328 \section{Existing font encodings}
330 This section lists the encodings currently assigned; for each
331 encoding, we list the registered (\LaTeX{}) name, the assigned purpose
332 of the encoding, and the author. Further details may list the code
333 positions used in the encoding, the \emph{variable slots} (see below),
334 an example font (for which a listing will be provided later in the
335 document if the relevant fonts are present), and a source for further
336 reference.
338 While the characteristic feature of an encoding is that each font
339 encoded according to the encoding should have the same glyph set,
340 there are some encodings (notably \Enc{OT1} and its descendants) in
341 which a few glyph code slots differ in their contents in different
342 fonts.
344 \subsection{Naming conventions}
346 Names for encoding schemes are strings of up to three letters (all
347 upper case) plus digits.
349 The \LaTeX3 project reserves the use of encoding names starting with the
350 following letters: |T| (standard 256-long text encodings), |TS|
351 (symbols that are designed to extend a corresponding |T| encoding),
352 |X| (text encodings that do not conform to the strict requirements for
353 |T| encodings), |M| (standard 256-long mathematical encodings), |S| (other
354 symbol encodings), |A| (other special applications), |OT| (standard
355 128-long text encodings), and |OM| (standard 128-long mathematical encodings).
357 Please do not use the above starting letters for non-portable
358 encodings. If new standard encodings emerge then we shall add them in
359 a later release of \LaTeX.
361 Encoding schemes which are local to a site or a system should start
362 with |L|, experimental encodings intended for wide distribution will
363 start with |E|, whilst |U| is for Unknown or Unclassified encodings.
365 \begin{quote}
366 \itshape We recommend that new encoding names should not be
367 introduced unless careful consideration and discussion in the user
368 community has confirmed the need for the encoding. If encodings have to
369 change from font to font, a number of problems arise, so it is best to
370 develop encodings that can be used with a large number of fonts in parallel.
371 This allows documents to be typeset using different fonts without problems.
373 The \Enc{TS1} encoding is a good example of a \emph{bad} encoding (even
374 though it was developed with the best intentions) as a huge number of fonts
375 can only implement parts of it. Similarly, the fact that the few sets of
376 available mathematical fonts (beside Computer Modern Math) nearly
377 all implement slightly different encodings is a huge source of
378 problems. Don't add to this if possible!
379 \end{quote}
382 \subsection{128$^+$ glyph encodings (text)}
384 The `OT' series of font encodings start with Donald Knuth's original
385 text encoding, that used for the text fonts in the earliest releases
386 of \TeX{} itself. The `O' of the encoding designator may be taken as
387 signifying `original', or just `old'.
389 \begin{encodinginfo}{OT1}
390 {\TeX{} text}
391 {Donald Ervin Knuth}
392 {0x00--0x7F}
393 {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
394 % {0X--'177}
395 % {'13--'17, '44, '74, '76, '134, '173--'175}
396 {cmr10}
397 {\cite[p.427]{A-W:DKn86}}
399 Donald Knuth designed his font encoding (and hence his fonts) in a
400 very different environment from that which now pervades the \TeX{}
401 world: his (mainframe) computer had very little memory, there was
402 little experience in (or demand for) for multilingual technical
403 typesetting, and as a result it was appropriate to sacrifice
404 uniformity for efficiency.
406 Thus Knuth's original fonts differ slightly in some encoded slots:
407 for example, the glyphs \texttt{\string<}, \texttt{\string>},
408 \verb=\=, \verb={=, and \verb=}= are only available in the
409 typewriter fonts and the \textdollar{} and \textsterling{} signs
410 share the same position (in different font shapes).
412 This means that direct selection of these slots can produce
413 unpredictable results, e.g., typing \texttt{\string<} or
414 \verb=\symbol{'74}= in a document can yield `\textquestiondown'.
415 \end{encodinginfo}
418 \begin{encodinginfo}{OT2}
419 {UW cyrillic encoding}
420 {University of Washington}
421 {0x00--0x7F}
422 {---}
423 {wnr10}
424 {\cite{Beeton:TB6-3-124}}
425 Support for this encoding is available in the Cyrillic bundle although for
426 all practical purposes it is better to use one of the \Enc{T2} encodings.
427 \end{encodinginfo}
430 \begin{encodinginfo}{OT3}
431 {UW IPA encoding}
432 {University of Washington}
433 {0x00--0x7f}
434 {---}
435 {wsuipa10}
436 {\cite[p.149]{CorkGW:91}}
437 The \Enc{OT3} encoding was never really used with \LaTeXe{}
438 following the introduction of the TIPA system which offers much
439 better support for IPA. In particular, no \File{ot3enc.def}
440 file was ever produced.
441 \end{encodinginfo}
444 \begin{encodinginfo}{OT4}
445 {Polish text encoding}
446 {B.~Jackowski and M.~Ry\'cko} %% ? Marcin Woli\'nski
447 {0x00--0x7F, 0x81, 0x82, 0x86, 0x8A, 0x8B, 0x91, 0x99, 0x9B, 0xA1,
448 0xA2, 0xA6, 0xAA, 0xAB, 0xAE, 0xAF, 0xB1, 0xB9, 0xBB, 0xD3, 0xF3,
449 0xFF}
450 {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
451 {plr10}
452 {---}
454 While Knuth included the means of typesetting the `lost L' (\L) in
455 his \Enc{OT1} encoding, he omitted the ogonek (\,\,\k{}), a diacritic
456 mark that is also needed in Polish text; hence the appearance, well
457 before the \Enc{T1} encoding, of fonts using this encoding.
458 \end{encodinginfo}
460 \begin{encodinginfo}{OT5}
461 {Not currently allocated}
462 {---}
463 {---}
464 {---}
466 {---}
468 \end{encodinginfo}
472 \begin{encodinginfo}{OT6}
473 {Armenian text encoding}
474 {Serguei Dachian}
475 {0x03--0x0F, 0x13--0x7F}
476 {---}
477 {artmr10}
478 {---}
480 This encoding was allocated to permit use of Dachian's
481 Armenian fonts in a standard \LaTeX{} environment.
483 Because of license issues the \texttt{artmr} fonts are not necessarily
484 included in distributed \TeX{} installations (and for this reason the
485 corresponding encoding table is not shown below). However, the fonts
486 and the support macros can be found on the CTAN archives (look for
487 \texttt{armtex}).
489 \end{encodinginfo}
493 \subsection{256 glyph encodings (text)}
495 \begin{encodinginfo}{T1}
496 {Cork encoding}
497 {Euro \TeX{} conference at Cork}
498 {0x00--0xFF}
499 {---}
500 {ecrm1000}
501 {\cite[p.514]{tub:MFe90}, \cite[p.99]{Knappen:TB17-2-96}}
503 The Cork encoding was developed so that advantage could be taken of
504 the (then) new facilities of \TeX{}\,3, allowing hyphenation of
505 most Western European (and some Eastern European) languages in an
506 unmodified version of \TeX{}.
508 The encoding was developed in the absence of any extant effort at
509 font design, but instances written in Metafont (the `EC' fonts), and
510 more recently Adobe Type 1 instances of the same fonts have become
511 available.
513 Substantial (but incomplete) instances have also been developed,
514 which use virtual fonts. These latter instances map either Knuth's
515 original (OT1-encoded) fonts, or commercial fonts that contain the
516 Adobe `standard' set of 224 glyphs.
517 \end{encodinginfo}
519 \begin{encodinginfo}
520 {T2A, T2B, T2C}
521 {Cyrillic encodings}
522 {The CyrTUG font team}
523 {0x00--0xFF}
524 {--- (within each encoding)}
525 {larm1000}
526 {\cite{Berdnikov:eurotex-98}}
528 There are too many glyphs in the full Cyrillic complement of
529 languages for all of them to be covered by a single
530 \LaTeX{}-compliant encoding (the lower half of each
531 \Enc{T2}~encoding is identical to that of \Enc{T1}, in order that
532 each should be a conforming \LaTeX{} encoding~--- see
533 section~\ref{sec:restrictions}). The approach taken is
534 therefore to develop a single encoding, \Enc{X2} (see \ref{sec:extendedenc})
535 which contains all the glyphs needed for the full set of
536 languages, and then to derive the three \LaTeX{}-complaint
537 \Enc{T2}-family encodings using the \Enc{X2} set together with that of
538 \Enc{T1}.
540 \end{encodinginfo}
544 \begin{encodinginfo}{T3}
545 {IPA encoding}
546 {FUKUI Rei, University of Tokyo}
547 {0x00--0xFF}
548 {---}
549 {tipa10}
550 {\cite[p.102]{Rei:TB17-2-102}}
553 The \Enc{T3} encoding (and associated macros) provides the glyphs required
554 in phonetic description according to current International Phonetic
555 Association recommendations \cite{ipa}.
557 The \Enc{T3} encoding does \emph{not fulfil} the requirements for \Enc{T}
558 encodings---the name is a historical accident. The correct name would be
559 \Enc{X3}, but due to the fact that this font family has been used under its
560 current encoding name for a long time, the name will not change for
561 compatibility reasons.
563 \end{encodinginfo}
567 \begin{encodinginfo}{T4}
568 {African Latin (fc)} % public name
569 {J\"org Knappen} % author name
570 {0x00--0xFF} % range(s) of slots used for glyphs
571 {0x24} % range(s) of slots with variable glyphs if any
572 {fcr10} % name of an example font
573 {\cite{tub:JKn93}}
575 The African Latin fonts contain in their lower half (0--127) the same
576 characters as the European Latin (T1-encoded) Fonts, while in their
577 upper half (128--255) they
578 contain letters and symbols for African languages that use extended
579 Latin alphabets.
580 Due to lack of space, J\"org had to play the unfortunate trick of
581 assigning \verb=\textdollar= and \verb=\textsterling=
582 the same position; users should take these characters
583 from the text companion font, if they are needed. Instead of defining
584 a lot of new control sequences for the single letters, there are three
585 accent-like control sequences with general purpose:
586 \verb=\m= (Modified-1),
587 \verb=\M= (Modified-2) and
588 \verb=\B= (Barred).
589 Most standard \LaTeX{} encoding-dependent commands
590 work. However, the Icelandic special letters are not available and `best
591 replacements' for \verb=\Th=, \verb=\th=, and \verb=\dh=
592 are used (barred T and d resp.).
593 \end{encodinginfo}
596 \begin{encodinginfo}{T5}
597 {Vietnamese encoding}
598 {Werner Lemberg and
599 Vladimir Volovich}
600 {0x00--0xFF}
601 {---}
602 {vnr10}
603 {\cite{vnr}}
605 The \Enc{T5} encoding was developed for Vietnamese. Again, this encoding
606 \emph{does not} conform to the requirements for a \Enc{T}-encoding
607 because its large number of accented letters prevent the \verb=\lccode= and
608 \verb=\uccode= mapping requirements for \Enc{T} encodings from being
609 fulfilled. However, since the Vietnamese language does not
610 use word division in typesetting so that this requirement is
611 actually not important for this particular language.
612 Since every glyph used in Vietnamese text is internally
613 represented as \textsc{licr} macros, the commands \verb=\MakeUppercase= and
614 \verb=\MakeLowercase= still work as expected (as they change the case of the
615 \textsc{ascii} characters in \textsc{licr} definitions).
617 \end{encodinginfo}
619 \begin{encodinginfo}
620 {T6}
621 {Armenian}
622 {---}
623 {---}
624 {---}
626 {---}
628 This encoding is reserved to permit future expansion of Armenian
629 \TeX{} to use 256-character (hyphenatable) fonts.
630 \end{encodinginfo}
632 \begin{encodinginfo}{T7}
633 {Greek encoding}
634 {---}
635 {---}
636 {---}
638 {---}
640 The name is already reserved for a 256 glyph greek encoding. The encoding
641 itself hasn't been defined so far.
643 \end{encodinginfo}
647 \subsection{256$^-$ glyph encodings (text symbols)}
649 \begin{encodinginfo}{TS1}
650 {Text Companion encoding (Cork)}
651 {J\"org Knappen}
652 {0x00--0x0D, 0x12, 0x15, 0x16, 0x18--0x1D, 0x20, 0x24, 0x27, 0x2A,
653 0x2C--0x3A, 0x3C--0x3E, 0x4D, 0x4F, 0x57, 0x5B, 0x5D--0x60,
654 0x62--0x64, 0x6C--0x6E, 0x7E--0xBF, 0xD6, 0xF6}
655 {---}
656 {tcrm1000}
657 {\cite{Knappen:TB17-2-96}}
659 The text symbol encoding offers access to symbolic glyphs that are
660 commonly used in text (for a variety of reasons), and whose style
661 should vary with the text that surrounds them.
663 Unfortunately, the \Enc{TS1} encoding was developed without
664 reference to the glyphs available in existing commercial fonts.
665 As a result, only font families
666 explicitly developed for \TeX{} (i.e., typically originating with
667 \MF{}) actually contain all glyphs required by the \Enc{TS1}
668 encoding. Most other font families (whether free or commercial)
669 often only provide half of the set%
671 %% don't show the comment if the tables are not generated
673 \expandafter\ifx\csname r@fonttable:tcrm1000\endcsname\relax
674 \else
675 \expandafter\ifx\csname r@fonttable:ptmr8c\endcsname\relax
676 \else
677 \space (compare the two tables for \Enc{TS1} on
678 pages~\pageref{fonttable:tcrm1000}
679 and~\pageref{fonttable:ptmr8c})%
681 \fi.
682 To improve this situation somewhat, NFSS provides a way to define encoding
683 subsets on a per family basis in the \Pkg{textcomp} package (which
684 package offers support for the \Enc{TS1} encoding).
685 \end{encodinginfo}
688 \begin{encodinginfo}{TS3}
689 {IPA symbol encoding}
690 {FUKUI Rei, University of Tokyo}
691 {0x00--0x0A, 0x20--0x49, 0x50--0x56, 0x70--0x7B}
692 {---}
693 {tipx10}
694 {\cite{Rei:TB17-2-102}}
696 The \Enc{TS3} encoding (together with the \Enc{T3} encoding) provides the
697 glyphs for typesetting phonetic transcriptions following the
698 guidelines of the International Phonetic Association \cite{ipa}. Support
699 is offered through the \Pkg{tipa} package.
700 \end{encodinginfo}
705 \subsection{256 glyph encodings (text extended)}
706 \label{sec:extendedenc}
708 \begin{encodinginfo}
709 {X2}
710 {Cyrillic glyph container}
711 {The CyrTUG font team}
712 {0x00--0xFF}
713 {---}
714 {rxrm1000}
715 {\cite{Berdnikov:eurotex-98}}
717 This encoding specifies the glyph container for Cyrillic characters,
718 which is used in specifying the \Enc{T2A}, \Enc{T2B} and \Enc{T2C} encodings.
719 \end{encodinginfo}
724 \subsection{128$^+$ glyph encodings (mathematics)}
727 \begin{encodinginfo}{OML}
728 {\TeX{} math italic}
729 {Donald Ervin Knuth}
730 {0x00--0x7F}
731 {---}
732 {cmmi10}
733 {\cite[p.430]{A-W:DKn86}}
735 The \Enc{OML} encoding contains italic Latin and Greek letters for
736 use in mathematical formulas (typically used for variables) together
737 with some symbols.
739 \end{encodinginfo}
741 \begin{encodinginfo}{OMS}
742 {\TeX{} math symbol}
743 {Donald Ervin Knuth}
744 {0x00--0x7F}
745 {---}
746 {cmsy10}
747 {\cite[p.431]{A-W:DKn86}}
749 The \Enc{OMS} encoding contains basic mathematical symbols,
750 together with an uppercase ``calligraphic'' Latin alphabet.
751 \end{encodinginfo}
754 \begin{encodinginfo}{OMX}
755 {\TeX{} math extension}
756 {Donald Ervin Knuth}
757 {0x00--0x7F}
758 {---}
759 {cmex10}
760 {\cite[p.432]{A-W:DKn86}}
762 \Enc{OMS} encodes mathematical symbols with variable sizes, such as
763 the $\sum$ sign, which changes its size if used in displayed
764 formulas, and the construction parts for
765 brackets, braces and radicals, etc., which can stretch to accommodate
766 the thing they're enclosing.
768 \end{encodinginfo}
773 \subsection{256 glyph encodings (mathematics)}
775 So far there are no 256 glyph mathematical encodings. A proposal is
776 given in \cite{ziegler}.
779 \subsection{Other encodings}
781 \begin{encodinginfo}
782 {C..}
783 {CJK encodings}
784 {Werner Lemberg}
785 {0x00--0xFF}
786 {---}
787 {} % no font, of course
788 {\cite{CJK}}
790 The \Pkg{CJK} package defines a number of encodings which access Chinese,
791 Japanese and Korean fonts.
793 \end{encodinginfo}
795 \begin{encodinginfo}
796 {E..}
797 {Experimental encodings}
798 {---}
799 {0x00--0xFF}
800 {all}
802 {\cite[p.416]{A-W:MG2004}}
804 As the name indicates, encodings starting with the letter \Enc{E} are
805 intended for experimental encodings, that are still likely to change.
806 \end{encodinginfo}
808 \begin{encodinginfo}{L..}
809 {Local encoding (site dependent)}
810 {---}
811 {0x00--0xFF}
812 {all}
814 {\cite[p.416]{A-W:MG2004}}
816 `Local' encodings provide the means to develop representation
817 techniques that are suited to a particular \TeX{} environment. While
818 the developer has freedom to specify their encoding as he or she
819 pleases, there is a strong incentive to obey the \LaTeX{} rules for
820 encodings, since it will otherwise be difficult to compose text using
821 the encoding.
823 At least it was the intention that \Enc{L..} encodings are local and
824 site dependent. However, a number of such encodings became generally
825 used without ever getting a different name allocated.
827 \end{encodinginfo}
831 \begin{encodinginfo}{LY1}
832 {Y\&Y 256 glyph encoding}
833 {Berthold Horn}
834 {0x00--0x08, 0x0C, 0x10, 0x12--0xFF}
835 {\emph{believed none}}
836 {ptmr8y}
837 {\cite[p.416]{A-W:MG2004}}
839 This is an alternative to the \Enc{T1} encoding developed by Y\&Y and
840 used in their commercial \TeX{} implementation.
842 \end{encodinginfo}
845 \begin{encodinginfo}{LV1}
846 {MicroPress encoding}
847 {Michael Vulis}
848 {\emph{unknown}}
849 {\emph{unknown}}
851 {\cite[p.416]{A-W:MG2004}}
853 This is an encoding developed by MicroPress and used for some of their
854 fonts.
856 \end{encodinginfo}
859 \begin{encodinginfo}{LGR}
860 {Greek 256 glyph encoding}
861 {\emph{unknown}}
862 {0x00--0xFF}
863 {\emph{believed none}}
864 {grmn1000}
865 {\cite[p.575]{A-W:MG2004}}
867 Currently the main encoding in use for the Greek language.
869 This encoding doesn't conform to the restrictions for
870 \Enc{T}-encodings described in section~\ref{sec:restrictions} on
871 page~\pageref{sec:restrictions} as it doesn't have \textsc{ascii}
872 glyphs at all.
874 \end{encodinginfo}
877 \begin{encodinginfo}
878 {PD1}
879 {PDF DocEncoding}
880 {Adobe}
881 {0x08--0x0A, 0x0C, 0x0D, 0x18--0x7E, 0x80--0x9E, 0xA0--0xAE, 0xB0--0xFF}
882 {---}
884 {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
886 The \Enc{PD1} encoding is a virtual encoding with 256 glyphs needed to
887 produce bookmarks and similar text in PDF document generated with pdf\LaTeX.
888 The encoding is ``virtual'' because by design there are no \TeX{}
889 fonts that cover \Enc{PD1}. Details can be found in appendix D.1
890 of~\cite{Adobe:PDF-1.6}.
891 \end{encodinginfo}
893 \begin{encodinginfo}
894 {PU}
895 {PDF Unicode Encoding}
896 {Adobe}
897 {---}
898 {---}
900 {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
902 Another virtual encoding (with more than 600 characters) for
903 Unicode-encoded bookmarks in PDF documents.
904 \end{encodinginfo}
906 \begin{encodinginfo}{U}
907 {Unknown encoding}
908 {---}
909 {potentially 0x00-0xFF}
910 {all}
911 {wasy10}
912 {\cite[p.416]{A-W:MG2004}}
914 This encoding should be used for fonts that resist classification,
915 e.g., when it is clear that there will never be more than one font
916 using the same encoding.
918 \end{encodinginfo}
922 \section{Restrictions}
923 \label{sec:restrictions}
926 \subsection{Required glyphs for general text encodings}
928 Encodings that are supposed to be used with \LaTeX{} for `general
929 purpose text fonts' need to have certain fixed glyphs in certain
930 encoding slots. A `general purpose text font' is one intended for
931 arbitrary natural language text and not just within special
932 environments (such as the phonetic alphabet) or just for typesetting
933 individual symbols (e.g., the text companion font with encoding
934 \Enc{TS1}).
936 This is the case for the following glyphs that have to be in their
937 \textsc{ascii} positions for general purpose text encodings:
938 \begin{center}
939 \begin{tabular}[t]{cc}
940 Glyph & Position \\ \hline
941 ! & \number`\! \\
942 ' & \number`\' \\
943 ( & \number`\( \\
944 ) & \number`\) \\
945 \relax* & \number`\* \\
946 + & \number`\+ \\
947 , & \number`\, \\
948 - & \number`\- \\
949 . & \number`\. \\
950 / & \number`\/ \\
951 0 \ldots\ 9 & \number`\0\ to \number`\9 \\
952 \end{tabular}
953 \quad
954 \begin{tabular}[t]{cc}
955 Glyph & Position \\ \hline
956 : & \number`\: \\
957 ; & \number`\; \\
958 = & \number`\= \\
959 ? & \number`\? \\
960 @ & \number`\@ \\
961 A \ldots\ Z & \number`\A\ to \number`\Z \\
962 \relax[ & \number`\[ \\
963 ] & \number`\] \\
964 ` & \number`\` \\
965 a \ldots\ z & \number`\a\ to \number`\z \\
966 \end{tabular}
967 \quad
968 \begin{tabular}[t]{cc}
969 Glyph\footnotemark & Position \\ \hline
970 < & \number`\< \\
971 > & \number`\> \\
972 \string| & \number`\| \\
973 \end{tabular}\footnotetext{The requirement for these three glyphs is
974 violated in the Latin alphabet \Enc{OT} encodings.}
975 \end{center}
976 In addition the following glyphs have to be present
977 somewhere\footnote{The position in this case is not important as they
978 are generated from ligature programs.} in the encoding together with
979 corresponding ligature programs to generate them:
980 \begin{center}
981 \begin{tabular}[t]{cc}
982 Glyph & Ligature program \\ \hline
983 `` & \texttt{`\/`} \\
984 '' & \texttt{'\/'} \\
985 -- & \texttt{-\/-} \\
986 --- & \texttt{-\/-\/-} \\
987 \end{tabular}
988 \end{center}
990 This is $33 + 2 * 26 = 85$ positions ``required'', which leaves 171
991 positions free.
993 If there are free slots available then adding all or some of the
994 diacritics would be the best way to fill them.
996 If there are insufficient slots for the characters needed, a possible
997 technique is to create a subsidiary encoding, and to move non-letter
998 characters to it. Since only ``letters'' take part in the hyphenation
999 algorithm, this technique doesn't affect the appearance of the typeset
1000 result.
1002 \subsection{The constraints on upper/lower case tables}
1004 Due to some technical restrictions of \TeX{} related to hyphenation it
1005 is not possible in \LaTeX{} to use more than one \verb=\lccode= or
1006 \verb=\uccode= table. Therefore all encodings need to share these two
1007 tables which are defined to be those of the \Enc{T1} encoding.
1009 The \Enc{T1} encoding has some nasty peculiarities which make certain slot
1010 positions more or less unusable for other encodings if this
1011 restriction is to be obeyed. This is unfortunate but since \Enc{T1} is well
1012 established and the basis for a large number of languages it seemed
1013 better to live with this situation instead of trying to replace \Enc{T1} with a
1014 slightly better standard (with the result that for a long time
1015 different \LaTeX{} installations would not be able to communicate with
1016 each other because of incompatible font sets).
1018 The positions that are problematic are as follows.
1019 \begin{center}
1020 \begin{tabular}{lp{.8\linewidth}}
1021 25 (\char 25) & uppercase maps strangely (same as for 105, \char 105)\\
1022 26 (\char 26) & uppercase maps strangely (same as for 106, \char 106)\\
1023 27 (\char 27) & lowercase maps to itself which makes this slot subject
1024 to hyphenation (used to support \Enc{OT1} encoding) \\
1025 157 (\char 157) & lowercase maps strangely (same as for 73, \char 73) \\
1026 158 (\char 158) & uppercase maps strangely (same as for 240, \char 240) \\
1027 \end{tabular}
1028 \end{center}
1029 One way to use such slots is to fill them with ligature glyphs as
1030 \TeX{} will not consult these tables for glyphs constructed through
1031 ligatures programs but instead uses the entries for the individual
1032 glyphs used to produce the ligature.
1034 A complete listing of the uppercase/lowercase mapping tables is to be
1035 found in section~\ref{sec:uclc-tab} (page \pageref{sec:uclc-tab}).
1037 \newcount\temp \newcount\tempL \newcount\tempU
1039 \def\nextstep{\global\tempL=\lccode\temp
1040 \global\tempU=\uccode\temp
1041 \lctablenumbersize\the\temp &
1042 \the\tempL&
1043 \the\tempU&\printlowerupper{\the\temp}{\the\tempL}{\the\tempU}\\
1044 \global\advance\temp by 1
1045 \stepprint}
1047 \def\printlowerupper#1#2#3{\char#1\relax
1048 (\ifnum#2=0\relax--\else\char#2\fi
1049 /\ifnum#3=0\relax--\else\char#3\fi)}
1051 \def\stepprint{\relax\ifnum\temp<\endval
1052 \let\next=\nextstep
1053 \else
1054 \let\next=\relax
1056 \next}
1058 \def\dolctable#1#2{{\temp=#1\relax
1059 \def\endval{#2}%
1060 \setlength\tabcolsep{1.5pt}%
1061 \begin{tabular}[t]{@{}cccc@{}}
1062 pos&lc&uc&glyphs\\\hline
1063 \stepprint
1064 \end{tabular}}}
1066 \iffalse
1067 \begin{center}
1068 \tiny\let\lctablenumbersize\tiny
1069 \mbox{\dolctable{0}{52}\vrule
1070 \dolctable{52}{104}\vrule
1071 \dolctable{104}{156}\vrule
1072 \dolctable{156}{208}\vrule
1073 \dolctable{208}{256}}
1074 \end{center}
1077 \iffalse
1078 \begin{center}\tiny
1079 \mbox{\dolctable{0}{65}\vrule
1080 \dolctable{65}{128}\vrule
1081 \dolctable{128}{193}\vrule
1082 \dolctable{193}{256}}
1083 \end{center}
1088 \section{Encoding specific commands}
1090 An encoding specific command is one that generates a glyph (or
1091 glyphs), to produce a graphic effect that may be implemented
1092 differently in different encodings. The encoding specific command
1093 automatically changes its implementation when the encoding changes in
1094 the course of the document. Encoding specific commands figure in
1095 \LaTeX's internal character representation (\textsc{licr}) and are also
1096 discussed in \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
1098 The following table only covers the encoding specific commands from
1099 the \Enc{OT1} and \Enc{T1} encodings. Other encodings may specify
1100 additional encoding specific commands. In the table, the first 15
1101 commands are `accent-like' and need as an argument the character to be
1102 accented. For example, |\v{c}| is the \textsc{licr} for `\v{c}'.
1104 \begin{tabbing}
1105 \ttverb\textvisiblespace\quad\=bbbbbbbbbbbbbb\=b'b'\=ccccccccccc\kill
1106 \ttverb\`{} \>OT1,T1\> \a`{}\> (grave) \\
1107 \ttverb\'{} \>OT1,T1\> \a'{}\> (acute) \\
1108 \ttverb\^{} \>OT1,T1\> \^{}\> (circumflex) \\
1109 \ttverb\~{} \>OT1,T1\> \~{}\> (tilde) \\
1110 \ttverb\"{} \>OT1,T1\> \"{}\> (umlaut) \\
1111 \ttverb\H{} \>OT1,T1\> \H{}\> (Hungarian umlaut) \\
1112 \ttverb\r{} \>OT1,T1\> \r{}\> (ring) \\
1113 \ttverb\v{} \>OT1,T1\> \v{}\> (ha\v{c}ek) \\
1114 \ttverb\u{} \>OT1,T1\> \u{}\> (breve) \\
1115 \ttverb\t{} \>OT1,T1\> \t{}\> (tie) \\
1116 \ttverb\={} \>OT1,T1\> \a={}\> (macron) \\
1117 \ttverb\.{} \>OT1,T1\> \.{}\> (dot) \\
1118 \ttverb\b{} \>OT1,T1\> \b{}\> (underbar) \\
1119 \ttverb\c{} \>OT1,T1\> \c{}\> (cedilla) \\
1120 \ttverb\d{} \>OT1,T1\> \d{}\> (dot under) \\
1121 \ttverb\k{} \>T1 \> \k{}\> (ogonek) \\
1122 % \ttverb\AA \>OT1,T1\> \AA \> \\ % no longer
1123 \ttverb\AE \>OT1,T1\> \AE \> \\
1124 \ttverb\DH \>T1 \> \DH \> \\
1125 \ttverb\DJ \>T1 \> \DJ \> \\
1126 \ttverb\L \>OT1,T1\> \L \> \\
1127 \ttverb\NG \>T1 \> \NG \> \\
1128 \ttverb\OE \>OT1,T1\> \OE \> \\
1129 \ttverb\O \>OT1,T1\> \O \> \\
1130 \ttverb\SS \>OT1,T1\> \SS \> \\
1131 \ttverb\TH \>T1 \> \TH \> \\
1132 % \ttverb\aa \>OT1,T1\> \aa \> \\ no-longer
1133 \ttverb\ae \>OT1,T1\> \ae \> \\
1134 \ttverb\dh \>T1 \> \dh \> \\
1135 \ttverb\dj \>T1 \> \dj \> \\
1136 \ttverb\guillemotleft \>T1 \> \guillemotleft \> (guillemet) \\
1137 \ttverb\guillemotright \>T1 \> \guillemotright \> (guillemet) \\
1138 \ttverb\guilsinglleft \>T1 \> \guilsinglleft \> (guillemet) \\
1139 \ttverb\guilsinglright \>T1 \> \guilsinglright \> (guillemet) \\
1140 \ttverb\i \>OT1,T1\> \i \> \\
1141 \ttverb\j \>OT1,T1\> \j \> \\
1142 \ttverb\l \>OT1,T1\> \l \> \\
1143 \ttverb\ng \>T1 \> \ng \> \\
1144 \ttverb\oe \>OT1,T1\> \oe \> \\
1145 \ttverb\o \>OT1,T1\> \o \> \\
1146 \ttverb\quotedblbase \>T1 \> \quotedblbase \> \\
1147 \ttverb\quotesinglbase \>T1 \> \quotesinglbase \> \\
1148 \ttverb\ss \>OT1,T1\> \ss \> \\
1149 \ttverb\textasciicircum \>OT1,T1\> \textasciicircum \> \\
1150 \ttverb\textasciitilde \>OT1,T1\> \textasciitilde \> \\
1151 \ttverb\textbackslash \>OT1,T1\> \textbackslash \> \\
1152 \ttverb\textbar \>OT1,T1\> \textbar \> \\
1153 \ttverb\textbraceleft \>OT1,T1\> \textbraceleft \> \\
1154 \ttverb\textbraceright \>OT1,T1\> \textbraceright \> \\
1155 \ttverb\textcompwordmark \>OT1,T1\> \textcompwordmark\> (invisible) \\
1156 \ttverb\textdollar \>OT1,T1\> \textdollar \> \\
1157 \ttverb\textemdash \>OT1,T1\> \textemdash \> \\
1158 \ttverb\textendash \>OT1,T1\> \textendash \> \\
1159 \ttverb\textexclamdown \>OT1,T1\> \textexclamdown \> \\
1160 \ttverb\textgreater \>OT1,T1\> \textgreater \> \\
1161 \ttverb\textless \>OT1,T1\> \textless \> \\
1162 \ttverb\textquestiondown \>OT1,T1\> \textquestiondown\> \\
1163 \ttverb\textquotedbl \>T1 \> \textquotedbl \> \\
1164 \ttverb\textquotedblleft \>OT1,T1\> \textquotedblleft\> \\
1165 \ttverb\textquotedblright \>OT1,T1\> \textquotedblright\> \\
1166 \ttverb\textquoteleft \>OT1,T1\> \textquoteleft \> \\
1167 \ttverb\textquoteright \>OT1,T1\> \textquoteright \> \\
1168 \ttverb\textregistered \>OT1,T1\> \textregistered \> \\
1169 \ttverb\textsection \>OT1,T1\> \textsection \> \\
1170 \ttverb\textsterling \>OT1,T1\> \textsterling \> \\
1171 \ttverb\texttrademark \>OT1,T1\> \texttrademark \> \\
1172 \ttverb\textunderscore \>OT1,T1\> \textunderscore \> \\
1173 \ttverb\textvisiblespace \>OT1,T1\> \textvisiblespace\> \\
1174 \ttverb\th \>T1 \> \th \>
1175 \end{tabbing}
1177 \section{Encodings for Unicode based \TeX\ systems}
1178 \label{sec:unicode}
1180 The preceding text has assumed a classic TeX system that is
1181 restricted to the use of fonts with at most 256 characters. In order
1182 to accommodate all the characters needed for different languages and
1183 mathematics it is necessary to have multiple encodings as described
1184 above, and \LaTeX\ needs to be aware of the encoding used for each
1185 font.
1187 Unicode aims to provide a single encoding that removes most of the
1188 need to switch encodings, apart from very specialist use for non
1189 standard characters. Rather than assign codes in the range 0--256 (hex
1190 FF) Unicode codes are in the range 0--1,114,111 (hex 10FFFF), although
1191 not all slots are available for distinct characters for technical
1192 reasons. Unicode offers the possibility to use a single input encoding
1193 (usually UTF-8) for all documents and to use essentially the same
1194 Unicode encoding for all fonts, so removing the need to switch
1195 encodings in different context.
1197 Omega was perhaps the first widely used \TeX\ extension that
1198 supported Unicode. Currently the two actively supported systems that are
1199 present in most modern \TeX\ distributions are xe\TeX\ and lua\TeX.
1201 When used with these extended \TeX\ engines, \LaTeX's font system can
1202 refer to Unicode fonts (typically OpenType fonts installed system wide
1203 on your operating system rather than fonts specifically encoded for
1204 \TeX. Currently the usual method of accessing these fonts is through
1205 the contributed \Pkg{fontspec} package. this uses the two
1206 \emph{Experimental} encoding \Enc{EU1} (on xe\TeX) and \Enc{EU2} (on
1207 lua\TeX). Technically these two are the same encoding in terms of
1208 allocating characters to numbered positions, but two encodings have
1209 been specified due to some internal differences in font handling in
1210 the two extended \TeX\ engines. The exact rules for \LaTeX\ encodings
1211 for Unicode engines have not yet been finalised, however it is
1212 possible that a single unified format can be used and so a single
1213 standardised name such as \Enc{UC} may be used. However at the present
1214 time \Enc{EU1} and \Enc{EU2} should be used, although it is rare to
1215 need to specify these explicitly in a document as the \Pkg{fontspec}
1216 package sets up the correct encoding based on the engine in use.
1218 The restrictions described in section \ref{sec:restrictions} do not
1219 apply, or need to be modified in a Unicode based engine. Clearly the
1220 lowercase table (and hyphenation patterns) can not be restricted to
1221 the values used for \Enc{T1} and do only refer to the first 256
1222 characters.
1224 When the \LaTeX\ format is made \LaTeX sets up the lowercase table
1225 and classifies characters as letter or non letter based on \Enc{T1} if
1226 a classic \TeX\ or pdf\TeX\ is being used. If a Unicode based \TeX\ is
1227 detected, the values are instead based on the classification and
1228 lower-case mappings provided by the Unicode Character Database
1229 \cite{ucd}. The relevant part of these tables are converted to \TeX\
1230 syntax as \ttverb{ltunicode.ltx} as part of the \LaTeX\ distribution.
1232 Similarly in the default configuration files used by modern \TeX\
1233 distribution, the hyphenation files for each supported language are
1234 written in UTF-8 encoding, using Unicode code points for all letters,
1235 then if a classic \TeX system is detected, some additional macros are
1236 loaded to convert these files to 256-character encodings where
1237 possible, and assuming the \Enc{T1} lowercase table.
1242 \begin{thebibliography}{99}
1243 \addcontentsline{toc}{section}{\numberline{\relax}\refname}
1246 \bibitem{Adobe:PDF-1.6} \emph{\textsc{PDF} reference}:
1247 Adobe portable document format version~1.6. Adobe Systems
1248 Incorporated, 2005. % why \textsuperscript{3}?
1249 \url{http://partners.adobe.com/public/developer/en/pdf/PDFReference16.pdf}.
1251 \bibitem{Beeton:TB6-3-124} Barbara Beeton:
1252 \emph{Mathematical symbols and cyrillic fonts ready for
1253 distribution}. In: TUGBoat, 6\#3), 1985.
1254 \url{http://tug.org/TUGboat/Articles/tb06-3/tb13beetcyr.pdf}.
1256 \bibitem{beeton} Barbara Beeton: \emph{Unicode
1257 and math, a combination whose time has come -- Finally!}. In:
1258 TUGBoat, 21\#3, 2000.
1259 \url{http://www.tug.org/TUGboat/Articles/tb21-3/tb68beet.pdf}.
1262 \bibitem{Berdnikov:eurotex-98} A.\@ Berdnikov, O.\@
1263 Lapko, M.\@ Kolodin, A.\@ Janishevsky and
1264 A.\@ Burykin: \emph{The Encoding Paradigm in
1265 \LaTeXe{} and the Projected X2 Encoding for Cyrillic Texts}.
1266 Euro\TeX~98.
1267 \url{http://www.gutenberg.eu.org/pub/GUTenberg/publicationsPDF/28-29-berdnikova.pdf}.
1269 \bibitem{CJK} \emph{The \Pkg{CJK} package}:
1270 \url{http://cjk.ffii.org}.
1272 \bibitem{clasen} Matthias Clasen: \emph{A new
1273 implementation of \LaTeX{} math}, 1997-98.
1274 \url{http://www.tug.org/twg/mfg/papers/current/newmath.ps.gz}.
1276 \bibitem{clasen-vieth} Matthias Clasen and Ulrik
1277 Vieth: \emph{Towards a new Math Font Encoding
1278 for (La)\TeX}. March 1998,
1279 \url{http://www.tug.org/twg/mfg/papers/current/mfg-euro-all.ps.gz}.
1281 \bibitem{CorkGW:91}
1282 Dean Guenther and Janene Winter.
1283 \newblock An international phonetic alphabet.
1284 \newblock In Guenther \cite{proc:MGu91}, pages 149--156.
1285 \newblock Published as {TUG}boat 12\#1.
1287 \bibitem{proc:MGu91}
1288 Mary Guenther, editor.
1289 \newblock {\em {\TeX} 90 Conference Proceedings}, March 1991.
1290 \newblock Published as {TUG}boat 12\#1.
1292 \bibitem{tub:MFe90}
1293 Michael~J. Ferguson.
1294 \newblock Report on multilingual activities.
1295 \newblock {\em {TUG}boat}, 11(4):514--516, 1990.
1297 \bibitem{fontinst} \emph{The \Pkg{fontinst} package}:
1298 \textlangle CTAN\textrangle\url{/fonts/utilities/fontinst}.
1300 \bibitem{Rei:TB17-2-102} Fukui Rei:
1301 \emph{\textsl{TIPA}: A system for processing phonetic
1302 symbols in \LaTeX}. In: TUGBoat, 17\#, 1996.
1303 \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51rei.pdf}.
1305 \bibitem{hyperref} \emph{The \Pkg{hyperref} package}:
1306 \url{http://www.tug.org/applications/hyperref}.
1308 \bibitem{tub:JKn93}
1309 J\"org Knappen.
1310 \newblock Fonts for Africa: The fc Fonts.
1311 \newblock {\em {TUG}boat}, 14(2):104, 1993.
1313 \bibitem{Knappen:TB17-2-96} J\"org Knappen:
1314 \emph{The \Pkg{dc} fonts~1.3: Move towards stability
1315 and completeness}. In: TUGBoat 17\#2, 1996.
1316 \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51knap.pdf}.
1318 \bibitem{A-W:DKn86}
1319 Donald~E. Knuth.
1320 \newblock {\em The {\TeX}book}.
1321 \newblock Volume~A of {\em Computers \& {T}ypesetting\/},
1322 May 1989.
1323 \newblock Eight printing.
1325 \bibitem{vnr} \emph{The \Pkg{vnr} font family}, developed by
1326 the author of pdf\TeX, {H\`an Th\^e\protect\llap{\raise 0.5ex\hbox{\'{\relax}}} Th\`anh}.
1327 \url{http://vntex.org/download/vntex}.
1329 \bibitem{ipa} Home page of the International Phonetic Association.
1330 \url{http://www.arts.gla.ac.uk/IPA/ipa.html}
1332 \bibitem{A-W:LLa94}
1333 Leslie Lamport.
1334 \newblock {\em {\LaTeX:} A Document Preparation System}.
1335 \newblock Addison-Wesley, Reading, Massachusetts, second edition, 1994.
1337 \bibitem{LH-Fonts} \emph{The \Pkg{lh}-Fonts for Cyrillic}:
1338 \textlangle CTAN\textrangle\url{/fonts/cyrillic/lh}.
1340 \bibitem{A-W:MG2004}
1341 Frank Mittelbach and Michel Goossens.
1342 \newblock {\em The {\LaTeX} Companion second edition}.
1343 \newblock With Johannes Braams, David Carlisle, and Chris Rowley.
1344 \newblock Addison-Wesley, Reading, Massachusetts, 2004.
1346 \bibitem{Unicode} \emph{The Unicode Standard}.
1347 \url{http://unicode.org}.
1349 \bibitem{ucd} \emph{The Unicode Character Database}.
1350 \url{http://unicode.org/ucd}.
1352 \bibitem{ziegler} Justin Ziegler, \emph{Technical
1353 Report on Math Font Encodings}, June 1994,
1354 \url{http://www.tug.org/twg/mfg/papers/ltx3pub/l3d007.ps.gz}.
1356 \end{thebibliography}
1358 \clearpage\appendix
1359 \begin{center}
1360 \Large\bfseries Appendices
1361 \end{center}
1363 \section{Example code tables}
1365 This appendix contains a table of each font mentioned as an ``example''
1366 font above, providing that the font was available when the document
1367 was processed with \LaTeX{}. (\LaTeX{} generates a warning message
1368 for each font it fails to find.)
1370 \subsection{Text encodings}
1372 \ftable{cmr10}{OT1}
1374 \ftable{wnr10}{OT2}
1376 \ftable{wsuipa10}{OT3}
1378 \ftable{plr10}{OT4}
1380 %\ftable{artmr10}{OT6}
1382 \ftable{ecrm1000}{T1}
1384 \ftable{larm1000}{T2A}
1386 \ftable{lbrm1000}{T2B}
1388 \ftable{lcrm1000}{T2C}
1390 \ftable{tipa10}{T3}
1392 \ftable{fcr10}{T4}
1394 \ftable{vnr10}{T5}
1397 \subsection{Text symbol encodings}
1399 The full table for \Enc{TS1} as provided by European Computer Modern family:
1400 \ftable{tcrm1000}{TS1}
1402 \pagebreak
1404 In contrast typical PostScript fonts usually have incomplete implementations
1405 of \Enc{TS1} sometimes missing more than half of the glyphs:
1407 \ftable{ptmr8c}{TS1}
1409 \ftable{tipx10}{TS3}
1413 \subsection{Extended text encodings}
1415 \ftable{rxrm1000}{X2}
1418 \subsection{Mathematical encodings}
1420 \ftable{cmmi10}{OML}
1422 \ftable{cmsy10}{OMS}
1424 \ftable{cmex10}{OMX}
1427 \subsection{Other encodings}
1429 \ftable{ptmr8y}{LY1}
1431 %%\ftable{????}{LV1}
1433 \ftable{grmn1000}{LGR}
1435 \ftable{wasy10}{U}
1436 \ftable{logo10}{U}
1438 \clearpage
1439 \section{Uppercase and lowercase tables}
1440 \label{sec:uclc-tab}
1442 The following two sets of tables list the \verb"\uppercase" and
1443 \verb"\lowercase" values for each position in the \LaTeX{} standard
1444 256-character tables.
1446 Each row of each table lists:
1447 \begin{quote}
1448 \begin{tabular}{lp{0.7\textwidth}}
1449 pos & The position in the table (0-255) \\
1450 lc & The value in the \verb"\lowercase" table at the position \\
1451 & (note that value 0 here means that \verb"\lowercase" is
1452 ineffective for this character, and hyphenation does not apply
1453 to it) \\
1454 uc & The value in the \verb"\uppercase" table at the position \\
1455 & (note that value 0 here means that \verb"\uppercase" is
1456 ineffective for this character) \\
1457 glyphs & The glyphs specified for the T1 encoding for this
1458 position, laid out as \meta{glyph}\textbf{(}\meta{lowercase
1459 glyph}\textbf{/}\meta{uppercase glyph}\textbf{)}
1460 \end{tabular}
1461 \end{quote}
1463 \begin{center}
1464 \let\lctablenumbersize\footnotesize
1465 \makebox[\textwidth]{\hss
1466 \dolctable{0}{32}\quad\dolctable{32}{64}\quad
1467 \dolctable{64}{96}\quad\dolctable{96}{128}%
1468 \hss}
1470 \makebox[\textwidth]{\hss
1471 \dolctable{128}{160}\quad\dolctable{160}{192}\quad
1472 \dolctable{192}{224}\quad\dolctable{224}{256}%
1473 \hss}
1474 \end{center}
1475 \end{document}
1478 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%