latex2e-20151001/doc/encguide.tex

   1 % \iffalse meta-comment
   2 %
   3 % Copyright 1993-2014
   4 % The LaTeX3 Project and any individual authors listed elsewhere
   5 % in this file.
   6 %
   7 % This file is part of the LaTeX base system.
   8 % -------------------------------------------
   9 %
  10 % It may be distributed and/or modified under the
  11 % conditions of the LaTeX Project Public License, either version 1.3c
  12 % of this license or (at your option) any later version.
  13 % The latest version of this license is in
  14 %    http://www.latex-project.org/lppl.txt
  15 % and version 1.3c or later is part of all distributions of LaTeX
  16 % version 2005/12/01 or later.
  17 %
  18 % This file has the LPPL maintenance status "maintained".
  19 %
  20 % The list of all files belonging to the LaTeX base distribution is
  21 % given in the file `manifest.txt'. See also `legal.txt' for additional
  22 % information.
  23 %
  24 % The list of derived (unpacked) files belonging to the distribution
  25 % and covered by LPPL is defined by the unpacking scripts (with
  26 % extension .ins) which are part of the distribution.
  27 %
  28 % \fi
  29 % $Id: encguide.tex 5713 2006-01-18 23:29:23Z robin $
  30 %
  31
  32
  33 \NeedsTeXFormat{LaTeX2e}[1995/12/01]
  34
  35 \documentclass{ltxguide}[1994/11/20]
  36
  37 \usepackage[T1]{fontenc}
  38 \IfFileExists{lmodern.sty}{\usepackage{lmodern}}{}
  39 \usepackage{textcomp}
  40 \usepackage{url}
  41 \usepackage{mflogo}
  42
  43 \addtolength\textheight{6\baselineskip}
  44 \addtolength\topmargin{-2\baselineskip}
  45
  46
  47 \newcommand\ttverb[1]{\texttt{\string#1}}
  48
  49
  50 % for encodings
  51 \providecommand{\Enc}[1]{\texttt{#1}}
  52
  53 % for packages
  54 \providecommand{\Pkg}[1]{%
  55   \textsf{#1}}
  56
  57 % for files
  58 \providecommand{\File}[1]{%
  59   \texttt{#1}}
  60
  61 % let's have meta values too
  62 \providecommand{\meta}[1]{%
  63   \ensuremath{\langle}\emph{#1}\ensuremath{\rangle}}
  64
  65 \usepackage{tabularx}
  66
  67 % eine Umgebung zur Darstellung von Kodierungen
  68 %
  69 % Argumente:
  70 %  #1: Name in LaTeX (z.B. OT1)
  71 %  #2: Name der Kodierung (z.B. TeX text)
  72 %  #3: Name des Autors (z.B. Don Knuth)
  73 %  #4: Bereich der benützten Glyphindizes
  74 %  #5: variable Positionen
  75 %  #6: Beispielzeichensatz
  76 %  #7: Referenz
  77 %
  78 % XXX add code to handle more than a single font example (e.g., larm1000,
  79 % lbrm1000, and lcrm1000).
  80 %
  81 \newenvironment{encodinginfo}[7]%
  82   {\noindent
  83    \begin{tabularx}{\linewidth}{@{}l>{\raggedright\let\\\tabularnewline}X}%
  84      \LaTeX{} name:          & \texttt{#1}\\%
  85      Public name:          & #2\\%
  86      Author:                   & #3\\%
  87      Glyph slots used: & #4\\%
  88      Variable slots:     & #5\\%
  89      Font example:     & \def\@tempa{#6}\ifx\@tempa\@empty---%
  90                             \else\texttt{#6}\referenceftable{#6}\fi\\%
  91      Further reference:                & #7%
  92    \end{tabularx}%
  93    \par\nobreak
  94    \vspace*{3pt}%
  95    \quote
  96   }%
  97   {\endquote
  98    \vspace{6pt}}
  99
 100 \makeatletter
 101 \def\referenceftable#1{
 102   \@ifundefined{r@fonttable:#1}%
 103   \relax
 104   {;\space encoding table on page~\pageref{fonttable:#1}}%
 105 }
 106
 107 % font table macros mainly lifted from manmac.tex
 108 \def\oct#1{\hbox{\rm\'{}\kern-.2em\it#1\/\kern.05em}}
 109 \def\hex#1{\hbox{\rm\H{}\tt#1}}
 110
 111 \def\oddline#1{\cr\noalign{\nointerlineskip}
 112   \multispan{19}\hrulefill&
 113   \setbox0=\hbox{\lower 2.3pt\hbox{\hex{#1x}}}\smash{\box0}\cr
 114   \noalign{\nointerlineskip}}
 115 \def\evenline{\cr\noalign{\hrule}}
 116 \def\chartstrut{\lower4.5pt\vbox to14pt{}}
 117 \def\beginchart#1#2{$$\global\count@=0 #1
 118   \halign to\hsize\bgroup
 119     \chartstrut##\tabskip0pt plus10pt&
 120     &\hfil##\hfil&\vrule##\cr
 121     \lower6.5pt\null
 122   &#2&&\oct0&&\oct1&&\oct2&&\oct3&&\oct4&&\oct5&&\oct6&&\oct7&\evenline}
 123 \def\endchart{\raise11.5pt\null&&&\hex 8&&\hex 9&&\hex A&&\hex B&
 124   &\hex C&&\hex D&&\hex E&&\hex F&\cr\egroup$$}
 125 \def\:{\setbox0=\hbox{\noboundary\char\count@\noboundary}%
 126   \ifdim\ht0>7.5pt\reposition
 127   \else\ifdim\dp0>2.5pt\reposition\fi\fi
 128   \box0\global\advance\count@ by1 }
 129 \def\reposition{\setbox0=\hbox{$\vcenter{\kern2pt\box0\kern2pt}$}}
 130 \def\normalchart{%
 131   &\oct{00x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline0
 132   &\oct{01x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 133   &\oct{02x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline1
 134   &\oct{03x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 135   &\oct{04x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline2
 136   &\oct{05x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 137   &\oct{06x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline3
 138   &\oct{07x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 139   &\oct{10x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline4
 140   &\oct{11x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 141   &\oct{12x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline5
 142   &\oct{13x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 143   &\oct{14x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline6
 144   &\oct{15x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 145   &\oct{16x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline7
 146   &\oct{17x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 147   \top}
 148
 149 \def\notophalf{}
 150 \def\tophalf{%
 151 %\noalign{\vskip 5pt\hrule}
 152   &\oct{20x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline8
 153   &\oct{21x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 154   &\oct{22x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline9
 155   &\oct{23x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 156   &\oct{24x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline A
 157   &\oct{25x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 158   &\oct{26x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline B
 159   &\oct{27x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 160   &\oct{30x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline C
 161   &\oct{31x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 162   &\oct{32x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline D
 163   &\oct{33x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 164   &\oct{34x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline E
 165   &\oct{35x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 166   &\oct{36x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline F
 167   &\oct{37x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline}
 168
 169 \def\ftable#1#2{%
 170      \batchmode
 171      \font\X=#1%
 172      \errorstopmode
 173      \ifx\X\nullfont
 174        \@warning{Font #1 not found, table omitted}
 175      \else
 176        \count@="80
 177        \setbox0=\hbox{\X
 178         \loop\char\count@\advance\count@ by1 \ifnum\count@<"100
 179         \repeat}%
 180   \ifdim\wd0>0pt \let\top\tophalf\else\let\top\notophalf\fi
 181      \beginchart\X{\hfill\llap{\textbf{#1, \large#2}\label{fonttable:#1}}}\normalchart
 182      \endchart\par\vfill
 183     \fi}
 184 \makeatother
 185
 186
 187 \setcounter{tocdepth}{3}
 188
 189 \title{\LaTeX{} font encodings}
 190
 191 \author{Frank Mittelbach \and Robin
 192    Fairbairns \and Werner Lemberg \and \LaTeX3 Project Team.}
 193
 194 \date{\copyright~Copyright 1995--2014 \\[5pt] 29 October 2014}
 195
 196 \begin{document}
 197
 198 \maketitle
 199
 200 \tableofcontents
 201
 202 \section{Introduction}
 203
 204 This document explains the ideas that underpin \LaTeX{} font
 205 encodings and the constraints that apply when defining a new encoding; it
 206 also lists the encodings that have already been defined.
 207
 208 \subsection{Encodings in \TeX{}}
 209
 210 \TeX{} (the program) implicitly recognises three sorts of encoding,
 211 and all are (in a sense) discussed in the \TeX{}book~\cite{A-W:DKn86}:
 212 \begin{itemize}
 213 \item[1.] The input encoding, which specifies the meanings of characters
 214   in files presented to \TeX{} for processing.  The \TeX{}book
 215   suggests that `your version of \TeX{} will recognise the characters
 216   you type on your keyboard' (\TeX{} the program has provision for
 217   static translations of input characters).
 218 \end{itemize}
 219 Such direct use of \TeX{}'s facilities is not the way modern
 220 \LaTeX{} (or indeed any other \TeX{} macro package) is likely to deal
 221 with input encodings.   This document does not address the topic of
 222 input encodings; the interested reader should examine the \LaTeX{}
 223 base package \Pkg{inputenc} \cite[sec.~7.5.2, p.~357]{A-W:MG2004}.
 224 \begin{itemize}
 225 \item[2.] The token stream that \TeX{} processes internally.  This stream
 226   of \TeX{}'s consciousness is discussed in great detail in the
 227   \TeX{}book.
 228 \end{itemize}
 229 Again, this document does not address the topic.  \LaTeX's internal
 230 character representation (\textsc{licr}) is well discussed in
 231 \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
 232 \begin{itemize}
 233 \item[3.] The font encoding---i.e., the mapping of character codes to
 234   glyphs in the fonts that are used to typeset \TeX{}'s output.
 235   Again, a set of font encodings is enumerated in the \TeX{}book, but
 236   that set has proved inadequate to the needs of modern multilingual
 237   use of \LaTeX.
 238 \end{itemize}
 239 This document explains \emph{why} Knuth's original set of encodings is
 240 inadequate to modern conditions, and discusses the issues that
 241 surround the design and definition of new font encodings.
 242
 243 Font encodings are important for more than their r\^{o}le in mapping the
 244 glyphs of the fonts to be used for typesetting: their glyph tables are
 245 also the context in which \TeX{}'s hyphenation algorithm operates.
 246 There are constraints imposed by \TeX{} that affect the way in which
 247 new font encodings, for use in a multi-lingual environment, may be
 248 structured (see section~\ref{sec:restrictions} for details).
 249
 250 \subsection{The history of \TeX{} font encodings}
 251
 252 Little attention was paid to font encodings prior to the arrival of
 253 \TeX{}\,3.  Up to that time, one used Donald Knuth's fonts (the
 254 Computer Modern family, using the encodings we now refer to as \Enc{OT1} and
 255 the \Enc{OM} series), or one was on one's own.
 256
 257 The Computer Modern text encoding raises problems in unmodified
 258 \TeX{}, because hyphenation cannot break words containing
 259 \verb"\accent" commands.  Even in those Western European languages for
 260 which the \Enc{OT1} encoding has symbols for the necessary
 261 \verb"\accent"-based diacritics, this shortcoming ruins typesetting of
 262 running text.
 263
 264 With the advent of \TeX{}\,3, with its ability to switch between
 265 hyphenation pattern sets, it was clear that the situation could not
 266 continue.  Thus a group at the TUG Annual General Meeting in Cork,
 267 Ireland, specified a uniform encoding for 256-glyph fonts, that
 268 contains accented letters and non-\textsc{ascii} letters necessary to
 269 express most Western European languages (and some Eastern European ones)
 270 without recourse to the \verb"\accent" command.
 271
 272 This ``Cork'' encoding has since been realised in a series of fonts
 273 designed with Metafont, in at least one font series that is available
 274 both in Adobe Type 1 format and in OpenType format, % viz., Latin Modern
 275 and in a number of virtual-font mappings of other font series.
 276
 277 Since the time of the Cork meeting, much effort has been devoted to
 278 the design of encodings for text fonts to use with \TeX{}, and the
 279 Cork encoding influenced the design of many such encodings.
 280
 281 Encodings for mathematical fonts have, in contrast, changed little
 282 since Knuth's contributions.  A TUG Technical Working Group was
 283 established at the Cork meeting, whose aim was to define a set of
 284 256-glyph encodings to regularise and extend Knuth's originals, using
 285 ideas from several other fonts that had appeared since, and from the
 286 known needs of researchers in mathematics and the mathematical sciences.
 287
 288 Independently, a first proposal (the so-called \emph{Aston proposal}) was worked
 289 out by Justin Ziegler together with Frank Mittelbach and other members of the
 290 \LaTeX3 project team~\cite{ziegler}. A first implementation of
 291 this propsal was realized by Matthias Clasen und Ulrik
 292 Vieth~\cite{clasen,clasen-vieth}.
 293
 294 However, the slow progress of these Mathematical encodings has been
 295 overtaken by the addition (in the last decade or so) of a large number
 296 of mathematical symbols to Unicode~\cite{beeton}; one can expect
 297 further changes so that new public mathematical font encodings will
 298 most likely be delayed still further.
 299
 300
 301
 302 \subsection{Further information}
 303
 304 For a general introduction to \LaTeX, including the new features of
 305 \LaTeXe, you should read \emph{\LaTeXbook},
 306 Leslie Lamport, Addison Wesley, 2nd~ed, 1994.
 307
 308 A more detailed description of the new features of \LaTeX, including an
 309 overview of more than 200 packages and nearly 1000 ready to run examples, is
 310 to be found in \emph{\LaTeXcomp{} second edition} by Frank Mittelbach and
 311 Michel Goossens~\cite{A-W:MG2004}.
 312
 313 The \LaTeX{} project sponsored a report on Mathematical % spelt out in full
 314 font encodings, which
 315 is worth reading for its insight into the problems of defining the way
 316 in which math is used: see~\cite{ziegler,clasen,clasen-vieth}.
 317
 318 The \LaTeX{} font selection scheme is based on \TeX, which is described
 319 by its developer in \emph{The \TeX book}, Donald E.~Knuth, Addison
 320 Wesley, 1986, revised in 1991 to include the features of \TeX~3.
 321
 322 For more information about \TeX{} and \LaTeX, please contact your local
 323 \TeX{} Users Group, or the international \TeX{} Users Group
 324 (\url{http://www.tug.org}).
 325
 326
 327
 328 \section{Existing font encodings}
 329
 330 This section lists the encodings currently assigned; for each
 331 encoding, we list the registered (\LaTeX{}) name, the assigned purpose
 332 of the encoding, and the author.  Further details may list the code
 333 positions used in the encoding, the \emph{variable slots} (see below),
 334 an example font (for which a listing will be provided later in the
 335 document if the relevant fonts are present), and a source for further
 336 reference.
 337
 338 While the characteristic feature of an encoding is that each font
 339 encoded according to the encoding should have the same glyph set,
 340 there are some encodings (notably \Enc{OT1} and its descendants) in
 341 which a few glyph code slots differ in their contents in different
 342 fonts.
 343
 344 \subsection{Naming conventions}
 345
 346 Names for encoding schemes are strings of up to three letters (all
 347 upper case) plus digits.
 348
 349 The \LaTeX3 project reserves the use of encoding names starting with the
 350 following letters: |T| (standard 256-long text encodings), |TS|
 351 (symbols that are designed to extend a corresponding |T| encoding),
 352 |X| (text encodings that do not conform to the strict requirements for
 353 |T| encodings), |M| (standard 256-long mathematical encodings), |S| (other
 354 symbol encodings), |A| (other special applications), |OT| (standard
 355 128-long text encodings), and |OM| (standard 128-long mathematical encodings).
 356
 357 Please do not use the above starting letters for non-portable
 358 encodings.  If new standard encodings emerge then we shall add them in
 359 a later release of \LaTeX.
 360
 361 Encoding schemes which are local to a site or a system should start
 362 with |L|, experimental encodings intended for wide distribution will
 363 start with |E|, whilst |U| is for Unknown or Unclassified encodings.
 364
 365 \begin{quote}
 366   \itshape We recommend that new encoding names should not be
 367   introduced unless careful consideration and discussion in the user
 368   community has confirmed the need for the encoding. If encodings have to
 369   change from font to font, a number of problems arise, so it is best to
 370   develop encodings that can be used with a large number of fonts in parallel.
 371   This allows documents to be typeset using different fonts without problems.
 372
 373   The \Enc{TS1} encoding is a good example of a \emph{bad} encoding (even
 374   though it was developed with the best intentions) as a huge number of fonts
 375   can only implement parts of it. Similarly, the fact that the few sets of
 376   available mathematical fonts (beside Computer Modern Math) nearly
 377   all implement slightly different encodings is a huge source of
 378   problems. Don't add to this if possible!
 379 \end{quote}
 380
 381
 382 \subsection{128$^+$ glyph encodings (text)}
 383
 384 The `OT' series of font encodings start with Donald Knuth's original
 385 text encoding, that used for the text fonts in the earliest releases
 386 of \TeX{} itself.  The `O' of the encoding designator may be taken as
 387 signifying `original', or just `old'.
 388
 389 \begin{encodinginfo}{OT1}
 390         {\TeX{} text}
 391         {Donald Ervin Knuth}
 392         {0x00--0x7F}
 393         {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
 394 % {0X--'177}
 395 % {'13--'17, '44, '74, '76, '134, '173--'175}
 396         {cmr10}
 397         {\cite[p.427]{A-W:DKn86}}
 398
 399   Donald Knuth designed his font encoding (and hence his fonts) in a
 400   very different environment from that which now pervades the \TeX{}
 401   world: his (mainframe) computer had very little memory, there was
 402   little experience in (or demand for) for multilingual technical
 403   typesetting, and as a result it was appropriate to sacrifice
 404   uniformity for efficiency.
 405
 406   Thus Knuth's original fonts differ slightly in some encoded slots:
 407   for example, the glyphs \texttt{\string<}, \texttt{\string>},
 408   \verb=\=, \verb={=, and \verb=}= are only available in the
 409   typewriter fonts and the \textdollar{} and \textsterling{} signs
 410   share the same position (in different font shapes).
 411
 412   This means that direct selection of these slots can produce
 413   unpredictable results, e.g., typing \texttt{\string<} or
 414   \verb=\symbol{'74}= in a document can yield `\textquestiondown'.
 415 \end{encodinginfo}
 416
 417
 418 \begin{encodinginfo}{OT2}
 419         {UW cyrillic encoding}
 420         {University of Washington}
 421         {0x00--0x7F}
 422         {---}
 423         {wnr10}
 424         {\cite{Beeton:TB6-3-124}}
 425   Support for this encoding is available in the Cyrillic bundle although for
 426   all practical purposes it is better to use one of the \Enc{T2} encodings.
 427 \end{encodinginfo}
 428
 429
 430 \begin{encodinginfo}{OT3}
 431         {UW IPA encoding}
 432         {University of Washington}
 433         {0x00--0x7f}
 434         {---}
 435         {wsuipa10}
 436         {\cite[p.149]{CorkGW:91}}
 437   The \Enc{OT3} encoding was never really used with \LaTeXe{}
 438   following the introduction of the TIPA system which offers much
 439   better support for IPA. In particular, no \File{ot3enc.def}
 440   file was ever produced.
 441 \end{encodinginfo}
 442
 443
 444 \begin{encodinginfo}{OT4}
 445         {Polish text encoding}
 446         {B.~Jackowski and M.~Ry\'cko} %% ?  Marcin Woli\'nski
 447   {0x00--0x7F, 0x81, 0x82, 0x86, 0x8A, 0x8B, 0x91, 0x99, 0x9B, 0xA1,
 448    0xA2, 0xA6, 0xAA, 0xAB, 0xAE, 0xAF, 0xB1, 0xB9, 0xBB, 0xD3, 0xF3,
 449    0xFF}
 450   {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
 451         {plr10}
 452         {---}
 453
 454    While Knuth included the means of typesetting the `lost L' (\L) in
 455   his \Enc{OT1} encoding, he omitted the ogonek (\,\,\k{}), a diacritic
 456   mark that is also needed in Polish text; hence the appearance, well
 457   before the \Enc{T1} encoding, of fonts using this encoding.
 458 \end{encodinginfo}
 459
 460 \begin{encodinginfo}{OT5}
 461         {Not currently allocated}
 462         {---}
 463         {---}
 464         {---}
 465         {}
 466         {---}
 467
 468 \end{encodinginfo}
 469
 470
 471
 472 \begin{encodinginfo}{OT6}
 473         {Armenian text encoding}
 474         {Serguei Dachian}
 475         {0x03--0x0F, 0x13--0x7F}
 476         {---}
 477         {artmr10}
 478         {---}
 479
 480   This encoding was allocated to permit use of Dachian's
 481   Armenian fonts in a standard \LaTeX{} environment.
 482
 483   Because of license issues the \texttt{artmr} fonts are not necessarily
 484   included in distributed \TeX{} installations (and for this reason the
 485   corresponding encoding table is not shown below). However, the fonts
 486   and the support macros can be found on the CTAN archives (look for
 487   \texttt{armtex}).
 488
 489 \end{encodinginfo}
 490
 491
 492
 493 \subsection{256 glyph encodings (text)}
 494
 495 \begin{encodinginfo}{T1}
 496         {Cork encoding}
 497         {Euro \TeX{} conference at Cork}
 498         {0x00--0xFF}
 499         {---}
 500         {ecrm1000}
 501         {\cite[p.514]{tub:MFe90}, \cite[p.99]{Knappen:TB17-2-96}}
 502
 503   The Cork encoding was developed so that advantage could be taken of
 504   the (then) new facilities of \TeX{}\,3, allowing hyphenation of
 505   most Western European (and some Eastern European) languages in an
 506   unmodified version of \TeX{}.
 507
 508   The encoding was developed in the absence of any extant effort at
 509   font design, but instances written in Metafont (the `EC' fonts), and
 510   more recently Adobe Type 1 instances of the same fonts have become
 511   available.
 512
 513   Substantial (but incomplete) instances have also been developed,
 514   which use virtual fonts.  These latter instances map either Knuth's
 515   original (OT1-encoded) fonts, or commercial fonts that contain the
 516   Adobe `standard' set of 224 glyphs.
 517 \end{encodinginfo}
 518
 519 \begin{encodinginfo}
 520   {T2A, T2B, T2C}
 521   {Cyrillic encodings}
 522   {The CyrTUG font team}
 523   {0x00--0xFF}
 524   {--- (within each encoding)}
 525   {larm1000}
 526   {\cite{Berdnikov:eurotex-98}}
 527
 528   There are too many glyphs in the full Cyrillic complement of
 529   languages for all of them to be covered by a single
 530   \LaTeX{}-compliant encoding (the lower half of each
 531   \Enc{T2}~encoding is identical to that of \Enc{T1}, in order that
 532   each should be a conforming \LaTeX{} encoding~--- see
 533   section~\ref{sec:restrictions}).  The approach taken is
 534   therefore to develop a single encoding, \Enc{X2} (see \ref{sec:extendedenc})
 535   which contains all the glyphs needed for the full set of
 536   languages, and then to derive the three \LaTeX{}-complaint
 537   \Enc{T2}-family encodings using the \Enc{X2} set together with that of
 538   \Enc{T1}.
 539
 540 \end{encodinginfo}
 541
 542
 543
 544 \begin{encodinginfo}{T3}
 545         {IPA encoding}
 546         {FUKUI Rei, University of Tokyo}
 547         {0x00--0xFF}
 548         {---}
 549         {tipa10}
 550         {\cite[p.102]{Rei:TB17-2-102}}
 551
 552
 553     The \Enc{T3} encoding (and associated macros) provides the glyphs required
 554   in phonetic description according to current International Phonetic
 555   Association recommendations \cite{ipa}.
 556
 557   The \Enc{T3} encoding does \emph{not fulfil} the requirements for \Enc{T}
 558   encodings---the name is a historical accident. The correct name would be
 559   \Enc{X3}, but due to the fact that this font family has been used under its
 560   current encoding name for a long time, the name will not change for
 561   compatibility reasons.
 562
 563 \end{encodinginfo}
 564
 565
 566
 567 \begin{encodinginfo}{T4}
 568         {African Latin (fc)}              % public name
 569         {J\"org Knappen}              % author name
 570         {0x00--0xFF}              % range(s) of slots used for glyphs
 571         {0x24}         % range(s) of slots with variable glyphs if any
 572         {fcr10}              % name of an example font
 573         {\cite{tub:JKn93}}
 574
 575 The African Latin fonts contain in their lower half (0--127) the same
 576 characters as the European Latin (T1-encoded) Fonts, while in their
 577 upper half (128--255) they
 578 contain letters and symbols for African languages that use extended
 579 Latin alphabets.
 580 Due to lack of space, J\"org had to play the unfortunate trick of
 581 assigning \verb=\textdollar= and \verb=\textsterling=
 582 the same position; users should take these characters
 583 from the text companion font, if they are needed.  Instead of defining
 584 a lot of new control sequences for the single letters, there are three
 585 accent-like control sequences with general purpose:
 586 \verb=\m= (Modified-1),
 587 \verb=\M= (Modified-2) and
 588 \verb=\B= (Barred).
 589 Most standard \LaTeX{} encoding-dependent commands
 590 work.  However, the Icelandic special letters are not available and `best
 591 replacements' for \verb=\Th=, \verb=\th=, and \verb=\dh=
 592 are used (barred T and d resp.).
 593 \end{encodinginfo}
 594
 595
 596 \begin{encodinginfo}{T5}
 597         {Vietnamese encoding}
 598         {Werner Lemberg and
 599          Vladimir Volovich}
 600         {0x00--0xFF}
 601         {---}
 602         {vnr10}
 603         {\cite{vnr}}
 604
 605   The \Enc{T5} encoding was developed for Vietnamese. Again, this encoding
 606   \emph{does not} conform to the requirements for a \Enc{T}-encoding
 607   because its large number of accented letters prevent the \verb=\lccode= and
 608   \verb=\uccode= mapping requirements for \Enc{T} encodings from being
 609   fulfilled.  However, since the Vietnamese language does not
 610   use word division in typesetting so that this requirement is
 611   actually not important for this particular language.
 612   Since every glyph used in Vietnamese text is internally
 613   represented as \textsc{licr} macros, the commands  \verb=\MakeUppercase= and
 614   \verb=\MakeLowercase= still work as expected (as they change the case of the
 615   \textsc{ascii} characters in \textsc{licr} definitions).
 616
 617 \end{encodinginfo}
 618
 619 \begin{encodinginfo}
 620   {T6}
 621   {Armenian}
 622   {---}
 623   {---}
 624   {---}
 625   {}
 626   {---}
 627
 628     This encoding is reserved to permit future expansion of Armenian
 629   \TeX{} to use 256-character (hyphenatable) fonts.
 630 \end{encodinginfo}
 631
 632 \begin{encodinginfo}{T7}
 633         {Greek encoding}
 634    {---}
 635    {---}
 636    {---}
 637    {}
 638    {---}
 639
 640 The name is already reserved for a 256 glyph greek encoding. The encoding
 641 itself hasn't been defined so far.
 642
 643 \end{encodinginfo}
 644
 645
 646
 647 \subsection{256$^-$ glyph encodings (text symbols)}
 648
 649 \begin{encodinginfo}{TS1}
 650         {Text Companion encoding (Cork)}
 651         {J\"org Knappen}
 652   {0x00--0x0D, 0x12, 0x15, 0x16, 0x18--0x1D, 0x20, 0x24, 0x27, 0x2A,
 653    0x2C--0x3A, 0x3C--0x3E, 0x4D, 0x4F, 0x57, 0x5B, 0x5D--0x60,
 654    0x62--0x64, 0x6C--0x6E, 0x7E--0xBF, 0xD6, 0xF6}
 655   {---}
 656         {tcrm1000}
 657         {\cite{Knappen:TB17-2-96}}
 658
 659    The text symbol encoding offers access to symbolic glyphs that are
 660   commonly used in text (for a variety of reasons), and whose style
 661   should vary with the text that surrounds them.
 662
 663   Unfortunately, the \Enc{TS1} encoding was developed without
 664   reference to the glyphs available in existing commercial fonts.
 665   As a result, only font families
 666   explicitly developed for \TeX{} (i.e., typically originating with
 667   \MF{}) actually contain all glyphs required by the \Enc{TS1}
 668   encoding.  Most other font families (whether free or commercial)
 669   often only provide half of the set%
 670 %%
 671 %% don't show the comment if the tables are not generated
 672 %%
 673 \expandafter\ifx\csname r@fonttable:tcrm1000\endcsname\relax
 674 \else
 675   \expandafter\ifx\csname r@fonttable:ptmr8c\endcsname\relax
 676   \else
 677     \space (compare the two tables for \Enc{TS1} on
 678      pages~\pageref{fonttable:tcrm1000}
 679      and~\pageref{fonttable:ptmr8c})%
 680   \fi
 681 \fi.
 682   To improve this situation somewhat, NFSS provides a way to define encoding
 683   subsets on a per family basis in the \Pkg{textcomp} package (which
 684   package offers support for the \Enc{TS1} encoding).
 685 \end{encodinginfo}
 686
 687
 688 \begin{encodinginfo}{TS3}
 689         {IPA symbol encoding}
 690         {FUKUI Rei, University of Tokyo}
 691         {0x00--0x0A, 0x20--0x49, 0x50--0x56, 0x70--0x7B}
 692         {---}
 693         {tipx10}
 694         {\cite{Rei:TB17-2-102}}
 695
 696   The \Enc{TS3} encoding (together with the \Enc{T3} encoding) provides the
 697   glyphs for typesetting phonetic transcriptions following the
 698   guidelines of the International Phonetic Association \cite{ipa}.  Support
 699   is offered through the \Pkg{tipa} package.
 700 \end{encodinginfo}
 701
 702
 703
 704
 705 \subsection{256 glyph encodings (text extended)}
 706 \label{sec:extendedenc}
 707
 708 \begin{encodinginfo}
 709   {X2}
 710   {Cyrillic glyph container}
 711   {The CyrTUG font team}
 712   {0x00--0xFF}
 713   {---}
 714   {rxrm1000}
 715   {\cite{Berdnikov:eurotex-98}}
 716
 717   This encoding specifies the glyph container for Cyrillic characters,
 718   which is used in specifying the \Enc{T2A}, \Enc{T2B} and \Enc{T2C} encodings.
 719 \end{encodinginfo}
 720
 721
 722
 723
 724 \subsection{128$^+$ glyph encodings (mathematics)}
 725
 726
 727 \begin{encodinginfo}{OML}
 728         {\TeX{} math italic}
 729         {Donald Ervin Knuth}
 730         {0x00--0x7F}
 731         {---}
 732         {cmmi10}
 733         {\cite[p.430]{A-W:DKn86}}
 734
 735   The \Enc{OML} encoding contains italic Latin and Greek letters for
 736   use in mathematical formulas (typically used for variables) together
 737   with some symbols.
 738
 739 \end{encodinginfo}
 740
 741 \begin{encodinginfo}{OMS}
 742         {\TeX{} math symbol}
 743         {Donald Ervin Knuth}
 744         {0x00--0x7F}
 745         {---}
 746         {cmsy10}
 747         {\cite[p.431]{A-W:DKn86}}
 748
 749   The  \Enc{OMS} encoding contains basic mathematical symbols,
 750   together with an uppercase ``calligraphic'' Latin alphabet.
 751 \end{encodinginfo}
 752
 753
 754 \begin{encodinginfo}{OMX}
 755         {\TeX{} math extension}
 756         {Donald Ervin Knuth}
 757         {0x00--0x7F}
 758         {---}
 759         {cmex10}
 760         {\cite[p.432]{A-W:DKn86}}
 761
 762   \Enc{OMS} encodes mathematical symbols with variable sizes, such as
 763   the $\sum$ sign, which changes its size if used in displayed
 764   formulas, and the construction parts for
 765   brackets, braces and radicals, etc., which can stretch to accommodate
 766   the thing they're enclosing.
 767
 768 \end{encodinginfo}
 769
 770
 771
 772
 773 \subsection{256 glyph encodings (mathematics)}
 774
 775 So far there are no 256 glyph mathematical encodings. A proposal is
 776 given in \cite{ziegler}.
 777
 778
 779 \subsection{Other encodings}
 780
 781 \begin{encodinginfo}
 782   {C..}
 783   {CJK encodings}
 784   {Werner Lemberg}
 785   {0x00--0xFF}
 786   {---}
 787   {} % no font, of course
 788   {\cite{CJK}}
 789
 790   The \Pkg{CJK} package defines a number of encodings which access Chinese,
 791   Japanese and Korean fonts.
 792
 793 \end{encodinginfo}
 794
 795 \begin{encodinginfo}
 796   {E..}
 797   {Experimental encodings}
 798   {---}
 799   {0x00--0xFF}
 800   {all}
 801   {}
 802   {\cite[p.416]{A-W:MG2004}}
 803
 804   As the name indicates, encodings starting with the letter \Enc{E} are
 805   intended for experimental encodings, that are still likely to change.
 806 \end{encodinginfo}
 807
 808 \begin{encodinginfo}{L..}
 809         {Local encoding (site dependent)}
 810         {---}
 811         {0x00--0xFF}
 812         {all}
 813         {}
 814         {\cite[p.416]{A-W:MG2004}}
 815
 816         `Local' encodings provide the means to develop representation
 817         techniques that are suited to a particular \TeX{} environment.  While
 818         the developer has freedom to specify their encoding as he or she
 819         pleases, there is a strong incentive to obey the \LaTeX{} rules for
 820         encodings, since it will otherwise be difficult to compose text using
 821         the encoding.
 822
 823         At least it was the intention that \Enc{L..} encodings are local and
 824         site dependent. However, a number of such encodings became generally
 825         used without ever getting a different name allocated.
 826
 827 \end{encodinginfo}
 828
 829
 830
 831 \begin{encodinginfo}{LY1}
 832         {Y\&Y 256 glyph encoding}
 833         {Berthold Horn}
 834         {0x00--0x08, 0x0C, 0x10, 0x12--0xFF}
 835         {\emph{believed none}}
 836         {ptmr8y}
 837         {\cite[p.416]{A-W:MG2004}}
 838
 839         This is an alternative to the \Enc{T1} encoding developed by Y\&Y and
 840         used in their commercial \TeX{} implementation.
 841
 842 \end{encodinginfo}
 843
 844
 845 \begin{encodinginfo}{LV1}
 846         {MicroPress encoding}
 847         {Michael Vulis}
 848         {\emph{unknown}}
 849         {\emph{unknown}}
 850         {}
 851         {\cite[p.416]{A-W:MG2004}}
 852
 853         This is an encoding developed by MicroPress and used for some of their
 854         fonts.
 855
 856 \end{encodinginfo}
 857
 858
 859 \begin{encodinginfo}{LGR}
 860         {Greek 256 glyph encoding}
 861         {\emph{unknown}}
 862         {0x00--0xFF}
 863         {\emph{believed none}}
 864         {grmn1000}
 865         {\cite[p.575]{A-W:MG2004}}
 866
 867         Currently the main encoding in use for the Greek language.
 868
 869         This encoding doesn't conform to the restrictions for
 870         \Enc{T}-encodings described in section~\ref{sec:restrictions} on
 871         page~\pageref{sec:restrictions} as it doesn't have \textsc{ascii}
 872         glyphs at all.
 873
 874 \end{encodinginfo}
 875
 876
 877 \begin{encodinginfo}
 878   {PD1}
 879   {PDF DocEncoding}
 880   {Adobe}
 881   {0x08--0x0A, 0x0C, 0x0D, 0x18--0x7E, 0x80--0x9E, 0xA0--0xAE, 0xB0--0xFF}
 882   {---}
 883   {}
 884   {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
 885
 886   The \Enc{PD1} encoding is a virtual encoding with 256 glyphs needed to
 887   produce bookmarks and similar text in PDF document generated with pdf\LaTeX.
 888   The encoding is ``virtual'' because by design there are no \TeX{}
 889   fonts that cover \Enc{PD1}. Details can be found in appendix D.1
 890   of~\cite{Adobe:PDF-1.6}.
 891 \end{encodinginfo}
 892
 893 \begin{encodinginfo}
 894   {PU}
 895   {PDF Unicode Encoding}
 896   {Adobe}
 897   {---}
 898   {---}
 899   {}
 900   {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
 901
 902   Another virtual encoding (with more than 600 characters) for
 903   Unicode-encoded bookmarks in PDF documents.
 904 \end{encodinginfo}
 905
 906 \begin{encodinginfo}{U}
 907         {Unknown encoding}
 908         {---}
 909         {potentially 0x00-0xFF}
 910         {all}
 911         {wasy10}
 912         {\cite[p.416]{A-W:MG2004}}
 913
 914   This encoding should be used for fonts that resist classification,
 915   e.g., when it is clear that there will never be more than one font
 916   using the same encoding.
 917
 918 \end{encodinginfo}
 919
 920
 921
 922 \section{Restrictions}
 923 \label{sec:restrictions}
 924
 925
 926 \subsection{Required glyphs for general text encodings}
 927
 928 Encodings that are supposed to be used with \LaTeX{} for `general
 929 purpose text fonts' need to have certain fixed glyphs in certain
 930 encoding slots.  A `general purpose text font' is one intended for
 931 arbitrary natural language text and not just within special
 932 environments (such as the phonetic alphabet) or just for typesetting
 933 individual symbols (e.g., the text companion font with encoding
 934 \Enc{TS1}).
 935
 936 This is the case for the following glyphs that have to be in their
 937 \textsc{ascii} positions for general purpose text encodings:
 938 \begin{center}
 939 \begin{tabular}[t]{cc}
 940   Glyph & Position \\ \hline
 941   !     & \number`\!    \\
 942   '     & \number`\'    \\
 943   (     & \number`\(    \\
 944   )     & \number`\)    \\
 945   \relax*       & \number`\*    \\
 946   +     & \number`\+    \\
 947   ,     & \number`\,    \\
 948   -     & \number`\-    \\
 949   .     & \number`\.    \\
 950   /     & \number`\/    \\
 951   0 \ldots\ 9   & \number`\0\ to \number`\9     \\
 952   \end{tabular}
 953   \quad
 954   \begin{tabular}[t]{cc}
 955   Glyph & Position \\ \hline
 956   :     & \number`\:    \\
 957   ;     & \number`\;    \\
 958   =     & \number`\=    \\
 959   ?     & \number`\?    \\
 960   @     & \number`\@    \\
 961   A \ldots\ Z   & \number`\A\ to \number`\Z     \\
 962   \relax[       & \number`\[    \\
 963   ]     & \number`\]    \\
 964   `     & \number`\`    \\
 965   a \ldots\ z   & \number`\a\ to \number`\z     \\
 966 \end{tabular}
 967 \quad
 968 \begin{tabular}[t]{cc}
 969 Glyph\footnotemark      & Position \\ \hline
 970 <       & \number`\<    \\
 971 >       & \number`\>    \\
 972 \string|        & \number`\|    \\
 973 \end{tabular}\footnotetext{The requirement for these three glyphs is
 974   violated in the Latin alphabet \Enc{OT} encodings.}
 975 \end{center}
 976 In addition the following glyphs have to be present
 977 somewhere\footnote{The position in this case is not important as they
 978 are generated from ligature programs.} in the encoding together with
 979 corresponding ligature programs to generate them:
 980 \begin{center}
 981 \begin{tabular}[t]{cc}
 982 Glyph   & Ligature program \\ \hline
 983  ``     & \texttt{`\/`} \\
 984  ''     & \texttt{'\/'} \\
 985  --     & \texttt{-\/-} \\
 986  ---    & \texttt{-\/-\/-} \\
 987 \end{tabular}
 988 \end{center}
 989
 990 This is $33 + 2 * 26 = 85$ positions ``required'', which leaves 171
 991 positions free.
 992
 993 If there are free slots available then adding all or some of the
 994 diacritics would be the best way to fill them.
 995
 996 If there are insufficient slots for the characters needed, a possible
 997 technique is to create a subsidiary encoding, and to move non-letter
 998 characters to it.  Since only ``letters'' take part in the hyphenation
 999 algorithm, this technique doesn't affect the appearance of the typeset
1000 result.
1001
1002 \subsection{The constraints on upper/lower case tables}
1003
1004 Due to some technical restrictions of \TeX{} related to hyphenation it
1005 is not possible in \LaTeX{} to use more than one \verb=\lccode= or
1006 \verb=\uccode= table. Therefore all encodings need to share these two
1007 tables which are defined to be those of the \Enc{T1} encoding.
1008
1009 The \Enc{T1} encoding has some nasty peculiarities which make certain slot
1010 positions more or less unusable for other encodings if this
1011 restriction is to be obeyed. This is unfortunate but since \Enc{T1} is well
1012 established and the basis for a large number of languages it seemed
1013 better to live with this situation instead of trying to replace \Enc{T1} with a
1014 slightly better standard (with the result that for a long time
1015 different \LaTeX{} installations would not be able to communicate with
1016 each other because of incompatible font sets).
1017
1018 The positions that are problematic are as follows.
1019 \begin{center}
1020 \begin{tabular}{lp{.8\linewidth}}
1021 25 (\char 25) & uppercase maps strangely (same as for 105, \char 105)\\
1022 26 (\char 26) & uppercase maps strangely (same as for 106, \char 106)\\
1023 27 (\char 27) & lowercase maps to itself which makes this slot subject
1024                 to hyphenation (used to support \Enc{OT1} encoding) \\
1025 157 (\char 157) & lowercase maps strangely (same as for 73, \char 73) \\
1026 158 (\char 158) & uppercase maps strangely (same as for 240, \char 240) \\
1027 \end{tabular}
1028 \end{center}
1029 One way to use such slots is to fill them with ligature glyphs as
1030 \TeX{} will not consult these tables for glyphs constructed through
1031 ligatures programs but instead uses the entries for the individual
1032 glyphs used to produce the ligature.
1033
1034 A complete listing of the uppercase/lowercase mapping tables is to be
1035 found in section~\ref{sec:uclc-tab} (page \pageref{sec:uclc-tab}).
1036
1037 \newcount\temp \newcount\tempL \newcount\tempU
1038
1039 \def\nextstep{\global\tempL=\lccode\temp
1040               \global\tempU=\uccode\temp
1041               \lctablenumbersize\the\temp &
1042               \the\tempL&
1043               \the\tempU&\printlowerupper{\the\temp}{\the\tempL}{\the\tempU}\\
1044                \global\advance\temp by 1
1045                \stepprint}
1046
1047 \def\printlowerupper#1#2#3{\char#1\relax
1048    (\ifnum#2=0\relax--\else\char#2\fi
1049    /\ifnum#3=0\relax--\else\char#3\fi)}
1050
1051 \def\stepprint{\relax\ifnum\temp<\endval
1052                     \let\next=\nextstep
1053                \else
1054                      \let\next=\relax
1055                \fi
1056                \next}
1057
1058 \def\dolctable#1#2{{\temp=#1\relax
1059 \def\endval{#2}%
1060 \setlength\tabcolsep{1.5pt}%
1061 \begin{tabular}[t]{@{}cccc@{}}
1062 pos&lc&uc&glyphs\\\hline
1063 \stepprint
1064 \end{tabular}}}
1065
1066 \iffalse
1067 \begin{center}
1068 \tiny\let\lctablenumbersize\tiny
1069 \mbox{\dolctable{0}{52}\vrule
1070 \dolctable{52}{104}\vrule
1071 \dolctable{104}{156}\vrule
1072 \dolctable{156}{208}\vrule
1073 \dolctable{208}{256}}
1074 \end{center}
1075 \fi
1076
1077 \iffalse
1078 \begin{center}\tiny
1079 \mbox{\dolctable{0}{65}\vrule
1080 \dolctable{65}{128}\vrule
1081 \dolctable{128}{193}\vrule
1082 \dolctable{193}{256}}
1083 \end{center}
1084 \fi
1085
1086
1087
1088 \section{Encoding specific commands}
1089
1090 An encoding specific command is one that generates a glyph (or
1091 glyphs), to produce a graphic effect that may be implemented
1092 differently in different encodings.  The encoding specific command
1093 automatically changes its implementation when the encoding changes in
1094 the course of the document.  Encoding specific commands figure in
1095 \LaTeX's internal character representation (\textsc{licr}) and are also
1096 discussed in \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
1097
1098 The following table only covers the encoding specific commands from
1099 the \Enc{OT1} and \Enc{T1} encodings. Other encodings may specify
1100 additional encoding specific commands.  In the table, the first 15
1101 commands are `accent-like' and need as an argument the character to be
1102 accented.  For example, |\v{c}| is the \textsc{licr} for `\v{c}'.
1103
1104 \begin{tabbing}
1105 \ttverb\textvisiblespace\quad\=bbbbbbbbbbbbbb\=b'b'\=ccccccccccc\kill
1106 \ttverb\`{}               \>OT1,T1\>   \a`{}\> (grave)      \\
1107 \ttverb\'{}               \>OT1,T1\>   \a'{}\> (acute)      \\
1108 \ttverb\^{}               \>OT1,T1\>   \^{}\>  (circumflex) \\
1109 \ttverb\~{}               \>OT1,T1\>   \~{}\>  (tilde)      \\
1110 \ttverb\"{}               \>OT1,T1\>   \"{}\>  (umlaut)     \\
1111 \ttverb\H{}               \>OT1,T1\>   \H{}\>  (Hungarian umlaut) \\
1112 \ttverb\r{}               \>OT1,T1\>   \r{}\>  (ring)       \\
1113 \ttverb\v{}               \>OT1,T1\>   \v{}\>  (ha\v{c}ek)  \\
1114 \ttverb\u{}               \>OT1,T1\>   \u{}\>  (breve)      \\
1115 \ttverb\t{}               \>OT1,T1\>   \t{}\>  (tie)        \\
1116 \ttverb\={}               \>OT1,T1\>   \a={}\> (macron)     \\
1117 \ttverb\.{}               \>OT1,T1\>   \.{}\>  (dot)        \\
1118 \ttverb\b{}               \>OT1,T1\>   \b{}\>  (underbar)   \\
1119 \ttverb\c{}               \>OT1,T1\>   \c{}\>  (cedilla)    \\
1120 \ttverb\d{}               \>OT1,T1\>   \d{}\>  (dot under)  \\
1121 \ttverb\k{}               \>T1    \>   \k{}\>  (ogonek)     \\
1122 % \ttverb\AA              \>OT1,T1\>   \AA \>               \\ % no longer
1123 \ttverb\AE                \>OT1,T1\>   \AE \>               \\
1124 \ttverb\DH                \>T1    \>   \DH \>               \\
1125 \ttverb\DJ                \>T1    \>   \DJ \>               \\
1126 \ttverb\L                 \>OT1,T1\>   \L  \>               \\
1127 \ttverb\NG                \>T1    \>   \NG \>               \\
1128 \ttverb\OE                \>OT1,T1\>   \OE \>               \\
1129 \ttverb\O                 \>OT1,T1\>   \O  \>               \\
1130 \ttverb\SS                \>OT1,T1\>   \SS \>               \\
1131 \ttverb\TH                \>T1    \>   \TH \>               \\
1132 % \ttverb\aa              \>OT1,T1\>   \aa \>               \\ no-longer
1133 \ttverb\ae                \>OT1,T1\>   \ae \>               \\
1134 \ttverb\dh                \>T1    \>   \dh \>               \\
1135 \ttverb\dj                \>T1    \>   \dj \>               \\
1136 \ttverb\guillemotleft     \>T1    \>   \guillemotleft  \> (guillemet) \\
1137 \ttverb\guillemotright    \>T1    \>   \guillemotright \> (guillemet) \\
1138 \ttverb\guilsinglleft     \>T1    \>   \guilsinglleft  \> (guillemet) \\
1139 \ttverb\guilsinglright    \>T1    \>   \guilsinglright \> (guillemet) \\
1140 \ttverb\i                 \>OT1,T1\>   \i  \>               \\
1141 \ttverb\j                 \>OT1,T1\>   \j  \>               \\
1142 \ttverb\l                 \>OT1,T1\>   \l  \>               \\
1143 \ttverb\ng                \>T1    \>   \ng \>               \\
1144 \ttverb\oe                \>OT1,T1\>   \oe \>               \\
1145 \ttverb\o                 \>OT1,T1\>   \o  \>               \\
1146 \ttverb\quotedblbase      \>T1    \>   \quotedblbase   \>   \\
1147 \ttverb\quotesinglbase    \>T1    \>   \quotesinglbase \>   \\
1148 \ttverb\ss                \>OT1,T1\>   \ss \>               \\
1149 \ttverb\textasciicircum   \>OT1,T1\>   \textasciicircum \>  \\
1150 \ttverb\textasciitilde    \>OT1,T1\>   \textasciitilde  \>  \\
1151 \ttverb\textbackslash     \>OT1,T1\>   \textbackslash   \>  \\
1152 \ttverb\textbar           \>OT1,T1\>   \textbar         \>  \\
1153 \ttverb\textbraceleft     \>OT1,T1\>   \textbraceleft   \>  \\
1154 \ttverb\textbraceright    \>OT1,T1\>   \textbraceright  \>  \\
1155 \ttverb\textcompwordmark  \>OT1,T1\>   \textcompwordmark\> (invisible) \\
1156 \ttverb\textdollar        \>OT1,T1\>   \textdollar      \>  \\
1157 \ttverb\textemdash        \>OT1,T1\>   \textemdash      \>  \\
1158 \ttverb\textendash        \>OT1,T1\>   \textendash      \>  \\
1159 \ttverb\textexclamdown    \>OT1,T1\>   \textexclamdown  \>  \\
1160 \ttverb\textgreater       \>OT1,T1\>   \textgreater     \>  \\
1161 \ttverb\textless          \>OT1,T1\>   \textless        \>  \\
1162 \ttverb\textquestiondown  \>OT1,T1\>   \textquestiondown\>  \\
1163 \ttverb\textquotedbl      \>T1    \>   \textquotedbl    \>  \\
1164 \ttverb\textquotedblleft  \>OT1,T1\>   \textquotedblleft\>  \\
1165 \ttverb\textquotedblright \>OT1,T1\>   \textquotedblright\> \\
1166 \ttverb\textquoteleft     \>OT1,T1\>   \textquoteleft   \>  \\
1167 \ttverb\textquoteright    \>OT1,T1\>   \textquoteright  \>  \\
1168 \ttverb\textregistered    \>OT1,T1\>   \textregistered  \>  \\
1169 \ttverb\textsection       \>OT1,T1\>   \textsection     \>  \\
1170 \ttverb\textsterling      \>OT1,T1\>   \textsterling    \>  \\
1171 \ttverb\texttrademark     \>OT1,T1\>   \texttrademark   \>  \\
1172 \ttverb\textunderscore    \>OT1,T1\>   \textunderscore  \>  \\
1173 \ttverb\textvisiblespace  \>OT1,T1\>   \textvisiblespace\>  \\
1174 \ttverb\th                \>T1    \>   \th              \>
1175 \end{tabbing}
1176
1177 \section{Encodings for Unicode based \TeX\ systems}
1178 \label{sec:unicode}
1179
1180 The preceding text has assumed a classic TeX system that is
1181 restricted to the use of fonts with at most 256 characters. In order
1182 to accommodate all the characters needed for different languages and
1183 mathematics it is necessary to have multiple encodings as described
1184 above, and \LaTeX\ needs to be aware of the encoding used for each
1185 font.
1186
1187 Unicode aims to provide a single encoding that removes most of the
1188 need to switch encodings, apart from very specialist use for non
1189 standard characters. Rather than assign codes in the range 0--256 (hex
1190 FF) Unicode codes are in the range 0--1,114,111 (hex 10FFFF), although
1191 not all slots are available for distinct characters for technical
1192 reasons. Unicode offers the possibility to use a single input encoding
1193 (usually UTF-8) for all documents and to use essentially the same
1194 Unicode encoding for all fonts, so removing the need to switch
1195 encodings in different context.
1196
1197 Omega was perhaps the first widely used \TeX\ extension that
1198 supported Unicode. Currently the two actively supported systems that are
1199 present in most modern \TeX\ distributions are xe\TeX\ and lua\TeX.
1200
1201 When used with these extended \TeX\ engines, \LaTeX's font system can
1202 refer to Unicode fonts (typically OpenType fonts installed system wide
1203 on your operating system rather than fonts specifically encoded for
1204 \TeX. Currently the usual method of accessing these fonts is through
1205 the contributed \Pkg{fontspec} package. this uses the two
1206 \emph{Experimental} encoding \Enc{EU1} (on xe\TeX) and \Enc{EU2} (on
1207 lua\TeX). Technically these two are the same encoding in terms of
1208 allocating characters to numbered positions, but two encodings have
1209 been specified due to some internal differences in font handling in
1210 the two extended \TeX\ engines. The exact rules for \LaTeX\ encodings
1211 for Unicode engines have not yet been finalised, however it is
1212 possible that a single unified format can be used and so a single
1213 standardised name such as \Enc{UC} may be used. However at the present
1214 time \Enc{EU1} and \Enc{EU2} should be used, although it is rare to
1215 need to specify these explicitly in a document as the \Pkg{fontspec}
1216 package sets up the correct encoding based on the engine in use.
1217
1218 The restrictions described in section \ref{sec:restrictions} do not
1219 apply, or need to be modified in a Unicode based engine. Clearly the
1220 lowercase table (and hyphenation patterns) can not be restricted to
1221 the values used for \Enc{T1} and do only refer to the first 256
1222 characters.
1223
1224 When the \LaTeX\ format is made \LaTeX sets up the lowercase table
1225 and classifies characters as letter or non letter based on \Enc{T1} if
1226 a classic \TeX\ or pdf\TeX\ is being used. If a Unicode based \TeX\ is
1227 detected, the values are instead based on the classification and
1228 lower-case mappings provided by the Unicode Character Database
1229 \cite{ucd}. The relevant part of these tables are converted to \TeX\
1230 syntax as \ttverb{ltunicode.ltx} as part of the \LaTeX\ distribution.
1231
1232 Similarly in the default configuration  files used by modern \TeX\
1233 distribution, the hyphenation files for each supported language are
1234 written in UTF-8 encoding, using Unicode code points for all letters,
1235 then if a classic \TeX system  is detected, some additional macros are
1236 loaded to convert these files to 256-character encodings where
1237 possible, and assuming the \Enc{T1} lowercase table.
1238
1239
1240
1241
1242 \begin{thebibliography}{99}
1243 \addcontentsline{toc}{section}{\numberline{\relax}\refname}
1244
1245
1246 \bibitem{Adobe:PDF-1.6} \emph{\textsc{PDF} reference}:
1247     Adobe portable document format version~1.6.  Adobe Systems
1248     Incorporated, 2005. % why \textsuperscript{3}?
1249   \url{http://partners.adobe.com/public/developer/en/pdf/PDFReference16.pdf}.
1250
1251 \bibitem{Beeton:TB6-3-124} Barbara Beeton:
1252   \emph{Mathematical symbols and cyrillic fonts ready for
1253       distribution}.  In: TUGBoat, 6\#3), 1985.
1254   \url{http://tug.org/TUGboat/Articles/tb06-3/tb13beetcyr.pdf}.
1255
1256 \bibitem{beeton} Barbara Beeton: \emph{Unicode
1257       and math, a combination whose time has come -- Finally!}.  In:
1258   TUGBoat, 21\#3, 2000.
1259   \url{http://www.tug.org/TUGboat/Articles/tb21-3/tb68beet.pdf}.
1260
1261
1262 \bibitem{Berdnikov:eurotex-98} A.\@ Berdnikov, O.\@
1263   Lapko, M.\@ Kolodin, A.\@ Janishevsky and
1264   A.\@ Burykin: \emph{The Encoding Paradigm in
1265       \LaTeXe{} and the Projected X2 Encoding for Cyrillic Texts}.
1266   Euro\TeX~98.
1267   \url{http://www.gutenberg.eu.org/pub/GUTenberg/publicationsPDF/28-29-berdnikova.pdf}.
1268
1269 \bibitem{CJK} \emph{The \Pkg{CJK} package}:
1270   \url{http://cjk.ffii.org}.
1271
1272 \bibitem{clasen} Matthias Clasen: \emph{A new
1273       implementation of \LaTeX{} math}, 1997-98.
1274   \url{http://www.tug.org/twg/mfg/papers/current/newmath.ps.gz}.
1275
1276 \bibitem{clasen-vieth} Matthias Clasen and Ulrik
1277   Vieth: \emph{Towards a new Math Font Encoding
1278       for (La)\TeX}.  March 1998,
1279   \url{http://www.tug.org/twg/mfg/papers/current/mfg-euro-all.ps.gz}.
1280
1281 \bibitem{CorkGW:91}
1282 Dean Guenther and Janene Winter.
1283 \newblock An international phonetic alphabet.
1284 \newblock In Guenther \cite{proc:MGu91}, pages 149--156.
1285 \newblock Published as {TUG}boat 12\#1.
1286
1287 \bibitem{proc:MGu91}
1288 Mary Guenther, editor.
1289 \newblock {\em {\TeX} 90 Conference Proceedings}, March 1991.
1290 \newblock Published as {TUG}boat 12\#1.
1291
1292 \bibitem{tub:MFe90}
1293 Michael~J. Ferguson.
1294 \newblock Report on multilingual activities.
1295 \newblock {\em {TUG}boat}, 11(4):514--516, 1990.
1296
1297 \bibitem{fontinst} \emph{The \Pkg{fontinst} package}:
1298   \textlangle CTAN\textrangle\url{/fonts/utilities/fontinst}.
1299
1300 \bibitem{Rei:TB17-2-102} Fukui Rei:
1301   \emph{\textsl{TIPA}: A system for processing phonetic
1302       symbols in \LaTeX}.  In: TUGBoat, 17\#, 1996.
1303   \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51rei.pdf}.
1304
1305 \bibitem{hyperref} \emph{The \Pkg{hyperref} package}:
1306   \url{http://www.tug.org/applications/hyperref}.
1307
1308 \bibitem{tub:JKn93}
1309 J\"org Knappen.
1310 \newblock Fonts for Africa: The fc Fonts.
1311 \newblock {\em {TUG}boat}, 14(2):104, 1993.
1312
1313 \bibitem{Knappen:TB17-2-96} J\"org Knappen:
1314   \emph{The \Pkg{dc} fonts~1.3: Move towards stability
1315       and completeness}.  In: TUGBoat 17\#2, 1996.
1316   \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51knap.pdf}.
1317
1318 \bibitem{A-W:DKn86}
1319 Donald~E. Knuth.
1320 \newblock {\em The {\TeX}book}.
1321 \newblock Volume~A of {\em Computers \& {T}ypesetting\/},
1322   May 1989.
1323 \newblock Eight printing.
1324
1325 \bibitem{vnr} \emph{The \Pkg{vnr} font family}, developed by
1326    the author of pdf\TeX, {H\`an Th\^e\protect\llap{\raise 0.5ex\hbox{\'{\relax}}} Th\`anh}.
1327    \url{http://vntex.org/download/vntex}.
1328
1329  \bibitem{ipa} Home page of the International Phonetic Association.
1330    \url{http://www.arts.gla.ac.uk/IPA/ipa.html}
1331
1332 \bibitem{A-W:LLa94}
1333 Leslie Lamport.
1334 \newblock {\em {\LaTeX:} A Document Preparation System}.
1335 \newblock Addison-Wesley, Reading, Massachusetts, second edition, 1994.
1336
1337 \bibitem{LH-Fonts} \emph{The \Pkg{lh}-Fonts for Cyrillic}:
1338   \textlangle CTAN\textrangle\url{/fonts/cyrillic/lh}.
1339
1340 \bibitem{A-W:MG2004}
1341 Frank Mittelbach and Michel Goossens.
1342 \newblock {\em The {\LaTeX} Companion second edition}.
1343 \newblock With Johannes Braams, David Carlisle, and Chris Rowley.
1344 \newblock Addison-Wesley, Reading, Massachusetts, 2004.
1345
1346 \bibitem{Unicode} \emph{The Unicode Standard}.
1347   \url{http://unicode.org}.
1348
1349 \bibitem{ucd} \emph{The Unicode Character Database}.
1350   \url{http://unicode.org/ucd}.
1351
1352 \bibitem{ziegler} Justin Ziegler, \emph{Technical
1353     Report on Math Font Encodings}, June 1994,
1354   \url{http://www.tug.org/twg/mfg/papers/ltx3pub/l3d007.ps.gz}.
1355
1356 \end{thebibliography}
1357
1358 \clearpage\appendix
1359 \begin{center}
1360   \Large\bfseries Appendices
1361 \end{center}
1362
1363 \section{Example code tables}
1364
1365 This appendix contains a table of each font mentioned as an ``example''
1366 font above, providing that the font was available when the document
1367 was processed with \LaTeX{}.  (\LaTeX{} generates a warning message
1368 for each font it fails to find.)
1369
1370 \subsection{Text encodings}
1371
1372 \ftable{cmr10}{OT1}
1373
1374 \ftable{wnr10}{OT2}
1375
1376 \ftable{wsuipa10}{OT3}
1377
1378 \ftable{plr10}{OT4}
1379
1380 %\ftable{artmr10}{OT6}
1381
1382 \ftable{ecrm1000}{T1}
1383
1384 \ftable{larm1000}{T2A}
1385
1386 \ftable{lbrm1000}{T2B}
1387
1388 \ftable{lcrm1000}{T2C}
1389
1390 \ftable{tipa10}{T3}
1391
1392 \ftable{fcr10}{T4}
1393
1394 \ftable{vnr10}{T5}
1395
1396
1397 \subsection{Text symbol encodings}
1398
1399 The full table for \Enc{TS1} as provided by European Computer Modern family:
1400 \ftable{tcrm1000}{TS1}
1401
1402 \pagebreak
1403
1404 In contrast typical PostScript fonts usually have incomplete implementations
1405 of \Enc{TS1} sometimes missing more than half of the glyphs:
1406
1407 \ftable{ptmr8c}{TS1}
1408
1409 \ftable{tipx10}{TS3}
1410
1411
1412
1413 \subsection{Extended text encodings}
1414
1415 \ftable{rxrm1000}{X2}
1416
1417
1418 \subsection{Mathematical encodings}
1419
1420 \ftable{cmmi10}{OML}
1421
1422 \ftable{cmsy10}{OMS}
1423
1424 \ftable{cmex10}{OMX}
1425
1426
1427 \subsection{Other encodings}
1428
1429 \ftable{ptmr8y}{LY1}
1430
1431 %%\ftable{????}{LV1}
1432
1433 \ftable{grmn1000}{LGR}
1434
1435 \ftable{wasy10}{U}
1436 \ftable{logo10}{U}
1437
1438 \clearpage
1439 \section{Uppercase and lowercase tables}
1440 \label{sec:uclc-tab}
1441
1442 The following two sets of tables list the \verb"\uppercase" and
1443 \verb"\lowercase" values for each position in the \LaTeX{} standard
1444 256-character tables.
1445
1446 Each row of each table lists:
1447 \begin{quote}
1448   \begin{tabular}{lp{0.7\textwidth}}
1449     pos & The position in the table (0-255) \\
1450     lc  & The value in the \verb"\lowercase" table at the position \\
1451         & (note that value 0 here means that \verb"\lowercase" is
1452           ineffective for this character, and hyphenation does not apply
1453           to it) \\
1454     uc  & The value in the \verb"\uppercase" table at the position \\
1455         & (note that value 0 here means that \verb"\uppercase" is
1456           ineffective for this character) \\
1457     glyphs & The glyphs specified for the T1 encoding for this
1458              position, laid out as \meta{glyph}\textbf{(}\meta{lowercase
1459              glyph}\textbf{/}\meta{uppercase glyph}\textbf{)}
1460   \end{tabular}
1461 \end{quote}
1462
1463 \begin{center}
1464   \let\lctablenumbersize\footnotesize
1465   \makebox[\textwidth]{\hss
1466     \dolctable{0}{32}\quad\dolctable{32}{64}\quad
1467     \dolctable{64}{96}\quad\dolctable{96}{128}%
1468   \hss}
1469
1470   \makebox[\textwidth]{\hss
1471     \dolctable{128}{160}\quad\dolctable{160}{192}\quad
1472     \dolctable{192}{224}\quad\dolctable{224}{256}%
1473   \hss}
1474 \end{center}
1475 \end{document}
1476
1477
1478 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%