trunk/doc/encguide.tex

   1 % \iffalse meta-comment
   2 %
   3 % Copyright 1993-2016
   4 % The LaTeX3 Project and any individual authors listed elsewhere
   5 % in this file.
   6 %
   7 % This file is part of the LaTeX base system.
   8 % -------------------------------------------
   9 %
  10 % It may be distributed and/or modified under the
  11 % conditions of the LaTeX Project Public License, either version 1.3c
  12 % of this license or (at your option) any later version.
  13 % The latest version of this license is in
  14 %    http://www.latex-project.org/lppl.txt
  15 % and version 1.3c or later is part of all distributions of LaTeX
  16 % version 2005/12/01 or later.
  17 %
  18 % This file has the LPPL maintenance status "maintained".
  19 %
  20 % The list of all files belonging to the LaTeX base distribution is
  21 % given in the file `manifest.txt'. See also `legal.txt' for additional
  22 % information.
  23 %
  24 % The list of derived (unpacked) files belonging to the distribution
  25 % and covered by LPPL is defined by the unpacking scripts (with
  26 % extension .ins) which are part of the distribution.
  27 %
  28 % \fi
  29 %
  30
  31
  32 \NeedsTeXFormat{LaTeX2e}[1995/12/01]
  33
  34 \documentclass{ltxguide}[1994/11/20]
  35
  36 \usepackage[T1]{fontenc}
  37 \IfFileExists{lmodern.sty}{\usepackage{lmodern}}{}
  38 \usepackage{textcomp}
  39 \usepackage{url}
  40 \usepackage{mflogo}
  41
  42 \addtolength\textheight{6\baselineskip}
  43 \addtolength\topmargin{-2\baselineskip}
  44
  45
  46 \newcommand\ttverb[1]{\texttt{\string#1}}
  47
  48
  49 % for encodings
  50 \providecommand{\Enc}[1]{\texttt{#1}}
  51
  52 % for packages
  53 \providecommand{\Pkg}[1]{%
  54   \textsf{#1}}
  55
  56 % for files
  57 \providecommand{\File}[1]{%
  58   \texttt{#1}}
  59
  60 % let's have meta values too
  61 \providecommand{\meta}[1]{%
  62   \ensuremath{\langle}\emph{#1}\ensuremath{\rangle}}
  63
  64 \usepackage{tabularx}
  65
  66 % eine Umgebung zur Darstellung von Kodierungen
  67 %
  68 % Argumente:
  69 %  #1: Name in LaTeX (z.B. OT1)
  70 %  #2: Name der Kodierung (z.B. TeX text)
  71 %  #3: Name des Autors (z.B. Don Knuth)
  72 %  #4: Bereich der benützten Glyphindizes
  73 %  #5: variable Positionen
  74 %  #6: Beispielzeichensatz
  75 %  #7: Referenz
  76 %
  77 % XXX add code to handle more than a single font example (e.g., larm1000,
  78 % lbrm1000, and lcrm1000).
  79 %
  80 \newenvironment{encodinginfo}[7]%
  81   {\noindent
  82    \begin{tabularx}{\linewidth}{@{}l>{\raggedright\let\\\tabularnewline}X}%
  83      \LaTeX{} name:          & \texttt{#1}\\%
  84      Public name:          & #2\\%
  85      Author:                   & #3\\%
  86      Glyph slots used: & #4\\%
  87      Variable slots:     & #5\\%
  88      Font example:     & \def\@tempa{#6}\ifx\@tempa\@empty---%
  89                             \else\texttt{#6}\referenceftable{#6}\fi\\%
  90      Further reference:                & #7%
  91    \end{tabularx}%
  92    \par\nobreak
  93    \vspace*{3pt}%
  94    \quote
  95   }%
  96   {\endquote
  97    \vspace{6pt}}
  98
  99 \makeatletter
 100 \def\referenceftable#1{
 101   \@ifundefined{r@fonttable:#1}%
 102   \relax
 103   {;\space encoding table on page~\pageref{fonttable:#1}}%
 104 }
 105
 106 % font table macros mainly lifted from manmac.tex
 107 \def\oct#1{\hbox{\rm\'{}\kern-.2em\it#1\/\kern.05em}}
 108 \def\hex#1{\hbox{\rm\H{}\tt#1}}
 109
 110 \def\oddline#1{\cr\noalign{\nointerlineskip}
 111   \multispan{19}\hrulefill&
 112   \setbox0=\hbox{\lower 2.3pt\hbox{\hex{#1x}}}\smash{\box0}\cr
 113   \noalign{\nointerlineskip}}
 114 \def\evenline{\cr\noalign{\hrule}}
 115 \def\chartstrut{\lower4.5pt\vbox to14pt{}}
 116 \def\beginchart#1#2{$$\global\count@=0 #1
 117   \halign to\hsize\bgroup
 118     \chartstrut##\tabskip0pt plus10pt&
 119     &\hfil##\hfil&\vrule##\cr
 120     \lower6.5pt\null
 121   &#2&&\oct0&&\oct1&&\oct2&&\oct3&&\oct4&&\oct5&&\oct6&&\oct7&\evenline}
 122 \def\endchart{\raise11.5pt\null&&&\hex 8&&\hex 9&&\hex A&&\hex B&
 123   &\hex C&&\hex D&&\hex E&&\hex F&\cr\egroup$$}
 124 \def\:{\setbox0=\hbox{\noboundary\char\count@\noboundary}%
 125   \ifdim\ht0>7.5pt\reposition
 126   \else\ifdim\dp0>2.5pt\reposition\fi\fi
 127   \box0\global\advance\count@ by1 }
 128 \def\reposition{\setbox0=\hbox{$\vcenter{\kern2pt\box0\kern2pt}$}}
 129 \def\normalchart{%
 130   &\oct{00x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline0
 131   &\oct{01x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 132   &\oct{02x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline1
 133   &\oct{03x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 134   &\oct{04x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline2
 135   &\oct{05x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 136   &\oct{06x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline3
 137   &\oct{07x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 138   &\oct{10x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline4
 139   &\oct{11x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 140   &\oct{12x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline5
 141   &\oct{13x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 142   &\oct{14x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline6
 143   &\oct{15x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 144   &\oct{16x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline7
 145   &\oct{17x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 146   \top}
 147
 148 \def\notophalf{}
 149 \def\tophalf{%
 150 %\noalign{\vskip 5pt\hrule}
 151   &\oct{20x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline8
 152   &\oct{21x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 153   &\oct{22x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline9
 154   &\oct{23x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 155   &\oct{24x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline A
 156   &\oct{25x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 157   &\oct{26x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline B
 158   &\oct{27x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 159   &\oct{30x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline C
 160   &\oct{31x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 161   &\oct{32x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline D
 162   &\oct{33x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 163   &\oct{34x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline E
 164   &\oct{35x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline
 165   &\oct{36x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\oddline F
 166   &\oct{37x}&&\:&&\:&&\:&&\:&&\:&&\:&&\:&&\:&\evenline}
 167
 168 \def\ftable#1#2{%
 169      \batchmode
 170      \font\X=#1%
 171      \errorstopmode
 172      \ifx\X\nullfont
 173        \@warning{Font #1 not found, table omitted}
 174      \else
 175        \count@="80
 176        \setbox0=\hbox{\X
 177         \loop\char\count@\advance\count@ by1 \ifnum\count@<"100
 178         \repeat}%
 179   \ifdim\wd0>0pt \let\top\tophalf\else\let\top\notophalf\fi
 180      \beginchart\X{\hfill\llap{\textbf{#1, \large#2}\label{fonttable:#1}}}\normalchart
 181      \endchart\par\vfill
 182     \fi}
 183 \makeatother
 184
 185
 186 \setcounter{tocdepth}{3}
 187
 188 \title{\LaTeX{} font encodings}
 189
 190 \author{Frank Mittelbach \and Robin
 191    Fairbairns \and Werner Lemberg \and \LaTeX3 Project Team.}
 192
 193 \date{\copyright~Copyright 1995--2016 \\[5pt] 18 February 2016}
 194
 195 \begin{document}
 196
 197 \maketitle
 198
 199 \tableofcontents
 200
 201 \section{Introduction}
 202
 203 This document explains the ideas that underpin \LaTeX{} font
 204 encodings and the constraints that apply when defining a new encoding; it
 205 also lists the encodings that have already been defined.
 206
 207 \subsection{Encodings in \TeX{}}
 208
 209 \TeX{} (the program) implicitly recognises three sorts of encoding,
 210 and all are (in a sense) discussed in the \TeX{}book~\cite{A-W:DKn86}:
 211 \begin{itemize}
 212 \item[1.] The input encoding, which specifies the meanings of characters
 213   in files presented to \TeX{} for processing.  The \TeX{}book
 214   suggests that `your version of \TeX{} will recognise the characters
 215   you type on your keyboard' (\TeX{} the program has provision for
 216   static translations of input characters).
 217 \end{itemize}
 218 Such direct use of \TeX{}'s facilities is not the way modern
 219 \LaTeX{} (or indeed any other \TeX{} macro package) is likely to deal
 220 with input encodings.   This document does not address the topic of
 221 input encodings; the interested reader should examine the \LaTeX{}
 222 base package \Pkg{inputenc} \cite[sec.~7.5.2, p.~357]{A-W:MG2004}.
 223 \begin{itemize}
 224 \item[2.] The token stream that \TeX{} processes internally.  This stream
 225   of \TeX{}'s consciousness is discussed in great detail in the
 226   \TeX{}book.
 227 \end{itemize}
 228 Again, this document does not address the topic.  \LaTeX's internal
 229 character representation (\textsc{licr}) is well discussed in
 230 \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
 231 \begin{itemize}
 232 \item[3.] The font encoding---i.e., the mapping of character codes to
 233   glyphs in the fonts that are used to typeset \TeX{}'s output.
 234   Again, a set of font encodings is enumerated in the \TeX{}book, but
 235   that set has proved inadequate to the needs of modern multilingual
 236   use of \LaTeX.
 237 \end{itemize}
 238 This document explains \emph{why} Knuth's original set of encodings is
 239 inadequate to modern conditions, and discusses the issues that
 240 surround the design and definition of new font encodings.
 241
 242 Font encodings are important for more than their r\^{o}le in mapping the
 243 glyphs of the fonts to be used for typesetting: their glyph tables are
 244 also the context in which \TeX{}'s hyphenation algorithm operates.
 245 There are constraints imposed by \TeX{} that affect the way in which
 246 new font encodings, for use in a multi-lingual environment, may be
 247 structured (see section~\ref{sec:restrictions} for details).
 248
 249 \subsection{The history of \TeX{} font encodings}
 250
 251 Little attention was paid to font encodings prior to the arrival of
 252 \TeX{}\,3.  Up to that time, one used Donald Knuth's fonts (the
 253 Computer Modern family, using the encodings we now refer to as \Enc{OT1} and
 254 the \Enc{OM} series), or one was on one's own.
 255
 256 The Computer Modern text encoding raises problems in unmodified
 257 \TeX{}, because hyphenation cannot break words containing
 258 \verb"\accent" commands.  Even in those Western European languages for
 259 which the \Enc{OT1} encoding has symbols for the necessary
 260 \verb"\accent"-based diacritics, this shortcoming ruins typesetting of
 261 running text.
 262
 263 With the advent of \TeX{}\,3, with its ability to switch between
 264 hyphenation pattern sets, it was clear that the situation could not
 265 continue.  Thus a group at the TUG Annual General Meeting in Cork,
 266 Ireland, specified a uniform encoding for 256-glyph fonts, that
 267 contains accented letters and non-\textsc{ascii} letters necessary to
 268 express most Western European languages (and some Eastern European ones)
 269 without recourse to the \verb"\accent" command.
 270
 271 This ``Cork'' encoding has since been realised in a series of fonts
 272 designed with Metafont, in at least one font series that is available
 273 both in Adobe Type 1 format and in OpenType format, % viz., Latin Modern
 274 and in a number of virtual-font mappings of other font series.
 275
 276 Since the time of the Cork meeting, much effort has been devoted to
 277 the design of encodings for text fonts to use with \TeX{}, and the
 278 Cork encoding influenced the design of many such encodings.
 279
 280 Encodings for mathematical fonts have, in contrast, changed little
 281 since Knuth's contributions.  A TUG Technical Working Group was
 282 established at the Cork meeting, whose aim was to define a set of
 283 256-glyph encodings to regularise and extend Knuth's originals, using
 284 ideas from several other fonts that had appeared since, and from the
 285 known needs of researchers in mathematics and the mathematical sciences.
 286
 287 Independently, a first proposal (the so-called \emph{Aston proposal}) was worked
 288 out by Justin Ziegler together with Frank Mittelbach and other members of the
 289 \LaTeX3 project team~\cite{ziegler}. A first implementation of
 290 this propsal was realized by Matthias Clasen und Ulrik
 291 Vieth~\cite{clasen,clasen-vieth}.
 292
 293 However, the slow progress of these Mathematical encodings has been
 294 overtaken by the addition (in the last decade or so) of a large number
 295 of mathematical symbols to Unicode~\cite{beeton}; one can expect
 296 further changes so that new public mathematical font encodings will
 297 most likely be delayed still further.
 298
 299
 300
 301 \subsection{Further information}
 302
 303 For a general introduction to \LaTeX, including the new features of
 304 \LaTeXe, you should read \emph{\LaTeXbook},
 305 Leslie Lamport, Addison Wesley, 2nd~ed, 1994.
 306
 307 A more detailed description of the new features of \LaTeX, including an
 308 overview of more than 200 packages and nearly 1000 ready to run examples, is
 309 to be found in \emph{\LaTeXcomp{} second edition} by Frank Mittelbach and
 310 Michel Goossens~\cite{A-W:MG2004}.
 311
 312 The \LaTeX{} project sponsored a report on Mathematical % spelt out in full
 313 font encodings, which
 314 is worth reading for its insight into the problems of defining the way
 315 in which math is used: see~\cite{ziegler,clasen,clasen-vieth}.
 316
 317 The \LaTeX{} font selection scheme is based on \TeX, which is described
 318 by its developer in \emph{The \TeX book}, Donald E.~Knuth, Addison
 319 Wesley, 1986, revised in 1991 to include the features of \TeX~3.
 320
 321 For more information about \TeX{} and \LaTeX, please contact your local
 322 \TeX{} Users Group, or the international \TeX{} Users Group
 323 (\url{http://www.tug.org}).
 324
 325
 326
 327 \section{Existing font encodings}
 328
 329 This section lists the encodings currently assigned; for each
 330 encoding, we list the registered (\LaTeX{}) name, the assigned purpose
 331 of the encoding, and the author.  Further details may list the code
 332 positions used in the encoding, the \emph{variable slots} (see below),
 333 an example font (for which a listing will be provided later in the
 334 document if the relevant fonts are present), and a source for further
 335 reference.
 336
 337 While the characteristic feature of an encoding is that each font
 338 encoded according to the encoding should have the same glyph set,
 339 there are some encodings (notably \Enc{OT1} and its descendants) in
 340 which a few glyph code slots differ in their contents in different
 341 fonts.
 342
 343 \subsection{Naming conventions}
 344
 345 Names for encoding schemes are strings of up to three letters (all
 346 upper case) plus digits.
 347
 348 The \LaTeX3 project reserves the use of encoding names starting with the
 349 following letters: |T| (standard 256-long text encodings), |TS|
 350 (symbols that are designed to extend a corresponding |T| encoding),
 351 |X| (text encodings that do not conform to the strict requirements for
 352 |T| encodings), |M| (standard 256-long mathematical encodings), |S| (other
 353 symbol encodings), |A| (other special applications), |OT| (standard
 354 128-long text encodings), and |OM| (standard 128-long mathematical encodings).
 355
 356 Please do not use the above starting letters for non-portable
 357 encodings.  If new standard encodings emerge then we shall add them in
 358 a later release of \LaTeX.
 359
 360 Encoding schemes which are local to a site or a system should start
 361 with |L|, experimental encodings intended for wide distribution will
 362 start with |E|, whilst |U| is for Unknown or Unclassified encodings.
 363
 364 \begin{quote}
 365   \itshape We recommend that new encoding names should not be
 366   introduced unless careful consideration and discussion in the user
 367   community has confirmed the need for the encoding. If encodings have to
 368   change from font to font, a number of problems arise, so it is best to
 369   develop encodings that can be used with a large number of fonts in parallel.
 370   This allows documents to be typeset using different fonts without problems.
 371
 372   The \Enc{TS1} encoding is a good example of a \emph{bad} encoding (even
 373   though it was developed with the best intentions) as a huge number of fonts
 374   can only implement parts of it. Similarly, the fact that the few sets of
 375   available mathematical fonts (beside Computer Modern Math) nearly
 376   all implement slightly different encodings is a huge source of
 377   problems. Don't add to this if possible!
 378 \end{quote}
 379
 380
 381 \subsection{128$^+$ glyph encodings (text)}
 382
 383 The `OT' series of font encodings start with Donald Knuth's original
 384 text encoding, that used for the text fonts in the earliest releases
 385 of \TeX{} itself.  The `O' of the encoding designator may be taken as
 386 signifying `original', or just `old'.
 387
 388 \begin{encodinginfo}{OT1}
 389         {\TeX{} text}
 390         {Donald Ervin Knuth}
 391         {0x00--0x7F}
 392         {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
 393 % {0X--'177}
 394 % {'13--'17, '44, '74, '76, '134, '173--'175}
 395         {cmr10}
 396         {\cite[p.427]{A-W:DKn86}}
 397
 398   Donald Knuth designed his font encoding (and hence his fonts) in a
 399   very different environment from that which now pervades the \TeX{}
 400   world: his (mainframe) computer had very little memory, there was
 401   little experience in (or demand for) for multilingual technical
 402   typesetting, and as a result it was appropriate to sacrifice
 403   uniformity for efficiency.
 404
 405   Thus Knuth's original fonts differ slightly in some encoded slots:
 406   for example, the glyphs \texttt{\string<}, \texttt{\string>},
 407   \verb=\=, \verb={=, and \verb=}= are only available in the
 408   typewriter fonts and the \textdollar{} and \textsterling{} signs
 409   share the same position (in different font shapes).
 410
 411   This means that direct selection of these slots can produce
 412   unpredictable results, e.g., typing \texttt{\string<} or
 413   \verb=\symbol{'74}= in a document can yield `\textquestiondown'.
 414 \end{encodinginfo}
 415
 416
 417 \begin{encodinginfo}{OT2}
 418         {UW cyrillic encoding}
 419         {University of Washington}
 420         {0x00--0x7F}
 421         {---}
 422         {wnr10}
 423         {\cite{Beeton:TB6-3-124}}
 424   Support for this encoding is available in the Cyrillic bundle although for
 425   all practical purposes it is better to use one of the \Enc{T2} encodings.
 426 \end{encodinginfo}
 427
 428
 429 \begin{encodinginfo}{OT3}
 430         {UW IPA encoding}
 431         {University of Washington}
 432         {0x00--0x7f}
 433         {---}
 434         {wsuipa10}
 435         {\cite[p.149]{CorkGW:91}}
 436   The \Enc{OT3} encoding was never really used with \LaTeXe{}
 437   following the introduction of the TIPA system which offers much
 438   better support for IPA. In particular, no \File{ot3enc.def}
 439   file was ever produced.
 440 \end{encodinginfo}
 441
 442
 443 \begin{encodinginfo}{OT4}
 444         {Polish text encoding}
 445         {B.~Jackowski and M.~Ry\'cko} %% ?  Marcin Woli\'nski
 446   {0x00--0x7F, 0x81, 0x82, 0x86, 0x8A, 0x8B, 0x91, 0x99, 0x9B, 0xA1,
 447    0xA2, 0xA6, 0xAA, 0xAB, 0xAE, 0xAF, 0xB1, 0xB9, 0xBB, 0xD3, 0xF3,
 448    0xFF}
 449   {0x0B--0x0F, 0x24, 0x3C, 0x3E, 0x5C, 0x7B--0x7D}
 450         {plr10}
 451         {---}
 452
 453    While Knuth included the means of typesetting the `lost L' (\L) in
 454   his \Enc{OT1} encoding, he omitted the ogonek (\,\,\k{}), a diacritic
 455   mark that is also needed in Polish text; hence the appearance, well
 456   before the \Enc{T1} encoding, of fonts using this encoding.
 457 \end{encodinginfo}
 458
 459 \begin{encodinginfo}{OT5}
 460         {Not currently allocated}
 461         {---}
 462         {---}
 463         {---}
 464         {}
 465         {---}
 466
 467 \end{encodinginfo}
 468
 469
 470
 471 \begin{encodinginfo}{OT6}
 472         {Armenian text encoding}
 473         {Serguei Dachian}
 474         {0x03--0x0F, 0x13--0x7F}
 475         {---}
 476         {artmr10}
 477         {---}
 478
 479   This encoding was allocated to permit use of Dachian's
 480   Armenian fonts in a standard \LaTeX{} environment.
 481
 482   Because of license issues the \texttt{artmr} fonts are not necessarily
 483   included in distributed \TeX{} installations (and for this reason the
 484   corresponding encoding table is not shown below). However, the fonts
 485   and the support macros can be found on the CTAN archives (look for
 486   \texttt{armtex}).
 487
 488 \end{encodinginfo}
 489
 490
 491
 492 \subsection{256 glyph encodings (text)}
 493
 494 \begin{encodinginfo}{T1}
 495         {Cork encoding}
 496         {Euro \TeX{} conference at Cork}
 497         {0x00--0xFF}
 498         {---}
 499         {ecrm1000}
 500         {\cite[p.514]{tub:MFe90}, \cite[p.99]{Knappen:TB17-2-96}}
 501
 502   The Cork encoding was developed so that advantage could be taken of
 503   the (then) new facilities of \TeX{}\,3, allowing hyphenation of
 504   most Western European (and some Eastern European) languages in an
 505   unmodified version of \TeX{}.
 506
 507   The encoding was developed in the absence of any extant effort at
 508   font design, but instances written in Metafont (the `EC' fonts), and
 509   more recently Adobe Type 1 instances of the same fonts have become
 510   available.
 511
 512   Substantial (but incomplete) instances have also been developed,
 513   which use virtual fonts.  These latter instances map either Knuth's
 514   original (OT1-encoded) fonts, or commercial fonts that contain the
 515   Adobe `standard' set of 224 glyphs.
 516 \end{encodinginfo}
 517
 518 \begin{encodinginfo}
 519   {T2A, T2B, T2C}
 520   {Cyrillic encodings}
 521   {The CyrTUG font team}
 522   {0x00--0xFF}
 523   {--- (within each encoding)}
 524   {larm1000}
 525   {\cite{Berdnikov:eurotex-98}}
 526
 527   There are too many glyphs in the full Cyrillic complement of
 528   languages for all of them to be covered by a single
 529   \LaTeX{}-compliant encoding (the lower half of each
 530   \Enc{T2}~encoding is identical to that of \Enc{T1}, in order that
 531   each should be a conforming \LaTeX{} encoding~--- see
 532   section~\ref{sec:restrictions}).  The approach taken is
 533   therefore to develop a single encoding, \Enc{X2} (see \ref{sec:extendedenc})
 534   which contains all the glyphs needed for the full set of
 535   languages, and then to derive the three \LaTeX{}-complaint
 536   \Enc{T2}-family encodings using the \Enc{X2} set together with that of
 537   \Enc{T1}.
 538
 539 \end{encodinginfo}
 540
 541
 542
 543 \begin{encodinginfo}{T3}
 544         {IPA encoding}
 545         {FUKUI Rei, University of Tokyo}
 546         {0x00--0xFF}
 547         {---}
 548         {tipa10}
 549         {\cite[p.102]{Rei:TB17-2-102}}
 550
 551
 552     The \Enc{T3} encoding (and associated macros) provides the glyphs required
 553   in phonetic description according to current International Phonetic
 554   Association recommendations \cite{ipa}.
 555
 556   The \Enc{T3} encoding does \emph{not fulfil} the requirements for \Enc{T}
 557   encodings---the name is a historical accident. The correct name would be
 558   \Enc{X3}, but due to the fact that this font family has been used under its
 559   current encoding name for a long time, the name will not change for
 560   compatibility reasons.
 561
 562 \end{encodinginfo}
 563
 564
 565
 566 \begin{encodinginfo}{T4}
 567         {African Latin (fc)}              % public name
 568         {J\"org Knappen}              % author name
 569         {0x00--0xFF}              % range(s) of slots used for glyphs
 570         {0x24}         % range(s) of slots with variable glyphs if any
 571         {fcr10}              % name of an example font
 572         {\cite{tub:JKn93}}
 573
 574 The African Latin fonts contain in their lower half (0--127) the same
 575 characters as the European Latin (T1-encoded) Fonts, while in their
 576 upper half (128--255) they
 577 contain letters and symbols for African languages that use extended
 578 Latin alphabets.
 579 Due to lack of space, J\"org had to play the unfortunate trick of
 580 assigning \verb=\textdollar= and \verb=\textsterling=
 581 the same position; users should take these characters
 582 from the text companion font, if they are needed.  Instead of defining
 583 a lot of new control sequences for the single letters, there are three
 584 accent-like control sequences with general purpose:
 585 \verb=\m= (Modified-1),
 586 \verb=\M= (Modified-2) and
 587 \verb=\B= (Barred).
 588 Most standard \LaTeX{} encoding-dependent commands
 589 work.  However, the Icelandic special letters are not available and `best
 590 replacements' for \verb=\Th=, \verb=\th=, and \verb=\dh=
 591 are used (barred T and d resp.).
 592 \end{encodinginfo}
 593
 594
 595 \begin{encodinginfo}{T5}
 596         {Vietnamese encoding}
 597         {Werner Lemberg and
 598          Vladimir Volovich}
 599         {0x00--0xFF}
 600         {---}
 601         {vnr10}
 602         {\cite{vnr}}
 603
 604   The \Enc{T5} encoding was developed for Vietnamese. Again, this encoding
 605   \emph{does not} conform to the requirements for a \Enc{T}-encoding
 606   because its large number of accented letters prevent the \verb=\lccode= and
 607   \verb=\uccode= mapping requirements for \Enc{T} encodings from being
 608   fulfilled.  However, since the Vietnamese language does not
 609   use word division in typesetting so that this requirement is
 610   actually not important for this particular language.
 611   Since every glyph used in Vietnamese text is internally
 612   represented as \textsc{licr} macros, the commands  \verb=\MakeUppercase= and
 613   \verb=\MakeLowercase= still work as expected (as they change the case of the
 614   \textsc{ascii} characters in \textsc{licr} definitions).
 615
 616 \end{encodinginfo}
 617
 618 \begin{encodinginfo}
 619   {T6}
 620   {Armenian}
 621   {---}
 622   {---}
 623   {---}
 624   {}
 625   {---}
 626
 627     This encoding is reserved to permit future expansion of Armenian
 628   \TeX{} to use 256-character (hyphenatable) fonts.
 629 \end{encodinginfo}
 630
 631 \begin{encodinginfo}{T7}
 632         {Greek encoding}
 633    {---}
 634    {---}
 635    {---}
 636    {}
 637    {---}
 638
 639 The name is already reserved for a 256 glyph greek encoding. The encoding
 640 itself hasn't been defined so far.
 641
 642 \end{encodinginfo}
 643
 644
 645
 646 \subsection{256$^-$ glyph encodings (text symbols)}
 647
 648 \begin{encodinginfo}{TS1}
 649         {Text Companion encoding (Cork)}
 650         {J\"org Knappen}
 651   {0x00--0x0D, 0x12, 0x15, 0x16, 0x18--0x1D, 0x20, 0x24, 0x27, 0x2A,
 652    0x2C--0x3A, 0x3C--0x3E, 0x4D, 0x4F, 0x57, 0x5B, 0x5D--0x60,
 653    0x62--0x64, 0x6C--0x6E, 0x7E--0xBF, 0xD6, 0xF6}
 654   {---}
 655         {tcrm1000}
 656         {\cite{Knappen:TB17-2-96}}
 657
 658    The text symbol encoding offers access to symbolic glyphs that are
 659   commonly used in text (for a variety of reasons), and whose style
 660   should vary with the text that surrounds them.
 661
 662   Unfortunately, the \Enc{TS1} encoding was developed without
 663   reference to the glyphs available in existing commercial fonts.
 664   As a result, only font families
 665   explicitly developed for \TeX{} (i.e., typically originating with
 666   \MF{}) actually contain all glyphs required by the \Enc{TS1}
 667   encoding.  Most other font families (whether free or commercial)
 668   often only provide half of the set%
 669 %%
 670 %% don't show the comment if the tables are not generated
 671 %%
 672 \expandafter\ifx\csname r@fonttable:tcrm1000\endcsname\relax
 673 \else
 674   \expandafter\ifx\csname r@fonttable:ptmr8c\endcsname\relax
 675   \else
 676     \space (compare the two tables for \Enc{TS1} on
 677      pages~\pageref{fonttable:tcrm1000}
 678      and~\pageref{fonttable:ptmr8c})%
 679   \fi
 680 \fi.
 681   To improve this situation somewhat, NFSS provides a way to define encoding
 682   subsets on a per family basis in the \Pkg{textcomp} package (which
 683   package offers support for the \Enc{TS1} encoding).
 684 \end{encodinginfo}
 685
 686
 687 \begin{encodinginfo}{TS3}
 688         {IPA symbol encoding}
 689         {FUKUI Rei, University of Tokyo}
 690         {0x00--0x0A, 0x20--0x49, 0x50--0x56, 0x70--0x7B}
 691         {---}
 692         {tipx10}
 693         {\cite{Rei:TB17-2-102}}
 694
 695   The \Enc{TS3} encoding (together with the \Enc{T3} encoding) provides the
 696   glyphs for typesetting phonetic transcriptions following the
 697   guidelines of the International Phonetic Association \cite{ipa}.  Support
 698   is offered through the \Pkg{tipa} package.
 699 \end{encodinginfo}
 700
 701
 702
 703
 704 \subsection{256 glyph encodings (text extended)}
 705 \label{sec:extendedenc}
 706
 707 \begin{encodinginfo}
 708   {X2}
 709   {Cyrillic glyph container}
 710   {The CyrTUG font team}
 711   {0x00--0xFF}
 712   {---}
 713   {rxrm1000}
 714   {\cite{Berdnikov:eurotex-98}}
 715
 716   This encoding specifies the glyph container for Cyrillic characters,
 717   which is used in specifying the \Enc{T2A}, \Enc{T2B} and \Enc{T2C} encodings.
 718 \end{encodinginfo}
 719
 720
 721
 722
 723 \subsection{128$^+$ glyph encodings (mathematics)}
 724
 725
 726 \begin{encodinginfo}{OML}
 727         {\TeX{} math italic}
 728         {Donald Ervin Knuth}
 729         {0x00--0x7F}
 730         {---}
 731         {cmmi10}
 732         {\cite[p.430]{A-W:DKn86}}
 733
 734   The \Enc{OML} encoding contains italic Latin and Greek letters for
 735   use in mathematical formulas (typically used for variables) together
 736   with some symbols.
 737
 738 \end{encodinginfo}
 739
 740 \begin{encodinginfo}{OMS}
 741         {\TeX{} math symbol}
 742         {Donald Ervin Knuth}
 743         {0x00--0x7F}
 744         {---}
 745         {cmsy10}
 746         {\cite[p.431]{A-W:DKn86}}
 747
 748   The  \Enc{OMS} encoding contains basic mathematical symbols,
 749   together with an uppercase ``calligraphic'' Latin alphabet.
 750 \end{encodinginfo}
 751
 752
 753 \begin{encodinginfo}{OMX}
 754         {\TeX{} math extension}
 755         {Donald Ervin Knuth}
 756         {0x00--0x7F}
 757         {---}
 758         {cmex10}
 759         {\cite[p.432]{A-W:DKn86}}
 760
 761   \Enc{OMS} encodes mathematical symbols with variable sizes, such as
 762   the $\sum$ sign, which changes its size if used in displayed
 763   formulas, and the construction parts for
 764   brackets, braces and radicals, etc., which can stretch to accommodate
 765   the thing they're enclosing.
 766
 767 \end{encodinginfo}
 768
 769
 770
 771
 772 \subsection{256 glyph encodings (mathematics)}
 773
 774 So far there are no 256 glyph mathematical encodings. A proposal is
 775 given in \cite{ziegler}.
 776
 777
 778 \subsection{Other encodings}
 779
 780 \begin{encodinginfo}
 781   {C..}
 782   {CJK encodings}
 783   {Werner Lemberg}
 784   {0x00--0xFF}
 785   {---}
 786   {} % no font, of course
 787   {\cite{CJK}}
 788
 789   The \Pkg{CJK} package defines a number of encodings which access Chinese,
 790   Japanese and Korean fonts.
 791
 792 \end{encodinginfo}
 793
 794 \begin{encodinginfo}
 795   {E..}
 796   {Experimental encodings}
 797   {---}
 798   {0x00--0xFF}
 799   {all}
 800   {}
 801   {\cite[p.416]{A-W:MG2004}}
 802
 803   As the name indicates, encodings starting with the letter \Enc{E} are
 804   intended for experimental encodings, that are still likely to change.
 805 \end{encodinginfo}
 806
 807 \begin{encodinginfo}{L..}
 808         {Local encoding (site dependent)}
 809         {---}
 810         {0x00--0xFF}
 811         {all}
 812         {}
 813         {\cite[p.416]{A-W:MG2004}}
 814
 815         `Local' encodings provide the means to develop representation
 816         techniques that are suited to a particular \TeX{} environment.  While
 817         the developer has freedom to specify their encoding as he or she
 818         pleases, there is a strong incentive to obey the \LaTeX{} rules for
 819         encodings, since it will otherwise be difficult to compose text using
 820         the encoding.
 821
 822         At least it was the intention that \Enc{L..} encodings are local and
 823         site dependent. However, a number of such encodings became generally
 824         used without ever getting a different name allocated.
 825
 826 \end{encodinginfo}
 827
 828
 829
 830 \begin{encodinginfo}{LY1}
 831         {Y\&Y 256 glyph encoding}
 832         {Berthold Horn}
 833         {0x00--0x08, 0x0C, 0x10, 0x12--0xFF}
 834         {\emph{believed none}}
 835         {ptmr8y}
 836         {\cite[p.416]{A-W:MG2004}}
 837
 838         This is an alternative to the \Enc{T1} encoding developed by Y\&Y and
 839         used in their commercial \TeX{} implementation.
 840
 841 \end{encodinginfo}
 842
 843
 844 \begin{encodinginfo}{LV1}
 845         {MicroPress encoding}
 846         {Michael Vulis}
 847         {\emph{unknown}}
 848         {\emph{unknown}}
 849         {}
 850         {\cite[p.416]{A-W:MG2004}}
 851
 852         This is an encoding developed by MicroPress and used for some of their
 853         fonts.
 854
 855 \end{encodinginfo}
 856
 857
 858 \begin{encodinginfo}{LGR}
 859         {Greek 256 glyph encoding}
 860         {\emph{unknown}}
 861         {0x00--0xFF}
 862         {\emph{believed none}}
 863         {grmn1000}
 864         {\cite[p.575]{A-W:MG2004}}
 865
 866         Currently the main encoding in use for the Greek language.
 867
 868         This encoding doesn't conform to the restrictions for
 869         \Enc{T}-encodings described in section~\ref{sec:restrictions} on
 870         page~\pageref{sec:restrictions} as it doesn't have \textsc{ascii}
 871         glyphs at all.
 872
 873 \end{encodinginfo}
 874
 875
 876 \begin{encodinginfo}
 877   {PD1}
 878   {PDF DocEncoding}
 879   {Adobe}
 880   {0x08--0x0A, 0x0C, 0x0D, 0x18--0x7E, 0x80--0x9E, 0xA0--0xAE, 0xB0--0xFF}
 881   {---}
 882   {}
 883   {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
 884
 885   The \Enc{PD1} encoding is a virtual encoding with 256 glyphs needed to
 886   produce bookmarks and similar text in PDF document generated with pdf\LaTeX.
 887   The encoding is ``virtual'' because by design there are no \TeX{}
 888   fonts that cover \Enc{PD1}. Details can be found in appendix D.1
 889   of~\cite{Adobe:PDF-1.6}.
 890 \end{encodinginfo}
 891
 892 \begin{encodinginfo}
 893   {PU}
 894   {PDF Unicode Encoding}
 895   {Adobe}
 896   {---}
 897   {---}
 898   {}
 899   {\cite{Adobe:PDF-1.6}, \cite{hyperref}}
 900
 901   Another virtual encoding (with more than 600 characters) for
 902   Unicode-encoded bookmarks in PDF documents.
 903 \end{encodinginfo}
 904
 905 \begin{encodinginfo}{U}
 906         {Unknown encoding}
 907         {---}
 908         {potentially 0x00-0xFF}
 909         {all}
 910         {wasy10}
 911         {\cite[p.416]{A-W:MG2004}}
 912
 913   This encoding should be used for fonts that resist classification,
 914   e.g., when it is clear that there will never be more than one font
 915   using the same encoding.
 916
 917 \end{encodinginfo}
 918
 919
 920
 921 \section{Restrictions}
 922 \label{sec:restrictions}
 923
 924
 925 \subsection{Required glyphs for general text encodings}
 926
 927 Encodings that are supposed to be used with \LaTeX{} for `general
 928 purpose text fonts' need to have certain fixed glyphs in certain
 929 encoding slots.  A `general purpose text font' is one intended for
 930 arbitrary natural language text and not just within special
 931 environments (such as the phonetic alphabet) or just for typesetting
 932 individual symbols (e.g., the text companion font with encoding
 933 \Enc{TS1}).
 934
 935 This is the case for the following glyphs that have to be in their
 936 \textsc{ascii} positions for general purpose text encodings:
 937 \begin{center}
 938 \begin{tabular}[t]{cc}
 939   Glyph & Position \\ \hline
 940   !     & \number`\!    \\
 941   '     & \number`\'    \\
 942   (     & \number`\(    \\
 943   )     & \number`\)    \\
 944   \relax*       & \number`\*    \\
 945   +     & \number`\+    \\
 946   ,     & \number`\,    \\
 947   -     & \number`\-    \\
 948   .     & \number`\.    \\
 949   /     & \number`\/    \\
 950   0 \ldots\ 9   & \number`\0\ to \number`\9     \\
 951   \end{tabular}
 952   \quad
 953   \begin{tabular}[t]{cc}
 954   Glyph & Position \\ \hline
 955   :     & \number`\:    \\
 956   ;     & \number`\;    \\
 957   =     & \number`\=    \\
 958   ?     & \number`\?    \\
 959   @     & \number`\@    \\
 960   A \ldots\ Z   & \number`\A\ to \number`\Z     \\
 961   \relax[       & \number`\[    \\
 962   ]     & \number`\]    \\
 963   `     & \number`\`    \\
 964   a \ldots\ z   & \number`\a\ to \number`\z     \\
 965 \end{tabular}
 966 \quad
 967 \begin{tabular}[t]{cc}
 968 Glyph\footnotemark      & Position \\ \hline
 969 <       & \number`\<    \\
 970 >       & \number`\>    \\
 971 \string|        & \number`\|    \\
 972 \end{tabular}\footnotetext{The requirement for these three glyphs is
 973   violated in the Latin alphabet \Enc{OT} encodings.}
 974 \end{center}
 975 In addition the following glyphs have to be present
 976 somewhere\footnote{The position in this case is not important as they
 977 are generated from ligature programs.} in the encoding together with
 978 corresponding ligature programs to generate them:
 979 \begin{center}
 980 \begin{tabular}[t]{cc}
 981 Glyph   & Ligature program \\ \hline
 982  ``     & \texttt{`\/`} \\
 983  ''     & \texttt{'\/'} \\
 984  --     & \texttt{-\/-} \\
 985  ---    & \texttt{-\/-\/-} \\
 986 \end{tabular}
 987 \end{center}
 988
 989 This is $33 + 2 * 26 = 85$ positions ``required'', which leaves 171
 990 positions free.
 991
 992 If there are free slots available then adding all or some of the
 993 diacritics would be the best way to fill them.
 994
 995 If there are insufficient slots for the characters needed, a possible
 996 technique is to create a subsidiary encoding, and to move non-letter
 997 characters to it.  Since only ``letters'' take part in the hyphenation
 998 algorithm, this technique doesn't affect the appearance of the typeset
 999 result.
1000
1001 \subsection{The constraints on upper/lower case tables}
1002
1003 Due to some technical restrictions of \TeX{} related to hyphenation it
1004 is not possible in \LaTeX{} to use more than one \verb=\lccode= or
1005 \verb=\uccode= table. Therefore all encodings need to share these two
1006 tables which are defined to be those of the \Enc{T1} encoding.
1007
1008 The \Enc{T1} encoding has some nasty peculiarities which make certain slot
1009 positions more or less unusable for other encodings if this
1010 restriction is to be obeyed. This is unfortunate but since \Enc{T1} is well
1011 established and the basis for a large number of languages it seemed
1012 better to live with this situation instead of trying to replace \Enc{T1} with a
1013 slightly better standard (with the result that for a long time
1014 different \LaTeX{} installations would not be able to communicate with
1015 each other because of incompatible font sets).
1016
1017 The positions that are problematic are as follows.
1018 \begin{center}
1019 \begin{tabular}{lp{.8\linewidth}}
1020 25 (\char 25) & uppercase maps strangely (same as for 105, \char 105)\\
1021 26 (\char 26) & uppercase maps strangely (same as for 106, \char 106)\\
1022 27 (\char 27) & lowercase maps to itself which makes this slot subject
1023                 to hyphenation (used to support \Enc{OT1} encoding) \\
1024 157 (\char 157) & lowercase maps strangely (same as for 73, \char 73) \\
1025 158 (\char 158) & uppercase maps strangely (same as for 240, \char 240) \\
1026 \end{tabular}
1027 \end{center}
1028 One way to use such slots is to fill them with ligature glyphs as
1029 \TeX{} will not consult these tables for glyphs constructed through
1030 ligatures programs but instead uses the entries for the individual
1031 glyphs used to produce the ligature.
1032
1033 A complete listing of the uppercase/lowercase mapping tables is to be
1034 found in section~\ref{sec:uclc-tab} (page \pageref{sec:uclc-tab}).
1035
1036 \newcount\temp \newcount\tempL \newcount\tempU
1037
1038 \def\nextstep{\global\tempL=\lccode\temp
1039               \global\tempU=\uccode\temp
1040               \lctablenumbersize\the\temp &
1041               \the\tempL&
1042               \the\tempU&\printlowerupper{\the\temp}{\the\tempL}{\the\tempU}\\
1043                \global\advance\temp by 1
1044                \stepprint}
1045
1046 \def\printlowerupper#1#2#3{\char#1\relax
1047    (\ifnum#2=0\relax--\else\char#2\fi
1048    /\ifnum#3=0\relax--\else\char#3\fi)}
1049
1050 \def\stepprint{\relax\ifnum\temp<\endval
1051                     \let\next=\nextstep
1052                \else
1053                      \let\next=\relax
1054                \fi
1055                \next}
1056
1057 \def\dolctable#1#2{{\temp=#1\relax
1058 \def\endval{#2}%
1059 \setlength\tabcolsep{1.5pt}%
1060 \begin{tabular}[t]{@{}cccc@{}}
1061 pos&lc&uc&glyphs\\\hline
1062 \stepprint
1063 \end{tabular}}}
1064
1065 \iffalse
1066 \begin{center}
1067 \tiny\let\lctablenumbersize\tiny
1068 \mbox{\dolctable{0}{52}\vrule
1069 \dolctable{52}{104}\vrule
1070 \dolctable{104}{156}\vrule
1071 \dolctable{156}{208}\vrule
1072 \dolctable{208}{256}}
1073 \end{center}
1074 \fi
1075
1076 \iffalse
1077 \begin{center}\tiny
1078 \mbox{\dolctable{0}{65}\vrule
1079 \dolctable{65}{128}\vrule
1080 \dolctable{128}{193}\vrule
1081 \dolctable{193}{256}}
1082 \end{center}
1083 \fi
1084
1085
1086
1087 \section{Encoding specific commands}
1088
1089 An encoding specific command is one that generates a glyph (or
1090 glyphs), to produce a graphic effect that may be implemented
1091 differently in different encodings.  The encoding specific command
1092 automatically changes its implementation when the encoding changes in
1093 the course of the document.  Encoding specific commands figure in
1094 \LaTeX's internal character representation (\textsc{licr}) and are also
1095 discussed in \cite[sec.~7.11.2, p.~442]{A-W:MG2004}.
1096
1097 The following table only covers the encoding specific commands from
1098 the \Enc{OT1} and \Enc{T1} encodings. Other encodings may specify
1099 additional encoding specific commands.  In the table, the first 15
1100 commands are `accent-like' and need as an argument the character to be
1101 accented.  For example, |\v{c}| is the \textsc{licr} for `\v{c}'.
1102
1103 \begin{tabbing}
1104 \ttverb\textvisiblespace\quad\=bbbbbbbbbbbbbb\=b'b'\=ccccccccccc\kill
1105 \ttverb\`{}               \>OT1,T1\>   \a`{}\> (grave)      \\
1106 \ttverb\'{}               \>OT1,T1\>   \a'{}\> (acute)      \\
1107 \ttverb\^{}               \>OT1,T1\>   \^{}\>  (circumflex) \\
1108 \ttverb\~{}               \>OT1,T1\>   \~{}\>  (tilde)      \\
1109 \ttverb\"{}               \>OT1,T1\>   \"{}\>  (umlaut)     \\
1110 \ttverb\H{}               \>OT1,T1\>   \H{}\>  (Hungarian umlaut) \\
1111 \ttverb\r{}               \>OT1,T1\>   \r{}\>  (ring)       \\
1112 \ttverb\v{}               \>OT1,T1\>   \v{}\>  (ha\v{c}ek)  \\
1113 \ttverb\u{}               \>OT1,T1\>   \u{}\>  (breve)      \\
1114 \ttverb\t{}               \>OT1,T1\>   \t{}\>  (tie)        \\
1115 \ttverb\={}               \>OT1,T1\>   \a={}\> (macron)     \\
1116 \ttverb\.{}               \>OT1,T1\>   \.{}\>  (dot)        \\
1117 \ttverb\b{}               \>OT1,T1\>   \b{}\>  (underbar)   \\
1118 \ttverb\c{}               \>OT1,T1\>   \c{}\>  (cedilla)    \\
1119 \ttverb\d{}               \>OT1,T1\>   \d{}\>  (dot under)  \\
1120 \ttverb\k{}               \>T1    \>   \k{}\>  (ogonek)     \\
1121 % \ttverb\AA              \>OT1,T1\>   \AA \>               \\ % no longer
1122 \ttverb\AE                \>OT1,T1\>   \AE \>               \\
1123 \ttverb\DH                \>T1    \>   \DH \>               \\
1124 \ttverb\DJ                \>T1    \>   \DJ \>               \\
1125 \ttverb\L                 \>OT1,T1\>   \L  \>               \\
1126 \ttverb\NG                \>T1    \>   \NG \>               \\
1127 \ttverb\OE                \>OT1,T1\>   \OE \>               \\
1128 \ttverb\O                 \>OT1,T1\>   \O  \>               \\
1129 \ttverb\SS                \>OT1,T1\>   \SS \>               \\
1130 \ttverb\TH                \>T1    \>   \TH \>               \\
1131 % \ttverb\aa              \>OT1,T1\>   \aa \>               \\ no-longer
1132 \ttverb\ae                \>OT1,T1\>   \ae \>               \\
1133 \ttverb\dh                \>T1    \>   \dh \>               \\
1134 \ttverb\dj                \>T1    \>   \dj \>               \\
1135 \ttverb\guillemotleft     \>T1    \>   \guillemotleft  \> (guillemet) \\
1136 \ttverb\guillemotright    \>T1    \>   \guillemotright \> (guillemet) \\
1137 \ttverb\guilsinglleft     \>T1    \>   \guilsinglleft  \> (guillemet) \\
1138 \ttverb\guilsinglright    \>T1    \>   \guilsinglright \> (guillemet) \\
1139 \ttverb\i                 \>OT1,T1\>   \i  \>               \\
1140 \ttverb\j                 \>OT1,T1\>   \j  \>               \\
1141 \ttverb\l                 \>OT1,T1\>   \l  \>               \\
1142 \ttverb\ng                \>T1    \>   \ng \>               \\
1143 \ttverb\oe                \>OT1,T1\>   \oe \>               \\
1144 \ttverb\o                 \>OT1,T1\>   \o  \>               \\
1145 \ttverb\quotedblbase      \>T1    \>   \quotedblbase   \>   \\
1146 \ttverb\quotesinglbase    \>T1    \>   \quotesinglbase \>   \\
1147 \ttverb\ss                \>OT1,T1\>   \ss \>               \\
1148 \ttverb\textasciicircum   \>OT1,T1\>   \textasciicircum \>  \\
1149 \ttverb\textasciitilde    \>OT1,T1\>   \textasciitilde  \>  \\
1150 \ttverb\textbackslash     \>OT1,T1\>   \textbackslash   \>  \\
1151 \ttverb\textbar           \>OT1,T1\>   \textbar         \>  \\
1152 \ttverb\textbraceleft     \>OT1,T1\>   \textbraceleft   \>  \\
1153 \ttverb\textbraceright    \>OT1,T1\>   \textbraceright  \>  \\
1154 \ttverb\textcompwordmark  \>OT1,T1\>   \textcompwordmark\> (invisible) \\
1155 \ttverb\textdollar        \>OT1,T1\>   \textdollar      \>  \\
1156 \ttverb\textemdash        \>OT1,T1\>   \textemdash      \>  \\
1157 \ttverb\textendash        \>OT1,T1\>   \textendash      \>  \\
1158 \ttverb\textexclamdown    \>OT1,T1\>   \textexclamdown  \>  \\
1159 \ttverb\textgreater       \>OT1,T1\>   \textgreater     \>  \\
1160 \ttverb\textless          \>OT1,T1\>   \textless        \>  \\
1161 \ttverb\textquestiondown  \>OT1,T1\>   \textquestiondown\>  \\
1162 \ttverb\textquotedbl      \>T1    \>   \textquotedbl    \>  \\
1163 \ttverb\textquotedblleft  \>OT1,T1\>   \textquotedblleft\>  \\
1164 \ttverb\textquotedblright \>OT1,T1\>   \textquotedblright\> \\
1165 \ttverb\textquoteleft     \>OT1,T1\>   \textquoteleft   \>  \\
1166 \ttverb\textquoteright    \>OT1,T1\>   \textquoteright  \>  \\
1167 \ttverb\textregistered    \>OT1,T1\>   \textregistered  \>  \\
1168 \ttverb\textsection       \>OT1,T1\>   \textsection     \>  \\
1169 \ttverb\textsterling      \>OT1,T1\>   \textsterling    \>  \\
1170 \ttverb\texttrademark     \>OT1,T1\>   \texttrademark   \>  \\
1171 \ttverb\textunderscore    \>OT1,T1\>   \textunderscore  \>  \\
1172 \ttverb\textvisiblespace  \>OT1,T1\>   \textvisiblespace\>  \\
1173 \ttverb\th                \>T1    \>   \th              \>
1174 \end{tabbing}
1175
1176 \section{Encodings for Unicode based \TeX\ systems}
1177 \label{sec:unicode}
1178
1179 The preceding text has assumed a classic TeX system that is
1180 restricted to the use of fonts with at most 256 characters. In order
1181 to accommodate all the characters needed for different languages and
1182 mathematics it is necessary to have multiple encodings as described
1183 above, and \LaTeX\ needs to be aware of the encoding used for each
1184 font.
1185
1186 Unicode aims to provide a single encoding that removes most of the
1187 need to switch encodings, apart from very specialist use for non-standard characters. Rather than assign codes in the range 0--256 (hex
1188 FF) Unicode codes are in the range 0--1,114,111 (hex 10FFFF), although
1189 not all slots are available for distinct characters for technical
1190 reasons. Unicode offers the possibility to use a single input encoding
1191 (usually UTF-8) for all documents and to use essentially the same
1192 Unicode encoding for all fonts, so removing the need to switch
1193 encodings in different contexts.
1194
1195 Omega was perhaps the first widely used \TeX\ extension that
1196 supported Unicode. Currently the two actively supported systems that are
1197 present in most modern \TeX\ distributions are Xe\TeX\ and Lua\TeX.
1198
1199 When used with these extended \TeX\ engines, \LaTeX's font system can
1200 refer to Unicode fonts (typically OpenType fonts installed system-wide
1201 on your operating system rather than fonts specifically encoded/installed for
1202 \TeX). Currently the usual method of accessing these fonts is through
1203 the contributed \Pkg{fontspec} package. This uses as encoding \Enc{TU}:
1204 ``\TeX{} Unicode'' (historically two experimental encodings \Enc{EU1}
1205 and \Enc{EU2}
1206 were used, depending on the engine, but these are deprecated).
1207 The exact rules for \LaTeX\ encodings
1208 for Unicode engines have not yet been finalised in terms of the (usual)
1209 requirement that each slot should be defined. (This is not realistic for
1210 a Unicode font, as almost all fonts address subsets of the full range.)
1211 It is rare to need to specify the \Enc{TU} encoding a document as the
1212 \Pkg{fontspec} package sets up the correct encoding when loaded.
1213
1214 The restrictions described in section \ref{sec:restrictions} do not
1215 apply, or need to be modified in a Unicode based engine. Clearly the
1216 lowercase table (and hyphenation patterns) can not be restricted to
1217 the values used for \Enc{T1} and do only refer to the first 256
1218 characters.
1219
1220 When the \LaTeX\ format is made \LaTeX\ sets up the lowercase table
1221 and classifies characters as letter- or non-letter-based on \Enc{T1} if
1222 a classic \TeX\ or pdf\TeX\ is being used. If a Unicode based \TeX\ is
1223 detected, the values are instead based on the classification and
1224 lower-case mappings provided by the Unicode Character Database
1225 \cite{ucd}. The \LaTeX{} team have written a generic loader bundle,
1226 \Pkg{unicode-data}, which provides the mechanism to load this information
1227 directly from the Unicode Character Database data files and which is read
1228 when a Unicode-compliant engine is detected during format-building.
1229
1230 Similarly in the default configuration files used by modern \TeX\
1231 distribution, the hyphenation files for each supported language are
1232 written in UTF-8 encoding, using Unicode code points for all letters,
1233 then if a classic \TeX\ system  is detected, some additional macros are
1234 loaded to convert these files to 256-character encodings where
1235 possible, and assuming the \Enc{T1} lowercase table. For Unicode engines
1236 no conversion takes place. (The hyphenation patterns for a small number of
1237 languages require that some punctuation characters have non-zero
1238 c values. This are set during pattern reading, and may at some
1239 stage in the future use the e-\TeX{} \verb=\savinghyphcodes= mechanism to
1240 avoid any need to manipulate \verb=\lccode= in the document.)
1241
1242
1243
1244
1245 \begin{thebibliography}{99}
1246 \addcontentsline{toc}{section}{\numberline{\relax}\refname}
1247
1248
1249 \bibitem{Adobe:PDF-1.6} \emph{\textsc{PDF} reference}:
1250     Adobe portable document format version~1.6.  Adobe Systems
1251     Incorporated, 2005. % why \textsuperscript{3}?
1252   \url{http://partners.adobe.com/public/developer/en/pdf/PDFReference16.pdf}.
1253
1254 \bibitem{Beeton:TB6-3-124} Barbara Beeton:
1255   \emph{Mathematical symbols and cyrillic fonts ready for
1256       distribution}.  In: TUGBoat, 6\#3), 1985.
1257   \url{http://tug.org/TUGboat/Articles/tb06-3/tb13beetcyr.pdf}.
1258
1259 \bibitem{beeton} Barbara Beeton: \emph{Unicode
1260       and math, a combination whose time has come -- Finally!}.  In:
1261   TUGBoat, 21\#3, 2000.
1262   \url{http://www.tug.org/TUGboat/Articles/tb21-3/tb68beet.pdf}.
1263
1264
1265 \bibitem{Berdnikov:eurotex-98} A.\@ Berdnikov, O.\@
1266   Lapko, M.\@ Kolodin, A.\@ Janishevsky and
1267   A.\@ Burykin: \emph{The Encoding Paradigm in
1268       \LaTeXe{} and the Projected X2 Encoding for Cyrillic Texts}.
1269   Euro\TeX~98.
1270   \url{http://www.gutenberg.eu.org/pub/GUTenberg/publicationsPDF/28-29-berdnikova.pdf}.
1271
1272 \bibitem{CJK} \emph{The \Pkg{CJK} package}:
1273   \url{http://cjk.ffii.org}.
1274
1275 \bibitem{clasen} Matthias Clasen: \emph{A new
1276       implementation of \LaTeX{} math}, 1997-98.
1277   \url{http://www.tug.org/twg/mfg/papers/current/newmath.ps.gz}.
1278
1279 \bibitem{clasen-vieth} Matthias Clasen and Ulrik
1280   Vieth: \emph{Towards a new Math Font Encoding
1281       for (La)\TeX}.  March 1998,
1282   \url{http://www.tug.org/twg/mfg/papers/current/mfg-euro-all.ps.gz}.
1283
1284 \bibitem{CorkGW:91}
1285 Dean Guenther and Janene Winter.
1286 \newblock An international phonetic alphabet.
1287 \newblock In Guenther \cite{proc:MGu91}, pages 149--156.
1288 \newblock Published as {TUG}boat 12\#1.
1289
1290 \bibitem{proc:MGu91}
1291 Mary Guenther, editor.
1292 \newblock {\em {\TeX} 90 Conference Proceedings}, March 1991.
1293 \newblock Published as {TUG}boat 12\#1.
1294
1295 \bibitem{tub:MFe90}
1296 Michael~J. Ferguson.
1297 \newblock Report on multilingual activities.
1298 \newblock {\em {TUG}boat}, 11(4):514--516, 1990.
1299
1300 \bibitem{fontinst} \emph{The \Pkg{fontinst} package}:
1301   \textlangle CTAN\textrangle\url{/fonts/utilities/fontinst}.
1302
1303 \bibitem{Rei:TB17-2-102} Fukui Rei:
1304   \emph{\textsl{TIPA}: A system for processing phonetic
1305       symbols in \LaTeX}.  In: TUGBoat, 17\#, 1996.
1306   \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51rei.pdf}.
1307
1308 \bibitem{hyperref} \emph{The \Pkg{hyperref} package}:
1309   \url{http://www.tug.org/applications/hyperref}.
1310
1311 \bibitem{tub:JKn93}
1312 J\"org Knappen.
1313 \newblock Fonts for Africa: The fc Fonts.
1314 \newblock {\em {TUG}boat}, 14(2):104, 1993.
1315
1316 \bibitem{Knappen:TB17-2-96} J\"org Knappen:
1317   \emph{The \Pkg{dc} fonts~1.3: Move towards stability
1318       and completeness}.  In: TUGBoat 17\#2, 1996.
1319   \url{http://www.tug.org/TUGboat/Articles/tb17-2/tb51knap.pdf}.
1320
1321 \bibitem{A-W:DKn86}
1322 Donald~E. Knuth.
1323 \newblock {\em The {\TeX}book}.
1324 \newblock Volume~A of {\em Computers \& {T}ypesetting\/},
1325   May 1989.
1326 \newblock Eight printing.
1327
1328 \bibitem{vnr} \emph{The \Pkg{vnr} font family}, developed by
1329    the author of pdf\TeX, {H\`an Th\^e\protect\llap{\raise 0.5ex\hbox{\'{\relax}}} Th\`anh}.
1330    \url{http://vntex.org/download/vntex}.
1331
1332  \bibitem{ipa} Home page of the International Phonetic Association.
1333    \url{http://www.arts.gla.ac.uk/IPA/ipa.html}
1334
1335 \bibitem{A-W:LLa94}
1336 Leslie Lamport.
1337 \newblock {\em {\LaTeX:} A Document Preparation System}.
1338 \newblock Addison-Wesley, Reading, Massachusetts, second edition, 1994.
1339
1340 \bibitem{LH-Fonts} \emph{The \Pkg{lh}-Fonts for Cyrillic}:
1341   \textlangle CTAN\textrangle\url{/fonts/cyrillic/lh}.
1342
1343 \bibitem{A-W:MG2004}
1344 Frank Mittelbach and Michel Goossens.
1345 \newblock {\em The {\LaTeX} Companion second edition}.
1346 \newblock With Johannes Braams, David Carlisle, and Chris Rowley.
1347 \newblock Addison-Wesley, Reading, Massachusetts, 2004.
1348
1349 \bibitem{Unicode} \emph{The Unicode Standard}.
1350   \url{http://unicode.org}.
1351
1352 \bibitem{ucd} \emph{The Unicode Character Database}.
1353   \url{http://unicode.org/ucd}.
1354
1355 \bibitem{ziegler} Justin Ziegler, \emph{Technical
1356     Report on Math Font Encodings}, June 1994,
1357   \url{http://www.tug.org/twg/mfg/papers/ltx3pub/l3d007.ps.gz}.
1358
1359 \end{thebibliography}
1360
1361 \clearpage\appendix
1362 \begin{center}
1363   \Large\bfseries Appendices
1364 \end{center}
1365
1366 \section{Example code tables}
1367
1368 This appendix contains a table of each font mentioned as an ``example''
1369 font above, providing that the font was available when the document
1370 was processed with \LaTeX{}.  (\LaTeX{} generates a warning message
1371 for each font it fails to find.)
1372
1373 \subsection{Text encodings}
1374
1375 \ftable{cmr10}{OT1}
1376
1377 \ftable{wnr10}{OT2}
1378
1379 \ftable{wsuipa10}{OT3}
1380
1381 \ftable{plr10}{OT4}
1382
1383 %\ftable{artmr10}{OT6}
1384
1385 \ftable{ecrm1000}{T1}
1386
1387 \ftable{larm1000}{T2A}
1388
1389 \ftable{lbrm1000}{T2B}
1390
1391 \ftable{lcrm1000}{T2C}
1392
1393 \ftable{tipa10}{T3}
1394
1395 \ftable{fcr10}{T4}
1396
1397 \ftable{vnr10}{T5}
1398
1399
1400 \subsection{Text symbol encodings}
1401
1402 The full table for \Enc{TS1} as provided by European Computer Modern family:
1403 \ftable{tcrm1000}{TS1}
1404
1405 \pagebreak
1406
1407 In contrast typical PostScript fonts usually have incomplete implementations
1408 of \Enc{TS1} sometimes missing more than half of the glyphs:
1409
1410 \ftable{ptmr8c}{TS1}
1411
1412 \ftable{tipx10}{TS3}
1413
1414
1415
1416 \subsection{Extended text encodings}
1417
1418 \ftable{rxrm1000}{X2}
1419
1420
1421 \subsection{Mathematical encodings}
1422
1423 \ftable{cmmi10}{OML}
1424
1425 \ftable{cmsy10}{OMS}
1426
1427 \ftable{cmex10}{OMX}
1428
1429
1430 \subsection{Other encodings}
1431
1432 \ftable{ptmr8y}{LY1}
1433
1434 %%\ftable{????}{LV1}
1435
1436 \ftable{grmn1000}{LGR}
1437
1438 \ftable{wasy10}{U}
1439 \ftable{logo10}{U}
1440
1441 \clearpage
1442 \section{Uppercase and lowercase tables}
1443 \label{sec:uclc-tab}
1444
1445 The following two sets of tables list the \verb"\uppercase" and
1446 \verb"\lowercase" values for each position in the \LaTeX{} standard
1447 256-character tables.
1448
1449 Each row of each table lists:
1450 \begin{quote}
1451   \begin{tabular}{lp{0.7\textwidth}}
1452     pos & The position in the table (0-255) \\
1453     lc  & The value in the \verb"\lowercase" table at the position \\
1454         & (note that value 0 here means that \verb"\lowercase" is
1455           ineffective for this character, and hyphenation does not apply
1456           to it) \\
1457     uc  & The value in the \verb"\uppercase" table at the position \\
1458         & (note that value 0 here means that \verb"\uppercase" is
1459           ineffective for this character) \\
1460     glyphs & The glyphs specified for the T1 encoding for this
1461              position, laid out as \meta{glyph}\textbf{(}\meta{lowercase
1462              glyph}\textbf{/}\meta{uppercase glyph}\textbf{)}
1463   \end{tabular}
1464 \end{quote}
1465
1466 \begin{center}
1467   \let\lctablenumbersize\footnotesize
1468   \makebox[\textwidth]{\hss
1469     \dolctable{0}{32}\quad\dolctable{32}{64}\quad
1470     \dolctable{64}{96}\quad\dolctable{96}{128}%
1471   \hss}
1472
1473   \makebox[\textwidth]{\hss
1474     \dolctable{128}{160}\quad\dolctable{160}{192}\quad
1475     \dolctable{192}{224}\quad\dolctable{224}{256}%
1476   \hss}
1477 \end{center}
1478 \end{document}
1479
1480
1481 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%