latex2e-20150101/base/utf8ienc.dtx

   1 % \iffalse meta-comment
   2 %
   3 % Copyright 1993-2015
   4 % The LaTeX3 Project and any individual authors listed elsewhere
   5 % in this file.
   6 %
   7 % This file is part of the LaTeX base system.
   8 % -------------------------------------------
   9 %
  10 % It may be distributed and/or modified under the
  11 % conditions of the LaTeX Project Public License, either version 1.3c
  12 % of this license or (at your option) any later version.
  13 % The latest version of this license is in
  14 %    http://www.latex-project.org/lppl.txt
  15 % and version 1.3c or later is part of all distributions of LaTeX
  16 % version 2005/12/01 or later.
  17 %
  18 % This file has the LPPL maintenance status "maintained".
  19 %
  20 % The list of all files belonging to the LaTeX base distribution is
  21 % given in the file `manifest.txt'. See also `legal.txt' for additional
  22 % information.
  23 %
  24 % The list of derived (unpacked) files belonging to the distribution
  25 % and covered by LPPL is defined by the unpacking scripts (with
  26 % extension .ins) which are part of the distribution.
  27 %
  28 % \fi
  29 %
  30 % \iffalse
  31 %<*driver>
  32 \documentclass{ltxdoc}
  33 \GetFileInfo{utf8.def}
  34 \title{Providing some UTF-8 support via \texttt{inputenc}}
  35 \date{\fileversion\space\filedate{} printed \today}
  36  \author{%
  37   Frank Mittelbach \and Chris Rowley\thanks{Borrowing heavily from
  38       code by David Carlisle and tables by Sebastian Rahtz; some table
  39       and code cleanup by Javier Bezos}}
  40 \usepackage[utf8]{inputenc}
  41 \begin{document}
  42  \MaintainedByLaTeXTeam{latex}
  43  \maketitle
  44  \tableofcontents
  45  \DocInput{utf8ienc.dtx}
  46 \end{document}
  47 %</driver>
  48 % \fi
  49 %
  50 % \CheckSum{1232}
  51 %
  52 % \newpage
  53 %
  54 % \section{Introduction}
  55 %
  56 % [The whole section is rather unfinished \ldots\ just like the code, sorry!]
  57 %
  58 % \subsection{Background and general stuff}
  59 %
  60 % For many reasons what this package provides is a long way from any
  61 % type of `Unicode compliance'.
  62 %
  63 % In stark contrast to 8-bit character sets, with 16 or more bits it can
  64 % easily be very inefficient to support the full range.\footnote{In
  65 % fact, \LaTeX's current 8-bit support does not go so far as to make
  66 % all 8-bit characters into valid input.}  Moreover, useful support of
  67 % character input by a typesetting system overwhelmingly means finding
  68 % an acceptable visual representation of a sequence of characters and
  69 % this, for \LaTeX{}, means having available a suitably encoded 8-bit
  70 % font.
  71 %
  72 % Unfortunately it is not possible to predict exactly what valid UTF-8
  73 % octet sequences will appear in a particular file so it is best to
  74 % make all the unsupported but valid sequences produce a reasonably
  75 % clear and noticeable error message.
  76 %
  77 % There are two directions from which to approach the question of what
  78 % to load.  One is to specify the ranges of Unicode characters that will
  79 % result in some sensible typesetting; this requires the provider to
  80 % ensure that suitable fonts are loaded and that these input characters
  81 % generate the correct typesetting via the encodings of those fonts.  The
  82 % other is to inspect the font encodings to be used and use these to
  83 % define which input Unicode characters should be supported.
  84 %
  85 % For Western European languages, at least, going in either direction
  86 % leads to many straightforward decisions and a few that are more
  87 % subjective.  In both cases some of the specifications are \TeX{}
  88 % specific whilst most are independent of the particular typesetting
  89 % software in use.
  90 %
  91 % As we have argued elsewhere, \LaTeX{} needs to refer to characters via
  92 % `seven-bit-text' names and, so far, these have been chosen by
  93 % reference to historical sources such as Plain \TeX{} or Adobe encoding
  94 % descriptions.  It is unclear whether this ad hoc naming structure should
  95 % simply be extended or whether it would be useful to
  96 % supplement it with standardised internal Unicode character names such as
  97 % one or more of the following:\footnote{Burkhard und Holger Mittelbach
  98 %   spielen mit mir!  Sie haben etwas hier geschrieben.}
  99 %
 100 % \begin{verbatim}
 101 %   \ltxutwochar <4 hex digits>
 102 %
 103 %   \ltxuchar {<hex digits>}
 104 %     B H U R R R
 105 %
 106 %   \ltxueightchartwo   <2 utf8 octets as 8-bit char tokens>
 107 %   \ltxueightcharthree <3 utf8 octets ...>
 108 %   \ltxueightcharfour  <4 utf8 octets ...>
 109 % \end{verbatim}
 110 %
 111 %
 112 % \subsection{More specific stuff}
 113 %
 114 % In addition to setting up the mechanism for reading UTF-8 characters
 115 % and specifying the \LaTeX-level support available, this package
 116 % contains support for some default historically expected \TeX-related
 117 % characters and some example `Unicode definition files' for standard
 118 % font encodings.
 119 %
 120 %
 121 % \subsection{Notes}
 122 %
 123 % This package does not support Unicode combining characters as \TeX{}
 124 % is not really equipped to make this possible.
 125 %
 126 %  No attempt is made to be useful beyond Latin, and maybe Cyrillic,
 127 %  for European languages (as of now).
 128 %
 129 %
 130 % \subsection{Basic operation of the code}
 131 %
 132 % The \texttt{inputenc} package makes the upper 8-bit characters active and
 133 %    assigns to all of them an error message. It then waits for the
 134 %    input encoding files to change this set-up.  Similarly, whenever
 135 %    |\inputencoding| is encountered in a document, first the upper
 136 %    8-bit characters are set back to produce an error and then the
 137 %    definitions for the new input encoding are loaded, changing some of the
 138 %    previous settings.
 139 %
 140 %    The 8-bit input encodings currently supported by \texttt{inputenc}
 141 %    all use declarations such as |\DeclareInputText| and the like to map an
 142 %    8-bit number to some \LaTeX{} internal form, e.g.~to |\"a|.
 143 %
 144 %    The situation when supporting UTF-8 as the input encoding is
 145 %    different, however. Here we only have to set up the actions of
 146 %    those 8-bit numbers that can be the first octet in a UTF-8
 147 %    representation of a Unicode character.  But we cannot simply set
 148 %    this to some internal \LaTeX{} form since the Unicode character
 149 %    consists of more than one octet; instead we have to define this
 150 %    starting octet to parse the right number of further octets that
 151 %    together form the UTF-8 representation of some Unicode character.
 152 %
 153 %    Therefore when switching to \texttt{utf8} within the
 154 %    \texttt{inputenc} framework the characters with numbers (hex)
 155 %    from \texttt{"C2} to \texttt{"DF} are defined to parse for a
 156 %    second octet following, the characters from \texttt{"E0} to
 157 %    \texttt{"EF} are defined to parse for two more octets and finally
 158 %    the characters from \texttt{"F0} to \texttt{"F3} are defined to
 159 %    parse for three additional octets.  These additional octets are
 160 %    always in the range \texttt{"80} to \texttt{"B9}.
 161 %
 162 %    Thus, when such a character is encountered in the document (so
 163 %    long as expansion is not prohibited) a defined number of
 164 %    additional octets (8-bit characters) are read and from them a
 165 %    unique control sequence name is immediately constructed.
 166 %
 167 %    This control sequence is either defined (good) or undefined
 168 %    (likely); in the latter case the user gets an error message
 169 %    saying that this UTF-8 sequence (or, better, Unicode character)
 170 %    is not supported.
 171 %
 172 %    If the control sequence is set up to do something useful then it will
 173 %    expand to a \LaTeX{} internal form: e.g.~for the utf8 sequence of
 174 %    two octets \texttt{"C3 "A4} we get |\"a| as the
 175 %    internal form which then, depending on the font encoding,
 176 %    eventually resolves to the single glyph `latin-a-umlaut' or to
 177 %    the composite glyph `latin-a with an umlaut accent'.
 178 %
 179 %    These mappings from (UTF-8 encoded) Unicode characters to \LaTeX{}
 180 %    internal forms are made indirectly.  The code below provides a
 181 %    declaration |\DeclareUnicodeCharacter| which maps Unicode numbers
 182 %    (as hexadecimal) to \LaTeX{} internal forms.
 183 %
 184 %    This mapping needs to be set up only once so it is done at
 185 %    |\begin{document}| by looking at the list of font encodings that
 186 %    are loaded by the document and providing mappings related to
 187 %    those font encodings whenever these are available. Thus at most
 188 %    only those Unicode characters that can be represented by the glyphs
 189 %    available in these encodings will be defined.
 190 %
 191 %    Technically this is done by loading one file per encoding,
 192 %    if available, that is supposed to provide the necessary mapping
 193 %    information.
 194 %
 195 %
 196 % \StopEventually{}
 197 %
 198 %
 199 %
 200 %
 201 % \section{Coding}
 202 %
 203 % \subsection{Housekeeping}
 204 %
 205 %    The usual introductory bits and pieces:
 206 %
 207 %    \begin{macrocode}
 208 %<utf8>\ProvidesFile{utf8.def}
 209 %<test>\ProvidesFile{utf8-test.tex}
 210 %<+lcy> \ProvidesFile{lcyenc.dfu}
 211 %<+ly1> \ProvidesFile{ly1enc.dfu}
 212 %<+oms> \ProvidesFile{omsenc.dfu}
 213 %<+ot1> \ProvidesFile{ot1enc.dfu}
 214 %<+ot2> \ProvidesFile{ot2enc.dfu}
 215 %<+t1>  \ProvidesFile{t1enc.dfu}
 216 %<+t2a> \ProvidesFile{t2aenc.dfu}
 217 %<+t2b> \ProvidesFile{t2benc.dfu}
 218 %<+t2c> \ProvidesFile{t2cenc.dfu}
 219 %<+ts1> \ProvidesFile{ts1enc.dfu}
 220 %<+x2>  \ProvidesFile{x2enc.dfu}
 221 %<+all> \ProvidesFile{utf8enc.dfu}
 222    [2015/06/27 v1.1n UTF-8 support for inputenc]
 223 %    \end{macrocode}
 224 %
 225 %    \begin{macrocode}
 226 %<*utf8>
 227 \makeatletter
 228 %    \end{macrocode}
 229 %    We restore the |\catcode| of space (which is set to ignore in
 230 %    \texttt{inputenc}) while reading \texttt{.def} files. Otherwise
 231 %    we would need to explicitly use |\space| all over the place in
 232 %    error and log messages.
 233 % \changes{v1.1d}{2004/05/08}{Explicitly set catcode of space}
 234 %    \begin{macrocode}
 235 \catcode`\ \saved@space@catcode
 236 %    \end{macrocode}
 237 %
 238 %
 239 %
 240 % \subsection{Parsing UTF-8 input}
 241 %
 242 % \begin{macro}{\UTFviii@two@octets}
 243 % \begin{macro}{\UTFviii@three@octets}
 244 % \begin{macro}{\UTFviii@four@octets}
 245 %    A UTF-8 char (that is not actually a 7-bit char, i.e.~a single
 246 %    octet) is parsed as follows: each starting octet is an active
 247 %    \TeX{} character token; each of these is defined below to be a
 248 %    macro with one to three arguments nominally (depending on the
 249 %    starting octet). It calls one of |\UTFviii@two@octets|,
 250 %    |\UTFviii@three@octets|, or |\UTFviii@four@octets| which then
 251 %    actually picks up the argument(s).
 252 %
 253 %    From the arguments a control sequence with a name of the form
 254 %    \verb=u8:#1#2...= is constructed where the |#i| ($i>1$) are the
 255 %    arguments and |#1| is the starting octet (as a \TeX{} character
 256 %    token).  Since some or even all of these characters are active
 257 %    (when inputenc is loaded) we need to use |\string| when building
 258 %    the csname.
 259 %
 260 %    The csname thus constructed can of course be undefined but to
 261 %    avoid producing an unhelpful low-level undefined command error we
 262 %    pass it to |\UTFviii@defined| which is responsible for producing
 263 %    a more sensible error message (not yet done!!).  If, however, it is
 264 %    defined we simply execute the thing (which should then expand to
 265 %    an encoding specific internal \LaTeX{} form).
 266 %    \begin{macrocode}
 267 \def\UTFviii@two@octets#1#2{\expandafter
 268     \UTFviii@defined\csname u8:#1\string#2\endcsname}
 269 %    \end{macrocode}
 270 % \end{macro}
 271 %
 272 %    \begin{macrocode}
 273 \def\UTFviii@three@octets#1#2#3{\expandafter
 274     \UTFviii@defined\csname u8:#1\string#2\string#3\endcsname}
 275 %    \end{macrocode}
 276 % \end{macro}
 277 %
 278 %    \begin{macrocode}
 279 \def\UTFviii@four@octets#1#2#3#4{\expandafter
 280     \UTFviii@defined\csname u8:#1\string#2\string#3\string#4\endcsname}
 281 %    \end{macrocode}
 282 % \end{macro}
 283 %
 284 % \begin{macro}{\UTFviii@defined}
 285 %    This tests whether its argument is different from |\relax|: it
 286 %    either calls for a sensible error message (not done), or it gets
 287 %    the |\fi| out of the way (in case the command has arguments) and
 288 %    executes it.
 289 %    \begin{macrocode}
 290 \def\UTFviii@defined#1{%
 291   \ifx#1\relax
 292 %    \end{macrocode}
 293 %    The endline character has a special definition within the
 294 %    inputenc package (it is gobbling spaces). For this reason we
 295 %    can't produce multiline strings without some precaution.
 296 % \changes{v1.1b}{2004/02/09}{No newlines allowed in error messages}
 297 % \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
 298 %    \begin{macrocode}
 299       \PackageError{inputenc}{Unicode\space char\space \string#1\space
 300                               not\space set\space up\space
 301                               for\space use\space with\space LaTeX}\@eha
 302   \else\expandafter
 303     #1%
 304   \fi
 305 }
 306 %    \end{macrocode}
 307 % \end{macro}
 308 %
 309 % \begin{macro}{\UTFviii@loop}
 310 %    This wonderful bit of code from Dr Carlisle defines the starting
 311 %    octets to call |\UTFviii@two@octets| etc as appropriate. The starting
 312 %    octet itself is passed directly as the first argument, the others
 313 %    are picked up later en route.
 314 %
 315 %    The |\UTFviii@loop| loops through the numbers starting at
 316 %    |\count@| and ending at |\@tempcnta|${} - 1$, each time executing
 317 %    the code in |\UTFviii@tmp|.
 318 %
 319 %    All this is done in a group so that temporary catcode changes
 320 %    etc.~vanish after everything is set up.
 321 %
 322 %    It may be a good idea to add code to deal with `illegal utf8 octets':
 323 %    at present these will be handled by whatever code was in use for 8-bit
 324 %    input before this code is executed.
 325 %
 326 %    \begin{macrocode}
 327 \begingroup
 328 \catcode`\~13
 329 \catcode`\"12
 330 %    \end{macrocode}
 331 %
 332 %    \begin{macrocode}
 333 \def\UTFviii@loop{%
 334   \uccode`\~\count@
 335   \uppercase\expandafter{\UTFviii@tmp}%
 336   \advance\count@\@ne
 337   \ifnum\count@<\@tempcnta
 338   \expandafter\UTFviii@loop
 339   \fi}
 340 %    \end{macrocode}
 341 %
 342 %    Setting up 2-byte UTF-8:
 343 %    \begin{macrocode}
 344     \count@"C2
 345     \@tempcnta"E0
 346     \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@two@octets\string~}}
 347 \UTFviii@loop
 348 %    \end{macrocode}
 349 %    Setting up 3-byte UTF-8:
 350 %    \begin{macrocode}
 351     \count@"E0
 352     \@tempcnta"F0
 353     \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@three@octets\string~}}
 354 \UTFviii@loop
 355 %    \end{macrocode}
 356 %
 357 %    Setting up 4-byte UTF-8:
 358 %    \begin{macrocode}
 359     \count@"F0
 360     \@tempcnta"F4
 361     \def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@four@octets\string~}}
 362 \UTFviii@loop
 363 \endgroup
 364 %    \end{macrocode}
 365 % \end{macro}
 366 %
 367 %    For this case we must disable the warning generated by
 368 %    \texttt{inputenc} if it doesn't see any new |\DeclareInputText|
 369 %    commands.
 370 %    \begin{macrocode}
 371 \@inpenc@test
 372 %    \end{macrocode}
 373 %
 374 %
 375 %    If this file (\texttt{utf8.def}) is not being read while setting
 376 %    up \texttt{inputenc}, i.e.~in the preamble, but when
 377 %    |\inputencoding| is called somewhere within the document, we do not
 378 %    need to input the specific Unicode mappings again. We therefore
 379 %    stop reading the file at this point.
 380 %    \begin{macrocode}
 381 \ifx\@begindocumenthook\@undefined
 382   \makeatother
 383 %    \end{macrocode}
 384 %    The |\fi| must be on the same line as |\endinput| or else it will
 385 %    never be seen!
 386 %    \begin{macrocode}
 387   \endinput \fi
 388 %    \end{macrocode}
 389 %
 390 %
 391 % \subsection{Mapping Unicode codes to \LaTeX{} internal forms}
 392 %
 393 %
 394 % \begin{macro}{\DeclareUnicodeCharacter}
 395 %    The |\DeclareUnicodeCharacter| declaration defines a mapping from
 396 %    a Unicode character code point to a \LaTeX{} internal form. The first
 397 %    argument is the Unicode number as hexadecimal digits and the second is
 398 %    the actual \LaTeX{} internal form.
 399 %
 400 %    We start by making sure that some characters have the right
 401 %    |\catcode| when they are used in the definitions below.
 402 %    \begin{macrocode}
 403 \begingroup
 404 \catcode`\"=12
 405 \catcode`\<=12
 406 \catcode`\.=12
 407 \catcode`\,=12
 408 \catcode`\;=12
 409 \catcode`\!=12
 410 \catcode`\~=13
 411 %    \end{macrocode}
 412 %
 413 %    \begin{macrocode}
 414 \gdef\DeclareUnicodeCharacter#1#2{%
 415   \count@"#1\relax
 416   \wlog{ \space\space defining Unicode char U+#1 (decimal \the\count@)}%
 417   \begingroup
 418 %    \end{macrocode}
 419 %    Next we do the parsing of the number stored in |\count@| and assign the
 420 %    result to |\UTFviii@tmp|. Actually all this could be done in-line,
 421 %    the macro |\parse@XML@charref| is only there to extend this code
 422 %    to parsing Unicode numbers in other contexts one day (perhaps).
 423 %    \begin{macrocode}
 424     \parse@XML@charref
 425 %    \end{macrocode}
 426 %
 427 %    Here is an example of what is happening, for the pair \texttt{"C2 "A3}
 428 %    (which is the utf8 represenation for the character \textsterling{}).
 429 %    After |\parse@XML@charref| we have, stored in |\UTFviii@tmp|, a
 430 %    single command with two character tokens as arguments:
 431 %    \begin{quote}
 432 %      [$t_{C2}$ and $t_{A3}$ are the characters corresponding to these
 433 %      two octets]\\
 434 %      |\UTFviii@two@octets| $t_{\rm C2}t_{\rm A3}$
 435 %    \end{quote}
 436 %    what we actually need to produce is a definition of the form
 437 %    \begin{quote}
 438 %    |\def\u8:|$t_{\rm C2}$$t_{\rm A3}$ |{|\textit{\LaTeX{} internal form}|}|\,.
 439 %    \end{quote}
 440 %    So here we temporarily redefine the prefix commands
 441 %    |\UTFviii@two@octets|, etc.~to
 442 %    generate the csname that we wish to define>  the |\string|s are
 443 %    added in case these tokens are still active.
 444 %    \begin{macrocode}
 445     \def\UTFviii@two@octets##1##2{\csname u8:##1\string##2\endcsname}%
 446     \def\UTFviii@three@octets##1##2##3{\csname u8:##1%
 447                                      \string##2\string##3\endcsname}%
 448     \def\UTFviii@four@octets##1##2##3##4{\csname u8:##1%
 449                            \string##2\string##3\string##4\endcsname}%
 450 %    \end{macrocode}
 451 %    Now we simply:-) need to use the right number of |\expandafter|s to
 452 %    finally construct the definition: expanding |\UTFviii@tmp| once to get
 453 %    its contents, a second time to replace the prefix command by its
 454 %    |\csname| expansion, and a third time to turn the expansion into
 455 %    a csname after which the |\gdef| finally gets applied.
 456 %    We add an irrelevant |\IeC| and braces around the definition, in
 457 %    order to avoid any space after the command being gobbled up
 458 %    when the text is written out to an auxiliary file (see
 459 %    \texttt{inputenc} for further details
 460 %    \begin{macrocode}
 461     \expandafter\expandafter\expandafter
 462     \expandafter\expandafter\expandafter
 463     \expandafter
 464      \gdef\UTFviii@tmp{\IeC{#2}}%
 465    \endgroup
 466 }
 467 %    \end{macrocode}
 468 % \end{macro}
 469 %
 470 %
 471 % \begin{macro}{\parse@XML@charref}
 472 %    This macro parses a Unicode number (decimal) and returns its
 473 %    UTF-8 representation as a sequence of non-active \TeX{} character
 474 %    tokens. In the
 475 %    original code it had two arguments delimited by \texttt{;} here,
 476 %    however, we supply the Unicode number implicitly.
 477 %    \begin{macrocode}
 478 \gdef\parse@XML@charref{%
 479 %    \end{macrocode}
 480 %    We need to keep a few things local, mainly the |\uccode|'s that
 481 %    are set up below. However, the group originally used here is
 482 %    actually unnecessary since we call this macro only within another
 483 %    group; but it will be important to restore the group if this
 484 %    macro gets used for other purposes.
 485 %    \begin{macrocode}
 486 %  \begingroup
 487 %    \end{macrocode}
 488 %    The original code from David supported the convention that a
 489 %    Unicode slot number could be given either as a decimal or as a
 490 %    hexadecimal (by starting with \texttt{x}).  We do not do this so
 491 %    this code is also removed.  This could be reactivated if one
 492 %    wants to support document commands that accept Unicode numbers
 493 %    (but then the first case needs to be changed from an error
 494 %    message back to something more useful again).
 495 %    \begin{macrocode}
 496 %  \uppercase{\count@\if x\noexpand#1"\else#1\fi#2}\relax
 497 %    \end{macrocode}
 498 %    As |\count@| already contains the right value we make
 499 %    |\parse@XML@charref| work without arguments.
 500 % \changes{v1.1g}{2005/09/27}{Real spaces do not show up so use \cs{space}}
 501 %    \begin{macrocode}
 502   \ifnum\count@<"A0\relax
 503      \PackageError{inputenc}{Cannot\space define\space Unicode\space
 504                              char\space value\space <\space 00A0}\@eha
 505 %    \end{macrocode}
 506 %    Do not ask us to provide an explanation for the code below, it is
 507 %    borrowed straight from \texttt{xmltex} by David and we trust him
 508 %    totally (and we are too lazy to reread the Unicode book to see if
 509 %    this is the correct algorithm).\footnote{We were hoping to also
 510 %    find in his work the \TeX{} code for going the other way: from
 511 %    UTF-8 octets to Unicode slot number, but no luck!}
 512 %    \begin{macrocode}
 513   \else\ifnum\count@<"800\relax
 514      \parse@UTFviii@a,%
 515      \parse@UTFviii@b C\UTFviii@two@octets.,%
 516   \else\ifnum\count@<"10000\relax
 517      \parse@UTFviii@a;%
 518      \parse@UTFviii@a,%
 519      \parse@UTFviii@b E\UTFviii@three@octets.{,;}%
 520    \else
 521      \parse@UTFviii@a;%
 522      \parse@UTFviii@a,%
 523      \parse@UTFviii@a!%
 524      \parse@UTFviii@b F\UTFviii@four@octets.{!,;}%
 525     \fi
 526     \fi
 527   \fi
 528 %  \endgroup
 529 }
 530 %    \end{macrocode}
 531 % \end{macro}
 532 %
 533 % \begin{macro}{\parse@UTFviii@a}
 534 %    \ldots so somebody else can document this part :-) \ldots~David?:-))))!
 535 % \changes{v1.1b}{2004/02/09}{Space in the wrong place \cs{count @64}}
 536 %    \begin{macrocode}
 537 \gdef\parse@UTFviii@a#1{%
 538      \@tempcnta\count@
 539      \divide\count@ 64
 540      \@tempcntb\count@
 541      \multiply\count@ 64
 542      \advance\@tempcnta-\count@
 543      \advance\@tempcnta 128
 544      \uccode`#1\@tempcnta
 545      \count@\@tempcntb}
 546 %    \end{macrocode}
 547 % \end{macro}
 548 %
 549 % \begin{macro}{\parse@UTFviii@b}
 550 %    \ldots same here
 551 %    \begin{macrocode}
 552 \gdef\parse@UTFviii@b#1#2#3#4{%
 553      \advance\count@ "#10\relax
 554      \uccode`#3\count@
 555      \uppercase{\gdef\UTFviii@tmp{#2#3#4}}}
 556 %    \end{macrocode}
 557 %
 558 %    \begin{macrocode}
 559 \endgroup
 560 %    \end{macrocode}
 561 % \end{macro}
 562 %
 563 %    \begin{macrocode}
 564 \@onlypreamble\DeclareUnicodeCharacter
 565 %    \end{macrocode}
 566 %    These are preamble only as long as we don't support Unicode
 567 %    charrefs in documents.
 568 %    \begin{macrocode}
 569 \@onlypreamble\parse@XML@charref
 570 \@onlypreamble\parse@UTFviii@a
 571 \@onlypreamble\parse@UTFviii@b
 572 %    \end{macrocode}
 573 %
 574 %
 575 % \subsection{Loading Unicode mappings at begin document}
 576 %
 577 % The original plan was to set up the UTF-8 support at
 578 % |\begin{document}|; but then any text characters used in the preamble
 579 % (as people do even though advised against it) would fail in one way or
 580 % the other.
 581 % So the implementation was changed and the Unicode definition files
 582 % for already defined encodings are loaded here.
 583 %
 584 %    We loop through all defined font encodings
 585 %    (stored in |\cdp@list|) and for each load a file
 586 %    \textit{name}\texttt{enc.dfu} if it exist. That file is then
 587 %    supposed to contain |\DeclareUnicodeCharacter| declarations.
 588 %    \begin{macrocode}
 589 \begingroup
 590   \def\cdp@elt#1#2#3#4{%
 591     \wlog{Now handling font encoding #1 ...}%
 592     \lowercase{%
 593         \InputIfFileExists{#1enc.dfu}}%
 594            {\wlog{... processing UTF-8 mapping file for font %
 595                      encoding #1}%
 596 %    \end{macrocode}
 597 % \changes{v1.1m}{2008/04/05}{Ensure we don't lose spaces in the log}
 598 %    The previous line is written to the log with the newline char being
 599 %    ignored (thus not producing a space). Therefore either everything has to
 600 %    be on a single input line or some special care must be taken.  From this
 601 %    point on we ignore spaces again, i.e., while we are reading the
 602 %    \texttt{.dfu} file. The |\endgroup| below will restore it again.
 603 % \changes{v1.1d}{2004/05/08}{Explicitly set catcode of space}
 604 % \changes{v1.1g}{2005/09/27}{We lost the ``false'' case}
 605 %    \begin{macrocode}
 606             \catcode`\ 9\relax}%
 607           {\wlog{... no UTF-8 mapping file for font encoding #1}}%
 608   }
 609   \cdp@list
 610 \endgroup
 611 %    \end{macrocode}
 612 % However, we don't know if there are font encodings still to be
 613 % loaded (either with \texttt{fontenc} or directly with |\input| by
 614 % some some package). Font encoding files are loaded only if the
 615 % corresponding encoding has not been loaded yet, and they always
 616 % begin with |\DeclareFontEncoding|. We now redefine the internal
 617 % kernel version of the latter to load the Unicode file if available.
 618 %
 619 %    \begin{macrocode}
 620 \def\DeclareFontEncoding@#1#2#3{%
 621   \expandafter
 622   \ifx\csname T@#1\endcsname\relax
 623     \def\cdp@elt{\noexpand\cdp@elt}%
 624     \xdef\cdp@list{\cdp@list\cdp@elt{#1}%
 625                     {\default@family}{\default@series}%
 626                     {\default@shape}}%
 627     \expandafter\let\csname#1-cmd\endcsname\@changed@cmd
 628     \begingroup
 629       \wlog{Now handling font encoding #1 ...}%
 630       \lowercase{%
 631         \InputIfFileExists{#1enc.dfu}}%
 632            {\wlog{... processing UTF-8 mapping file for font %
 633                       encoding #1}}%
 634            {\wlog{... no UTF-8 mapping file for font encoding #1}}%
 635     \endgroup
 636   \else
 637      \@font@info{Redeclaring font encoding #1}%
 638   \fi
 639   \global\@namedef{T@#1}{#2}%
 640   \global\@namedef{M@#1}{\default@M#3}%
 641   \xdef\LastDeclaredEncoding{#1}%
 642   }
 643 %</utf8>
 644 %    \end{macrocode}
 645 %
 646 %
 647 %
 648 % \section{Mapping characters ---\newline based on font (glyph) encodings}
 649 %
 650 % This section is a first attempt to provide Unicode definitions for
 651 % characters whose standard glyphs are currently provided by the
 652 % standard \LaTeX{} font-encodings |T1|, |OT1|, etc. They are by
 653 % no means completed and need checking.
 654 %
 655 % For example, one should check the already existing input encodings
 656 %    for glyphs that may in fact be available and required,
 657 %    e.g.~\texttt{latin4} has a number of glyphs with the |\=|
 658 %    accent. Since the |T1| encoding does not provide such glyphs,
 659 %    these characters are not listed below (yet).
 660 %
 661 % The list below was generated by looking at the current \LaTeX{} font
 662 %    encoding files, e.g., \texttt{t1enc.def} and using the work by
 663 %    Sebastian Rahtz (in \texttt{ucharacters.sty}) with a few
 664 %    modifications. In combinations such as |\^\i| the preferred form
 665 %    is that and not |\^i|.
 666 %
 667 % This list has been built from several sources, obviously including
 668 % the Unicode Standard itself. These sources include Passive \TeX{} by
 669 % Sebastian Rahtz, the \texttt{unicode}
 670 % package by Dominique P. G. Unruh (mainly for Latin encodings) and
 671 % \texttt{text4ht} by Eitan Gurari (for Cyrillic ones).
 672 %
 673 % Note that it strictly follows the Mittelbach principles for
 674 % input character encodings: thus it offers no support for using utf8
 675 % representations of math symbols such as $\times$ or $\div$ (in math mode).
 676 %
 677 %
 678 % \subsection{About the table itself}
 679 %
 680 % In addition to generating individual files, the table below is, at present,
 681 % a one-one (we think) partial relationship between the (ill-defined) set
 682 % of LICRs and the Unicode slots "0080 to "FFFF.  At present these entries
 683 % are used only to define a collection of partial mappings from Unicode
 684 % slots to LICRs; each of these mappings becomes full if we add an exception
 685 % value (`not defined') to the set of LICRs.
 686 %
 687 % It is probably not essential for the relationship in the full table to be
 688 % one-one; this raises questions such as: the exact role of LICRs;
 689 % the formal relationships on the set of LICRs; the (non-mathematical)
 690 % relationship between
 691 % LICRs and Unicode (which has its own somewhat fuzzy equivalences);
 692 % and ultimately what a character is and what a character representation
 693 % and/or name is.
 694 %
 695 % Viewed this a way, the result has, perhaps puzzling, just two (we think)
 696 % gaps in the second 128 `Unicode slots' (00A0 and 00AD): neither of these
 697 % is really a character, of course.
 698 %
 699 % It is unclear the extent to which entries in this table should
 700 % resemble the closely related ones in the 8-bit \texttt{inputenc} files.
 701 % The Unicode standard claims that the first 256 slots `are' ASCII and
 702 % Latin-1.
 703 %
 704 % Of course, \TeX{} itself typically does not treat even many perfectly
 705 % `normal text' 7-bit slots as text characters, so it is unclear
 706 % whether \LaTeX{} should even attempt to deal in any consistent way with
 707 % those Unicode slots that are not definitive text characters.
 708 %
 709 %
 710 % \subsection{The mapping table}
 711 %
 712 % Note that the first argument must be a 4-hex-digit number greater
 713 % than \texttt{00BF}.
 714 %
 715 % There are few notes about inconsistencies etc at the end of the table.
 716 %
 717 %    \begin{macrocode}
 718 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00A1}{\textexclamdown}
 719 %<all,ts1,ly1>\DeclareUnicodeCharacter{00A2}{\textcent}
 720 %<all,ts1,t1,ot1,ly1>\DeclareUnicodeCharacter{00A3}{\textsterling}
 721 %<all,x2,ts1,t2c,t2b,t2a,ly1,lcy>\DeclareUnicodeCharacter{00A4}{\textcurrency}
 722 %<all,ts1,ly1>\DeclareUnicodeCharacter{00A5}{\textyen}
 723 %<all,ts1,ly1>\DeclareUnicodeCharacter{00A6}{\textbrokenbar}
 724 %<all,x2,ts1,t2c,t2b,t2a,oms,ly1>\DeclareUnicodeCharacter{00A7}{\textsection}
 725 %<all,ts1>\DeclareUnicodeCharacter{00A8}{\textasciidieresis}
 726 %<all,ts1,utf8>\DeclareUnicodeCharacter{00A9}{\textcopyright}
 727 %<all,ts1,ly1,utf8>\DeclareUnicodeCharacter{00AA}{\textordfeminine}
 728 %<*all,x2,t2c,t2b,t2a,t1,ot2,ly1,lcy>
 729 \DeclareUnicodeCharacter{00AB}{\guillemotleft}
 730 %</all,x2,t2c,t2b,t2a,t1,ot2,ly1,lcy>
 731 %<all,ts1>\DeclareUnicodeCharacter{00AC}{\textlnot}
 732 %<all,ts1,ly1,utf8>\DeclareUnicodeCharacter{00AE}{\textregistered}
 733 %<all,ts1>\DeclareUnicodeCharacter{00AF}{\textasciimacron}
 734 %<all,ts1,ly1>\DeclareUnicodeCharacter{00B0}{\textdegree}
 735 %<all,ts1>\DeclareUnicodeCharacter{00B1}{\textpm}
 736 %<all,ts1>\DeclareUnicodeCharacter{00B2}{\texttwosuperior}
 737 %<all,ts1>\DeclareUnicodeCharacter{00B3}{\textthreesuperior}
 738 %<all,ts1>\DeclareUnicodeCharacter{00B4}{\textasciiacute}
 739 %<all,ts1,ly1>\DeclareUnicodeCharacter{00B5}{\textmu} % micro sign
 740 %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{00B6}{\textparagraph}
 741 %<all,oms,ts1,ly1>\DeclareUnicodeCharacter{00B7}{\textperiodcentered}
 742 %<all,ot1>\DeclareUnicodeCharacter{00B8}{\c\ }
 743 %<all,ts1>\DeclareUnicodeCharacter{00B9}{\textonesuperior}
 744 %<all,ts1,ly1,utf8>\DeclareUnicodeCharacter{00BA}{\textordmasculine}
 745 %<*all,x2,t2c,t2b,t2a,t1,ot2,ly1,lcy>
 746 \DeclareUnicodeCharacter{00BB}{\guillemotright}
 747 %</all,x2,t2c,t2b,t2a,t1,ot2,ly1,lcy>
 748 %<all,ts1,ly1>\DeclareUnicodeCharacter{00BC}{\textonequarter}
 749 %<all,ts1,ly1>\DeclareUnicodeCharacter{00BD}{\textonehalf}
 750 %<all,ts1,ly1>\DeclareUnicodeCharacter{00BE}{\textthreequarters}
 751 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00BF}{\textquestiondown}
 752 %<all,t1,ly1>\DeclareUnicodeCharacter{00C0}{\@tabacckludge`A}
 753 %<all,t1,ly1>\DeclareUnicodeCharacter{00C1}{\@tabacckludge'A}
 754 %<all,t1,ly1>\DeclareUnicodeCharacter{00C2}{\^A}
 755 %<all,t1,ly1>\DeclareUnicodeCharacter{00C3}{\~A}
 756 %<all,t1,ly1>\DeclareUnicodeCharacter{00C4}{\"A}
 757 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00C5}{\r A}
 758 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{00C6}{\AE}
 759 %<all,t1,ly1>\DeclareUnicodeCharacter{00C7}{\c C}
 760 %<all,t1,ly1>\DeclareUnicodeCharacter{00C8}{\@tabacckludge`E}
 761 %<all,t1,ly1>\DeclareUnicodeCharacter{00C9}{\@tabacckludge'E}
 762 %<all,t1,ly1>\DeclareUnicodeCharacter{00CA}{\^E}
 763 %<all,t1,ly1>\DeclareUnicodeCharacter{00CB}{\"E}
 764 %<all,t1,ly1>\DeclareUnicodeCharacter{00CC}{\@tabacckludge`I}
 765 %<all,t1,ly1>\DeclareUnicodeCharacter{00CD}{\@tabacckludge'I}
 766 %<all,t1,ly1>\DeclareUnicodeCharacter{00CE}{\^I}
 767 %<all,t1,ly1>\DeclareUnicodeCharacter{00CF}{\"I}
 768 %<all,t1,ly1>\DeclareUnicodeCharacter{00D0}{\DH}
 769 %<all,t1,ly1>\DeclareUnicodeCharacter{00D1}{\~N}
 770 %<all,t1,ly1>\DeclareUnicodeCharacter{00D2}{\@tabacckludge`O}
 771 %<all,t1,ly1>\DeclareUnicodeCharacter{00D3}{\@tabacckludge'O}
 772 %<all,t1,ly1>\DeclareUnicodeCharacter{00D4}{\^O}
 773 %<all,t1,ly1>\DeclareUnicodeCharacter{00D5}{\~O}
 774 %<all,t1,ly1>\DeclareUnicodeCharacter{00D6}{\"O}
 775 %<all,ts1>\DeclareUnicodeCharacter{00D7}{\texttimes}
 776 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{00D8}{\O}
 777 %<all,t1,ly1>\DeclareUnicodeCharacter{00D9}{\@tabacckludge`U}
 778 %<all,t1,ly1>\DeclareUnicodeCharacter{00DA}{\@tabacckludge'U}
 779 %<all,t1,ly1>\DeclareUnicodeCharacter{00DB}{\^U}
 780 %<all,t1,ly1>\DeclareUnicodeCharacter{00DC}{\"U}
 781 %<all,t1,ly1>\DeclareUnicodeCharacter{00DD}{\@tabacckludge'Y}
 782 %<all,t1,ly1>\DeclareUnicodeCharacter{00DE}{\TH}
 783 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{00DF}{\ss}
 784 %<all,t1,ly1>\DeclareUnicodeCharacter{00E0}{\@tabacckludge`a}
 785 %<all,t1,ly1>\DeclareUnicodeCharacter{00E1}{\@tabacckludge'a}
 786 %<all,t1,ly1>\DeclareUnicodeCharacter{00E2}{\^a}
 787 %<all,t1,ly1>\DeclareUnicodeCharacter{00E3}{\~a}
 788 %<all,t1,ly1>\DeclareUnicodeCharacter{00E4}{\"a}
 789 %<all,t1,ly1>\DeclareUnicodeCharacter{00E5}{\r a}
 790 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{00E6}{\ae}
 791 %<all,t1,ly1>\DeclareUnicodeCharacter{00E7}{\c c}
 792 %<all,t1,ly1>\DeclareUnicodeCharacter{00E8}{\@tabacckludge`e}
 793 %<all,t1,ly1>\DeclareUnicodeCharacter{00E9}{\@tabacckludge'e}
 794 %<all,t1,ly1>\DeclareUnicodeCharacter{00EA}{\^e}
 795 %<all,t1,ly1>\DeclareUnicodeCharacter{00EB}{\"e}
 796 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00EC}{\@tabacckludge`\i}
 797 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00ED}{\@tabacckludge'\i}
 798 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00EE}{\^\i}
 799 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{00EF}{\"\i}
 800 %<all,t1,ly1>\DeclareUnicodeCharacter{00F0}{\dh}
 801 %<all,t1,ly1>\DeclareUnicodeCharacter{00F1}{\~n}
 802 %<all,t1,ly1>\DeclareUnicodeCharacter{00F2}{\@tabacckludge`o}
 803 %<all,t1,ly1>\DeclareUnicodeCharacter{00F3}{\@tabacckludge'o}
 804 %<all,t1,ly1>\DeclareUnicodeCharacter{00F4}{\^o}
 805 %<all,t1,ly1>\DeclareUnicodeCharacter{00F5}{\~o}
 806 %<all,t1,ly1>\DeclareUnicodeCharacter{00F6}{\"o}
 807 %<all,ts1>\DeclareUnicodeCharacter{00F7}{\textdiv}
 808 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{00F8}{\o}
 809 %<all,t1,ly1>\DeclareUnicodeCharacter{00F9}{\@tabacckludge`u}
 810 %<all,t1,ly1>\DeclareUnicodeCharacter{00FA}{\@tabacckludge'u}
 811 %<all,t1,ly1>\DeclareUnicodeCharacter{00FB}{\^u}
 812 %<all,t1,ly1>\DeclareUnicodeCharacter{00FC}{\"u}
 813 %<all,t1,ly1>\DeclareUnicodeCharacter{00FD}{\@tabacckludge'y}
 814 %<all,t1,ly1>\DeclareUnicodeCharacter{00FE}{\th}
 815 %<all,t1,ly1>\DeclareUnicodeCharacter{00FF}{\"y}
 816 %<all,t1>\DeclareUnicodeCharacter{0102}{\u A}
 817 %<all,t1>\DeclareUnicodeCharacter{0103}{\u a}
 818 %<all,t1>\DeclareUnicodeCharacter{0104}{\k A}
 819 %<all,t1>\DeclareUnicodeCharacter{0105}{\k a}
 820 %<all,t1>\DeclareUnicodeCharacter{0106}{\@tabacckludge'C}
 821 %<all,t1>\DeclareUnicodeCharacter{0107}{\@tabacckludge'c}
 822 %<all,t1>\DeclareUnicodeCharacter{010C}{\v C}
 823 %<all,t1>\DeclareUnicodeCharacter{010D}{\v c}
 824 %<all,t1>\DeclareUnicodeCharacter{010E}{\v D}
 825 %<all,t1>\DeclareUnicodeCharacter{010F}{\v d}
 826 %<all,t1>\DeclareUnicodeCharacter{0110}{\DJ}
 827 %<all,t1>\DeclareUnicodeCharacter{0111}{\dj}
 828 %<all,t1>\DeclareUnicodeCharacter{0118}{\k E}
 829 %<all,t1>\DeclareUnicodeCharacter{0119}{\k e}
 830 %<all,t1>\DeclareUnicodeCharacter{011A}{\v E}
 831 %<all,t1>\DeclareUnicodeCharacter{011B}{\v e}
 832 %<all,t1>\DeclareUnicodeCharacter{011E}{\u G}
 833 %<all,t1>\DeclareUnicodeCharacter{011F}{\u g}
 834 %<all,t1>\DeclareUnicodeCharacter{0130}{\.I}
 835 %<all,t2c,t2b,t2a,t1,ot2,ot1,ly1,lcy>\DeclareUnicodeCharacter{0131}{\i}
 836 %<all,t1>\DeclareUnicodeCharacter{0132}{\IJ}
 837 %<all,t1>\DeclareUnicodeCharacter{0133}{\ij}
 838 %<all,t1>\DeclareUnicodeCharacter{0139}{\@tabacckludge'L}
 839 %<all,t1>\DeclareUnicodeCharacter{013A}{\@tabacckludge'l}
 840 %<all,t1>\DeclareUnicodeCharacter{013D}{\v L}
 841 %<all,t1>\DeclareUnicodeCharacter{013E}{\v l}
 842 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{0141}{\L}
 843 %<all,t1,ot1,ly1>\DeclareUnicodeCharacter{0142}{\l}
 844 %<all,t1>\DeclareUnicodeCharacter{0143}{\@tabacckludge'N}
 845 %<all,t1>\DeclareUnicodeCharacter{0144}{\@tabacckludge'n}
 846 %<all,t1>\DeclareUnicodeCharacter{0147}{\v N}
 847 %<all,t1>\DeclareUnicodeCharacter{0148}{\v n}
 848 %<all,t1>\DeclareUnicodeCharacter{014A}{\NG}
 849 %<all,t1>\DeclareUnicodeCharacter{014B}{\ng}
 850 %<all,t1>\DeclareUnicodeCharacter{0150}{\H O}
 851 %<all,t1>\DeclareUnicodeCharacter{0151}{\H o}
 852 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{0152}{\OE}
 853 %<all,t1,ot1,ly1,lcy>\DeclareUnicodeCharacter{0153}{\oe}
 854 %<all,t1>\DeclareUnicodeCharacter{0154}{\@tabacckludge'R}
 855 %<all,t1>\DeclareUnicodeCharacter{0155}{\@tabacckludge'r}
 856 %<all,t1>\DeclareUnicodeCharacter{0158}{\v R}
 857 %<all,t1>\DeclareUnicodeCharacter{0159}{\v r}
 858 %<all,t1>\DeclareUnicodeCharacter{015A}{\@tabacckludge'S}
 859 %<all,t1>\DeclareUnicodeCharacter{015B}{\@tabacckludge's}
 860 %<all,t1>\DeclareUnicodeCharacter{015E}{\c S}
 861 %<all,t1>\DeclareUnicodeCharacter{015F}{\c s}
 862 %<all,t1,ly1>\DeclareUnicodeCharacter{0160}{\v S}
 863 %<all,t1,ly1>\DeclareUnicodeCharacter{0161}{\v s}
 864 %<all,t1>\DeclareUnicodeCharacter{0162}{\c T}
 865 %<all,t1>\DeclareUnicodeCharacter{0163}{\c t}
 866 %<all,t1>\DeclareUnicodeCharacter{0164}{\v T}
 867 %<all,t1>\DeclareUnicodeCharacter{0165}{\v t}
 868 %<all,t1>\DeclareUnicodeCharacter{016E}{\r U}
 869 %<all,t1>\DeclareUnicodeCharacter{016F}{\r u}
 870 %<all,t1>\DeclareUnicodeCharacter{0170}{\H U}
 871 %<all,t1>\DeclareUnicodeCharacter{0171}{\H u}
 872 %<all,t1,ly1>\DeclareUnicodeCharacter{0178}{\"Y}
 873 %<all,t1>\DeclareUnicodeCharacter{0179}{\@tabacckludge'Z}
 874 %<all,t1>\DeclareUnicodeCharacter{017A}{\@tabacckludge'z}
 875 %<all,t1>\DeclareUnicodeCharacter{017B}{\.Z}
 876 %<all,t1>\DeclareUnicodeCharacter{017C}{\.z}
 877 %<all,t1,ly1>\DeclareUnicodeCharacter{017D}{\v Z}
 878 %<all,t1,ly1>\DeclareUnicodeCharacter{017E}{\v z}
 879 %<all,ts1,ly1>\DeclareUnicodeCharacter{0192}{\textflorin}
 880 %<all,ly1,utf8>\DeclareUnicodeCharacter{02C6}{\textasciicircum}
 881 %<all,ts1>\DeclareUnicodeCharacter{02C7}{\textasciicaron}
 882 %<all,ly1,utf8>\DeclareUnicodeCharacter{02DC}{\textasciitilde}
 883 %<all,ts1>\DeclareUnicodeCharacter{02D8}{\textasciibreve}
 884 %<all,ts1>\DeclareUnicodeCharacter{02DD}{\textacutedbl}
 885 %    \end{macrocode}
 886 %    The Cyrillic code points have been recently checked (2007) and extended
 887 %    and corrected by Matthias Noe (\verb=a9931078@unet.univie.ac.at=) --- thanks.
 888 % \changes{v1.1j}{2007/11/09}{Added a few new unicode decls in cyrillic (pr/3988)}
 889 % \changes{v1.1k}{2007/11/11}{Added and further unicode decls in cyrillic}
 890 % \changes{v1.1n}{2015/06/27}{correct accent http://tex.stackexchange.com/q/252521}
 891 %    \begin{macrocode}
 892 %<*all,x2,t2c,t2b,t2a,ot2,lcy>
 893 \DeclareUnicodeCharacter{0400}{\@tabacckludge`\CYRE}
 894 %</all,x2,t2c,t2b,t2a,ot2,lcy>
 895 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{0401}{\CYRYO}
 896 %<all,x2,t2a,ot2>\DeclareUnicodeCharacter{0402}{\CYRDJE}
 897 %<*all,x2,t2c,t2b,t2a,ot2,lcy>
 898 \DeclareUnicodeCharacter{0403}{\@tabacckludge'\CYRG}
 899 %</all,x2,t2c,t2b,t2a,ot2,lcy>
 900 %<all,x2,t2a,ot2,lcy>\DeclareUnicodeCharacter{0404}{\CYRIE}
 901 %<all,x2,t2c,t2b,t2a,ot2>\DeclareUnicodeCharacter{0405}{\CYRDZE}
 902 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{0406}{\CYRII}
 903 %<all,x2,t2a,lcy>\DeclareUnicodeCharacter{0407}{\CYRYI}
 904 %<all,x2,t2c,t2b,t2a,ot2>\DeclareUnicodeCharacter{0408}{\CYRJE}
 905 %<all,x2,t2b,t2a,ot2>\DeclareUnicodeCharacter{0409}{\CYRLJE}
 906 %<all,x2,t2b,t2a,ot2>\DeclareUnicodeCharacter{040A}{\CYRNJE}
 907 %<all,x2,t2a,ot2>\DeclareUnicodeCharacter{040B}{\CYRTSHE}
 908 %<*all,x2,t2c,t2b,t2a,ot2,lcy>
 909 \DeclareUnicodeCharacter{040C}{\@tabacckludge'\CYRK}
 910 \DeclareUnicodeCharacter{040D}{\@tabacckludge`\CYRI}
 911 %</all,x2,t2c,t2b,t2a,ot2,lcy>
 912 %<all,x2,t2b,t2a,lcy>\DeclareUnicodeCharacter{040E}{\CYRUSHRT}
 913 %<all,x2,t2c,t2a,ot2>\DeclareUnicodeCharacter{040F}{\CYRDZHE}
 914 %<*all,x2,t2c,t2b,t2a,ot2,lcy>
 915 \DeclareUnicodeCharacter{0410}{\CYRA}
 916 \DeclareUnicodeCharacter{0411}{\CYRB}
 917 \DeclareUnicodeCharacter{0412}{\CYRV}
 918 \DeclareUnicodeCharacter{0413}{\CYRG}
 919 \DeclareUnicodeCharacter{0414}{\CYRD}
 920 \DeclareUnicodeCharacter{0415}{\CYRE}
 921 \DeclareUnicodeCharacter{0416}{\CYRZH}
 922 \DeclareUnicodeCharacter{0417}{\CYRZ}
 923 \DeclareUnicodeCharacter{0418}{\CYRI}
 924 \DeclareUnicodeCharacter{0419}{\CYRISHRT}
 925 \DeclareUnicodeCharacter{041A}{\CYRK}
 926 \DeclareUnicodeCharacter{041B}{\CYRL}
 927 \DeclareUnicodeCharacter{041C}{\CYRM}
 928 \DeclareUnicodeCharacter{041D}{\CYRN}
 929 \DeclareUnicodeCharacter{041E}{\CYRO}
 930 \DeclareUnicodeCharacter{041F}{\CYRP}
 931 \DeclareUnicodeCharacter{0420}{\CYRR}
 932 \DeclareUnicodeCharacter{0421}{\CYRS}
 933 \DeclareUnicodeCharacter{0422}{\CYRT}
 934 \DeclareUnicodeCharacter{0423}{\CYRU}
 935 \DeclareUnicodeCharacter{0424}{\CYRF}
 936 \DeclareUnicodeCharacter{0425}{\CYRH}
 937 \DeclareUnicodeCharacter{0426}{\CYRC}
 938 \DeclareUnicodeCharacter{0427}{\CYRCH}
 939 \DeclareUnicodeCharacter{0428}{\CYRSH}
 940 \DeclareUnicodeCharacter{0429}{\CYRSHCH}
 941 \DeclareUnicodeCharacter{042A}{\CYRHRDSN}
 942 \DeclareUnicodeCharacter{042B}{\CYRERY}
 943 \DeclareUnicodeCharacter{042C}{\CYRSFTSN}
 944 \DeclareUnicodeCharacter{042D}{\CYREREV}
 945 \DeclareUnicodeCharacter{042E}{\CYRYU}
 946 \DeclareUnicodeCharacter{042F}{\CYRYA}
 947 \DeclareUnicodeCharacter{0430}{\cyra}
 948 \DeclareUnicodeCharacter{0431}{\cyrb}
 949 \DeclareUnicodeCharacter{0432}{\cyrv}
 950 \DeclareUnicodeCharacter{0433}{\cyrg}
 951 \DeclareUnicodeCharacter{0434}{\cyrd}
 952 \DeclareUnicodeCharacter{0435}{\cyre}
 953 \DeclareUnicodeCharacter{0436}{\cyrzh}
 954 \DeclareUnicodeCharacter{0437}{\cyrz}
 955 \DeclareUnicodeCharacter{0438}{\cyri}
 956 \DeclareUnicodeCharacter{0439}{\cyrishrt}
 957 \DeclareUnicodeCharacter{043A}{\cyrk}
 958 \DeclareUnicodeCharacter{043B}{\cyrl}
 959 \DeclareUnicodeCharacter{043C}{\cyrm}
 960 \DeclareUnicodeCharacter{043D}{\cyrn}
 961 \DeclareUnicodeCharacter{043E}{\cyro}
 962 \DeclareUnicodeCharacter{043F}{\cyrp}
 963 \DeclareUnicodeCharacter{0440}{\cyrr}
 964 \DeclareUnicodeCharacter{0441}{\cyrs}
 965 \DeclareUnicodeCharacter{0442}{\cyrt}
 966 \DeclareUnicodeCharacter{0443}{\cyru}
 967 \DeclareUnicodeCharacter{0444}{\cyrf}
 968 \DeclareUnicodeCharacter{0445}{\cyrh}
 969 \DeclareUnicodeCharacter{0446}{\cyrc}
 970 \DeclareUnicodeCharacter{0447}{\cyrch}
 971 \DeclareUnicodeCharacter{0448}{\cyrsh}
 972 \DeclareUnicodeCharacter{0449}{\cyrshch}
 973 \DeclareUnicodeCharacter{044A}{\cyrhrdsn}
 974 \DeclareUnicodeCharacter{044B}{\cyrery}
 975 \DeclareUnicodeCharacter{044C}{\cyrsftsn}
 976 \DeclareUnicodeCharacter{044D}{\cyrerev}
 977 \DeclareUnicodeCharacter{044E}{\cyryu}
 978 \DeclareUnicodeCharacter{044F}{\cyrya}
 979 \DeclareUnicodeCharacter{0450}{\@tabacckludge`\cyre}
 980 \DeclareUnicodeCharacter{0451}{\cyryo}
 981 %</all,x2,t2c,t2b,t2a,ot2,lcy>
 982 %<all,x2,t2a,ot2>\DeclareUnicodeCharacter{0452}{\cyrdje}
 983 %<*all,x2,t2c,t2b,t2a,ot2,lcy>
 984 \DeclareUnicodeCharacter{0453}{\@tabacckludge'\cyrg}
 985 %</all,x2,t2c,t2b,t2a,ot2,lcy>
 986 %<all,x2,t2a,ot2,lcy>\DeclareUnicodeCharacter{0454}{\cyrie}
 987 %<all,x2,t2c,t2b,t2a,ot2>\DeclareUnicodeCharacter{0455}{\cyrdze}
 988 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{0456}{\cyrii}
 989 %<all,x2,t2a,lcy>\DeclareUnicodeCharacter{0457}{\cyryi}
 990 %<all,x2,t2c,t2b,t2a,ot2>\DeclareUnicodeCharacter{0458}{\cyrje}
 991 %<all,x2,t2b,t2a,ot2>\DeclareUnicodeCharacter{0459}{\cyrlje}
 992 %<all,x2,t2b,t2a,ot2>\DeclareUnicodeCharacter{045A}{\cyrnje}
 993 %<all,x2,t2a,ot2>\DeclareUnicodeCharacter{045B}{\cyrtshe}
 994 %<*all,x2,t2c,t2b,t2a,ot2,lcy>
 995 \DeclareUnicodeCharacter{045C}{\@tabacckludge'\cyrk}
 996 \DeclareUnicodeCharacter{045D}{\@tabacckludge`\cyri}
 997 %</all,x2,t2c,t2b,t2a,ot2,lcy>
 998 %<all,x2,t2b,t2a,lcy>\DeclareUnicodeCharacter{045E}{\cyrushrt}
 999 %<all,x2,t2c,t2a,ot2>\DeclareUnicodeCharacter{045F}{\cyrdzhe}
1000 %<all,x2,ot2>\DeclareUnicodeCharacter{0462}{\CYRYAT}
1001 %<all,x2,ot2>\DeclareUnicodeCharacter{0463}{\cyryat}
1002 %<all,x2>\DeclareUnicodeCharacter{046A}{\CYRBYUS}
1003 %<all,x2>\DeclareUnicodeCharacter{046B}{\cyrbyus}
1004 %    \end{macrocode}
1005 %    The next two declarations are questionable, the encoding definition
1006 %    should probably contain |\CYROTLD| and |\cyrotld|. Or alternatively, if
1007 %    the characters in the X2 encodings are really meant to represent the
1008 %    historical characters in Ux0472 and Ux0473 (they look like them) then
1009 %    they would need to change instead.
1010 %
1011 %    However, their looks are probably a font designers decision and the next
1012 %    two mappings are wrong or rather the names in OT2 should change for
1013 %    consistency.
1014 %
1015 %    On the other hand the names |\CYROTLD| are somewhat questionabled as the
1016 %    Unicode standard only describes ``Cyrillic barred O'' while |TLD| refers
1017 %    to a tilde (which is more less what the ``Cyrillic FITA looks according
1018 %    to the Unicode book).
1019 %    \begin{macrocode}
1020 %<all,ot2>\DeclareUnicodeCharacter{0472}{\CYRFITA}
1021 %<all,ot2>\DeclareUnicodeCharacter{0473}{\cyrfita}
1022 %    \end{macrocode}
1023 %
1024 %    \begin{macrocode}
1025 %<all,x2,ot2>\DeclareUnicodeCharacter{0474}{\CYRIZH}
1026 %<all,x2,ot2>\DeclareUnicodeCharacter{0475}{\cyrizh}
1027 %    \end{macrocode}
1028 %    While the double grave accent seems to exist in X2, T2A, T2B and T2C
1029 %    encoding, the letter izhitsa exists only in X2 and OT2. Therefore,
1030 %    izhitsa with double grave seems to be possible only using X2.
1031 %    \begin{macrocode}
1032 %<all,x2>\DeclareUnicodeCharacter{0476}{\C\CYRIZH}
1033 %<all,x2>\DeclareUnicodeCharacter{0477}{\C\cyrizh}
1034 %    \end{macrocode}
1035 %
1036 %    \begin{macrocode}
1037 %<all,t2c>\DeclareUnicodeCharacter{048C}{\CYRSEMISFTSN}
1038 %<all,t2c>\DeclareUnicodeCharacter{048D}{\cyrsemisftsn}
1039 %<all,t2c>\DeclareUnicodeCharacter{048E}{\CYRRTICK}
1040 %<all,t2c>\DeclareUnicodeCharacter{048F}{\cyrrtick}
1041 %<all,x2,t2a,lcy>\DeclareUnicodeCharacter{0490}{\CYRGUP}
1042 %<all,x2,t2a,lcy>\DeclareUnicodeCharacter{0491}{\cyrgup}
1043 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{0492}{\CYRGHCRS}
1044 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{0493}{\cyrghcrs}
1045 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{0494}{\CYRGHK}
1046 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{0495}{\cyrghk}
1047 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{0496}{\CYRZHDSC}
1048 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{0497}{\cyrzhdsc}
1049 %<all,x2,t2a>\DeclareUnicodeCharacter{0498}{\CYRZDSC}
1050 %<all,x2,t2a>\DeclareUnicodeCharacter{0499}{\cyrzdsc}
1051 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{049A}{\CYRKDSC}
1052 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{049B}{\cyrkdsc}
1053 %<all,x2,t2a>\DeclareUnicodeCharacter{049C}{\CYRKVCRS}
1054 %<all,x2,t2a>\DeclareUnicodeCharacter{049D}{\cyrkvcrs}
1055 %<all,x2,t2c>\DeclareUnicodeCharacter{049E}{\CYRKHCRS}
1056 %<all,x2,t2c>\DeclareUnicodeCharacter{049F}{\cyrkhcrs}
1057 %<all,x2,t2a>\DeclareUnicodeCharacter{04A0}{\CYRKBEAK}
1058 %<all,x2,t2a>\DeclareUnicodeCharacter{04A1}{\cyrkbeak}
1059 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04A2}{\CYRNDSC}
1060 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04A3}{\cyrndsc}
1061 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{04A4}{\CYRNG}
1062 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{04A5}{\cyrng}
1063 %<all,x2,t2c>\DeclareUnicodeCharacter{04A6}{\CYRPHK}
1064 %<all,x2,t2c>\DeclareUnicodeCharacter{04A7}{\cyrphk}
1065 %<all,x2,t2c>\DeclareUnicodeCharacter{04A8}{\CYRABHHA}
1066 %<all,x2,t2c>\DeclareUnicodeCharacter{04A9}{\cyrabhha}
1067 %<all,x2,t2a>\DeclareUnicodeCharacter{04AA}{\CYRSDSC}
1068 %<all,x2,t2a>\DeclareUnicodeCharacter{04AB}{\cyrsdsc}
1069 %<all,x2,t2c>\DeclareUnicodeCharacter{04AC}{\CYRTDSC}
1070 %<all,x2,t2c>\DeclareUnicodeCharacter{04AD}{\cyrtdsc}
1071 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{04AE}{\CYRY}
1072 %<all,x2,t2b,t2a>\DeclareUnicodeCharacter{04AF}{\cyry}
1073 %<all,x2,t2a>\DeclareUnicodeCharacter{04B0}{\CYRYHCRS}
1074 %<all,x2,t2a>\DeclareUnicodeCharacter{04B1}{\cyryhcrs}
1075 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04B2}{\CYRHDSC}
1076 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04B3}{\cyrhdsc}
1077 %<all,x2,t2c>\DeclareUnicodeCharacter{04B4}{\CYRTETSE}
1078 %<all,x2,t2c>\DeclareUnicodeCharacter{04B5}{\cyrtetse}
1079 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04B6}{\CYRCHRDSC}
1080 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04B7}{\cyrchrdsc}
1081 %<all,x2,t2a>\DeclareUnicodeCharacter{04B8}{\CYRCHVCRS}
1082 %<all,x2,t2a>\DeclareUnicodeCharacter{04B9}{\cyrchvcrs}
1083 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04BA}{\CYRSHHA}
1084 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04BB}{\cyrshha}
1085 %<all,x2,t2c>\DeclareUnicodeCharacter{04BC}{\CYRABHCH}
1086 %<all,x2,t2c>\DeclareUnicodeCharacter{04BD}{\cyrabhch}
1087 %<all,x2,t2c>\DeclareUnicodeCharacter{04BE}{\CYRABHCHDSC}
1088 %<all,x2,t2c>\DeclareUnicodeCharacter{04BF}{\cyrabhchdsc}
1089 %    \end{macrocode}
1090 %    The character |\CYRpalochka| is not defined by OT2 and LCY. However it is
1091 %    looking identical to |\CYRII| and the Unicode standard explicitly refers
1092 %    to that (and to Latin I). So perhaps those encodings could get an alias?
1093 %    On the other hand, why are there two distinct slots in the T2 encodings
1094 %    even though they are so pressed for space? Perhaps they don't always look
1095 %    alike.
1096 %    \begin{macrocode}
1097 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04C0}{\CYRpalochka}
1098 %    \end{macrocode}
1099 %
1100 %    \begin{macrocode}
1101 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04C1}{\U\CYRZH}
1102 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04C2}{\U\cyrzh}
1103 %<all,x2,t2b>\DeclareUnicodeCharacter{04C3}{\CYRKHK}
1104 %<all,x2,t2b>\DeclareUnicodeCharacter{04C4}{\cyrkhk}
1105 %    \end{macrocode}
1106 %    According to the Unicode standard Ux04C5 should be an L with ``tail'' not
1107 %    with descender (which also exists as Ux04A2) but it looks as if the char
1108 %    names do not make this distinction). Should they?
1109 %    \begin{macrocode}
1110 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{04C5}{\CYRLDSC}
1111 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{04C6}{\cyrldsc}
1112 %    \end{macrocode}
1113 %
1114 %    \begin{macrocode}
1115 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{04C7}{\CYRNHK}
1116 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{04C8}{\cyrnhk}
1117 %<all,x2,t2b>\DeclareUnicodeCharacter{04CB}{\CYRCHLDSC}
1118 %<all,x2,t2b>\DeclareUnicodeCharacter{04CC}{\cyrchldsc}
1119 %    \end{macrocode}
1120 %    According to the Unicode standard Ux04CD should be an M with ``tail'' not
1121 %    with descender. However this time there is no M with descender in the
1122 %    Unicode standard.
1123 %    \begin{macrocode}
1124 %<all,x2,t2c>\DeclareUnicodeCharacter{04CD}{\CYRMDSC}
1125 %<all,x2,t2c>\DeclareUnicodeCharacter{04CE}{\cyrmdsc}
1126 %    \end{macrocode}
1127 %
1128 %    \begin{macrocode}
1129 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04D0}{\U\CYRA}
1130 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04D1}{\U\cyra}
1131 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04D2}{\"\CYRA}
1132 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04D3}{\"\cyra}
1133 %<all,x2,t2a>\DeclareUnicodeCharacter{04D4}{\CYRAE}
1134 %<all,x2,t2a>\DeclareUnicodeCharacter{04D5}{\cyrae}
1135 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04D6}{\U\CYRE}
1136 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04D7}{\U\cyre}
1137 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04D8}{\CYRSCHWA}
1138 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04D9}{\cyrschwa}
1139 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04DA}{\"\CYRSCHWA}
1140 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04DB}{\"\cyrschwa}
1141 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04DC}{\"\CYRZH}
1142 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04DD}{\"\cyrzh}
1143 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04DE}{\"\CYRZ}
1144 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04DF}{\"\cyrz}
1145 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{04E0}{\CYRABHDZE}
1146 %<all,x2,t2c,t2b>\DeclareUnicodeCharacter{04E1}{\cyrabhdze}
1147 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04E2}{\=\CYRI}
1148 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04E3}{\=\cyri}
1149 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04E4}{\"\CYRI}
1150 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04E5}{\"\cyri}
1151 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04E6}{\"\CYRO}
1152 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04E7}{\"\cyro}
1153 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04E8}{\CYROTLD}
1154 %<all,x2,t2c,t2b,t2a>\DeclareUnicodeCharacter{04E9}{\cyrotld}
1155 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04EC}{\"\CYREREV}
1156 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04ED}{\"\cyrerev}
1157 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04EE}{\=\CYRU}
1158 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04EF}{\=\cyru}
1159 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F0}{\"\CYRU}
1160 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F1}{\"\cyru}
1161 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F2}{\H\CYRU}
1162 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F3}{\H\cyru}
1163 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F4}{\"\CYRCH}
1164 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F5}{\"\cyrch}
1165 %<all,x2,t2b>\DeclareUnicodeCharacter{04F6}{\CYRGDSC}
1166 %<all,x2,t2b>\DeclareUnicodeCharacter{04F7}{\cyrgdsc}
1167 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F8}{\"\CYRERY}
1168 %<all,x2,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{04F9}{\"\cyrery}
1169 %<all,t2b>\DeclareUnicodeCharacter{04FA}{\CYRGDSCHCRS}
1170 %<all,t2b>\DeclareUnicodeCharacter{04FB}{\cyrgdschcrs}
1171 %<all,x2,t2b>\DeclareUnicodeCharacter{04FC}{\CYRHHK}
1172 %<all,x2,t2b>\DeclareUnicodeCharacter{04FD}{\cyrhhk}
1173 %<all,t2b>\DeclareUnicodeCharacter{04FE}{\CYRHHCRS}
1174 %<all,t2b>\DeclareUnicodeCharacter{04FF}{\cyrhhcrs}
1175 %<all,ts1>\DeclareUnicodeCharacter{0E3F}{\textbaht}
1176 %<all,x2,t2c,t2b,t2a,t1,utf8>\DeclareUnicodeCharacter{200C}{\textcompwordmark}
1177 %<*all,x2,t2c,t2b,t2a,t1,ot2,ot1,ly1,lcy>
1178 \DeclareUnicodeCharacter{2013}{\textendash}
1179 \DeclareUnicodeCharacter{2014}{\textemdash}
1180 %</all,x2,t2c,t2b,t2a,t1,ot2,ot1,ly1,lcy>
1181 %<all,ts1>\DeclareUnicodeCharacter{2016}{\textbardbl}
1182 %<*all,x2,t2c,t2b,t2a,t1,ot2,ot1,lcy>
1183 \DeclareUnicodeCharacter{2018}{\textquoteleft}
1184 \DeclareUnicodeCharacter{2019}{\textquoteright}
1185 %</all,x2,t2c,t2b,t2a,t1,ot2,ot1,lcy>
1186 %<all,t1>\DeclareUnicodeCharacter{201A}{\quotesinglbase}
1187 %<*all,x2,t2c,t2b,t2a,t1,ot2,ot1,ly1,lcy>
1188 \DeclareUnicodeCharacter{201C}{\textquotedblleft}
1189 \DeclareUnicodeCharacter{201D}{\textquotedblright}
1190 %</all,x2,t2c,t2b,t2a,t1,ot2,ot1,ly1,lcy>
1191 %<all,x2,t2c,t2b,t2a,t1,lcy>\DeclareUnicodeCharacter{201E}{\quotedblbase}
1192 %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{2020}{\textdagger}
1193 %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{2021}{\textdaggerdbl}
1194 %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{2022}{\textbullet}
1195 %<all,ly1,utf8>\DeclareUnicodeCharacter{2026}{\textellipsis}
1196 %<*all,x2,ts1,t2c,t2b,t2a,t1,ly1>
1197 \DeclareUnicodeCharacter{2030}{\textperthousand}
1198 %</all,x2,ts1,t2c,t2b,t2a,t1,ly1>
1199 %<*all,x2,ts1,t2c,t2b,t2a,t1>
1200 \DeclareUnicodeCharacter{2031}{\textpertenthousand}
1201 %</all,x2,ts1,t2c,t2b,t2a,t1>
1202 %<all,t1,ly1>\DeclareUnicodeCharacter{2039}{\guilsinglleft}
1203 %<all,t1,ly1>\DeclareUnicodeCharacter{203A}{\guilsinglright}
1204 %<all,ts1>\DeclareUnicodeCharacter{203B}{\textreferencemark}
1205 %<all,ts1>\DeclareUnicodeCharacter{203D}{\textinterrobang}
1206 %<all,ts1>\DeclareUnicodeCharacter{2044}{\textfractionsolidus}
1207 %<all,ts1>\DeclareUnicodeCharacter{204E}{\textasteriskcentered}
1208 %<all,ts1>\DeclareUnicodeCharacter{2052}{\textdiscount}
1209 %<all,ts1>\DeclareUnicodeCharacter{20A1}{\textcolonmonetary}
1210 %<all,ts1>\DeclareUnicodeCharacter{20A4}{\textlira}
1211 %<all,ts1>\DeclareUnicodeCharacter{20A6}{\textnaira}
1212 %<all,ts1>\DeclareUnicodeCharacter{20A9}{\textwon}
1213 %<all,ts1>\DeclareUnicodeCharacter{20AB}{\textdong}
1214 %<all,ts1>\DeclareUnicodeCharacter{20AC}{\texteuro}
1215 %<all,ts1>\DeclareUnicodeCharacter{20B1}{\textpeso}
1216 %<all,ts1>\DeclareUnicodeCharacter{2103}{\textcelsius}
1217 %<all,x2,ts1,t2c,t2b,t2a,ot2,lcy>\DeclareUnicodeCharacter{2116}{\textnumero}
1218 %<all,ts1>\DeclareUnicodeCharacter{2117}{\textcircledP}
1219 %<all,ts1>\DeclareUnicodeCharacter{211E}{\textrecipe}
1220 %<all,ts1>\DeclareUnicodeCharacter{2120}{\textservicemark}
1221 %<all,ts1,ly1,utf8>\DeclareUnicodeCharacter{2122}{\texttrademark}
1222 %<all,ts1>\DeclareUnicodeCharacter{2126}{\textohm}
1223 %<all,ts1>\DeclareUnicodeCharacter{2127}{\textmho}
1224 %<all,ts1>\DeclareUnicodeCharacter{212E}{\textestimated}
1225 %<all,ts1>\DeclareUnicodeCharacter{2190}{\textleftarrow}
1226 %<all,ts1>\DeclareUnicodeCharacter{2191}{\textuparrow}
1227 %<all,ts1>\DeclareUnicodeCharacter{2192}{\textrightarrow}
1228 %<all,ts1>\DeclareUnicodeCharacter{2193}{\textdownarrow}
1229 %<all,x2,ts1,t2c,t2b,t2a>\DeclareUnicodeCharacter{2329}{\textlangle}
1230 %<all,x2,ts1,t2c,t2b,t2a>\DeclareUnicodeCharacter{232A}{\textrangle}
1231 %<all,ts1>\DeclareUnicodeCharacter{2422}{\textblank}
1232 %<all,x2,t2c,t2b,t2a,t1,utf8>\DeclareUnicodeCharacter{2423}{\textvisiblespace}
1233 %<all,ts1>\DeclareUnicodeCharacter{25E6}{\textopenbullet}
1234 %<all,ts1>\DeclareUnicodeCharacter{25EF}{\textbigcircle}
1235 %<all,ts1>\DeclareUnicodeCharacter{266A}{\textmusicalnote}
1236 %    \end{macrocode}
1237 %
1238 % \subsection{Notes}
1239 %
1240 % \changes{v1.1e}{2004/05/22}{Added notes on inconsistency with `8-bit files'.}
1241 % The following inputs are inconsistent with the 8-bit inputenc files
1242 % since they will always only produce the `text character'.  This is an
1243 % area where inputenc is notoriously confused.
1244 % \begin{verbatim}
1245 % %<all,ts1,t1,ot1,ly1>\DeclareUnicodeCharacter{00A3}{\textsterling}
1246 % %<*all,x2,ts1,t2c,t2b,t2a,oms,ly1>
1247 % \DeclareUnicodeCharacter{00A7}{\textsection}
1248 % %</all,x2,ts1,t2c,t2b,t2a,oms,ly1>
1249 % %<all,ts1,utf8>\DeclareUnicodeCharacter{00A9}{\textcopyright}
1250 % %<all,ts1>\DeclareUnicodeCharacter{00B1}{\textpm}
1251 % %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{00B6}{\textparagraph}
1252 % %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{2020}{\textdagger}
1253 % %<all,ts1,oms,ly1>\DeclareUnicodeCharacter{2021}{\textdaggerdbl}
1254 % %<all,ly1,utf8>\DeclareUnicodeCharacter{2026}{\textellipsis}
1255 % \end{verbatim}
1256 %
1257 %    The following definitions are in an encoding file but have no
1258 %    direct equivalent in Unicode, or they simply do not make sense in that
1259 %    context (or we have not yet found anything or \ldots :-).  For
1260 %    example, the non-combining accent characters are certainly
1261 %    available somewhere but these are not equivalent to a \TeX{}
1262 %    accent command.
1263 %\begin{verbatim}
1264 %\DeclareTextSymbol{\j}{OT1}{17}
1265 %\DeclareTextSymbol{\SS}{T1}{223}
1266 %\DeclareTextSymbol{\textcompwordmark}{T1}{23}
1267 %
1268 %\DeclareTextAccent{\"}{OT1}{127}
1269 %\DeclareTextAccent{\'}{OT1}{19}
1270 %\DeclareTextAccent{\.}{OT1}{95}
1271 %\DeclareTextAccent{\=}{OT1}{22}
1272 %\DeclareTextAccent{\H}{OT1}{125}
1273 %\DeclareTextAccent{\^}{OT1}{94}
1274 %\DeclareTextAccent{\`}{OT1}{18}
1275 %\DeclareTextAccent{\r}{OT1}{23}
1276 %\DeclareTextAccent{\u}{OT1}{21}
1277 %\DeclareTextAccent{\v}{OT1}{20}
1278 %\DeclareTextAccent{\~}{OT1}{126}
1279 %\DeclareTextCommand{\b}{OT1}[1]
1280 %\DeclareTextCommand{\c}{OT1}[1]
1281 %\DeclareTextCommand{\d}{OT1}[1]
1282 %\DeclareTextCommand{\k}{T1}[1]
1283 %\end{verbatim}
1284 %
1285 %
1286 %
1287 % \subsection{Mappings for OT1 glyphs}
1288 %
1289 %    This is even more incomplete as again it covers only the single
1290 %    glyphs from |OT1| plus some that have been explicitly defined for
1291 %    this encoding. Everything that is provided in |T1|, and that
1292 %    could be provided as composite glyphs via |OT1|, could and
1293 %    probably should be set up as well.  Which leaves the many things
1294 %    that are not provided in |T1| but can be provided in |OT1| (and
1295 %    in |T1|) by composite glyphs.
1296
1297 %    \begin{macrocode}
1298 %    \end{macrocode}
1299 % Stuff not mapped (note that |\j| ($\jmath$) is not equivalent to any
1300 % Unicode character):
1301 %\begin{verbatim}
1302 %\DeclareTextSymbol{\j}{OT1}{17}
1303 %\DeclareTextAccent{\"}{OT1}{127}
1304 %\DeclareTextAccent{\'}{OT1}{19}
1305 %\DeclareTextAccent{\.}{OT1}{95}
1306 %\DeclareTextAccent{\=}{OT1}{22}
1307 %\DeclareTextAccent{\^}{OT1}{94}
1308 %\DeclareTextAccent{\`}{OT1}{18}
1309 %\DeclareTextAccent{\~}{OT1}{126}
1310 %\DeclareTextAccent{\H}{OT1}{125}
1311 %\DeclareTextAccent{\u}{OT1}{21}
1312 %\DeclareTextAccent{\v}{OT1}{20}
1313 %\DeclareTextAccent{\r}{OT1}{23}
1314 %\DeclareTextCommand{\b}{OT1}[1]
1315 %\DeclareTextCommand{\c}{OT1}[1]
1316 %\DeclareTextCommand{\d}{OT1}[1]
1317 %\end{verbatim}
1318 %
1319 %
1320 %
1321 % \subsection{Mappings for OMS glyphs}
1322 %
1323 % Characters like |\textbackslash| are not mapped as they are
1324 %    (primarily) only in the lower 127 and the code here only sets up
1325 %    mappings for UTF-8 characters that are at least 2 octets long.
1326 %\begin{verbatim}
1327 %\DeclareTextSymbol{\textbackslash}{OMS}{110}        % "6E
1328 %\DeclareTextSymbol{\textbar}{OMS}{106}              % "6A
1329 %\DeclareTextSymbol{\textbraceleft}{OMS}{102}        % "66
1330 %\DeclareTextSymbol{\textbraceright}{OMS}{103}       % "67
1331 %\end{verbatim}
1332 %
1333 % But the following (and some others) might actually lurk in Unicode
1334 %    somewhere\ldots
1335 %\begin{verbatim}
1336 %\DeclareTextSymbol{\textasteriskcentered}{OMS}{3}   % "03
1337 %\DeclareTextCommand{\textcircled}{OMS}
1338 %\end{verbatim}
1339 %
1340 %
1341 %
1342 %
1343 % \subsection{Mappings for TS1 glyphs}
1344 %
1345 % Exercise for somebody else.
1346 %
1347 %
1348 % \subsection{Mappings for \texttt{latex.ltx} glyphs}
1349 %
1350 % There is also a collection of characters already set up in the kernel,
1351 % one way or the other. Since these do not clearly relate to any
1352 %    particular font encoding they are mapped when the
1353 % \texttt{utf8} support is first set up.
1354 %
1355 % Also there are a number of |\providecommand|s in the various input
1356 % encoding files which may or may not go into this part.
1357 % \changes{v1.1b}{2004/02/09}{Added commands already defined in the kernel}
1358 %    \begin{macrocode}
1359 %<*utf8>
1360 % This space is intentionally empty ...
1361 %</utf8>
1362 %    \end{macrocode}
1363 %
1364 %
1365 % \section{A test document}
1366 %
1367 %    Here is a very small test document which may or may not survive
1368 %    if the current document is transferred from one place to the
1369 %    other.
1370 %    \begin{macrocode}
1371 %<*test>
1372 \documentclass{article}
1373
1374 \usepackage[latin1,utf8]{inputenc}
1375 \usepackage[T1]{fontenc}
1376 \usepackage{trace}
1377
1378 \scrollmode  % to run past the error below
1379
1380 \begin{document}
1381
1382  German umlauts in UTF-8: ^^c3^^a4^^c3^^b6^^c3^^bc  %%% äöü
1383
1384 \inputencoding{latin1}  % switch to latin1
1385
1386  German umlauts in UTF-8 but read by latin1 (and will produce one
1387  error since \verb=\textcurrency= is not provided):
1388  ^^c3^^a4^^c3^^b6^^c3^^bc
1389
1390 \inputencoding{utf8}    % switch back to utf8
1391
1392  German umlauts in UTF-8: ^^c3^^a4^^c3^^b6^^c3^^bc
1393
1394
1395 Some codes that should produce errors as nothing is set up
1396 for them: ^^c3F ^^e1^^a4^^b6
1397
1398 And some that are not legal utf8 sequences: ^^c3X ^^e1XY
1399
1400 \showoutput
1401 \tracingstats=2
1402 \stop
1403 %</test>
1404 %    \end{macrocode}
1405 %
1406 % \Finale
1407 %
1408 \endinput