gnu/dist/gettext/gettext-tools/src/po-lex.c

   1 /* GNU gettext - internationalization aids
   2    Copyright (C) 1995-1999, 2000-2004 Free Software Foundation, Inc.
   3
   4    This file was written by Peter Miller <millerp@canb.auug.org.au>.
   5    Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software Foundation,
  19    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  20
  21
  22 #ifdef HAVE_CONFIG_H
  23 # include "config.h"
  24 #endif
  25
  26 /* Specification.  */
  27 #include "po-lex.h"
  28
  29 #include <errno.h>
  30 #include <limits.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <stdarg.h>
  35
  36 #if HAVE_ICONV
  37 # include <iconv.h>
  38 #endif
  39
  40 #include "c-ctype.h"
  41 #include "linebreak.h"
  42 #include "vasprintf.h"
  43 #include "gettext.h"
  44 #include "po-charset.h"
  45 #include "xalloc.h"
  46 #include "exit.h"
  47 #include "error.h"
  48 #include "error-progname.h"
  49 #include "pos.h"
  50 #include "str-list.h"
  51 #include "po-gram-gen2.h"
  52
  53 #define _(str) gettext(str)
  54
  55 #if HAVE_ICONV
  56 # include "utf8-ucs4.h"
  57 #endif
  58
  59 #if HAVE_DECL_GETC_UNLOCKED
  60 # undef getc
  61 # define getc getc_unlocked
  62 #endif
  63
  64
  65 /* Current position within the PO file.  */
  66 lex_pos_ty gram_pos;
  67 int gram_pos_column;
  68
  69
  70 /* Error handling during the parsing of a PO file.
  71    These functions can access gram_pos and gram_pos_column.  */
  72
  73 #if !(__STDC__ && \
  74       ((defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L && !defined __DECC) \
  75        || (defined __GNUC__ && __GNUC__ >= 2 && !defined __APPLE_CC__)))
  76
  77 /* CAUTION: If you change this function, you must also make identical
  78    changes to the macro of the same name in src/po-lex.h  */
  79
  80 /* VARARGS1 */
  81 void
  82 po_gram_error (const char *fmt, ...)
  83 {
  84   va_list ap;
  85   char *buffer;
  86
  87   va_start (ap, fmt);
  88   if (vasprintf (&buffer, fmt, ap) < 0)
  89     error (EXIT_FAILURE, 0, _("memory exhausted"));
  90   va_end (ap);
  91   error_with_progname = false;
  92   po_error (0, 0, "%s:%lu:%d: %s", gram_pos.file_name,
  93             (unsigned long) gram_pos.line_number, gram_pos_column + 1, buffer);
  94   error_with_progname = true;
  95   free (buffer);
  96
  97   /* Some messages need more than one line.  Continuation lines are
  98      indicated by using "..." at the start of the string.  We don't
  99      increment the error counter for these continuation lines.  */
 100   if (*fmt == '.')
 101     --error_message_count;
 102   else if (error_message_count >= gram_max_allowed_errors)
 103     po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
 104 }
 105
 106 /* CAUTION: If you change this function, you must also make identical
 107    changes to the macro of the same name in src/po-lex.h  */
 108
 109 /* VARARGS2 */
 110 void
 111 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
 112 {
 113   va_list ap;
 114   char *buffer;
 115
 116   va_start (ap, fmt);
 117   if (vasprintf (&buffer, fmt, ap) < 0)
 118     error (EXIT_FAILURE, 0, _("memory exhausted"));
 119   va_end (ap);
 120   error_with_progname = false;
 121   po_error_at_line (0, 0, pp->file_name, pp->line_number, "%s", buffer);
 122   error_with_progname = true;
 123   free (buffer);
 124
 125   /* Some messages need more than one line, or more than one location.
 126      Continuation lines are indicated by using "..." at the start of the
 127      string.  We don't increment the error counter for these
 128      continuation lines.  */
 129   if (*fmt == '.')
 130     --error_message_count;
 131   else if (error_message_count >= gram_max_allowed_errors)
 132     po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
 133 }
 134
 135 #endif
 136
 137
 138 /* The lowest level of PO file parsing converts bytes to multibyte characters.
 139    This is needed
 140    1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
 141       translation phase maps bytes to characters.
 142    2. to keep track of the current column, for the sake of precise error
 143       location. Emacs compile.el interprets the column in error messages
 144       by default as a screen column number, not as character number.
 145    3. to avoid skipping backslash-newline in the midst of a multibyte
 146       character. If XY is a multibyte character,  X \ newline Y  is invalid.
 147  */
 148
 149 /* Multibyte character data type.  */
 150 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
 151    while the file is being parsed.  */
 152
 153 #define MBCHAR_BUF_SIZE 24
 154
 155 struct mbchar
 156 {
 157   size_t bytes;         /* number of bytes of current character, > 0 */
 158 #if HAVE_ICONV
 159   bool uc_valid;        /* true if uc is a valid Unicode character */
 160   unsigned int uc;      /* if uc_valid: the current character */
 161 #endif
 162   char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
 163 };
 164
 165 /* We want to pass multibyte characters by reference automatically,
 166    therefore we use an array type.  */
 167 typedef struct mbchar mbchar_t[1];
 168
 169 /* A version of memcpy optimized for the case n <= 1.  */
 170 static inline void
 171 memcpy_small (void *dst, const void *src, size_t n)
 172 {
 173   if (n > 0)
 174     {
 175       char *q = (char *) dst;
 176       const char *p = (const char *) src;
 177
 178       *q = *p;
 179       if (--n > 0)
 180         do *++q = *++p; while (--n > 0);
 181     }
 182 }
 183
 184 /* EOF (not a real character) is represented with bytes = 0 and
 185    uc_valid = false.  */
 186 static inline bool
 187 mb_iseof (const mbchar_t mbc)
 188 {
 189   return (mbc->bytes == 0);
 190 }
 191
 192 /* Access the current character.  */
 193 static inline const char *
 194 mb_ptr (const mbchar_t mbc)
 195 {
 196   return mbc->buf;
 197 }
 198 static inline size_t
 199 mb_len (const mbchar_t mbc)
 200 {
 201   return mbc->bytes;
 202 }
 203
 204 /* Comparison of characters.  */
 205
 206 static inline bool
 207 mb_iseq (const mbchar_t mbc, char sc)
 208 {
 209   /* Note: It is wrong to compare only mbc->uc, because when the encoding is
 210      SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
 211      want to treat it as an escape character, although it looks like a Yen
 212      sign.  */
 213 #if HAVE_ICONV && 0
 214   if (mbc->uc_valid)
 215     return (mbc->uc == sc); /* wrong! */
 216   else
 217 #endif
 218     return (mbc->bytes == 1 && mbc->buf[0] == sc);
 219 }
 220
 221 static inline bool
 222 mb_isnul (const mbchar_t mbc)
 223 {
 224 #if HAVE_ICONV
 225   if (mbc->uc_valid)
 226     return (mbc->uc == 0);
 227   else
 228 #endif
 229     return (mbc->bytes == 1 && mbc->buf[0] == 0);
 230 }
 231
 232 static inline int
 233 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
 234 {
 235 #if HAVE_ICONV
 236   if (mbc1->uc_valid && mbc2->uc_valid)
 237     return (int) mbc1->uc - (int) mbc2->uc;
 238   else
 239 #endif
 240     return (mbc1->bytes == mbc2->bytes
 241             ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
 242             : mbc1->bytes < mbc2->bytes
 243               ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
 244               : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
 245 }
 246
 247 static inline bool
 248 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
 249 {
 250 #if HAVE_ICONV
 251   if (mbc1->uc_valid && mbc2->uc_valid)
 252     return mbc1->uc == mbc2->uc;
 253   else
 254 #endif
 255     return (mbc1->bytes == mbc2->bytes
 256             && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
 257 }
 258
 259 /* <ctype.h>, <wctype.h> classification.  */
 260
 261 static inline bool
 262 mb_isascii (const mbchar_t mbc)
 263 {
 264 #if HAVE_ICONV
 265   if (mbc->uc_valid)
 266     return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
 267   else
 268 #endif
 269     return mbc->bytes == 1 && (mbc->buf[0] & 0x80) == 0;
 270 }
 271
 272 /* Extra <wchar.h> function.  */
 273
 274 /* Unprintable characters appear as a small box of width 1.  */
 275 #define MB_UNPRINTABLE_WIDTH 1
 276
 277 static int
 278 mb_width (const mbchar_t mbc)
 279 {
 280 #if HAVE_ICONV
 281   if (mbc->uc_valid)
 282     {
 283       unsigned int uc = mbc->uc;
 284       const char *encoding =
 285         (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
 286       int w = uc_width (uc, encoding);
 287       /* For unprintable characters, arbitrarily return 0 for control
 288          characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise.  */
 289       if (w >= 0)
 290         return w;
 291       if (uc >= 0x0000 && uc <= 0x001F)
 292         {
 293           if (uc == 0x0009)
 294             return 8 - (gram_pos_column & 7);
 295           return 0;
 296         }
 297       if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
 298         return 0;
 299       return MB_UNPRINTABLE_WIDTH;
 300     }
 301   else
 302 #endif
 303     {
 304       if (mbc->bytes == 1)
 305         {
 306           if ((mbc->buf[0] & 0x80) == 0 && mbc->buf[0] <= 0x1F)
 307             {
 308               if (mbc->buf[0] == 0x09)
 309                 return 8 - (gram_pos_column & 7);
 310               return 0;
 311             }
 312           if (mbc->buf[0] == 0x7F)
 313             return 0;
 314         }
 315       return MB_UNPRINTABLE_WIDTH;
 316     }
 317 }
 318
 319 /* Output.  */
 320 static inline void
 321 mb_putc (const mbchar_t mbc, FILE *stream)
 322 {
 323   fwrite (mbc->buf, 1, mbc->bytes, stream);
 324 }
 325
 326 /* Assignment.  */
 327 static inline void
 328 mb_setascii (mbchar_t mbc, char sc)
 329 {
 330   mbc->bytes = 1;
 331 #if HAVE_ICONV
 332   mbc->uc_valid = 1;
 333   mbc->uc = sc;
 334 #endif
 335   mbc->buf[0] = sc;
 336 }
 337
 338 /* Copying a character.  */
 339 static inline void
 340 mb_copy (mbchar_t new, const mbchar_t old)
 341 {
 342   memcpy_small (&new->buf[0], &old->buf[0], old->bytes);
 343   new->bytes = old->bytes;
 344 #if HAVE_ICONV
 345   if ((new->uc_valid = old->uc_valid))
 346     new->uc = old->uc;
 347 #endif
 348 }
 349
 350
 351 /* Multibyte character input.  */
 352
 353 /* Number of characters that can be pushed back.
 354    We need 1 for lex_getc, plus 1 for lex_ungetc.  */
 355 #define NPUSHBACK 2
 356
 357 /* Data type of a multibyte character input stream.  */
 358 struct mbfile
 359 {
 360   FILE *fp;
 361   bool eof_seen;
 362   int have_pushback;
 363   unsigned int bufcount;
 364   char buf[MBCHAR_BUF_SIZE];
 365   struct mbchar pushback[NPUSHBACK];
 366 };
 367
 368 /* We want to pass multibyte streams by reference automatically,
 369    therefore we use an array type.  */
 370 typedef struct mbfile mbfile_t[1];
 371
 372 /* Whether invalid multibyte sequences in the input shall be signalled
 373    or silently tolerated.  */
 374 static bool signal_eilseq;
 375
 376 static inline void
 377 mbfile_init (mbfile_t mbf, FILE *stream)
 378 {
 379   mbf->fp = stream;
 380   mbf->eof_seen = false;
 381   mbf->have_pushback = 0;
 382   mbf->bufcount = 0;
 383 }
 384
 385 /* Read the next multibyte character from mbf and put it into mbc.
 386    If a read error occurs, errno is set and ferror (mbf->fp) becomes true.  */
 387 static void
 388 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
 389 {
 390   size_t bytes;
 391
 392   /* If EOF has already been seen, don't use getc.  This matters if
 393      mbf->fp is connected to an interactive tty.  */
 394   if (mbf->eof_seen)
 395     goto eof;
 396
 397   /* Return character pushed back, if there is one.  */
 398   if (mbf->have_pushback > 0)
 399     {
 400       mbf->have_pushback--;
 401       mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
 402       return;
 403     }
 404
 405   /* Before using iconv, we need at least one byte.  */
 406   if (mbf->bufcount == 0)
 407     {
 408       int c = getc (mbf->fp);
 409       if (c == EOF)
 410         {
 411           mbf->eof_seen = true;
 412           goto eof;
 413         }
 414       mbf->buf[0] = (unsigned char) c;
 415       mbf->bufcount++;
 416     }
 417
 418 #if HAVE_ICONV
 419   if (po_lex_iconv != (iconv_t)(-1))
 420     {
 421       /* Use iconv on an increasing number of bytes.  Read only as many
 422          bytes from mbf->fp as needed.  This is needed to give reasonable
 423          interactive behaviour when mbf->fp is connected to an interactive
 424          tty.  */
 425       for (;;)
 426         {
 427           unsigned char scratchbuf[64];
 428           const char *inptr = &mbf->buf[0];
 429           size_t insize = mbf->bufcount;
 430           char *outptr = (char *) &scratchbuf[0];
 431           size_t outsize = sizeof (scratchbuf);
 432
 433           size_t res = iconv (po_lex_iconv,
 434                               (ICONV_CONST char **) &inptr, &insize,
 435                               &outptr, &outsize);
 436           /* We expect that a character has been produced if and only if
 437              some input bytes have been consumed.  */
 438           if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
 439             abort ();
 440           if (outsize == sizeof (scratchbuf))
 441             {
 442               /* No character has been produced.  Must be an error.  */
 443               if (res != (size_t)(-1))
 444                 abort ();
 445
 446               if (errno == EILSEQ)
 447                 {
 448                   /* An invalid multibyte sequence was encountered.  */
 449                   /* Return a single byte.  */
 450                   if (signal_eilseq)
 451                     po_gram_error (_("invalid multibyte sequence"));
 452                   bytes = 1;
 453                   mbc->uc_valid = false;
 454                   break;
 455                 }
 456               else if (errno == EINVAL)
 457                 {
 458                   /* An incomplete multibyte character.  */
 459                   int c;
 460
 461                   if (mbf->bufcount == MBCHAR_BUF_SIZE)
 462                     {
 463                       /* An overlong incomplete multibyte sequence was
 464                          encountered.  */
 465                       /* Return a single byte.  */
 466                       bytes = 1;
 467                       mbc->uc_valid = false;
 468                       break;
 469                     }
 470
 471                   /* Read one more byte and retry iconv.  */
 472                   c = getc (mbf->fp);
 473                   if (c == EOF)
 474                     {
 475                       mbf->eof_seen = true;
 476                       if (ferror (mbf->fp))
 477                         goto eof;
 478                       if (signal_eilseq)
 479                         po_gram_error (_("\
 480 incomplete multibyte sequence at end of file"));
 481                       bytes = mbf->bufcount;
 482                       mbc->uc_valid = false;
 483                       break;
 484                     }
 485                   mbf->buf[mbf->bufcount++] = (unsigned char) c;
 486                   if (c == '\n')
 487                     {
 488                       if (signal_eilseq)
 489                         po_gram_error (_("\
 490 incomplete multibyte sequence at end of line"));
 491                       bytes = mbf->bufcount - 1;
 492                       mbc->uc_valid = false;
 493                       break;
 494                     }
 495                 }
 496               else
 497                 po_error (EXIT_FAILURE, errno, _("iconv failure"));
 498             }
 499           else
 500             {
 501               size_t outbytes = sizeof (scratchbuf) - outsize;
 502               bytes = mbf->bufcount - insize;
 503
 504               /* We expect that one character has been produced.  */
 505               if (bytes == 0)
 506                 abort ();
 507               if (outbytes == 0)
 508                 abort ();
 509               /* Convert it from UTF-8 to UCS-4.  */
 510               if (u8_mbtouc (&mbc->uc, scratchbuf, outbytes) < outbytes)
 511                 {
 512                   /* scratchbuf contains an out-of-range Unicode character
 513                      (> 0x10ffff).  */
 514                   if (signal_eilseq)
 515                     po_gram_error (_("invalid multibyte sequence"));
 516                   mbc->uc_valid = false;
 517                   break;
 518                 }
 519               mbc->uc_valid = true;
 520               break;
 521             }
 522         }
 523     }
 524   else
 525 #endif
 526     {
 527       if (po_lex_weird_cjk
 528           /* Special handling of encodings with CJK structure.  */
 529           && (unsigned char) mbf->buf[0] >= 0x80)
 530         {
 531           if (mbf->bufcount == 1)
 532             {
 533               /* Read one more byte.  */
 534               int c = getc (mbf->fp);
 535               if (c == EOF)
 536                 {
 537                   if (ferror (mbf->fp))
 538                     {
 539                       mbf->eof_seen = true;
 540                       goto eof;
 541                     }
 542                 }
 543               else
 544                 {
 545                   mbf->buf[1] = (unsigned char) c;
 546                   mbf->bufcount++;
 547                 }
 548             }
 549           if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
 550             /* Return a double byte.  */
 551             bytes = 2;
 552           else
 553             /* Return a single byte.  */
 554             bytes = 1;
 555         }
 556       else
 557         {
 558           /* Return a single byte.  */
 559           bytes = 1;
 560         }
 561 #if HAVE_ICONV
 562       mbc->uc_valid = false;
 563 #endif
 564     }
 565
 566   /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
 567   memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
 568   mbc->bytes = bytes;
 569
 570   mbf->bufcount -= bytes;
 571   if (mbf->bufcount > 0)
 572     {
 573       /* It's not worth calling memmove() for so few bytes.  */
 574       unsigned int count = mbf->bufcount;
 575       char *p = &mbf->buf[0];
 576
 577       do
 578         {
 579           *p = *(p + bytes);
 580           p++;
 581         }
 582       while (--count > 0);
 583     }
 584   return;
 585
 586 eof:
 587   /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
 588   mbc->bytes = 0;
 589 #if HAVE_ICONV
 590   mbc->uc_valid = false;
 591 #endif
 592   return;
 593 }
 594
 595 static void
 596 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
 597 {
 598   if (mbf->have_pushback >= NPUSHBACK)
 599     abort ();
 600   mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
 601   mbf->have_pushback++;
 602 }
 603
 604
 605 /* Lexer variables.  */
 606
 607 static mbfile_t mbf;
 608 unsigned int gram_max_allowed_errors = 20;
 609 static bool po_lex_obsolete;
 610 static bool pass_comments = false;
 611 bool pass_obsolete_entries = false;
 612
 613
 614 /* Prepare lexical analysis.  */
 615 void
 616 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
 617 {
 618   /* Ignore the logical_filename, because PO file entries already have
 619      their file names attached.  But use real_filename for error messages.  */
 620   gram_pos.file_name = xstrdup (real_filename);
 621
 622   mbfile_init (mbf, fp);
 623
 624   gram_pos.line_number = 1;
 625   gram_pos_column = 0;
 626   signal_eilseq = true;
 627   po_lex_obsolete = false;
 628   po_lex_charset_init ();
 629 }
 630
 631 /* Terminate lexical analysis.  */
 632 void
 633 lex_end ()
 634 {
 635   mbf->fp = NULL;
 636   gram_pos.file_name = NULL;
 637   gram_pos.line_number = 0;
 638   gram_pos_column = 0;
 639   signal_eilseq = false;
 640   po_lex_obsolete = false;
 641   po_lex_charset_close ();
 642 }
 643
 644
 645 /* Read a single character, dealing with backslash-newline.
 646    Also keep track of the current line number and column number.  */
 647 static void
 648 lex_getc (mbchar_t mbc)
 649 {
 650   for (;;)
 651     {
 652       mbfile_getc (mbc, mbf);
 653
 654       if (mb_iseof (mbc))
 655         {
 656           if (ferror (mbf->fp))
 657             {
 658             bomb:
 659               po_error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
 660                         gram_pos.file_name);
 661             }
 662           break;
 663         }
 664
 665       if (mb_iseq (mbc, '\n'))
 666         {
 667           gram_pos.line_number++;
 668           gram_pos_column = 0;
 669           break;
 670         }
 671
 672       gram_pos_column += mb_width (mbc);
 673
 674       if (mb_iseq (mbc, '\\'))
 675         {
 676           mbchar_t mbc2;
 677
 678           mbfile_getc (mbc2, mbf);
 679
 680           if (mb_iseof (mbc2))
 681             {
 682               if (ferror (mbf->fp))
 683                 goto bomb;
 684               break;
 685             }
 686
 687           if (!mb_iseq (mbc2, '\n'))
 688             {
 689               mbfile_ungetc (mbc2, mbf);
 690               break;
 691             }
 692
 693           gram_pos.line_number++;
 694           gram_pos_column = 0;
 695         }
 696       else
 697         break;
 698     }
 699 }
 700
 701
 702 static void
 703 lex_ungetc (const mbchar_t mbc)
 704 {
 705   if (!mb_iseof (mbc))
 706     {
 707       if (mb_iseq (mbc, '\n'))
 708         /* Decrement the line number, but don't care about the column.  */
 709         gram_pos.line_number--;
 710       else
 711         /* Decrement the column number.  Also works well enough for tabs.  */
 712         gram_pos_column -= mb_width (mbc);
 713
 714       mbfile_ungetc (mbc, mbf);
 715     }
 716 }
 717
 718
 719 static int
 720 keyword_p (const char *s)
 721 {
 722   if (!strcmp (s, "domain"))
 723     return DOMAIN;
 724   if (!strcmp (s, "msgid"))
 725     return MSGID;
 726   if (!strcmp (s, "msgid_plural"))
 727     return MSGID_PLURAL;
 728   if (!strcmp (s, "msgstr"))
 729     return MSGSTR;
 730   po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
 731   return NAME;
 732 }
 733
 734
 735 static int
 736 control_sequence ()
 737 {
 738   mbchar_t mbc;
 739   int val;
 740   int max;
 741
 742   lex_getc (mbc);
 743   if (mb_len (mbc) == 1)
 744     switch (mb_ptr (mbc) [0])
 745       {
 746       case 'n':
 747         return '\n';
 748
 749       case 't':
 750         return '\t';
 751
 752       case 'b':
 753         return '\b';
 754
 755       case 'r':
 756         return '\r';
 757
 758       case 'f':
 759         return '\f';
 760
 761       case 'v':
 762         return '\v';
 763
 764       case 'a':
 765         return '\a';
 766
 767       case '\\':
 768       case '"':
 769         return mb_ptr (mbc) [0];
 770
 771       case '0': case '1': case '2': case '3':
 772       case '4': case '5': case '6': case '7':
 773         val = 0;
 774         max = 0;
 775         for (;;)
 776           {
 777             char c = mb_ptr (mbc) [0];
 778             /* Warning: not portable, can't depend on '0'..'7' ordering.  */
 779             val = val * 8 + (c - '0');
 780             if (++max == 3)
 781               break;
 782             lex_getc (mbc);
 783             if (mb_len (mbc) == 1)
 784               switch (mb_ptr (mbc) [0])
 785                 {
 786                 case '0': case '1': case '2': case '3':
 787                 case '4': case '5': case '6': case '7':
 788                   continue;
 789
 790                 default:
 791                   break;
 792                 }
 793             lex_ungetc (mbc);
 794             break;
 795           }
 796         return val;
 797
 798       case 'x':
 799         lex_getc (mbc);
 800         if (mb_iseof (mbc) || mb_len (mbc) != 1
 801             || !c_isxdigit (mb_ptr (mbc) [0]))
 802           break;
 803
 804         val = 0;
 805         for (;;)
 806           {
 807             char c = mb_ptr (mbc) [0];
 808             val *= 16;
 809             if (c_isdigit (c))
 810               /* Warning: not portable, can't depend on '0'..'9' ordering */
 811               val += c - '0';
 812             else if (c_isupper (c))
 813               /* Warning: not portable, can't depend on 'A'..'F' ordering */
 814               val += c - 'A' + 10;
 815             else
 816               /* Warning: not portable, can't depend on 'a'..'f' ordering */
 817               val += c - 'a' + 10;
 818
 819             lex_getc (mbc);
 820             if (mb_len (mbc) == 1)
 821               switch (mb_ptr (mbc) [0])
 822                 {
 823                 case '0': case '1': case '2': case '3': case '4':
 824                 case '5': case '6': case '7': case '8': case '9':
 825                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 826                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 827                   continue;
 828
 829                 default:
 830                   break;
 831                 }
 832             lex_ungetc (mbc);
 833             break;
 834           }
 835         return val;
 836
 837       /* FIXME: \u and \U are not handled.  */
 838       }
 839   lex_ungetc (mbc);
 840   po_gram_error (_("invalid control sequence"));
 841   return ' ';
 842 }
 843
 844
 845 /* Return the next token in the PO file.  The return codes are defined
 846    in "po-gram-gen2.h".  Associated data is put in 'po_gram_lval'.  */
 847 int
 848 po_gram_lex ()
 849 {
 850   static char *buf;
 851   static size_t bufmax;
 852   mbchar_t mbc;
 853   size_t bufpos;
 854
 855   for (;;)
 856     {
 857       lex_getc (mbc);
 858
 859       if (mb_iseof (mbc))
 860         /* Yacc want this for end of file.  */
 861         return 0;
 862
 863       if (mb_len (mbc) == 1)
 864         switch (mb_ptr (mbc) [0])
 865           {
 866           case '\n':
 867             po_lex_obsolete = false;
 868             /* Ignore whitespace, not relevant for the grammar.  */
 869             break;
 870
 871           case ' ':
 872           case '\t':
 873           case '\r':
 874           case '\f':
 875           case '\v':
 876             /* Ignore whitespace, not relevant for the grammar.  */
 877             break;
 878
 879           case '#':
 880             lex_getc (mbc);
 881             if (mb_iseq (mbc, '~'))
 882               /* A pseudo-comment beginning with #~ is found.  This is
 883                  not a comment.  It is the format for obsolete entries.
 884                  We simply discard the "#~" prefix.  The following
 885                  characters are expected to be well formed.  */
 886               {
 887                 po_lex_obsolete = true;
 888                 break;
 889               }
 890
 891             /* Accumulate comments into a buffer.  If we have been asked
 892                to pass comments, generate a COMMENT token, otherwise
 893                discard it.  */
 894             signal_eilseq = false;
 895             if (pass_comments)
 896               {
 897                 bufpos = 0;
 898                 for (;;)
 899                   {
 900                     while (bufpos + mb_len (mbc) >= bufmax)
 901                       {
 902                         bufmax += 100;
 903                         buf = xrealloc (buf, bufmax);
 904                       }
 905                     if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
 906                       break;
 907
 908                     memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
 909                     bufpos += mb_len (mbc);
 910
 911                     lex_getc (mbc);
 912                   }
 913                 buf[bufpos] = '\0';
 914
 915                 po_gram_lval.string.string = buf;
 916                 po_gram_lval.string.pos = gram_pos;
 917                 po_gram_lval.string.obsolete = po_lex_obsolete;
 918                 po_lex_obsolete = false;
 919                 signal_eilseq = true;
 920                 return COMMENT;
 921               }
 922             else
 923               {
 924                 /* We do this in separate loop because collecting large
 925                    comments while they get not passed to the upper layers
 926                    is not very effective.  */
 927                 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
 928                   lex_getc (mbc);
 929                 po_lex_obsolete = false;
 930                 signal_eilseq = true;
 931               }
 932             break;
 933
 934           case '"':
 935             /* Accumulate a string.  */
 936             bufpos = 0;
 937             for (;;)
 938               {
 939                 lex_getc (mbc);
 940                 while (bufpos + mb_len (mbc) >= bufmax)
 941                   {
 942                     bufmax += 100;
 943                     buf = xrealloc (buf, bufmax);
 944                   }
 945                 if (mb_iseof (mbc))
 946                   {
 947                     po_gram_error_at_line (&gram_pos,
 948                                            _("end-of-file within string"));
 949                     break;
 950                   }
 951                 if (mb_iseq (mbc, '\n'))
 952                   {
 953                     po_gram_error_at_line (&gram_pos,
 954                                            _("end-of-line within string"));
 955                     break;
 956                   }
 957                 if (mb_iseq (mbc, '"'))
 958                   break;
 959                 if (mb_iseq (mbc, '\\'))
 960                   {
 961                     buf[bufpos++] = control_sequence ();
 962                     continue;
 963                   }
 964
 965                 /* Add mbc to the accumulator.  */
 966                 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
 967                 bufpos += mb_len (mbc);
 968               }
 969             buf[bufpos] = '\0';
 970
 971             /* FIXME: Treatment of embedded \000 chars is incorrect.  */
 972             po_gram_lval.string.string = xstrdup (buf);
 973             po_gram_lval.string.pos = gram_pos;
 974             po_gram_lval.string.obsolete = po_lex_obsolete;
 975             return STRING;
 976
 977           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 978           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 979           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 980           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 981           case 'y': case 'z':
 982           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 983           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 984           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 985           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 986           case 'Y': case 'Z':
 987           case '_': case '$':
 988             bufpos = 0;
 989             for (;;)
 990               {
 991                 char c = mb_ptr (mbc) [0];
 992                 if (bufpos + 1 >= bufmax)
 993                   {
 994                     bufmax += 100;
 995                     buf = xrealloc (buf, bufmax);
 996                   }
 997                 buf[bufpos++] = c;
 998                 lex_getc (mbc);
 999                 if (mb_len (mbc) == 1)
1000                   switch (mb_ptr (mbc) [0])
1001                     {
1002                     default:
1003                       break;
1004                     case 'a': case 'b': case 'c': case 'd': case 'e':
1005                     case 'f': case 'g': case 'h': case 'i': case 'j':
1006                     case 'k': case 'l': case 'm': case 'n': case 'o':
1007                     case 'p': case 'q': case 'r': case 's': case 't':
1008                     case 'u': case 'v': case 'w': case 'x': case 'y':
1009                     case 'z':
1010                     case 'A': case 'B': case 'C': case 'D': case 'E':
1011                     case 'F': case 'G': case 'H': case 'I': case 'J':
1012                     case 'K': case 'L': case 'M': case 'N': case 'O':
1013                     case 'P': case 'Q': case 'R': case 'S': case 'T':
1014                     case 'U': case 'V': case 'W': case 'X': case 'Y':
1015                     case 'Z':
1016                     case '_': case '$':
1017                     case '0': case '1': case '2': case '3': case '4':
1018                     case '5': case '6': case '7': case '8': case '9':
1019                       continue;
1020                     }
1021                 break;
1022               }
1023             lex_ungetc (mbc);
1024
1025             buf[bufpos] = '\0';
1026
1027             {
1028               int k = keyword_p (buf);
1029               if (k == NAME)
1030                 {
1031                   po_gram_lval.string.string = xstrdup (buf);
1032                   po_gram_lval.string.pos = gram_pos;
1033                   po_gram_lval.string.obsolete = po_lex_obsolete;
1034                 }
1035               else
1036                 {
1037                   po_gram_lval.pos.pos = gram_pos;
1038                   po_gram_lval.pos.obsolete = po_lex_obsolete;
1039                 }
1040               return k;
1041             }
1042
1043           case '0': case '1': case '2': case '3': case '4':
1044           case '5': case '6': case '7': case '8': case '9':
1045             bufpos = 0;
1046             for (;;)
1047               {
1048                 char c = mb_ptr (mbc) [0];
1049                 if (bufpos + 1 >= bufmax)
1050                   {
1051                     bufmax += 100;
1052                     buf = xrealloc (buf, bufmax + 1);
1053                   }
1054                 buf[bufpos++] = c;
1055                 lex_getc (mbc);
1056                 if (mb_len (mbc) == 1)
1057                   switch (mb_ptr (mbc) [0])
1058                     {
1059                     default:
1060                       break;
1061
1062                     case '0': case '1': case '2': case '3': case '4':
1063                     case '5': case '6': case '7': case '8': case '9':
1064                       continue;
1065                     }
1066                 break;
1067               }
1068             lex_ungetc (mbc);
1069
1070             buf[bufpos] = '\0';
1071
1072             po_gram_lval.number.number = atol (buf);
1073             po_gram_lval.number.pos = gram_pos;
1074             po_gram_lval.number.obsolete = po_lex_obsolete;
1075             return NUMBER;
1076
1077           case '[':
1078             po_gram_lval.pos.pos = gram_pos;
1079             po_gram_lval.pos.obsolete = po_lex_obsolete;
1080             return '[';
1081
1082           case ']':
1083             po_gram_lval.pos.pos = gram_pos;
1084             po_gram_lval.pos.obsolete = po_lex_obsolete;
1085             return ']';
1086
1087           default:
1088             /* This will cause a syntax error.  */
1089             return JUNK;
1090           }
1091       else
1092         /* This will cause a syntax error.  */
1093         return JUNK;
1094     }
1095 }
1096
1097
1098 /* po_gram_lex() can return comments as COMMENT.  Switch this on or off.  */
1099 void
1100 po_lex_pass_comments (bool flag)
1101 {
1102   pass_comments = flag;
1103 }
1104
1105
1106 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1107    Switch this on or off.  */
1108 void
1109 po_lex_pass_obsolete_entries (bool flag)
1110 {
1111   pass_obsolete_entries = flag;
1112 }