gnu/dist/gettext/gettext-tools/src/x-ycp.c

   1 /* xgettext YCP backend.
   2    Copyright (C) 2001-2003 Free Software Foundation, Inc.
   3
   4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software Foundation,
  18    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  19
  20 #ifdef HAVE_CONFIG_H
  21 # include "config.h"
  22 #endif
  23
  24 #include <ctype.h>
  25 #include <errno.h>
  26 #include <limits.h>
  27 #include <stdbool.h>
  28 #include <stdio.h>
  29 #include <stdlib.h>
  30
  31 #include "message.h"
  32 #include "xgettext.h"
  33 #include "x-ycp.h"
  34 #include "error.h"
  35 #include "xalloc.h"
  36 #include "exit.h"
  37 #include "gettext.h"
  38
  39 #define _(s) gettext(s)
  40
  41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  42
  43
  44 /* The YCP syntax is defined in libycp/doc/syntax.html.
  45    See also libycp/src/scanner.ll.  */
  46
  47
  48 void
  49 init_flag_table_ycp ()
  50 {
  51   xgettext_record_flag ("sformat:1:ycp-format");
  52   xgettext_record_flag ("y2debug:1:ycp-format");
  53   xgettext_record_flag ("y2milestone:1:ycp-format");
  54   xgettext_record_flag ("y2warning:1:ycp-format");
  55   xgettext_record_flag ("y2error:1:ycp-format");
  56   xgettext_record_flag ("y2security:1:ycp-format");
  57   xgettext_record_flag ("y2internal:1:ycp-format");
  58 }
  59
  60
  61 /* ======================== Reading of characters.  ======================== */
  62
  63
  64 /* Real filename, used in error messages about the input file.  */
  65 static const char *real_file_name;
  66
  67 /* Logical filename and line number, used to label the extracted messages.  */
  68 static char *logical_file_name;
  69 static int line_number;
  70 static int char_in_line;
  71
  72 /* The input file stream.  */
  73 static FILE *fp;
  74
  75 /* These are for tracking whether comments count as immediately before
  76    keyword.  */
  77 static int last_comment_line;
  78 static int last_non_comment_line;
  79
  80
  81 /* 1. line_number handling.  */
  82
  83 static int
  84 phase1_getc ()
  85 {
  86   int c = getc (fp);
  87
  88   if (c == EOF)
  89     {
  90       if (ferror (fp))
  91         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
  92                real_file_name);
  93       return EOF;
  94     }
  95
  96   if (c == '\n')
  97     {
  98       line_number++;
  99       char_in_line = 0;
 100     }
 101   else
 102     char_in_line++;
 103
 104   return c;
 105 }
 106
 107 /* Supports only one pushback character.  */
 108 static void
 109 phase1_ungetc (int c)
 110 {
 111   if (c != EOF)
 112     {
 113       if (c == '\n')
 114         {
 115           --line_number;
 116           char_in_line = INT_MAX;
 117         }
 118       else
 119         --char_in_line;
 120
 121       ungetc (c, fp);
 122     }
 123 }
 124
 125
 126 /* 2. Replace each comment that is not inside a character constant or
 127    string literal with a space character.  We need to remember the
 128    comment for later, because it may be attached to a keyword string.
 129    YCP comments can be in C comment syntax, C++ comment syntax or sh
 130    comment syntax.  */
 131
 132 static unsigned char phase2_pushback[1];
 133 static int phase2_pushback_length;
 134
 135 static int
 136 phase2_getc ()
 137 {
 138   static char *buffer;
 139   static size_t bufmax;
 140   size_t buflen;
 141   int lineno;
 142   int c;
 143   bool last_was_star;
 144
 145   if (phase2_pushback_length)
 146     return phase2_pushback[--phase2_pushback_length];
 147
 148   if (char_in_line == 0)
 149     {
 150       /* Eat whitespace, to recognize ^[\t ]*# pattern.  */
 151       do
 152         c = phase1_getc ();
 153       while (c == '\t' || c == ' ');
 154
 155       if (c == '#')
 156         {
 157           /* sh comment.  */
 158           buflen = 0;
 159           lineno = line_number;
 160           for (;;)
 161             {
 162               c = phase1_getc ();
 163               if (c == '\n' || c == EOF)
 164                 break;
 165               /* We skip all leading white space, but not EOLs.  */
 166               if (!(buflen == 0 && (c == ' ' || c == '\t')))
 167                 {
 168                   if (buflen >= bufmax)
 169                     {
 170                       bufmax = 2 * bufmax + 10;
 171                       buffer = xrealloc (buffer, bufmax);
 172                     }
 173                   buffer[buflen++] = c;
 174                 }
 175             }
 176           if (buflen >= bufmax)
 177             {
 178               bufmax = 2 * bufmax + 10;
 179               buffer = xrealloc (buffer, bufmax);
 180             }
 181           buffer[buflen] = '\0';
 182           xgettext_comment_add (buffer);
 183           last_comment_line = lineno;
 184           return '\n';
 185         }
 186     }
 187   else
 188     c = phase1_getc ();
 189
 190   if (c == '/')
 191     {
 192       c = phase1_getc ();
 193
 194       switch (c)
 195         {
 196         default:
 197           phase1_ungetc (c);
 198           return '/';
 199
 200         case '*':
 201           /* C comment.  */
 202           buflen = 0;
 203           lineno = line_number;
 204           last_was_star = false;
 205           for (;;)
 206             {
 207               c = phase1_getc ();
 208               if (c == EOF)
 209                 break;
 210               /* We skip all leading white space, but not EOLs.  */
 211               if (buflen == 0 && (c == ' ' || c == '\t'))
 212                 continue;
 213               if (buflen >= bufmax)
 214                 {
 215                   bufmax = 2 * bufmax + 10;
 216                   buffer = xrealloc (buffer, bufmax);
 217                 }
 218               buffer[buflen++] = c;
 219               switch (c)
 220                 {
 221                 case '\n':
 222                   --buflen;
 223                   while (buflen >= 1
 224                          && (buffer[buflen - 1] == ' '
 225                              || buffer[buflen - 1] == '\t'))
 226                     --buflen;
 227                   buffer[buflen] = '\0';
 228                   xgettext_comment_add (buffer);
 229                   buflen = 0;
 230                   lineno = line_number;
 231                   last_was_star = false;
 232                   continue;
 233
 234                 case '*':
 235                   last_was_star = true;
 236                   continue;
 237
 238                 case '/':
 239                   if (last_was_star)
 240                     {
 241                       buflen -= 2;
 242                       while (buflen >= 1
 243                              && (buffer[buflen - 1] == ' '
 244                                  || buffer[buflen - 1] == '\t'))
 245                         --buflen;
 246                       buffer[buflen] = '\0';
 247                       xgettext_comment_add (buffer);
 248                       break;
 249                     }
 250                   /* FALLTHROUGH */
 251
 252                 default:
 253                   last_was_star = false;
 254                   continue;
 255                 }
 256               break;
 257             }
 258           last_comment_line = lineno;
 259           return ' ';
 260
 261         case '/':
 262           /* C++ comment.  */
 263           buflen = 0;
 264           lineno = line_number;
 265           for (;;)
 266             {
 267               c = phase1_getc ();
 268               if (c == '\n' || c == EOF)
 269                 break;
 270               /* We skip all leading white space, but not EOLs.  */
 271               if (!(buflen == 0 && (c == ' ' || c == '\t')))
 272                 {
 273                   if (buflen >= bufmax)
 274                     {
 275                       bufmax = 2 * bufmax + 10;
 276                       buffer = xrealloc (buffer, bufmax);
 277                     }
 278                   buffer[buflen++] = c;
 279                 }
 280             }
 281           if (buflen >= bufmax)
 282             {
 283               bufmax = 2 * bufmax + 10;
 284               buffer = xrealloc (buffer, bufmax);
 285             }
 286           buffer[buflen] = '\0';
 287           xgettext_comment_add (buffer);
 288           last_comment_line = lineno;
 289           return '\n';
 290         }
 291     }
 292   else
 293     return c;
 294 }
 295
 296 /* Supports only one pushback character.  */
 297 static void
 298 phase2_ungetc (int c)
 299 {
 300   if (c != EOF)
 301     {
 302       if (phase2_pushback_length == SIZEOF (phase2_pushback))
 303         abort ();
 304       phase2_pushback[phase2_pushback_length++] = c;
 305     }
 306 }
 307
 308
 309 /* ========================== Reading of tokens.  ========================== */
 310
 311
 312 enum token_type_ty
 313 {
 314   token_type_eof,
 315   token_type_lparen,            /* ( */
 316   token_type_rparen,            /* ) */
 317   token_type_comma,             /* , */
 318   token_type_i18n,              /* _( */
 319   token_type_string_literal,    /* "abc" */
 320   token_type_symbol,            /* symbol, number */
 321   token_type_other              /* misc. operator */
 322 };
 323 typedef enum token_type_ty token_type_ty;
 324
 325 typedef struct token_ty token_ty;
 326 struct token_ty
 327 {
 328   token_type_ty type;
 329   char *string;         /* for token_type_string_literal, token_type_symbol */
 330   int line_number;
 331 };
 332
 333
 334 /* 7. Replace escape sequences within character strings with their
 335    single character equivalents.  */
 336
 337 #define P7_QUOTES (1000 + '"')
 338
 339 static int
 340 phase7_getc ()
 341 {
 342   int c;
 343
 344   for (;;)
 345     {
 346       /* Use phase 1, because phase 2 elides comments.  */
 347       c = phase1_getc ();
 348
 349       if (c == '"')
 350         return P7_QUOTES;
 351       if (c != '\\')
 352         return c;
 353       c = phase1_getc ();
 354       if (c != '\n')
 355         switch (c)
 356           {
 357           case 'b':
 358             return '\b';
 359           case 'f':
 360             return '\f';
 361           case 'n':
 362             return '\n';
 363           case 'r':
 364             return '\r';
 365           case 't':
 366             return '\t';
 367
 368           /* FIXME: What is the octal escape syntax?
 369              syntax.html says: [0] [0-7]+
 370              scanner.ll says:  [0-7] [0-7] [0-7]
 371            */
 372 #if 0
 373           case '0': case '1': case '2': case '3':
 374           case '4': case '5': case '6': case '7':
 375             {
 376               int n, j;
 377
 378               n = 0;
 379               for (j = 0; j < 3; ++j)
 380                 {
 381                   n = n * 8 + c - '0';
 382                   c = phase1_getc ();
 383                   switch (c)
 384                     {
 385                     default:
 386                       break;
 387
 388                     case '0': case '1': case '2': case '3':
 389                     case '4': case '5': case '6': case '7':
 390                       continue;
 391                     }
 392                   break;
 393                 }
 394               phase1_ungetc (c);
 395               return n;
 396             }
 397 #endif
 398
 399           default:
 400             return c;
 401           }
 402     }
 403 }
 404
 405
 406 /* Combine characters into tokens.  Discard whitespace.  */
 407
 408 static void
 409 x_ycp_lex (token_ty *tp)
 410 {
 411   static char *buffer;
 412   static int bufmax;
 413   int bufpos;
 414   int c;
 415
 416   for (;;)
 417     {
 418       tp->line_number = line_number;
 419       c = phase2_getc ();
 420
 421       switch (c)
 422         {
 423         case EOF:
 424           tp->type = token_type_eof;
 425           return;
 426
 427         case '\n':
 428           if (last_non_comment_line > last_comment_line)
 429             xgettext_comment_reset ();
 430           /* FALLTHROUGH */
 431         case '\r':
 432         case '\t':
 433         case ' ':
 434           /* Ignore whitespace and comments.  */
 435           continue;
 436         }
 437
 438       last_non_comment_line = tp->line_number;
 439
 440       switch (c)
 441         {
 442         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 443         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 444         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 445         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 446         case 'Y': case 'Z':
 447         case '_':
 448         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 449         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 450         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 451         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 452         case 'y': case 'z':
 453         case '0': case '1': case '2': case '3': case '4':
 454         case '5': case '6': case '7': case '8': case '9':
 455           /* Symbol, or part of a number.  */
 456           bufpos = 0;
 457           for (;;)
 458             {
 459               if (bufpos >= bufmax)
 460                 {
 461                   bufmax = 2 * bufmax + 10;
 462                   buffer = xrealloc (buffer, bufmax);
 463                 }
 464               buffer[bufpos++] = c;
 465               c = phase2_getc ();
 466               switch (c)
 467                 {
 468                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 469                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
 470                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 471                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 472                 case 'Y': case 'Z':
 473                 case '_':
 474                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 475                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 476                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 477                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 478                 case 'y': case 'z':
 479                 case '0': case '1': case '2': case '3': case '4':
 480                 case '5': case '6': case '7': case '8': case '9':
 481                   continue;
 482                 default:
 483                   if (bufpos == 1 && buffer[0] == '_' && c == '(')
 484                     {
 485                       tp->type = token_type_i18n;
 486                       return;
 487                     }
 488                   phase2_ungetc (c);
 489                   break;
 490                 }
 491               break;
 492             }
 493           if (bufpos >= bufmax)
 494             {
 495               bufmax = 2 * bufmax + 10;
 496               buffer = xrealloc (buffer, bufmax);
 497             }
 498           buffer[bufpos] = '\0';
 499           tp->string = xstrdup (buffer);
 500           tp->type = token_type_symbol;
 501           return;
 502
 503         case '"':
 504           bufpos = 0;
 505           for (;;)
 506             {
 507               c = phase7_getc ();
 508               if (c == EOF || c == P7_QUOTES)
 509                 break;
 510               if (bufpos >= bufmax)
 511                 {
 512                   bufmax = 2 * bufmax + 10;
 513                   buffer = xrealloc (buffer, bufmax);
 514                 }
 515               buffer[bufpos++] = c;
 516             }
 517           if (bufpos >= bufmax)
 518             {
 519               bufmax = 2 * bufmax + 10;
 520               buffer = xrealloc (buffer, bufmax);
 521             }
 522           buffer[bufpos] = '\0';
 523           tp->string = xstrdup (buffer);
 524           tp->type = token_type_string_literal;
 525           return;
 526
 527         case '(':
 528           tp->type = token_type_lparen;
 529           return;
 530
 531         case ')':
 532           tp->type = token_type_rparen;
 533           return;
 534
 535         case ',':
 536           tp->type = token_type_comma;
 537           return;
 538
 539         default:
 540           /* We could carefully recognize each of the 2 and 3 character
 541              operators, but it is not necessary, as we only need to recognize
 542              gettext invocations.  Don't bother.  */
 543           tp->type = token_type_other;
 544           return;
 545         }
 546     }
 547 }
 548
 549
 550 /* ========================= Extracting strings.  ========================== */
 551
 552
 553 /* Context lookup table.  */
 554 static flag_context_list_table_ty *flag_context_list_table;
 555
 556
 557 /* The file is broken into tokens.
 558
 559      Normal handling: Look for
 560        [A] _( [B] msgid ... )
 561      Plural handling: Look for
 562        [A] _( [B] msgid [C] , [D] msgid_plural ... )
 563      At point [A]: state == 0.
 564      At point [B]: state == 1, plural_mp == NULL.
 565      At point [C]: state == 2, plural_mp != NULL.
 566      At point [D]: state == 1, plural_mp != NULL.
 567
 568    We use recursion because we have to set the context according to the given
 569    flags.  */
 570
 571
 572 /* Extract messages until the next balanced closing parenthesis.
 573    Extracted messages are added to MLP.
 574    Return true upon eof, false upon closing parenthesis.  */
 575 static bool
 576 extract_parenthesized (message_list_ty *mlp,
 577                        flag_context_ty outer_context,
 578                        flag_context_list_iterator_ty context_iter,
 579                        bool in_i18n)
 580 {
 581   int state; /* 1 or 2 inside _( ... ), otherwise 0 */
 582   message_ty *plural_mp = NULL; /* defined only when in states 1 and 2 */
 583   /* Context iterator that will be used if the next token is a '('.  */
 584   flag_context_list_iterator_ty next_context_iter =
 585     passthrough_context_list_iterator;
 586   /* Current context.  */
 587   flag_context_ty inner_context =
 588     inherited_context (outer_context,
 589                        flag_context_list_iterator_advance (&context_iter));
 590
 591   /* Start state is 0 or 1.  */
 592   state = (in_i18n ? 1 : 0);
 593
 594   for (;;)
 595     {
 596       token_ty token;
 597
 598       x_ycp_lex (&token);
 599       switch (token.type)
 600         {
 601         case token_type_i18n:
 602           if (extract_parenthesized (mlp, inner_context, next_context_iter,
 603                                      true))
 604             return true;
 605           next_context_iter = null_context_list_iterator;
 606           state = 0;
 607           continue;
 608
 609         case token_type_string_literal:
 610           if (state == 1)
 611             {
 612               lex_pos_ty pos;
 613               pos.file_name = logical_file_name;
 614               pos.line_number = token.line_number;
 615
 616               if (plural_mp == NULL)
 617                 {
 618                   /* Seen an msgid.  */
 619                   plural_mp = remember_a_message (mlp, token.string,
 620                                                   inner_context, &pos);
 621                   state = 2;
 622                 }
 623               else
 624                 {
 625                   /* Seen an msgid_plural.  */
 626                   remember_a_message_plural (plural_mp, token.string,
 627                                              inner_context, &pos);
 628                   state = 0;
 629                 }
 630             }
 631           else
 632             {
 633               free (token.string);
 634               state = 0;
 635             }
 636           next_context_iter = null_context_list_iterator;
 637           continue;
 638
 639         case token_type_symbol:
 640           next_context_iter =
 641             flag_context_list_iterator (
 642               flag_context_list_table_lookup (
 643                 flag_context_list_table,
 644                 token.string, strlen (token.string)));
 645           free (token.string);
 646           state = 0;
 647           continue;
 648
 649         case token_type_lparen:
 650           if (extract_parenthesized (mlp, inner_context, next_context_iter,
 651                                      false))
 652             return true;
 653           next_context_iter = null_context_list_iterator;
 654           state = 0;
 655           continue;
 656
 657         case token_type_rparen:
 658           return false;
 659
 660         case token_type_comma:
 661           if (state == 2)
 662             state = 1;
 663           else
 664             state = 0;
 665           inner_context =
 666             inherited_context (outer_context,
 667                                flag_context_list_iterator_advance (
 668                                  &context_iter));
 669           next_context_iter = passthrough_context_list_iterator;
 670           continue;
 671
 672         case token_type_other:
 673           next_context_iter = null_context_list_iterator;
 674           state = 0;
 675           continue;
 676
 677         case token_type_eof:
 678           return true;
 679
 680         default:
 681           abort ();
 682         }
 683     }
 684 }
 685
 686
 687 void
 688 extract_ycp (FILE *f,
 689              const char *real_filename, const char *logical_filename,
 690              flag_context_list_table_ty *flag_table,
 691              msgdomain_list_ty *mdlp)
 692 {
 693   message_list_ty *mlp = mdlp->item[0]->messages;
 694
 695   fp = f;
 696   real_file_name = real_filename;
 697   logical_file_name = xstrdup (logical_filename);
 698   line_number = 1;
 699   char_in_line = 0;
 700
 701   last_comment_line = -1;
 702   last_non_comment_line = -1;
 703
 704   flag_context_list_table = flag_table;
 705
 706   /* Eat tokens until eof is seen.  When extract_parenthesized returns
 707      due to an unbalanced closing parenthesis, just restart it.  */
 708   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
 709                                  false))
 710     ;
 711
 712   fp = NULL;
 713   real_file_name = NULL;
 714   logical_file_name = NULL;
 715   line_number = 0;
 716   char_in_line = 0;
 717 }