gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 59 Temple Place - Suite 330, Boston, MA
  21    02111-1307, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90 */
  24 /* App, the assembler pre-processor.  This pre-processor strips out excess
  25    spaces, turns single-quoted characters into a decimal constant, and turns
  26    # <number> <filename> <garbage> into a .line <number>\n.file <filename>
  27    pair.  This needs better error-handling.  */
  28
  29 #include <stdio.h>
  30 #include "as.h"                 /* For BAD_CASE() only */
  31
  32 #if (__STDC__ != 1)
  33 #ifndef const
  34 #define const  /* empty */
  35 #endif
  36 #endif
  37
  38 #ifdef TC_M68K
  39 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  40    flag_m68k_mri, because the two flags will be affected by the .mri
  41    pseudo-op at different times.  */
  42 static int scrub_m68k_mri;
  43 #else
  44 #define scrub_m68k_mri 0
  45 #endif
  46
  47 /* The pseudo-op which switches in and out of MRI mode.  See the
  48    comment in do_scrub_chars.  */
  49 static const char mri_pseudo[] = ".mri 0";
  50
  51 #if defined TC_ARM && defined OBJ_ELF
  52 /* The pseudo-op for which we need to special-case `@' characters.
  53    See the comment in do_scrub_chars.  */
  54 static const char   symver_pseudo[] = ".symver";
  55 static const char * symver_state;
  56 #endif
  57
  58 static char lex[256];
  59 static const char symbol_chars[] =
  60 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  61
  62 #define LEX_IS_SYMBOL_COMPONENT         1
  63 #define LEX_IS_WHITESPACE               2
  64 #define LEX_IS_LINE_SEPARATOR           3
  65 #define LEX_IS_COMMENT_START            4
  66 #define LEX_IS_LINE_COMMENT_START       5
  67 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  68 #define LEX_IS_STRINGQUOTE              8
  69 #define LEX_IS_COLON                    9
  70 #define LEX_IS_NEWLINE                  10
  71 #define LEX_IS_ONECHAR_QUOTE            11
  72 #ifdef TC_V850
  73 #define LEX_IS_DOUBLEDASH_1ST           12
  74 #endif
  75 #ifdef TC_M32R
  76 #define DOUBLEBAR_PARALLEL
  77 #endif
  78 #ifdef DOUBLEBAR_PARALLEL
  79 #define LEX_IS_DOUBLEBAR_1ST            13
  80 #endif
  81 #define LEX_IS_PARALLEL_SEPARATOR       14
  82 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  83 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  84 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  85 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  86 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  87 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  88 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  89
  90 static int process_escape PARAMS ((int));
  91
  92 /* FIXME-soon: The entire lexer/parser thingy should be
  93    built statically at compile time rather than dynamically
  94    each and every time the assembler is run.  xoxorich.  */
  95
  96 void
  97 do_scrub_begin (m68k_mri)
  98      int m68k_mri ATTRIBUTE_UNUSED;
  99 {
 100   const char *p;
 101   int c;
 102
 103   lex[' '] = LEX_IS_WHITESPACE;
 104   lex['\t'] = LEX_IS_WHITESPACE;
 105   lex['\r'] = LEX_IS_WHITESPACE;
 106   lex['\n'] = LEX_IS_NEWLINE;
 107   lex[':'] = LEX_IS_COLON;
 108
 109 #ifdef TC_M68K
 110   scrub_m68k_mri = m68k_mri;
 111
 112   if (! m68k_mri)
 113 #endif
 114     {
 115       lex['"'] = LEX_IS_STRINGQUOTE;
 116
 117 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 118       /* I370 uses single-quotes to delimit integer, float constants */
 119       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 120 #endif
 121
 122 #ifdef SINGLE_QUOTE_STRINGS
 123       lex['\''] = LEX_IS_STRINGQUOTE;
 124 #endif
 125     }
 126
 127   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 128      in state 5 of do_scrub_chars must be changed.  */
 129
 130   /* Note that these override the previous defaults, e.g. if ';' is a
 131      comment char, then it isn't a line separator.  */
 132   for (p = symbol_chars; *p; ++p)
 133     {
 134       lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 135     }                           /* declare symbol characters */
 136
 137   for (c = 128; c < 256; ++c)
 138     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 139
 140 #ifdef tc_symbol_chars
 141   /* This macro permits the processor to specify all characters which
 142      may appears in an operand.  This will prevent the scrubber from
 143      discarding meaningful whitespace in certain cases.  The i386
 144      backend uses this to support prefixes, which can confuse the
 145      scrubber as to whether it is parsing operands or opcodes.  */
 146   for (p = tc_symbol_chars; *p; ++p)
 147     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 148 #endif
 149
 150   /* The m68k backend wants to be able to change comment_chars.  */
 151 #ifndef tc_comment_chars
 152 #define tc_comment_chars comment_chars
 153 #endif
 154   for (p = tc_comment_chars; *p; p++)
 155     {
 156       lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 157     }                           /* declare comment chars */
 158
 159   for (p = line_comment_chars; *p; p++)
 160     {
 161       lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162     }                           /* declare line comment chars */
 163
 164   for (p = line_separator_chars; *p; p++)
 165     {
 166       lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 167     }                           /* declare line separators */
 168
 169 #ifdef tc_parallel_separator_chars
 170   /* This macro permits the processor to specify all characters which
 171      separate parallel insns on the same line.  */
 172   for (p = tc_parallel_separator_chars; *p; p++)
 173     {
 174       lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 175     }                           /* declare parallel separators */
 176 #endif
 177
 178   /* Only allow slash-star comments if slash is not in use.
 179      FIXME: This isn't right.  We should always permit them.  */
 180   if (lex['/'] == 0)
 181     {
 182       lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 183     }
 184
 185 #ifdef TC_M68K
 186   if (m68k_mri)
 187     {
 188       lex['\''] = LEX_IS_STRINGQUOTE;
 189       lex[';'] = LEX_IS_COMMENT_START;
 190       lex['*'] = LEX_IS_LINE_COMMENT_START;
 191       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 192          then it can't be used in an expression.  */
 193       lex['!'] = LEX_IS_LINE_COMMENT_START;
 194     }
 195 #endif
 196
 197 #ifdef TC_V850
 198   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 199 #endif
 200 #ifdef DOUBLEBAR_PARALLEL
 201   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 202 #endif
 203 #ifdef TC_D30V
 204   /* must do this is we want VLIW instruction with "->" or "<-" */
 205   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 206 #endif
 207 }                               /* do_scrub_begin() */
 208
 209 /* Saved state of the scrubber */
 210 static int state;
 211 static int old_state;
 212 static char *out_string;
 213 static char out_buf[20];
 214 static int add_newlines;
 215 static char *saved_input;
 216 static int saved_input_len;
 217 static char input_buffer[32 * 1024];
 218 static const char *mri_state;
 219 static char mri_last_ch;
 220
 221 /* Data structure for saving the state of app across #include's.  Note that
 222    app is called asynchronously to the parsing of the .include's, so our
 223    state at the time .include is interpreted is completely unrelated.
 224    That's why we have to save it all.  */
 225
 226 struct app_save {
 227   int          state;
 228   int          old_state;
 229   char *       out_string;
 230   char         out_buf[sizeof (out_buf)];
 231   int          add_newlines;
 232   char *       saved_input;
 233   int          saved_input_len;
 234 #ifdef TC_M68K
 235   int          scrub_m68k_mri;
 236 #endif
 237   const char * mri_state;
 238   char         mri_last_ch;
 239 #if defined TC_ARM && defined OBJ_ELF
 240   const char * symver_state;
 241 #endif
 242 };
 243
 244 char *
 245 app_push ()
 246 {
 247   register struct app_save *saved;
 248
 249   saved = (struct app_save *) xmalloc (sizeof (*saved));
 250   saved->state = state;
 251   saved->old_state = old_state;
 252   saved->out_string = out_string;
 253   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 254   saved->add_newlines = add_newlines;
 255   if (saved_input == NULL)
 256     saved->saved_input = NULL;
 257   else
 258     {
 259       saved->saved_input = xmalloc (saved_input_len);
 260       memcpy (saved->saved_input, saved_input, saved_input_len);
 261       saved->saved_input_len = saved_input_len;
 262     }
 263 #ifdef TC_M68K
 264   saved->scrub_m68k_mri = scrub_m68k_mri;
 265 #endif
 266   saved->mri_state = mri_state;
 267   saved->mri_last_ch = mri_last_ch;
 268 #if defined TC_ARM && defined OBJ_ELF
 269   saved->symver_state = symver_state;
 270 #endif
 271
 272   /* do_scrub_begin() is not useful, just wastes time.  */
 273
 274   state = 0;
 275   saved_input = NULL;
 276
 277   return (char *) saved;
 278 }
 279
 280 void
 281 app_pop (arg)
 282      char *arg;
 283 {
 284   register struct app_save *saved = (struct app_save *) arg;
 285
 286   /* There is no do_scrub_end ().  */
 287   state = saved->state;
 288   old_state = saved->old_state;
 289   out_string = saved->out_string;
 290   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 291   add_newlines = saved->add_newlines;
 292   if (saved->saved_input == NULL)
 293     saved_input = NULL;
 294   else
 295     {
 296       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
 297       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 298       saved_input = input_buffer;
 299       saved_input_len = saved->saved_input_len;
 300       free (saved->saved_input);
 301     }
 302 #ifdef TC_M68K
 303   scrub_m68k_mri = saved->scrub_m68k_mri;
 304 #endif
 305   mri_state = saved->mri_state;
 306   mri_last_ch = saved->mri_last_ch;
 307 #if defined TC_ARM && defined OBJ_ELF
 308   symver_state = saved->symver_state;
 309 #endif
 310
 311   free (arg);
 312 }                               /* app_pop() */
 313
 314 /* @@ This assumes that \n &c are the same on host and target.  This is not
 315    necessarily true.  */
 316 static int
 317 process_escape (ch)
 318      int ch;
 319 {
 320   switch (ch)
 321     {
 322     case 'b':
 323       return '\b';
 324     case 'f':
 325       return '\f';
 326     case 'n':
 327       return '\n';
 328     case 'r':
 329       return '\r';
 330     case 't':
 331       return '\t';
 332     case '\'':
 333       return '\'';
 334     case '"':
 335       return '\"';
 336     default:
 337       return ch;
 338     }
 339 }
 340
 341 /* This function is called to process input characters.  The GET
 342    parameter is used to retrieve more input characters.  GET should
 343    set its parameter to point to a buffer, and return the length of
 344    the buffer; it should return 0 at end of file.  The scrubbed output
 345    characters are put into the buffer starting at TOSTART; the TOSTART
 346    buffer is TOLEN bytes in length.  The function returns the number
 347    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 348    end of file was seen.  This function is arranged as a state
 349    machine, and saves its state so that it may return at any point.
 350    This is the way the old code used to work.  */
 351
 352 int
 353 do_scrub_chars (get, tostart, tolen)
 354      int (*get) PARAMS ((char *, int));
 355      char *tostart;
 356      int tolen;
 357 {
 358   char *to = tostart;
 359   char *toend = tostart + tolen;
 360   char *from;
 361   char *fromend;
 362   int fromlen;
 363   register int ch, ch2 = 0;
 364
 365   /*State 0: beginning of normal line
 366           1: After first whitespace on line (flush more white)
 367           2: After first non-white (opcode) on line (keep 1white)
 368           3: after second white on line (into operands) (flush white)
 369           4: after putting out a .line, put out digits
 370           5: parsing a string, then go to old-state
 371           6: putting out \ escape in a "d string.
 372           7: After putting out a .appfile, put out string.
 373           8: After putting out a .appfile string, flush until newline.
 374           9: After seeing symbol char in state 3 (keep 1white after symchar)
 375          10: After seeing whitespace in state 9 (keep white before symchar)
 376          11: After seeing a symbol character in state 0 (eg a label definition)
 377          -1: output string in out_string and go to the state in old_state
 378          -2: flush text until a '*' '/' is seen, then go to state old_state
 379 #ifdef TC_V850
 380          12: After seeing a dash, looking for a second dash as a start of comment.
 381 #endif
 382 #ifdef DOUBLEBAR_PARALLEL
 383          13: After seeing a vertical bar, looking for a second vertical bar as a parallel expression seperator.
 384 #endif
 385           */
 386
 387   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 388      constructs like ``.loc 1 20''.  This was turning into ``.loc
 389      120''.  States 9 and 10 ensure that a space is never dropped in
 390      between characters which could appear in an identifier.  Ian
 391      Taylor, ian@cygnus.com.
 392
 393      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 394      correctly on the PA (and any other target where colons are optional).
 395      Jeff Law, law@cs.utah.edu.
 396
 397      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 398      get squashed into "cmp r1,r2||trap#1", with the all important space
 399      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 400
 401   /* This macro gets the next input character.  */
 402
 403 #define GET()                                                   \
 404   (from < fromend                                               \
 405    ? * (unsigned char *) (from++)                               \
 406    : (saved_input = NULL,                                       \
 407       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 408       from = input_buffer,                                      \
 409       fromend = from + fromlen,                                 \
 410       (fromlen == 0                                             \
 411        ? EOF                                                    \
 412        : * (unsigned char *) (from++))))
 413
 414   /* This macro pushes a character back on the input stream.  */
 415
 416 #define UNGET(uch) (*--from = (uch))
 417
 418   /* This macro puts a character into the output buffer.  If this
 419      character fills the output buffer, this macro jumps to the label
 420      TOFULL.  We use this rather ugly approach because we need to
 421      handle two different termination conditions: EOF on the input
 422      stream, and a full output buffer.  It would be simpler if we
 423      always read in the entire input stream before processing it, but
 424      I don't want to make such a significant change to the assembler's
 425      memory usage.  */
 426
 427 #define PUT(pch)                        \
 428   do                                    \
 429     {                                   \
 430       *to++ = (pch);                    \
 431       if (to >= toend)                  \
 432         goto tofull;                    \
 433     }                                   \
 434   while (0)
 435
 436   if (saved_input != NULL)
 437     {
 438       from = saved_input;
 439       fromend = from + saved_input_len;
 440     }
 441   else
 442     {
 443       fromlen = (*get) (input_buffer, sizeof input_buffer);
 444       if (fromlen == 0)
 445         return 0;
 446       from = input_buffer;
 447       fromend = from + fromlen;
 448     }
 449
 450   while (1)
 451     {
 452       /* The cases in this switch end with continue, in order to
 453          branch back to the top of this while loop and generate the
 454          next output character in the appropriate state.  */
 455       switch (state)
 456         {
 457         case -1:
 458           ch = *out_string++;
 459           if (*out_string == '\0')
 460             {
 461               state = old_state;
 462               old_state = 3;
 463             }
 464           PUT (ch);
 465           continue;
 466
 467         case -2:
 468           for (;;)
 469             {
 470               do
 471                 {
 472                   ch = GET ();
 473
 474                   if (ch == EOF)
 475                     {
 476                       as_warn (_("end of file in comment"));
 477                       goto fromeof;
 478                     }
 479
 480                   if (ch == '\n')
 481                     PUT ('\n');
 482                 }
 483               while (ch != '*');
 484
 485               while ((ch = GET ()) == '*')
 486                 ;
 487
 488               if (ch == EOF)
 489                 {
 490                   as_warn (_("end of file in comment"));
 491                   goto fromeof;
 492                 }
 493
 494               if (ch == '/')
 495                 break;
 496
 497               UNGET (ch);
 498             }
 499
 500           state = old_state;
 501           UNGET (' ');
 502           continue;
 503
 504         case 4:
 505           ch = GET ();
 506           if (ch == EOF)
 507             goto fromeof;
 508           else if (ch >= '0' && ch <= '9')
 509             PUT (ch);
 510           else
 511             {
 512               while (ch != EOF && IS_WHITESPACE (ch))
 513                 ch = GET ();
 514               if (ch == '"')
 515                 {
 516                   UNGET (ch);
 517                   if (scrub_m68k_mri)
 518                     out_string = "\n\tappfile ";
 519                   else
 520                     out_string = "\n\t.appfile ";
 521                   old_state = 7;
 522                   state = -1;
 523                   PUT (*out_string++);
 524                 }
 525               else
 526                 {
 527                   while (ch != EOF && ch != '\n')
 528                     ch = GET ();
 529                   state = 0;
 530                   PUT (ch);
 531                 }
 532             }
 533           continue;
 534
 535         case 5:
 536           /* We are going to copy everything up to a quote character,
 537              with special handling for a backslash.  We try to
 538              optimize the copying in the simple case without using the
 539              GET and PUT macros.  */
 540           {
 541             char *s;
 542             int len;
 543
 544             for (s = from; s < fromend; s++)
 545               {
 546                 ch = *s;
 547                 /* This condition must be changed if the type of any
 548                    other character can be LEX_IS_STRINGQUOTE.  */
 549                 if (ch == '\\'
 550                     || ch == '"'
 551                     || ch == '\''
 552                     || ch == '\n')
 553                   break;
 554               }
 555             len = s - from;
 556             if (len > toend - to)
 557               len = toend - to;
 558             if (len > 0)
 559               {
 560                 memcpy (to, from, len);
 561                 to += len;
 562                 from += len;
 563               }
 564           }
 565
 566           ch = GET ();
 567           if (ch == EOF)
 568             {
 569               as_warn (_("end of file in string; inserted '\"'"));
 570               state = old_state;
 571               UNGET ('\n');
 572               PUT ('"');
 573             }
 574           else if (lex[ch] == LEX_IS_STRINGQUOTE)
 575             {
 576               state = old_state;
 577               PUT (ch);
 578             }
 579 #ifndef NO_STRING_ESCAPES
 580           else if (ch == '\\')
 581             {
 582               state = 6;
 583               PUT (ch);
 584             }
 585 #endif
 586           else if (scrub_m68k_mri && ch == '\n')
 587             {
 588               /* Just quietly terminate the string.  This permits lines like
 589                    bne  label   loop if we haven't reach end yet
 590                  */
 591               state = old_state;
 592               UNGET (ch);
 593               PUT ('\'');
 594             }
 595           else
 596             {
 597               PUT (ch);
 598             }
 599           continue;
 600
 601         case 6:
 602           state = 5;
 603           ch = GET ();
 604           switch (ch)
 605             {
 606               /* Handle strings broken across lines, by turning '\n' into
 607                  '\\' and 'n'.  */
 608             case '\n':
 609               UNGET ('n');
 610               add_newlines++;
 611               PUT ('\\');
 612               continue;
 613
 614             case '"':
 615             case '\\':
 616             case 'b':
 617             case 'f':
 618             case 'n':
 619             case 'r':
 620             case 't':
 621             case 'v':
 622             case 'x':
 623             case 'X':
 624             case '0':
 625             case '1':
 626             case '2':
 627             case '3':
 628             case '4':
 629             case '5':
 630             case '6':
 631             case '7':
 632               break;
 633 #if defined(IGNORE_NONSTANDARD_ESCAPES) | defined(ONLY_STANDARD_ESCAPES)
 634             default:
 635               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 636               break;
 637 #else  /* ONLY_STANDARD_ESCAPES */
 638             default:
 639               /* Accept \x as x for any x */
 640               break;
 641 #endif /* ONLY_STANDARD_ESCAPES */
 642
 643             case EOF:
 644               as_warn (_("end of file in string; '\"' inserted"));
 645               PUT ('"');
 646               continue;
 647             }
 648           PUT (ch);
 649           continue;
 650
 651         case 7:
 652           ch = GET ();
 653           state = 5;
 654           old_state = 8;
 655           if (ch == EOF)
 656             goto fromeof;
 657           PUT (ch);
 658           continue;
 659
 660         case 8:
 661           do
 662             ch = GET ();
 663           while (ch != '\n' && ch != EOF);
 664           if (ch == EOF)
 665             goto fromeof;
 666           state = 0;
 667           PUT (ch);
 668           continue;
 669         }
 670
 671       /* OK, we are somewhere in states 0 through 4 or 9 through 11 */
 672
 673       /* flushchar: */
 674       ch = GET ();
 675
 676     recycle:
 677
 678 #if defined TC_ARM && defined OBJ_ELF
 679       /* We need to watch out for .symver directives.  See the comment later
 680          in this function.  */
 681       if (symver_state == NULL)
 682         {
 683           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 684             symver_state = symver_pseudo + 1;
 685         }
 686       else
 687         {
 688           /* We advance to the next state if we find the right
 689              character.  */
 690           if (ch != '\0' && (*symver_state == ch))
 691             ++symver_state;
 692           else if (*symver_state != '\0')
 693             /* We did not get the expected character, or we didn't
 694                get a valid terminating character after seeing the
 695                entire pseudo-op, so we must go back to the beginning.  */
 696             symver_state = NULL;
 697           else
 698             {
 699               /* We've read the entire pseudo-op.  If this is the end
 700                  of the line, go back to the beginning.  */
 701               if (IS_NEWLINE (ch))
 702                 symver_state = NULL;
 703             }
 704         }
 705 #endif /* TC_ARM && OBJ_ELF */
 706
 707 #ifdef TC_M68K
 708       /* We want to have pseudo-ops which control whether we are in
 709          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 710          the scrubber, that means that we need a special purpose
 711          recognizer here.  */
 712       if (mri_state == NULL)
 713         {
 714           if ((state == 0 || state == 1)
 715               && ch == mri_pseudo[0])
 716             mri_state = mri_pseudo + 1;
 717         }
 718       else
 719         {
 720           /* We advance to the next state if we find the right
 721              character, or if we need a space character and we get any
 722              whitespace character, or if we need a '0' and we get a
 723              '1' (this is so that we only need one state to handle
 724              ``.mri 0'' and ``.mri 1'').  */
 725           if (ch != '\0'
 726               && (*mri_state == ch
 727                   || (*mri_state == ' '
 728                       && lex[ch] == LEX_IS_WHITESPACE)
 729                   || (*mri_state == '0'
 730                       && ch == '1')))
 731             {
 732               mri_last_ch = ch;
 733               ++mri_state;
 734             }
 735           else if (*mri_state != '\0'
 736                    || (lex[ch] != LEX_IS_WHITESPACE
 737                        && lex[ch] != LEX_IS_NEWLINE))
 738             {
 739               /* We did not get the expected character, or we didn't
 740                  get a valid terminating character after seeing the
 741                  entire pseudo-op, so we must go back to the
 742                  beginning.  */
 743               mri_state = NULL;
 744             }
 745           else
 746             {
 747               /* We've read the entire pseudo-op.  mips_last_ch is
 748                  either '0' or '1' indicating whether to enter or
 749                  leave MRI mode.  */
 750               do_scrub_begin (mri_last_ch == '1');
 751               mri_state = NULL;
 752
 753               /* We continue handling the character as usual.  The
 754                  main gas reader must also handle the .mri pseudo-op
 755                  to control expression parsing and the like.  */
 756             }
 757         }
 758 #endif
 759
 760       if (ch == EOF)
 761         {
 762           if (state != 0)
 763             {
 764               as_warn (_("end of file not at end of a line; newline inserted"));
 765               state = 0;
 766               PUT ('\n');
 767             }
 768           goto fromeof;
 769         }
 770
 771       switch (lex[ch])
 772         {
 773         case LEX_IS_WHITESPACE:
 774           do
 775             {
 776               ch = GET ();
 777             }
 778           while (ch != EOF && IS_WHITESPACE (ch));
 779           if (ch == EOF)
 780             goto fromeof;
 781
 782           if (state == 0)
 783             {
 784               /* Preserve a single whitespace character at the
 785                  beginning of a line.  */
 786               state = 1;
 787               UNGET (ch);
 788               PUT (' ');
 789               break;
 790             }
 791
 792 #ifdef KEEP_WHITE_AROUND_COLON
 793           if (lex[ch] == LEX_IS_COLON)
 794             {
 795               /* Only keep this white if there's no white *after* the
 796                  colon.  */
 797               ch2 = GET ();
 798               UNGET (ch2);
 799               if (!IS_WHITESPACE (ch2))
 800                 {
 801                   state = 9;
 802                   UNGET (ch);
 803                   PUT (' ');
 804                   break;
 805                 }
 806             }
 807 #endif
 808           if (IS_COMMENT (ch)
 809               || ch == '/'
 810               || IS_LINE_SEPARATOR (ch)
 811               || IS_PARALLEL_SEPARATOR (ch))
 812             {
 813               if (scrub_m68k_mri)
 814                 {
 815                   /* In MRI mode, we keep these spaces.  */
 816                   UNGET (ch);
 817                   PUT (' ');
 818                   break;
 819                 }
 820               goto recycle;
 821             }
 822
 823           /* If we're in state 2 or 11, we've seen a non-white
 824              character followed by whitespace.  If the next character
 825              is ':', this is whitespace after a label name which we
 826              normally must ignore.  In MRI mode, though, spaces are
 827              not permitted between the label and the colon.  */
 828           if ((state == 2 || state == 11)
 829               && lex[ch] == LEX_IS_COLON
 830               && ! scrub_m68k_mri)
 831             {
 832               state = 1;
 833               PUT (ch);
 834               break;
 835             }
 836
 837           switch (state)
 838             {
 839             case 0:
 840               state++;
 841               goto recycle;     /* Punted leading sp */
 842             case 1:
 843               /* We can arrive here if we leave a leading whitespace
 844                  character at the beginning of a line.  */
 845               goto recycle;
 846             case 2:
 847               state = 3;
 848               if (to + 1 < toend)
 849                 {
 850                   /* Optimize common case by skipping UNGET/GET.  */
 851                   PUT (' ');    /* Sp after opco */
 852                   goto recycle;
 853                 }
 854               UNGET (ch);
 855               PUT (' ');
 856               break;
 857             case 3:
 858               if (scrub_m68k_mri)
 859                 {
 860                   /* In MRI mode, we keep these spaces.  */
 861                   UNGET (ch);
 862                   PUT (' ');
 863                   break;
 864                 }
 865               goto recycle;     /* Sp in operands */
 866             case 9:
 867             case 10:
 868               if (scrub_m68k_mri)
 869                 {
 870                   /* In MRI mode, we keep these spaces.  */
 871                   state = 3;
 872                   UNGET (ch);
 873                   PUT (' ');
 874                   break;
 875                 }
 876               state = 10;       /* Sp after symbol char */
 877               goto recycle;
 878             case 11:
 879               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 880                 state = 1;
 881               else
 882                 {
 883                   /* We know that ch is not ':', since we tested that
 884                      case above.  Therefore this is not a label, so it
 885                      must be the opcode, and we've just seen the
 886                      whitespace after it.  */
 887                   state = 3;
 888                 }
 889               UNGET (ch);
 890               PUT (' ');        /* Sp after label definition.  */
 891               break;
 892             default:
 893               BAD_CASE (state);
 894             }
 895           break;
 896
 897         case LEX_IS_TWOCHAR_COMMENT_1ST:
 898           ch2 = GET ();
 899           if (ch2 == '*')
 900             {
 901               for (;;)
 902                 {
 903                   do
 904                     {
 905                       ch2 = GET ();
 906                       if (ch2 != EOF && IS_NEWLINE (ch2))
 907                         add_newlines++;
 908                     }
 909                   while (ch2 != EOF && ch2 != '*');
 910
 911                   while (ch2 == '*')
 912                     ch2 = GET ();
 913
 914                   if (ch2 == EOF || ch2 == '/')
 915                     break;
 916
 917                   /* This UNGET will ensure that we count newlines
 918                      correctly.  */
 919                   UNGET (ch2);
 920                 }
 921
 922               if (ch2 == EOF)
 923                 as_warn (_("end of file in multiline comment"));
 924
 925               ch = ' ';
 926               goto recycle;
 927             }
 928 #ifdef DOUBLESLASH_LINE_COMMENTS
 929           else if (ch2 == '/')
 930             {
 931               do
 932                 {
 933                   ch = GET ();
 934                 }
 935               while (ch != EOF && !IS_NEWLINE (ch));
 936               if (ch == EOF)
 937                 as_warn ("end of file in comment; newline inserted");
 938               state = 0;
 939               PUT ('\n');
 940               break;
 941             }
 942 #endif
 943           else
 944             {
 945               if (ch2 != EOF)
 946                 UNGET (ch2);
 947               if (state == 9 || state == 10)
 948                 state = 3;
 949               PUT (ch);
 950             }
 951           break;
 952
 953         case LEX_IS_STRINGQUOTE:
 954           if (state == 10)
 955             {
 956               /* Preserve the whitespace in foo "bar" */
 957               UNGET (ch);
 958               state = 3;
 959               PUT (' ');
 960
 961               /* PUT didn't jump out.  We could just break, but we
 962                  know what will happen, so optimize a bit.  */
 963               ch = GET ();
 964               old_state = 3;
 965             }
 966           else if (state == 9)
 967             old_state = 3;
 968           else
 969             old_state = state;
 970           state = 5;
 971           PUT (ch);
 972           break;
 973
 974 #ifndef IEEE_STYLE
 975         case LEX_IS_ONECHAR_QUOTE:
 976           if (state == 10)
 977             {
 978               /* Preserve the whitespace in foo 'b' */
 979               UNGET (ch);
 980               state = 3;
 981               PUT (' ');
 982               break;
 983             }
 984           ch = GET ();
 985           if (ch == EOF)
 986             {
 987               as_warn (_("end of file after a one-character quote; \\0 inserted"));
 988               ch = 0;
 989             }
 990           if (ch == '\\')
 991             {
 992               ch = GET ();
 993               if (ch == EOF)
 994                 {
 995                   as_warn (_("end of file in escape character"));
 996                   ch = '\\';
 997                 }
 998               else
 999                 ch = process_escape (ch);
1000             }
1001           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1002
1003           /* None of these 'x constants for us.  We want 'x'.  */
1004           if ((ch = GET ()) != '\'')
1005             {
1006 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1007               as_warn (_("missing close quote; (assumed)"));
1008 #else
1009               if (ch != EOF)
1010                 UNGET (ch);
1011 #endif
1012             }
1013           if (strlen (out_buf) == 1)
1014             {
1015               PUT (out_buf[0]);
1016               break;
1017             }
1018           if (state == 9)
1019             old_state = 3;
1020           else
1021             old_state = state;
1022           state = -1;
1023           out_string = out_buf;
1024           PUT (*out_string++);
1025           break;
1026 #endif
1027
1028         case LEX_IS_COLON:
1029 #ifdef KEEP_WHITE_AROUND_COLON
1030           state = 9;
1031 #else
1032           if (state == 9 || state == 10)
1033             state = 3;
1034           else if (state != 3)
1035             state = 1;
1036 #endif
1037           PUT (ch);
1038           break;
1039
1040         case LEX_IS_NEWLINE:
1041           /* Roll out a bunch of newlines from inside comments, etc.  */
1042           if (add_newlines)
1043             {
1044               --add_newlines;
1045               UNGET (ch);
1046             }
1047           /* Fall through.  */
1048
1049         case LEX_IS_LINE_SEPARATOR:
1050           state = 0;
1051           PUT (ch);
1052           break;
1053
1054         case LEX_IS_PARALLEL_SEPARATOR:
1055           state = 1;
1056           PUT (ch);
1057           break;
1058
1059 #ifdef TC_V850
1060         case LEX_IS_DOUBLEDASH_1ST:
1061           ch2 = GET ();
1062           if (ch2 != '-')
1063             {
1064               UNGET (ch2);
1065               goto de_fault;
1066             }
1067           /* Read and skip to end of line.  */
1068           do
1069             {
1070               ch = GET ();
1071             }
1072           while (ch != EOF && ch != '\n');
1073           if (ch == EOF)
1074             {
1075               as_warn (_("end of file in comment; newline inserted"));
1076             }
1077           state = 0;
1078           PUT ('\n');
1079           break;
1080 #endif
1081 #ifdef DOUBLEBAR_PARALLEL
1082         case LEX_IS_DOUBLEBAR_1ST:
1083           ch2 = GET ();
1084           if (ch2 != '|')
1085             {
1086               UNGET (ch2);
1087               goto de_fault;
1088             }
1089           /* Reset back to state 1 and pretend that we are parsing a line from
1090              just after the first white space.  */
1091           state = 1;
1092           PUT ('|');
1093           PUT ('|');
1094           break;
1095 #endif
1096         case LEX_IS_LINE_COMMENT_START:
1097           /* FIXME-someday: The two character comment stuff was badly
1098              thought out.  On i386, we want '/' as line comment start
1099              AND we want C style comments.  hence this hack.  The
1100              whole lexical process should be reworked.  xoxorich.  */
1101           if (ch == '/')
1102             {
1103               ch2 = GET ();
1104               if (ch2 == '*')
1105                 {
1106                   old_state = 3;
1107                   state = -2;
1108                   break;
1109                 }
1110               else
1111                 {
1112                   UNGET (ch2);
1113                 }
1114             } /* bad hack */
1115
1116           if (state == 0 || state == 1) /* Only comment at start of line.  */
1117             {
1118               int startch;
1119
1120               startch = ch;
1121
1122               do
1123                 {
1124                   ch = GET ();
1125                 }
1126               while (ch != EOF && IS_WHITESPACE (ch));
1127               if (ch == EOF)
1128                 {
1129                   as_warn (_("end of file in comment; newline inserted"));
1130                   PUT ('\n');
1131                   break;
1132                 }
1133               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1134                 {
1135                   /* Not a cpp line.  */
1136                   while (ch != EOF && !IS_NEWLINE (ch))
1137                     ch = GET ();
1138                   if (ch == EOF)
1139                     as_warn (_("end of file in comment; newline inserted"));
1140                   state = 0;
1141                   PUT ('\n');
1142                   break;
1143                 }
1144               /* Looks like `# 123 "filename"' from cpp.  */
1145               UNGET (ch);
1146               old_state = 4;
1147               state = -1;
1148               if (scrub_m68k_mri)
1149                 out_string = "\tappline ";
1150               else
1151                 out_string = "\t.appline ";
1152               PUT (*out_string++);
1153               break;
1154             }
1155
1156 #ifdef TC_D10V
1157           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1158              Trap is the only short insn that has a first operand that is
1159              neither register nor label.
1160              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1161              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1162              already LEX_IS_LINE_COMMENT_START.  However, it is the
1163              only character in line_comment_chars for d10v, hence we
1164              can recognize it as such.  */
1165           /* An alternative approach would be to reset the state to 1 when
1166              we see '||', '<'- or '->', but that seems to be overkill.  */
1167           if (state == 10)
1168             PUT (' ');
1169 #endif
1170           /* We have a line comment character which is not at the
1171              start of a line.  If this is also a normal comment
1172              character, fall through.  Otherwise treat it as a default
1173              character.  */
1174           if (strchr (tc_comment_chars, ch) == NULL
1175               && (! scrub_m68k_mri
1176                   || (ch != '!' && ch != '*')))
1177             goto de_fault;
1178           if (scrub_m68k_mri
1179               && (ch == '!' || ch == '*' || ch == '#')
1180               && state != 1
1181               && state != 10)
1182             goto de_fault;
1183           /* Fall through.  */
1184         case LEX_IS_COMMENT_START:
1185 #if defined TC_ARM && defined OBJ_ELF
1186           /* On the ARM, `@' is the comment character.
1187              Unfortunately this is also a special character in ELF .symver
1188              directives (and .type, though we deal with those another way).
1189              So we check if this line is such a directive, and treat
1190              the character as default if so.  This is a hack.  */
1191           if ((symver_state != NULL) && (*symver_state == 0))
1192             goto de_fault;
1193 #endif
1194 #ifdef WARN_COMMENTS
1195           if (!found_comment)
1196             as_where (&found_comment_file, &found_comment);
1197 #endif
1198           do
1199             {
1200               ch = GET ();
1201             }
1202           while (ch != EOF && !IS_NEWLINE (ch));
1203           if (ch == EOF)
1204             as_warn (_("end of file in comment; newline inserted"));
1205           state = 0;
1206           PUT ('\n');
1207           break;
1208
1209         case LEX_IS_SYMBOL_COMPONENT:
1210           if (state == 10)
1211             {
1212               /* This is a symbol character following another symbol
1213                  character, with whitespace in between.  We skipped
1214                  the whitespace earlier, so output it now.  */
1215               UNGET (ch);
1216               state = 3;
1217               PUT (' ');
1218               break;
1219             }
1220
1221           if (state == 3)
1222             state = 9;
1223
1224           /* This is a common case.  Quickly copy CH and all the
1225              following symbol component or normal characters.  */
1226           if (to + 1 < toend
1227               && mri_state == NULL
1228 #if defined TC_ARM && defined OBJ_ELF
1229               && symver_state == NULL
1230 #endif
1231               )
1232             {
1233               char *s;
1234               int len;
1235
1236               for (s = from; s < fromend; s++)
1237                 {
1238                   int type;
1239
1240                   ch2 = *(unsigned char *) s;
1241                   type = lex[ch2];
1242                   if (type != 0
1243                       && type != LEX_IS_SYMBOL_COMPONENT)
1244                     break;
1245                 }
1246               if (s > from)
1247                 {
1248                   /* Handle the last character normally, for
1249                      simplicity.  */
1250                   --s;
1251                 }
1252               len = s - from;
1253               if (len > (toend - to) - 1)
1254                 len = (toend - to) - 1;
1255               if (len > 0)
1256                 {
1257                   PUT (ch);
1258                   if (len > 8)
1259                     {
1260                       memcpy (to, from, len);
1261                       to += len;
1262                       from += len;
1263                     }
1264                   else
1265                     {
1266                       switch (len)
1267                         {
1268                         case 8: *to++ = *from++;
1269                         case 7: *to++ = *from++;
1270                         case 6: *to++ = *from++;
1271                         case 5: *to++ = *from++;
1272                         case 4: *to++ = *from++;
1273                         case 3: *to++ = *from++;
1274                         case 2: *to++ = *from++;
1275                         case 1: *to++ = *from++;
1276                         }
1277                     }
1278                   ch = GET ();
1279                 }
1280             }
1281
1282           /* Fall through.  */
1283         default:
1284         de_fault:
1285           /* Some relatively `normal' character.  */
1286           if (state == 0)
1287             {
1288               state = 11;       /* Now seeing label definition */
1289             }
1290           else if (state == 1)
1291             {
1292               state = 2;        /* Ditto */
1293             }
1294           else if (state == 9)
1295             {
1296               if (lex[ch] != LEX_IS_SYMBOL_COMPONENT)
1297                 state = 3;
1298             }
1299           else if (state == 10)
1300             {
1301               if (ch == '\\')
1302                 {
1303                   /* Special handling for backslash: a backslash may
1304                      be the beginning of a formal parameter (of a
1305                      macro) following another symbol character, with
1306                      whitespace in between.  If that is the case, we
1307                      output a space before the parameter.  Strictly
1308                      speaking, correct handling depends upon what the
1309                      macro parameter expands into; if the parameter
1310                      expands into something which does not start with
1311                      an operand character, then we don't want to keep
1312                      the space.  We don't have enough information to
1313                      make the right choice, so here we are making the
1314                      choice which is more likely to be correct.  */
1315                   PUT (' ');
1316                 }
1317
1318               state = 3;
1319             }
1320           PUT (ch);
1321           break;
1322         }
1323     }
1324
1325   /*NOTREACHED*/
1326
1327  fromeof:
1328   /* We have reached the end of the input.  */
1329   return to - tostart;
1330
1331  tofull:
1332   /* The output buffer is full.  Save any input we have not yet
1333      processed.  */
1334   if (fromend > from)
1335     {
1336       saved_input = from;
1337       saved_input_len = fromend - from;
1338     }
1339   else
1340     saved_input = NULL;
1341
1342   return to - tostart;
1343 }
1344
1345 /* end of app.c */