gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000, 2001, 2002, 2003, 2006, 2007
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 3, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful, but WITHOUT
  14    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
  16    License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  21    02110-1301, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  24 /* App, the assembler pre-processor.  This pre-processor strips out
  25    excess spaces, turns single-quoted characters into a decimal
  26    constant, and turns the # in # <number> <filename> <garbage> into a
  27    .linefile.  This needs better error-handling.  */
  28
  29 #include "as.h"
  30
  31 #if (__STDC__ != 1)
  32 #ifndef const
  33 #define const  /* empty */
  34 #endif
  35 #endif
  36
  37 #ifdef H_TICK_HEX
  38 int enable_h_tick_hex = 0;
  39 #endif
  40
  41 #ifdef TC_M68K
  42 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  43    flag_m68k_mri, because the two flags will be affected by the .mri
  44    pseudo-op at different times.  */
  45 static int scrub_m68k_mri;
  46
  47 /* The pseudo-op which switches in and out of MRI mode.  See the
  48    comment in do_scrub_chars.  */
  49 static const char mri_pseudo[] = ".mri 0";
  50 #else
  51 #define scrub_m68k_mri 0
  52 #endif
  53
  54 #if defined TC_ARM && defined OBJ_ELF
  55 /* The pseudo-op for which we need to special-case `@' characters.
  56    See the comment in do_scrub_chars.  */
  57 static const char   symver_pseudo[] = ".symver";
  58 static const char * symver_state;
  59 #endif
  60
  61 static char lex[256];
  62 static const char symbol_chars[] =
  63 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  64
  65 #define LEX_IS_SYMBOL_COMPONENT         1
  66 #define LEX_IS_WHITESPACE               2
  67 #define LEX_IS_LINE_SEPARATOR           3
  68 #define LEX_IS_COMMENT_START            4
  69 #define LEX_IS_LINE_COMMENT_START       5
  70 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  71 #define LEX_IS_STRINGQUOTE              8
  72 #define LEX_IS_COLON                    9
  73 #define LEX_IS_NEWLINE                  10
  74 #define LEX_IS_ONECHAR_QUOTE            11
  75 #ifdef TC_V850
  76 #define LEX_IS_DOUBLEDASH_1ST           12
  77 #endif
  78 #ifdef TC_M32R
  79 #define DOUBLEBAR_PARALLEL
  80 #endif
  81 #ifdef DOUBLEBAR_PARALLEL
  82 #define LEX_IS_DOUBLEBAR_1ST            13
  83 #endif
  84 #define LEX_IS_PARALLEL_SEPARATOR       14
  85 #ifdef H_TICK_HEX
  86 #define LEX_IS_H                        15
  87 #endif
  88 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  89 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  90 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  91 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  92 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  93 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  94 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  95
  96 static int process_escape (int);
  97
  98 /* FIXME-soon: The entire lexer/parser thingy should be
  99    built statically at compile time rather than dynamically
 100    each and every time the assembler is run.  xoxorich.  */
 101
 102 void
 103 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
 104 {
 105   const char *p;
 106   int c;
 107
 108   lex[' '] = LEX_IS_WHITESPACE;
 109   lex['\t'] = LEX_IS_WHITESPACE;
 110   lex['\r'] = LEX_IS_WHITESPACE;
 111   lex['\n'] = LEX_IS_NEWLINE;
 112   lex[':'] = LEX_IS_COLON;
 113
 114 #ifdef TC_M68K
 115   scrub_m68k_mri = m68k_mri;
 116
 117   if (! m68k_mri)
 118 #endif
 119     {
 120       lex['"'] = LEX_IS_STRINGQUOTE;
 121
 122 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 123       /* I370 uses single-quotes to delimit integer, float constants.  */
 124       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 125 #endif
 126
 127 #ifdef SINGLE_QUOTE_STRINGS
 128       lex['\''] = LEX_IS_STRINGQUOTE;
 129 #endif
 130     }
 131
 132   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 133      in state 5 of do_scrub_chars must be changed.  */
 134
 135   /* Note that these override the previous defaults, e.g. if ';' is a
 136      comment char, then it isn't a line separator.  */
 137   for (p = symbol_chars; *p; ++p)
 138     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 139
 140   for (c = 128; c < 256; ++c)
 141     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 142
 143 #ifdef tc_symbol_chars
 144   /* This macro permits the processor to specify all characters which
 145      may appears in an operand.  This will prevent the scrubber from
 146      discarding meaningful whitespace in certain cases.  The i386
 147      backend uses this to support prefixes, which can confuse the
 148      scrubber as to whether it is parsing operands or opcodes.  */
 149   for (p = tc_symbol_chars; *p; ++p)
 150     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 151 #endif
 152
 153   /* The m68k backend wants to be able to change comment_chars.  */
 154 #ifndef tc_comment_chars
 155 #define tc_comment_chars comment_chars
 156 #endif
 157   for (p = tc_comment_chars; *p; p++)
 158     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 159
 160   for (p = line_comment_chars; *p; p++)
 161     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 162
 163   for (p = line_separator_chars; *p; p++)
 164     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 165
 166 #ifdef tc_parallel_separator_chars
 167   /* This macro permits the processor to specify all characters which
 168      separate parallel insns on the same line.  */
 169   for (p = tc_parallel_separator_chars; *p; p++)
 170     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 171 #endif
 172
 173   /* Only allow slash-star comments if slash is not in use.
 174      FIXME: This isn't right.  We should always permit them.  */
 175   if (lex['/'] == 0)
 176     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 177
 178 #ifdef TC_M68K
 179   if (m68k_mri)
 180     {
 181       lex['\''] = LEX_IS_STRINGQUOTE;
 182       lex[';'] = LEX_IS_COMMENT_START;
 183       lex['*'] = LEX_IS_LINE_COMMENT_START;
 184       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 185          then it can't be used in an expression.  */
 186       lex['!'] = LEX_IS_LINE_COMMENT_START;
 187     }
 188 #endif
 189
 190 #ifdef TC_V850
 191   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 192 #endif
 193 #ifdef DOUBLEBAR_PARALLEL
 194   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 195 #endif
 196 #ifdef TC_D30V
 197   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 198   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 199 #endif
 200
 201 #ifdef H_TICK_HEX
 202   if (enable_h_tick_hex)
 203     {
 204       lex['h'] = LEX_IS_H;
 205       lex['H'] = LEX_IS_H;
 206     }
 207 #endif
 208 }
 209
 210 /* Saved state of the scrubber.  */
 211 static int state;
 212 static int old_state;
 213 static char *out_string;
 214 static char out_buf[20];
 215 static int add_newlines;
 216 static char *saved_input;
 217 static int saved_input_len;
 218 static char input_buffer[32 * 1024];
 219 static const char *mri_state;
 220 static char mri_last_ch;
 221
 222 /* Data structure for saving the state of app across #include's.  Note that
 223    app is called asynchronously to the parsing of the .include's, so our
 224    state at the time .include is interpreted is completely unrelated.
 225    That's why we have to save it all.  */
 226
 227 struct app_save
 228 {
 229   int          state;
 230   int          old_state;
 231   char *       out_string;
 232   char         out_buf[sizeof (out_buf)];
 233   int          add_newlines;
 234   char *       saved_input;
 235   int          saved_input_len;
 236 #ifdef TC_M68K
 237   int          scrub_m68k_mri;
 238 #endif
 239   const char * mri_state;
 240   char         mri_last_ch;
 241 #if defined TC_ARM && defined OBJ_ELF
 242   const char * symver_state;
 243 #endif
 244 };
 245
 246 char *
 247 app_push (void)
 248 {
 249   register struct app_save *saved;
 250
 251   saved = (struct app_save *) xmalloc (sizeof (*saved));
 252   saved->state = state;
 253   saved->old_state = old_state;
 254   saved->out_string = out_string;
 255   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 256   saved->add_newlines = add_newlines;
 257   if (saved_input == NULL)
 258     saved->saved_input = NULL;
 259   else
 260     {
 261       saved->saved_input = xmalloc (saved_input_len);
 262       memcpy (saved->saved_input, saved_input, saved_input_len);
 263       saved->saved_input_len = saved_input_len;
 264     }
 265 #ifdef TC_M68K
 266   saved->scrub_m68k_mri = scrub_m68k_mri;
 267 #endif
 268   saved->mri_state = mri_state;
 269   saved->mri_last_ch = mri_last_ch;
 270 #if defined TC_ARM && defined OBJ_ELF
 271   saved->symver_state = symver_state;
 272 #endif
 273
 274   /* do_scrub_begin() is not useful, just wastes time.  */
 275
 276   state = 0;
 277   saved_input = NULL;
 278
 279   return (char *) saved;
 280 }
 281
 282 void
 283 app_pop (char *arg)
 284 {
 285   register struct app_save *saved = (struct app_save *) arg;
 286
 287   /* There is no do_scrub_end ().  */
 288   state = saved->state;
 289   old_state = saved->old_state;
 290   out_string = saved->out_string;
 291   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 292   add_newlines = saved->add_newlines;
 293   if (saved->saved_input == NULL)
 294     saved_input = NULL;
 295   else
 296     {
 297       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
 298       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 299       saved_input = input_buffer;
 300       saved_input_len = saved->saved_input_len;
 301       free (saved->saved_input);
 302     }
 303 #ifdef TC_M68K
 304   scrub_m68k_mri = saved->scrub_m68k_mri;
 305 #endif
 306   mri_state = saved->mri_state;
 307   mri_last_ch = saved->mri_last_ch;
 308 #if defined TC_ARM && defined OBJ_ELF
 309   symver_state = saved->symver_state;
 310 #endif
 311
 312   free (arg);
 313 }
 314
 315 /* @@ This assumes that \n &c are the same on host and target.  This is not
 316    necessarily true.  */
 317
 318 static int
 319 process_escape (int ch)
 320 {
 321   switch (ch)
 322     {
 323     case 'b':
 324       return '\b';
 325     case 'f':
 326       return '\f';
 327     case 'n':
 328       return '\n';
 329     case 'r':
 330       return '\r';
 331     case 't':
 332       return '\t';
 333     case '\'':
 334       return '\'';
 335     case '"':
 336       return '\"';
 337     default:
 338       return ch;
 339     }
 340 }
 341
 342 /* This function is called to process input characters.  The GET
 343    parameter is used to retrieve more input characters.  GET should
 344    set its parameter to point to a buffer, and return the length of
 345    the buffer; it should return 0 at end of file.  The scrubbed output
 346    characters are put into the buffer starting at TOSTART; the TOSTART
 347    buffer is TOLEN bytes in length.  The function returns the number
 348    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 349    end of file was seen.  This function is arranged as a state
 350    machine, and saves its state so that it may return at any point.
 351    This is the way the old code used to work.  */
 352
 353 int
 354 do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
 355 {
 356   char *to = tostart;
 357   char *toend = tostart + tolen;
 358   char *from;
 359   char *fromend;
 360   int fromlen;
 361   register int ch, ch2 = 0;
 362   /* Character that started the string we're working on.  */
 363   static char quotechar;
 364
 365   /*State 0: beginning of normal line
 366           1: After first whitespace on line (flush more white)
 367           2: After first non-white (opcode) on line (keep 1white)
 368           3: after second white on line (into operands) (flush white)
 369           4: after putting out a .linefile, put out digits
 370           5: parsing a string, then go to old-state
 371           6: putting out \ escape in a "d string.
 372           7: no longer used
 373           8: no longer used
 374           9: After seeing symbol char in state 3 (keep 1white after symchar)
 375          10: After seeing whitespace in state 9 (keep white before symchar)
 376          11: After seeing a symbol character in state 0 (eg a label definition)
 377          -1: output string in out_string and go to the state in old_state
 378          -2: flush text until a '*' '/' is seen, then go to state old_state
 379 #ifdef TC_V850
 380          12: After seeing a dash, looking for a second dash as a start
 381              of comment.
 382 #endif
 383 #ifdef DOUBLEBAR_PARALLEL
 384          13: After seeing a vertical bar, looking for a second
 385              vertical bar as a parallel expression separator.
 386 #endif
 387 #ifdef TC_IA64
 388          14: After seeing a `(' at state 0, looking for a `)' as
 389              predicate.
 390          15: After seeing a `(' at state 1, looking for a `)' as
 391              predicate.
 392 #endif
 393 #ifdef TC_Z80
 394          16: After seeing an 'a' or an 'A' at the start of a symbol
 395          17: After seeing an 'f' or an 'F' in state 16
 396 #endif
 397           */
 398
 399   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 400      constructs like ``.loc 1 20''.  This was turning into ``.loc
 401      120''.  States 9 and 10 ensure that a space is never dropped in
 402      between characters which could appear in an identifier.  Ian
 403      Taylor, ian@cygnus.com.
 404
 405      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 406      correctly on the PA (and any other target where colons are optional).
 407      Jeff Law, law@cs.utah.edu.
 408
 409      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 410      get squashed into "cmp r1,r2||trap#1", with the all important space
 411      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 412
 413   /* This macro gets the next input character.  */
 414
 415 #define GET()                                                   \
 416   (from < fromend                                               \
 417    ? * (unsigned char *) (from++)                               \
 418    : (saved_input = NULL,                                       \
 419       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 420       from = input_buffer,                                      \
 421       fromend = from + fromlen,                                 \
 422       (fromlen == 0                                             \
 423        ? EOF                                                    \
 424        : * (unsigned char *) (from++))))
 425
 426   /* This macro pushes a character back on the input stream.  */
 427
 428 #define UNGET(uch) (*--from = (uch))
 429
 430   /* This macro puts a character into the output buffer.  If this
 431      character fills the output buffer, this macro jumps to the label
 432      TOFULL.  We use this rather ugly approach because we need to
 433      handle two different termination conditions: EOF on the input
 434      stream, and a full output buffer.  It would be simpler if we
 435      always read in the entire input stream before processing it, but
 436      I don't want to make such a significant change to the assembler's
 437      memory usage.  */
 438
 439 #define PUT(pch)                                \
 440   do                                            \
 441     {                                           \
 442       *to++ = (pch);                            \
 443       if (to >= toend)                          \
 444         goto tofull;                            \
 445     }                                           \
 446   while (0)
 447
 448   if (saved_input != NULL)
 449     {
 450       from = saved_input;
 451       fromend = from + saved_input_len;
 452     }
 453   else
 454     {
 455       fromlen = (*get) (input_buffer, sizeof input_buffer);
 456       if (fromlen == 0)
 457         return 0;
 458       from = input_buffer;
 459       fromend = from + fromlen;
 460     }
 461
 462   while (1)
 463     {
 464       /* The cases in this switch end with continue, in order to
 465          branch back to the top of this while loop and generate the
 466          next output character in the appropriate state.  */
 467       switch (state)
 468         {
 469         case -1:
 470           ch = *out_string++;
 471           if (*out_string == '\0')
 472             {
 473               state = old_state;
 474               old_state = 3;
 475             }
 476           PUT (ch);
 477           continue;
 478
 479         case -2:
 480           for (;;)
 481             {
 482               do
 483                 {
 484                   ch = GET ();
 485
 486                   if (ch == EOF)
 487                     {
 488                       as_warn (_("end of file in comment"));
 489                       goto fromeof;
 490                     }
 491
 492                   if (ch == '\n')
 493                     PUT ('\n');
 494                 }
 495               while (ch != '*');
 496
 497               while ((ch = GET ()) == '*')
 498                 ;
 499
 500               if (ch == EOF)
 501                 {
 502                   as_warn (_("end of file in comment"));
 503                   goto fromeof;
 504                 }
 505
 506               if (ch == '/')
 507                 break;
 508
 509               UNGET (ch);
 510             }
 511
 512           state = old_state;
 513           UNGET (' ');
 514           continue;
 515
 516         case 4:
 517           ch = GET ();
 518           if (ch == EOF)
 519             goto fromeof;
 520           else if (ch >= '0' && ch <= '9')
 521             PUT (ch);
 522           else
 523             {
 524               while (ch != EOF && IS_WHITESPACE (ch))
 525                 ch = GET ();
 526               if (ch == '"')
 527                 {
 528                   quotechar = ch;
 529                   state = 5;
 530                   old_state = 3;
 531                   PUT (ch);
 532                 }
 533               else
 534                 {
 535                   while (ch != EOF && ch != '\n')
 536                     ch = GET ();
 537                   state = 0;
 538                   PUT (ch);
 539                 }
 540             }
 541           continue;
 542
 543         case 5:
 544           /* We are going to copy everything up to a quote character,
 545              with special handling for a backslash.  We try to
 546              optimize the copying in the simple case without using the
 547              GET and PUT macros.  */
 548           {
 549             char *s;
 550             int len;
 551
 552             for (s = from; s < fromend; s++)
 553               {
 554                 ch = *s;
 555                 if (ch == '\\'
 556                     || ch == quotechar
 557                     || ch == '\n')
 558                   break;
 559               }
 560             len = s - from;
 561             if (len > toend - to)
 562               len = toend - to;
 563             if (len > 0)
 564               {
 565                 memcpy (to, from, len);
 566                 to += len;
 567                 from += len;
 568                 if (to >= toend)
 569                   goto tofull;
 570               }
 571           }
 572
 573           ch = GET ();
 574           if (ch == EOF)
 575             {
 576               /* This buffer is here specifically so
 577                  that the UNGET below will work.  */
 578               static char one_char_buf[1];
 579
 580               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 581               state = old_state;
 582               from = fromend = one_char_buf + 1;
 583               fromlen = 1;
 584               UNGET ('\n');
 585               PUT (quotechar);
 586             }
 587           else if (ch == quotechar)
 588             {
 589               state = old_state;
 590               PUT (ch);
 591             }
 592 #ifndef NO_STRING_ESCAPES
 593           else if (ch == '\\')
 594             {
 595               state = 6;
 596               PUT (ch);
 597             }
 598 #endif
 599           else if (scrub_m68k_mri && ch == '\n')
 600             {
 601               /* Just quietly terminate the string.  This permits lines like
 602                    bne  label   loop if we haven't reach end yet.  */
 603               state = old_state;
 604               UNGET (ch);
 605               PUT ('\'');
 606             }
 607           else
 608             {
 609               PUT (ch);
 610             }
 611           continue;
 612
 613         case 6:
 614           state = 5;
 615           ch = GET ();
 616           switch (ch)
 617             {
 618               /* Handle strings broken across lines, by turning '\n' into
 619                  '\\' and 'n'.  */
 620             case '\n':
 621               UNGET ('n');
 622               add_newlines++;
 623               PUT ('\\');
 624               continue;
 625
 626             case EOF:
 627               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 628               PUT (quotechar);
 629               continue;
 630
 631             case '"':
 632             case '\\':
 633             case 'b':
 634             case 'f':
 635             case 'n':
 636             case 'r':
 637             case 't':
 638             case 'v':
 639             case 'x':
 640             case 'X':
 641             case '0':
 642             case '1':
 643             case '2':
 644             case '3':
 645             case '4':
 646             case '5':
 647             case '6':
 648             case '7':
 649               break;
 650
 651             default:
 652 #ifdef ONLY_STANDARD_ESCAPES
 653               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 654 #endif
 655               break;
 656             }
 657           PUT (ch);
 658           continue;
 659
 660 #ifdef DOUBLEBAR_PARALLEL
 661         case 13:
 662           ch = GET ();
 663           if (ch != '|')
 664             abort ();
 665
 666           /* Reset back to state 1 and pretend that we are parsing a
 667              line from just after the first white space.  */
 668           state = 1;
 669           PUT ('|');
 670           continue;
 671 #endif
 672 #ifdef TC_Z80
 673         case 16:
 674           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 675           ch = GET ();
 676           if (ch == 'f' || ch == 'F')
 677             {
 678               state = 17;
 679               PUT (ch);
 680             }
 681           else
 682             {
 683               state = 9;
 684               break;
 685             }
 686         case 17:
 687           /* We have seen "af" at the start of a symbol,
 688              a ' here is a part of that symbol.  */
 689           ch = GET ();
 690           state = 9;
 691           if (ch == '\'')
 692             /* Change to avoid warning about unclosed string.  */
 693             PUT ('`');
 694           else if (ch != EOF)
 695             UNGET (ch);
 696           break;
 697 #endif
 698         }
 699
 700       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 701
 702       /* flushchar: */
 703       ch = GET ();
 704
 705 #ifdef TC_IA64
 706       if (ch == '(' && (state == 0 || state == 1))
 707         {
 708           state += 14;
 709           PUT (ch);
 710           continue;
 711         }
 712       else if (state == 14 || state == 15)
 713         {
 714           if (ch == ')')
 715             {
 716               state -= 14;
 717               PUT (ch);
 718               ch = GET ();
 719             }
 720           else
 721             {
 722               PUT (ch);
 723               continue;
 724             }
 725         }
 726 #endif
 727
 728     recycle:
 729
 730 #if defined TC_ARM && defined OBJ_ELF
 731       /* We need to watch out for .symver directives.  See the comment later
 732          in this function.  */
 733       if (symver_state == NULL)
 734         {
 735           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 736             symver_state = symver_pseudo + 1;
 737         }
 738       else
 739         {
 740           /* We advance to the next state if we find the right
 741              character.  */
 742           if (ch != '\0' && (*symver_state == ch))
 743             ++symver_state;
 744           else if (*symver_state != '\0')
 745             /* We did not get the expected character, or we didn't
 746                get a valid terminating character after seeing the
 747                entire pseudo-op, so we must go back to the beginning.  */
 748             symver_state = NULL;
 749           else
 750             {
 751               /* We've read the entire pseudo-op.  If this is the end
 752                  of the line, go back to the beginning.  */
 753               if (IS_NEWLINE (ch))
 754                 symver_state = NULL;
 755             }
 756         }
 757 #endif /* TC_ARM && OBJ_ELF */
 758
 759 #ifdef TC_M68K
 760       /* We want to have pseudo-ops which control whether we are in
 761          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 762          the scrubber, that means that we need a special purpose
 763          recognizer here.  */
 764       if (mri_state == NULL)
 765         {
 766           if ((state == 0 || state == 1)
 767               && ch == mri_pseudo[0])
 768             mri_state = mri_pseudo + 1;
 769         }
 770       else
 771         {
 772           /* We advance to the next state if we find the right
 773              character, or if we need a space character and we get any
 774              whitespace character, or if we need a '0' and we get a
 775              '1' (this is so that we only need one state to handle
 776              ``.mri 0'' and ``.mri 1'').  */
 777           if (ch != '\0'
 778               && (*mri_state == ch
 779                   || (*mri_state == ' '
 780                       && lex[ch] == LEX_IS_WHITESPACE)
 781                   || (*mri_state == '0'
 782                       && ch == '1')))
 783             {
 784               mri_last_ch = ch;
 785               ++mri_state;
 786             }
 787           else if (*mri_state != '\0'
 788                    || (lex[ch] != LEX_IS_WHITESPACE
 789                        && lex[ch] != LEX_IS_NEWLINE))
 790             {
 791               /* We did not get the expected character, or we didn't
 792                  get a valid terminating character after seeing the
 793                  entire pseudo-op, so we must go back to the
 794                  beginning.  */
 795               mri_state = NULL;
 796             }
 797           else
 798             {
 799               /* We've read the entire pseudo-op.  mips_last_ch is
 800                  either '0' or '1' indicating whether to enter or
 801                  leave MRI mode.  */
 802               do_scrub_begin (mri_last_ch == '1');
 803               mri_state = NULL;
 804
 805               /* We continue handling the character as usual.  The
 806                  main gas reader must also handle the .mri pseudo-op
 807                  to control expression parsing and the like.  */
 808             }
 809         }
 810 #endif
 811
 812       if (ch == EOF)
 813         {
 814           if (state != 0)
 815             {
 816               as_warn (_("end of file not at end of a line; newline inserted"));
 817               state = 0;
 818               PUT ('\n');
 819             }
 820           goto fromeof;
 821         }
 822
 823       switch (lex[ch])
 824         {
 825         case LEX_IS_WHITESPACE:
 826           do
 827             {
 828               ch = GET ();
 829             }
 830           while (ch != EOF && IS_WHITESPACE (ch));
 831           if (ch == EOF)
 832             goto fromeof;
 833
 834           if (state == 0)
 835             {
 836               /* Preserve a single whitespace character at the
 837                  beginning of a line.  */
 838               state = 1;
 839               UNGET (ch);
 840               PUT (' ');
 841               break;
 842             }
 843
 844 #ifdef KEEP_WHITE_AROUND_COLON
 845           if (lex[ch] == LEX_IS_COLON)
 846             {
 847               /* Only keep this white if there's no white *after* the
 848                  colon.  */
 849               ch2 = GET ();
 850               if (ch2 != EOF)
 851                 UNGET (ch2);
 852               if (!IS_WHITESPACE (ch2))
 853                 {
 854                   state = 9;
 855                   UNGET (ch);
 856                   PUT (' ');
 857                   break;
 858                 }
 859             }
 860 #endif
 861           if (IS_COMMENT (ch)
 862               || ch == '/'
 863               || IS_LINE_SEPARATOR (ch)
 864               || IS_PARALLEL_SEPARATOR (ch))
 865             {
 866               if (scrub_m68k_mri)
 867                 {
 868                   /* In MRI mode, we keep these spaces.  */
 869                   UNGET (ch);
 870                   PUT (' ');
 871                   break;
 872                 }
 873               goto recycle;
 874             }
 875
 876           /* If we're in state 2 or 11, we've seen a non-white
 877              character followed by whitespace.  If the next character
 878              is ':', this is whitespace after a label name which we
 879              normally must ignore.  In MRI mode, though, spaces are
 880              not permitted between the label and the colon.  */
 881           if ((state == 2 || state == 11)
 882               && lex[ch] == LEX_IS_COLON
 883               && ! scrub_m68k_mri)
 884             {
 885               state = 1;
 886               PUT (ch);
 887               break;
 888             }
 889
 890           switch (state)
 891             {
 892             case 1:
 893               /* We can arrive here if we leave a leading whitespace
 894                  character at the beginning of a line.  */
 895               goto recycle;
 896             case 2:
 897               state = 3;
 898               if (to + 1 < toend)
 899                 {
 900                   /* Optimize common case by skipping UNGET/GET.  */
 901                   PUT (' ');    /* Sp after opco */
 902                   goto recycle;
 903                 }
 904               UNGET (ch);
 905               PUT (' ');
 906               break;
 907             case 3:
 908               if (scrub_m68k_mri)
 909                 {
 910                   /* In MRI mode, we keep these spaces.  */
 911                   UNGET (ch);
 912                   PUT (' ');
 913                   break;
 914                 }
 915               goto recycle;     /* Sp in operands */
 916             case 9:
 917             case 10:
 918               if (scrub_m68k_mri)
 919                 {
 920                   /* In MRI mode, we keep these spaces.  */
 921                   state = 3;
 922                   UNGET (ch);
 923                   PUT (' ');
 924                   break;
 925                 }
 926               state = 10;       /* Sp after symbol char */
 927               goto recycle;
 928             case 11:
 929               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 930                 state = 1;
 931               else
 932                 {
 933                   /* We know that ch is not ':', since we tested that
 934                      case above.  Therefore this is not a label, so it
 935                      must be the opcode, and we've just seen the
 936                      whitespace after it.  */
 937                   state = 3;
 938                 }
 939               UNGET (ch);
 940               PUT (' ');        /* Sp after label definition.  */
 941               break;
 942             default:
 943               BAD_CASE (state);
 944             }
 945           break;
 946
 947         case LEX_IS_TWOCHAR_COMMENT_1ST:
 948           ch2 = GET ();
 949           if (ch2 == '*')
 950             {
 951               for (;;)
 952                 {
 953                   do
 954                     {
 955                       ch2 = GET ();
 956                       if (ch2 != EOF && IS_NEWLINE (ch2))
 957                         add_newlines++;
 958                     }
 959                   while (ch2 != EOF && ch2 != '*');
 960
 961                   while (ch2 == '*')
 962                     ch2 = GET ();
 963
 964                   if (ch2 == EOF || ch2 == '/')
 965                     break;
 966
 967                   /* This UNGET will ensure that we count newlines
 968                      correctly.  */
 969                   UNGET (ch2);
 970                 }
 971
 972               if (ch2 == EOF)
 973                 as_warn (_("end of file in multiline comment"));
 974
 975               ch = ' ';
 976               goto recycle;
 977             }
 978 #ifdef DOUBLESLASH_LINE_COMMENTS
 979           else if (ch2 == '/')
 980             {
 981               do
 982                 {
 983                   ch = GET ();
 984                 }
 985               while (ch != EOF && !IS_NEWLINE (ch));
 986               if (ch == EOF)
 987                 as_warn ("end of file in comment; newline inserted");
 988               state = 0;
 989               PUT ('\n');
 990               break;
 991             }
 992 #endif
 993           else
 994             {
 995               if (ch2 != EOF)
 996                 UNGET (ch2);
 997               if (state == 9 || state == 10)
 998                 state = 3;
 999               PUT (ch);
1000             }
1001           break;
1002
1003         case LEX_IS_STRINGQUOTE:
1004           quotechar = ch;
1005           if (state == 10)
1006             {
1007               /* Preserve the whitespace in foo "bar".  */
1008               UNGET (ch);
1009               state = 3;
1010               PUT (' ');
1011
1012               /* PUT didn't jump out.  We could just break, but we
1013                  know what will happen, so optimize a bit.  */
1014               ch = GET ();
1015               old_state = 3;
1016             }
1017           else if (state == 9)
1018             old_state = 3;
1019           else
1020             old_state = state;
1021           state = 5;
1022           PUT (ch);
1023           break;
1024
1025 #ifndef IEEE_STYLE
1026         case LEX_IS_ONECHAR_QUOTE:
1027 #ifdef H_TICK_HEX
1028           if (state == 9 && enable_h_tick_hex)
1029             {
1030               char c;
1031
1032               c = GET ();
1033               as_warn ("'%c found after symbol", c);
1034               UNGET (c);
1035             }
1036 #endif
1037           if (state == 10)
1038             {
1039               /* Preserve the whitespace in foo 'b'.  */
1040               UNGET (ch);
1041               state = 3;
1042               PUT (' ');
1043               break;
1044             }
1045           ch = GET ();
1046           if (ch == EOF)
1047             {
1048               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1049               ch = 0;
1050             }
1051           if (ch == '\\')
1052             {
1053               ch = GET ();
1054               if (ch == EOF)
1055                 {
1056                   as_warn (_("end of file in escape character"));
1057                   ch = '\\';
1058                 }
1059               else
1060                 ch = process_escape (ch);
1061             }
1062           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1063
1064           /* None of these 'x constants for us.  We want 'x'.  */
1065           if ((ch = GET ()) != '\'')
1066             {
1067 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1068               as_warn (_("missing close quote; (assumed)"));
1069 #else
1070               if (ch != EOF)
1071                 UNGET (ch);
1072 #endif
1073             }
1074           if (strlen (out_buf) == 1)
1075             {
1076               PUT (out_buf[0]);
1077               break;
1078             }
1079           if (state == 9)
1080             old_state = 3;
1081           else
1082             old_state = state;
1083           state = -1;
1084           out_string = out_buf;
1085           PUT (*out_string++);
1086           break;
1087 #endif
1088
1089         case LEX_IS_COLON:
1090 #ifdef KEEP_WHITE_AROUND_COLON
1091           state = 9;
1092 #else
1093           if (state == 9 || state == 10)
1094             state = 3;
1095           else if (state != 3)
1096             state = 1;
1097 #endif
1098           PUT (ch);
1099           break;
1100
1101         case LEX_IS_NEWLINE:
1102           /* Roll out a bunch of newlines from inside comments, etc.  */
1103           if (add_newlines)
1104             {
1105               --add_newlines;
1106               UNGET (ch);
1107             }
1108           /* Fall through.  */
1109
1110         case LEX_IS_LINE_SEPARATOR:
1111           state = 0;
1112           PUT (ch);
1113           break;
1114
1115         case LEX_IS_PARALLEL_SEPARATOR:
1116           state = 1;
1117           PUT (ch);
1118           break;
1119
1120 #ifdef TC_V850
1121         case LEX_IS_DOUBLEDASH_1ST:
1122           ch2 = GET ();
1123           if (ch2 != '-')
1124             {
1125               if (ch2 != EOF)
1126                 UNGET (ch2);
1127               goto de_fault;
1128             }
1129           /* Read and skip to end of line.  */
1130           do
1131             {
1132               ch = GET ();
1133             }
1134           while (ch != EOF && ch != '\n');
1135
1136           if (ch == EOF)
1137             as_warn (_("end of file in comment; newline inserted"));
1138
1139           state = 0;
1140           PUT ('\n');
1141           break;
1142 #endif
1143 #ifdef DOUBLEBAR_PARALLEL
1144         case LEX_IS_DOUBLEBAR_1ST:
1145           ch2 = GET ();
1146           if (ch2 != EOF)
1147             UNGET (ch2);
1148           if (ch2 != '|')
1149             goto de_fault;
1150
1151           /* Handle '||' in two states as invoking PUT twice might
1152              result in the first one jumping out of this loop.  We'd
1153              then lose track of the state and one '|' char.  */
1154           state = 13;
1155           PUT ('|');
1156           break;
1157 #endif
1158         case LEX_IS_LINE_COMMENT_START:
1159           /* FIXME-someday: The two character comment stuff was badly
1160              thought out.  On i386, we want '/' as line comment start
1161              AND we want C style comments.  hence this hack.  The
1162              whole lexical process should be reworked.  xoxorich.  */
1163           if (ch == '/')
1164             {
1165               ch2 = GET ();
1166               if (ch2 == '*')
1167                 {
1168                   old_state = 3;
1169                   state = -2;
1170                   break;
1171                 }
1172               else
1173                 {
1174                   UNGET (ch2);
1175                 }
1176             }
1177
1178           if (state == 0 || state == 1) /* Only comment at start of line.  */
1179             {
1180               int startch;
1181
1182               startch = ch;
1183
1184               do
1185                 {
1186                   ch = GET ();
1187                 }
1188               while (ch != EOF && IS_WHITESPACE (ch));
1189
1190               if (ch == EOF)
1191                 {
1192                   as_warn (_("end of file in comment; newline inserted"));
1193                   PUT ('\n');
1194                   break;
1195                 }
1196
1197               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1198                 {
1199                   /* Not a cpp line.  */
1200                   while (ch != EOF && !IS_NEWLINE (ch))
1201                     ch = GET ();
1202                   if (ch == EOF)
1203                     as_warn (_("end of file in comment; newline inserted"));
1204                   state = 0;
1205                   PUT ('\n');
1206                   break;
1207                 }
1208               /* Looks like `# 123 "filename"' from cpp.  */
1209               UNGET (ch);
1210               old_state = 4;
1211               state = -1;
1212               if (scrub_m68k_mri)
1213                 out_string = "\tlinefile ";
1214               else
1215                 out_string = "\t.linefile ";
1216               PUT (*out_string++);
1217               break;
1218             }
1219
1220 #ifdef TC_D10V
1221           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1222              Trap is the only short insn that has a first operand that is
1223              neither register nor label.
1224              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1225              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1226              already LEX_IS_LINE_COMMENT_START.  However, it is the
1227              only character in line_comment_chars for d10v, hence we
1228              can recognize it as such.  */
1229           /* An alternative approach would be to reset the state to 1 when
1230              we see '||', '<'- or '->', but that seems to be overkill.  */
1231           if (state == 10)
1232             PUT (' ');
1233 #endif
1234           /* We have a line comment character which is not at the
1235              start of a line.  If this is also a normal comment
1236              character, fall through.  Otherwise treat it as a default
1237              character.  */
1238           if (strchr (tc_comment_chars, ch) == NULL
1239               && (! scrub_m68k_mri
1240                   || (ch != '!' && ch != '*')))
1241             goto de_fault;
1242           if (scrub_m68k_mri
1243               && (ch == '!' || ch == '*' || ch == '#')
1244               && state != 1
1245               && state != 10)
1246             goto de_fault;
1247           /* Fall through.  */
1248         case LEX_IS_COMMENT_START:
1249 #if defined TC_ARM && defined OBJ_ELF
1250           /* On the ARM, `@' is the comment character.
1251              Unfortunately this is also a special character in ELF .symver
1252              directives (and .type, though we deal with those another way).
1253              So we check if this line is such a directive, and treat
1254              the character as default if so.  This is a hack.  */
1255           if ((symver_state != NULL) && (*symver_state == 0))
1256             goto de_fault;
1257 #endif
1258
1259 #ifdef TC_ARM
1260           /* For the ARM, care is needed not to damage occurrences of \@
1261              by stripping the @ onwards.  Yuck.  */
1262           if (to > tostart && *(to - 1) == '\\')
1263             /* Do not treat the @ as a start-of-comment.  */
1264             goto de_fault;
1265 #endif
1266
1267 #ifdef WARN_COMMENTS
1268           if (!found_comment)
1269             as_where (&found_comment_file, &found_comment);
1270 #endif
1271           do
1272             {
1273               ch = GET ();
1274             }
1275           while (ch != EOF && !IS_NEWLINE (ch));
1276           if (ch == EOF)
1277             as_warn (_("end of file in comment; newline inserted"));
1278           state = 0;
1279           PUT ('\n');
1280           break;
1281
1282 #ifdef H_TICK_HEX
1283         case LEX_IS_H:
1284           /* Look for strings like H'[0-9A-Fa-f] and if found, replace
1285              the H' with 0x to make them gas-style hex characters.  */
1286           if (enable_h_tick_hex)
1287             {
1288               char quot;
1289
1290               quot = GET ();
1291               if (quot == '\'')
1292                 {
1293                   UNGET ('x');
1294                   ch = '0';
1295                 }
1296               else
1297                 UNGET (quot);
1298             }
1299           /* FALL THROUGH */
1300 #endif
1301
1302         case LEX_IS_SYMBOL_COMPONENT:
1303           if (state == 10)
1304             {
1305               /* This is a symbol character following another symbol
1306                  character, with whitespace in between.  We skipped
1307                  the whitespace earlier, so output it now.  */
1308               UNGET (ch);
1309               state = 3;
1310               PUT (' ');
1311               break;
1312             }
1313
1314 #ifdef TC_Z80
1315           /* "af'" is a symbol containing '\''.  */
1316           if (state == 3 && (ch == 'a' || ch == 'A'))
1317             {
1318               state = 16;
1319               PUT (ch);
1320               ch = GET ();
1321               if (ch == 'f' || ch == 'F')
1322                 {
1323                   state = 17;
1324                   PUT (ch);
1325                   break;
1326                 }
1327               else
1328                 {
1329                   state = 9;
1330                   if (!IS_SYMBOL_COMPONENT (ch))
1331                     {
1332                       if (ch != EOF)
1333                         UNGET (ch);
1334                       break;
1335                     }
1336                 }
1337             }
1338 #endif
1339           if (state == 3)
1340             state = 9;
1341
1342           /* This is a common case.  Quickly copy CH and all the
1343              following symbol component or normal characters.  */
1344           if (to + 1 < toend
1345               && mri_state == NULL
1346 #if defined TC_ARM && defined OBJ_ELF
1347               && symver_state == NULL
1348 #endif
1349               )
1350             {
1351               char *s;
1352               int len;
1353
1354               for (s = from; s < fromend; s++)
1355                 {
1356                   int type;
1357
1358                   ch2 = *(unsigned char *) s;
1359                   type = lex[ch2];
1360                   if (type != 0
1361                       && type != LEX_IS_SYMBOL_COMPONENT)
1362                     break;
1363                 }
1364
1365               if (s > from)
1366                 /* Handle the last character normally, for
1367                    simplicity.  */
1368                 --s;
1369
1370               len = s - from;
1371
1372               if (len > (toend - to) - 1)
1373                 len = (toend - to) - 1;
1374
1375               if (len > 0)
1376                 {
1377                   PUT (ch);
1378                   memcpy (to, from, len);
1379                   to += len;
1380                   from += len;
1381                   if (to >= toend)
1382                     goto tofull;
1383                   ch = GET ();
1384                 }
1385             }
1386
1387           /* Fall through.  */
1388         default:
1389         de_fault:
1390           /* Some relatively `normal' character.  */
1391           if (state == 0)
1392             {
1393               state = 11;       /* Now seeing label definition.  */
1394             }
1395           else if (state == 1)
1396             {
1397               state = 2;        /* Ditto.  */
1398             }
1399           else if (state == 9)
1400             {
1401               if (!IS_SYMBOL_COMPONENT (ch))
1402                 state = 3;
1403             }
1404           else if (state == 10)
1405             {
1406               if (ch == '\\')
1407                 {
1408                   /* Special handling for backslash: a backslash may
1409                      be the beginning of a formal parameter (of a
1410                      macro) following another symbol character, with
1411                      whitespace in between.  If that is the case, we
1412                      output a space before the parameter.  Strictly
1413                      speaking, correct handling depends upon what the
1414                      macro parameter expands into; if the parameter
1415                      expands into something which does not start with
1416                      an operand character, then we don't want to keep
1417                      the space.  We don't have enough information to
1418                      make the right choice, so here we are making the
1419                      choice which is more likely to be correct.  */
1420                   if (to + 1 >= toend)
1421                     {
1422                       /* If we're near the end of the buffer, save the
1423                          character for the next time round.  Otherwise
1424                          we'll lose our state.  */
1425                       UNGET (ch);
1426                       goto tofull;
1427                     }
1428                   *to++ = ' ';
1429                 }
1430
1431               state = 3;
1432             }
1433           PUT (ch);
1434           break;
1435         }
1436     }
1437
1438   /*NOTREACHED*/
1439
1440  fromeof:
1441   /* We have reached the end of the input.  */
1442   return to - tostart;
1443
1444  tofull:
1445   /* The output buffer is full.  Save any input we have not yet
1446      processed.  */
1447   if (fromend > from)
1448     {
1449       saved_input = from;
1450       saved_input_len = fromend - from;
1451     }
1452   else
1453     saved_input = NULL;
1454
1455   return to - tostart;
1456 }