gas/app.c

   1 /* This is the Assembler Pre-Processor
   2    Copyright 1987, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
   3    1999, 2000, 2001, 2002, 2003, 2006, 2007
   4    Free Software Foundation, Inc.
   5
   6    This file is part of GAS, the GNU Assembler.
   7
   8    GAS is free software; you can redistribute it and/or modify
   9    it under the terms of the GNU General Public License as published by
  10    the Free Software Foundation; either version 2, or (at your option)
  11    any later version.
  12
  13    GAS is distributed in the hope that it will be useful,
  14    but WITHOUT ANY WARRANTY; without even the implied warranty of
  15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16    GNU General Public License for more details.
  17
  18    You should have received a copy of the GNU General Public License
  19    along with GAS; see the file COPYING.  If not, write to the Free
  20    Software Foundation, 51 Franklin Street - Fifth Floor, Boston, MA
  21    02110-1301, USA.  */
  22
  23 /* Modified by Allen Wirfs-Brock, Instantiations Inc 2/90.  */
  24 /* App, the assembler pre-processor.  This pre-processor strips out
  25    excess spaces, turns single-quoted characters into a decimal
  26    constant, and turns the # in # <number> <filename> <garbage> into a
  27    .linefile.  This needs better error-handling.  */
  28
  29 #include "as.h"
  30
  31 #if (__STDC__ != 1)
  32 #ifndef const
  33 #define const  /* empty */
  34 #endif
  35 #endif
  36
  37 #ifdef TC_M68K
  38 /* Whether we are scrubbing in m68k MRI mode.  This is different from
  39    flag_m68k_mri, because the two flags will be affected by the .mri
  40    pseudo-op at different times.  */
  41 static int scrub_m68k_mri;
  42
  43 /* The pseudo-op which switches in and out of MRI mode.  See the
  44    comment in do_scrub_chars.  */
  45 static const char mri_pseudo[] = ".mri 0";
  46 #else
  47 #define scrub_m68k_mri 0
  48 #endif
  49
  50 #if defined TC_ARM && defined OBJ_ELF
  51 /* The pseudo-op for which we need to special-case `@' characters.
  52    See the comment in do_scrub_chars.  */
  53 static const char   symver_pseudo[] = ".symver";
  54 static const char * symver_state;
  55 #endif
  56
  57 static char lex[256];
  58 static const char symbol_chars[] =
  59 "$._ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
  60
  61 #define LEX_IS_SYMBOL_COMPONENT         1
  62 #define LEX_IS_WHITESPACE               2
  63 #define LEX_IS_LINE_SEPARATOR           3
  64 #define LEX_IS_COMMENT_START            4
  65 #define LEX_IS_LINE_COMMENT_START       5
  66 #define LEX_IS_TWOCHAR_COMMENT_1ST      6
  67 #define LEX_IS_STRINGQUOTE              8
  68 #define LEX_IS_COLON                    9
  69 #define LEX_IS_NEWLINE                  10
  70 #define LEX_IS_ONECHAR_QUOTE            11
  71 #ifdef TC_V850
  72 #define LEX_IS_DOUBLEDASH_1ST           12
  73 #endif
  74 #ifdef TC_M32R
  75 #define DOUBLEBAR_PARALLEL
  76 #endif
  77 #ifdef DOUBLEBAR_PARALLEL
  78 #define LEX_IS_DOUBLEBAR_1ST            13
  79 #endif
  80 #define LEX_IS_PARALLEL_SEPARATOR       14
  81 #define IS_SYMBOL_COMPONENT(c)          (lex[c] == LEX_IS_SYMBOL_COMPONENT)
  82 #define IS_WHITESPACE(c)                (lex[c] == LEX_IS_WHITESPACE)
  83 #define IS_LINE_SEPARATOR(c)            (lex[c] == LEX_IS_LINE_SEPARATOR)
  84 #define IS_PARALLEL_SEPARATOR(c)        (lex[c] == LEX_IS_PARALLEL_SEPARATOR)
  85 #define IS_COMMENT(c)                   (lex[c] == LEX_IS_COMMENT_START)
  86 #define IS_LINE_COMMENT(c)              (lex[c] == LEX_IS_LINE_COMMENT_START)
  87 #define IS_NEWLINE(c)                   (lex[c] == LEX_IS_NEWLINE)
  88
  89 static int process_escape (int);
  90
  91 /* FIXME-soon: The entire lexer/parser thingy should be
  92    built statically at compile time rather than dynamically
  93    each and every time the assembler is run.  xoxorich.  */
  94
  95 void
  96 do_scrub_begin (int m68k_mri ATTRIBUTE_UNUSED)
  97 {
  98   const char *p;
  99   int c;
 100
 101   lex[' '] = LEX_IS_WHITESPACE;
 102   lex['\t'] = LEX_IS_WHITESPACE;
 103   lex['\r'] = LEX_IS_WHITESPACE;
 104   lex['\n'] = LEX_IS_NEWLINE;
 105   lex[':'] = LEX_IS_COLON;
 106
 107 #ifdef TC_M68K
 108   scrub_m68k_mri = m68k_mri;
 109
 110   if (! m68k_mri)
 111 #endif
 112     {
 113       lex['"'] = LEX_IS_STRINGQUOTE;
 114
 115 #if ! defined (TC_HPPA) && ! defined (TC_I370)
 116       /* I370 uses single-quotes to delimit integer, float constants.  */
 117       lex['\''] = LEX_IS_ONECHAR_QUOTE;
 118 #endif
 119
 120 #ifdef SINGLE_QUOTE_STRINGS
 121       lex['\''] = LEX_IS_STRINGQUOTE;
 122 #endif
 123     }
 124
 125   /* Note: if any other character can be LEX_IS_STRINGQUOTE, the loop
 126      in state 5 of do_scrub_chars must be changed.  */
 127
 128   /* Note that these override the previous defaults, e.g. if ';' is a
 129      comment char, then it isn't a line separator.  */
 130   for (p = symbol_chars; *p; ++p)
 131     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 132
 133   for (c = 128; c < 256; ++c)
 134     lex[c] = LEX_IS_SYMBOL_COMPONENT;
 135
 136 #ifdef tc_symbol_chars
 137   /* This macro permits the processor to specify all characters which
 138      may appears in an operand.  This will prevent the scrubber from
 139      discarding meaningful whitespace in certain cases.  The i386
 140      backend uses this to support prefixes, which can confuse the
 141      scrubber as to whether it is parsing operands or opcodes.  */
 142   for (p = tc_symbol_chars; *p; ++p)
 143     lex[(unsigned char) *p] = LEX_IS_SYMBOL_COMPONENT;
 144 #endif
 145
 146   /* The m68k backend wants to be able to change comment_chars.  */
 147 #ifndef tc_comment_chars
 148 #define tc_comment_chars comment_chars
 149 #endif
 150   for (p = tc_comment_chars; *p; p++)
 151     lex[(unsigned char) *p] = LEX_IS_COMMENT_START;
 152
 153   for (p = line_comment_chars; *p; p++)
 154     lex[(unsigned char) *p] = LEX_IS_LINE_COMMENT_START;
 155
 156   for (p = line_separator_chars; *p; p++)
 157     lex[(unsigned char) *p] = LEX_IS_LINE_SEPARATOR;
 158
 159 #ifdef tc_parallel_separator_chars
 160   /* This macro permits the processor to specify all characters which
 161      separate parallel insns on the same line.  */
 162   for (p = tc_parallel_separator_chars; *p; p++)
 163     lex[(unsigned char) *p] = LEX_IS_PARALLEL_SEPARATOR;
 164 #endif
 165
 166   /* Only allow slash-star comments if slash is not in use.
 167      FIXME: This isn't right.  We should always permit them.  */
 168   if (lex['/'] == 0)
 169     lex['/'] = LEX_IS_TWOCHAR_COMMENT_1ST;
 170
 171 #ifdef TC_M68K
 172   if (m68k_mri)
 173     {
 174       lex['\''] = LEX_IS_STRINGQUOTE;
 175       lex[';'] = LEX_IS_COMMENT_START;
 176       lex['*'] = LEX_IS_LINE_COMMENT_START;
 177       /* The MRI documentation says '!' is LEX_IS_COMMENT_START, but
 178          then it can't be used in an expression.  */
 179       lex['!'] = LEX_IS_LINE_COMMENT_START;
 180     }
 181 #endif
 182
 183 #ifdef TC_V850
 184   lex['-'] = LEX_IS_DOUBLEDASH_1ST;
 185 #endif
 186 #ifdef DOUBLEBAR_PARALLEL
 187   lex['|'] = LEX_IS_DOUBLEBAR_1ST;
 188 #endif
 189 #ifdef TC_D30V
 190   /* Must do this is we want VLIW instruction with "->" or "<-".  */
 191   lex['-'] = LEX_IS_SYMBOL_COMPONENT;
 192 #endif
 193 }
 194
 195 /* Saved state of the scrubber.  */
 196 static int state;
 197 static int old_state;
 198 static char *out_string;
 199 static char out_buf[20];
 200 static int add_newlines;
 201 static char *saved_input;
 202 static int saved_input_len;
 203 static char input_buffer[32 * 1024];
 204 static const char *mri_state;
 205 static char mri_last_ch;
 206
 207 /* Data structure for saving the state of app across #include's.  Note that
 208    app is called asynchronously to the parsing of the .include's, so our
 209    state at the time .include is interpreted is completely unrelated.
 210    That's why we have to save it all.  */
 211
 212 struct app_save
 213 {
 214   int          state;
 215   int          old_state;
 216   char *       out_string;
 217   char         out_buf[sizeof (out_buf)];
 218   int          add_newlines;
 219   char *       saved_input;
 220   int          saved_input_len;
 221 #ifdef TC_M68K
 222   int          scrub_m68k_mri;
 223 #endif
 224   const char * mri_state;
 225   char         mri_last_ch;
 226 #if defined TC_ARM && defined OBJ_ELF
 227   const char * symver_state;
 228 #endif
 229 };
 230
 231 char *
 232 app_push (void)
 233 {
 234   register struct app_save *saved;
 235
 236   saved = (struct app_save *) xmalloc (sizeof (*saved));
 237   saved->state = state;
 238   saved->old_state = old_state;
 239   saved->out_string = out_string;
 240   memcpy (saved->out_buf, out_buf, sizeof (out_buf));
 241   saved->add_newlines = add_newlines;
 242   if (saved_input == NULL)
 243     saved->saved_input = NULL;
 244   else
 245     {
 246       saved->saved_input = xmalloc (saved_input_len);
 247       memcpy (saved->saved_input, saved_input, saved_input_len);
 248       saved->saved_input_len = saved_input_len;
 249     }
 250 #ifdef TC_M68K
 251   saved->scrub_m68k_mri = scrub_m68k_mri;
 252 #endif
 253   saved->mri_state = mri_state;
 254   saved->mri_last_ch = mri_last_ch;
 255 #if defined TC_ARM && defined OBJ_ELF
 256   saved->symver_state = symver_state;
 257 #endif
 258
 259   /* do_scrub_begin() is not useful, just wastes time.  */
 260
 261   state = 0;
 262   saved_input = NULL;
 263
 264   return (char *) saved;
 265 }
 266
 267 void
 268 app_pop (char *arg)
 269 {
 270   register struct app_save *saved = (struct app_save *) arg;
 271
 272   /* There is no do_scrub_end ().  */
 273   state = saved->state;
 274   old_state = saved->old_state;
 275   out_string = saved->out_string;
 276   memcpy (out_buf, saved->out_buf, sizeof (out_buf));
 277   add_newlines = saved->add_newlines;
 278   if (saved->saved_input == NULL)
 279     saved_input = NULL;
 280   else
 281     {
 282       assert (saved->saved_input_len <= (int) (sizeof input_buffer));
 283       memcpy (input_buffer, saved->saved_input, saved->saved_input_len);
 284       saved_input = input_buffer;
 285       saved_input_len = saved->saved_input_len;
 286       free (saved->saved_input);
 287     }
 288 #ifdef TC_M68K
 289   scrub_m68k_mri = saved->scrub_m68k_mri;
 290 #endif
 291   mri_state = saved->mri_state;
 292   mri_last_ch = saved->mri_last_ch;
 293 #if defined TC_ARM && defined OBJ_ELF
 294   symver_state = saved->symver_state;
 295 #endif
 296
 297   free (arg);
 298 }
 299
 300 /* @@ This assumes that \n &c are the same on host and target.  This is not
 301    necessarily true.  */
 302
 303 static int
 304 process_escape (int ch)
 305 {
 306   switch (ch)
 307     {
 308     case 'b':
 309       return '\b';
 310     case 'f':
 311       return '\f';
 312     case 'n':
 313       return '\n';
 314     case 'r':
 315       return '\r';
 316     case 't':
 317       return '\t';
 318     case '\'':
 319       return '\'';
 320     case '"':
 321       return '\"';
 322     default:
 323       return ch;
 324     }
 325 }
 326
 327 /* This function is called to process input characters.  The GET
 328    parameter is used to retrieve more input characters.  GET should
 329    set its parameter to point to a buffer, and return the length of
 330    the buffer; it should return 0 at end of file.  The scrubbed output
 331    characters are put into the buffer starting at TOSTART; the TOSTART
 332    buffer is TOLEN bytes in length.  The function returns the number
 333    of scrubbed characters put into TOSTART.  This will be TOLEN unless
 334    end of file was seen.  This function is arranged as a state
 335    machine, and saves its state so that it may return at any point.
 336    This is the way the old code used to work.  */
 337
 338 int
 339 do_scrub_chars (int (*get) (char *, int), char *tostart, int tolen)
 340 {
 341   char *to = tostart;
 342   char *toend = tostart + tolen;
 343   char *from;
 344   char *fromend;
 345   int fromlen;
 346   register int ch, ch2 = 0;
 347   /* Character that started the string we're working on.  */
 348   static char quotechar;
 349
 350   /*State 0: beginning of normal line
 351           1: After first whitespace on line (flush more white)
 352           2: After first non-white (opcode) on line (keep 1white)
 353           3: after second white on line (into operands) (flush white)
 354           4: after putting out a .linefile, put out digits
 355           5: parsing a string, then go to old-state
 356           6: putting out \ escape in a "d string.
 357           7: no longer used
 358           8: no longer used
 359           9: After seeing symbol char in state 3 (keep 1white after symchar)
 360          10: After seeing whitespace in state 9 (keep white before symchar)
 361          11: After seeing a symbol character in state 0 (eg a label definition)
 362          -1: output string in out_string and go to the state in old_state
 363          -2: flush text until a '*' '/' is seen, then go to state old_state
 364 #ifdef TC_V850
 365          12: After seeing a dash, looking for a second dash as a start
 366              of comment.
 367 #endif
 368 #ifdef DOUBLEBAR_PARALLEL
 369          13: After seeing a vertical bar, looking for a second
 370              vertical bar as a parallel expression separator.
 371 #endif
 372 #ifdef TC_IA64
 373          14: After seeing a `(' at state 0, looking for a `)' as
 374              predicate.
 375          15: After seeing a `(' at state 1, looking for a `)' as
 376              predicate.
 377 #endif
 378 #ifdef TC_Z80
 379          16: After seeing an 'a' or an 'A' at the start of a symbol
 380          17: After seeing an 'f' or an 'F' in state 16
 381 #endif
 382           */
 383
 384   /* I added states 9 and 10 because the MIPS ECOFF assembler uses
 385      constructs like ``.loc 1 20''.  This was turning into ``.loc
 386      120''.  States 9 and 10 ensure that a space is never dropped in
 387      between characters which could appear in an identifier.  Ian
 388      Taylor, ian@cygnus.com.
 389
 390      I added state 11 so that something like "Lfoo add %r25,%r26,%r27" works
 391      correctly on the PA (and any other target where colons are optional).
 392      Jeff Law, law@cs.utah.edu.
 393
 394      I added state 13 so that something like "cmp r1, r2 || trap #1" does not
 395      get squashed into "cmp r1,r2||trap#1", with the all important space
 396      between the 'trap' and the '#1' being eliminated.  nickc@cygnus.com  */
 397
 398   /* This macro gets the next input character.  */
 399
 400 #define GET()                                                   \
 401   (from < fromend                                               \
 402    ? * (unsigned char *) (from++)                               \
 403    : (saved_input = NULL,                                       \
 404       fromlen = (*get) (input_buffer, sizeof input_buffer),     \
 405       from = input_buffer,                                      \
 406       fromend = from + fromlen,                                 \
 407       (fromlen == 0                                             \
 408        ? EOF                                                    \
 409        : * (unsigned char *) (from++))))
 410
 411   /* This macro pushes a character back on the input stream.  */
 412
 413 #define UNGET(uch) (*--from = (uch))
 414
 415   /* This macro puts a character into the output buffer.  If this
 416      character fills the output buffer, this macro jumps to the label
 417      TOFULL.  We use this rather ugly approach because we need to
 418      handle two different termination conditions: EOF on the input
 419      stream, and a full output buffer.  It would be simpler if we
 420      always read in the entire input stream before processing it, but
 421      I don't want to make such a significant change to the assembler's
 422      memory usage.  */
 423
 424 #define PUT(pch)                                \
 425   do                                            \
 426     {                                           \
 427       *to++ = (pch);                            \
 428       if (to >= toend)                          \
 429         goto tofull;                            \
 430     }                                           \
 431   while (0)
 432
 433   if (saved_input != NULL)
 434     {
 435       from = saved_input;
 436       fromend = from + saved_input_len;
 437     }
 438   else
 439     {
 440       fromlen = (*get) (input_buffer, sizeof input_buffer);
 441       if (fromlen == 0)
 442         return 0;
 443       from = input_buffer;
 444       fromend = from + fromlen;
 445     }
 446
 447   while (1)
 448     {
 449       /* The cases in this switch end with continue, in order to
 450          branch back to the top of this while loop and generate the
 451          next output character in the appropriate state.  */
 452       switch (state)
 453         {
 454         case -1:
 455           ch = *out_string++;
 456           if (*out_string == '\0')
 457             {
 458               state = old_state;
 459               old_state = 3;
 460             }
 461           PUT (ch);
 462           continue;
 463
 464         case -2:
 465           for (;;)
 466             {
 467               do
 468                 {
 469                   ch = GET ();
 470
 471                   if (ch == EOF)
 472                     {
 473                       as_warn (_("end of file in comment"));
 474                       goto fromeof;
 475                     }
 476
 477                   if (ch == '\n')
 478                     PUT ('\n');
 479                 }
 480               while (ch != '*');
 481
 482               while ((ch = GET ()) == '*')
 483                 ;
 484
 485               if (ch == EOF)
 486                 {
 487                   as_warn (_("end of file in comment"));
 488                   goto fromeof;
 489                 }
 490
 491               if (ch == '/')
 492                 break;
 493
 494               UNGET (ch);
 495             }
 496
 497           state = old_state;
 498           UNGET (' ');
 499           continue;
 500
 501         case 4:
 502           ch = GET ();
 503           if (ch == EOF)
 504             goto fromeof;
 505           else if (ch >= '0' && ch <= '9')
 506             PUT (ch);
 507           else
 508             {
 509               while (ch != EOF && IS_WHITESPACE (ch))
 510                 ch = GET ();
 511               if (ch == '"')
 512                 {
 513                   quotechar = ch;
 514                   state = 5;
 515                   old_state = 3;
 516                   PUT (ch);
 517                 }
 518               else
 519                 {
 520                   while (ch != EOF && ch != '\n')
 521                     ch = GET ();
 522                   state = 0;
 523                   PUT (ch);
 524                 }
 525             }
 526           continue;
 527
 528         case 5:
 529           /* We are going to copy everything up to a quote character,
 530              with special handling for a backslash.  We try to
 531              optimize the copying in the simple case without using the
 532              GET and PUT macros.  */
 533           {
 534             char *s;
 535             int len;
 536
 537             for (s = from; s < fromend; s++)
 538               {
 539                 ch = *s;
 540                 if (ch == '\\'
 541                     || ch == quotechar
 542                     || ch == '\n')
 543                   break;
 544               }
 545             len = s - from;
 546             if (len > toend - to)
 547               len = toend - to;
 548             if (len > 0)
 549               {
 550                 memcpy (to, from, len);
 551                 to += len;
 552                 from += len;
 553               }
 554           }
 555
 556           ch = GET ();
 557           if (ch == EOF)
 558             {
 559               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 560               state = old_state;
 561               UNGET ('\n');
 562               PUT (quotechar);
 563             }
 564           else if (ch == quotechar)
 565             {
 566               state = old_state;
 567               PUT (ch);
 568             }
 569 #ifndef NO_STRING_ESCAPES
 570           else if (ch == '\\')
 571             {
 572               state = 6;
 573               PUT (ch);
 574             }
 575 #endif
 576           else if (scrub_m68k_mri && ch == '\n')
 577             {
 578               /* Just quietly terminate the string.  This permits lines like
 579                    bne  label   loop if we haven't reach end yet.  */
 580               state = old_state;
 581               UNGET (ch);
 582               PUT ('\'');
 583             }
 584           else
 585             {
 586               PUT (ch);
 587             }
 588           continue;
 589
 590         case 6:
 591           state = 5;
 592           ch = GET ();
 593           switch (ch)
 594             {
 595               /* Handle strings broken across lines, by turning '\n' into
 596                  '\\' and 'n'.  */
 597             case '\n':
 598               UNGET ('n');
 599               add_newlines++;
 600               PUT ('\\');
 601               continue;
 602
 603             case EOF:
 604               as_warn (_("end of file in string; '%c' inserted"), quotechar);
 605               PUT (quotechar);
 606               continue;
 607
 608             case '"':
 609             case '\\':
 610             case 'b':
 611             case 'f':
 612             case 'n':
 613             case 'r':
 614             case 't':
 615             case 'v':
 616             case 'x':
 617             case 'X':
 618             case '0':
 619             case '1':
 620             case '2':
 621             case '3':
 622             case '4':
 623             case '5':
 624             case '6':
 625             case '7':
 626               break;
 627
 628             default:
 629 #ifdef ONLY_STANDARD_ESCAPES
 630               as_warn (_("unknown escape '\\%c' in string; ignored"), ch);
 631 #endif
 632               break;
 633             }
 634           PUT (ch);
 635           continue;
 636
 637 #ifdef DOUBLEBAR_PARALLEL
 638         case 13:
 639           ch = GET ();
 640           if (ch != '|')
 641             abort ();
 642
 643           /* Reset back to state 1 and pretend that we are parsing a
 644              line from just after the first white space.  */
 645           state = 1;
 646           PUT ('|');
 647           continue;
 648 #endif
 649 #ifdef TC_Z80
 650         case 16:
 651           /* We have seen an 'a' at the start of a symbol, look for an 'f'.  */
 652           ch = GET ();
 653           if (ch == 'f' || ch == 'F')
 654             {
 655               state = 17;
 656               PUT (ch);
 657             }
 658           else
 659             {
 660               state = 9;
 661               break;
 662             }
 663         case 17:
 664           /* We have seen "af" at the start of a symbol,
 665              a ' here is a part of that symbol.  */
 666           ch = GET ();
 667           state = 9;
 668           if (ch == '\'')
 669             /* Change to avoid warning about unclosed string.  */
 670             PUT ('`');
 671           else
 672             UNGET (ch);
 673           break;
 674 #endif
 675         }
 676
 677       /* OK, we are somewhere in states 0 through 4 or 9 through 11.  */
 678
 679       /* flushchar: */
 680       ch = GET ();
 681
 682 #ifdef TC_IA64
 683       if (ch == '(' && (state == 0 || state == 1))
 684         {
 685           state += 14;
 686           PUT (ch);
 687           continue;
 688         }
 689       else if (state == 14 || state == 15)
 690         {
 691           if (ch == ')')
 692             {
 693               state -= 14;
 694               PUT (ch);
 695               ch = GET ();
 696             }
 697           else
 698             {
 699               PUT (ch);
 700               continue;
 701             }
 702         }
 703 #endif
 704
 705     recycle:
 706
 707 #if defined TC_ARM && defined OBJ_ELF
 708       /* We need to watch out for .symver directives.  See the comment later
 709          in this function.  */
 710       if (symver_state == NULL)
 711         {
 712           if ((state == 0 || state == 1) && ch == symver_pseudo[0])
 713             symver_state = symver_pseudo + 1;
 714         }
 715       else
 716         {
 717           /* We advance to the next state if we find the right
 718              character.  */
 719           if (ch != '\0' && (*symver_state == ch))
 720             ++symver_state;
 721           else if (*symver_state != '\0')
 722             /* We did not get the expected character, or we didn't
 723                get a valid terminating character after seeing the
 724                entire pseudo-op, so we must go back to the beginning.  */
 725             symver_state = NULL;
 726           else
 727             {
 728               /* We've read the entire pseudo-op.  If this is the end
 729                  of the line, go back to the beginning.  */
 730               if (IS_NEWLINE (ch))
 731                 symver_state = NULL;
 732             }
 733         }
 734 #endif /* TC_ARM && OBJ_ELF */
 735
 736 #ifdef TC_M68K
 737       /* We want to have pseudo-ops which control whether we are in
 738          MRI mode or not.  Unfortunately, since m68k MRI mode affects
 739          the scrubber, that means that we need a special purpose
 740          recognizer here.  */
 741       if (mri_state == NULL)
 742         {
 743           if ((state == 0 || state == 1)
 744               && ch == mri_pseudo[0])
 745             mri_state = mri_pseudo + 1;
 746         }
 747       else
 748         {
 749           /* We advance to the next state if we find the right
 750              character, or if we need a space character and we get any
 751              whitespace character, or if we need a '0' and we get a
 752              '1' (this is so that we only need one state to handle
 753              ``.mri 0'' and ``.mri 1'').  */
 754           if (ch != '\0'
 755               && (*mri_state == ch
 756                   || (*mri_state == ' '
 757                       && lex[ch] == LEX_IS_WHITESPACE)
 758                   || (*mri_state == '0'
 759                       && ch == '1')))
 760             {
 761               mri_last_ch = ch;
 762               ++mri_state;
 763             }
 764           else if (*mri_state != '\0'
 765                    || (lex[ch] != LEX_IS_WHITESPACE
 766                        && lex[ch] != LEX_IS_NEWLINE))
 767             {
 768               /* We did not get the expected character, or we didn't
 769                  get a valid terminating character after seeing the
 770                  entire pseudo-op, so we must go back to the
 771                  beginning.  */
 772               mri_state = NULL;
 773             }
 774           else
 775             {
 776               /* We've read the entire pseudo-op.  mips_last_ch is
 777                  either '0' or '1' indicating whether to enter or
 778                  leave MRI mode.  */
 779               do_scrub_begin (mri_last_ch == '1');
 780               mri_state = NULL;
 781
 782               /* We continue handling the character as usual.  The
 783                  main gas reader must also handle the .mri pseudo-op
 784                  to control expression parsing and the like.  */
 785             }
 786         }
 787 #endif
 788
 789       if (ch == EOF)
 790         {
 791           if (state != 0)
 792             {
 793               as_warn (_("end of file not at end of a line; newline inserted"));
 794               state = 0;
 795               PUT ('\n');
 796             }
 797           goto fromeof;
 798         }
 799
 800       switch (lex[ch])
 801         {
 802         case LEX_IS_WHITESPACE:
 803           do
 804             {
 805               ch = GET ();
 806             }
 807           while (ch != EOF && IS_WHITESPACE (ch));
 808           if (ch == EOF)
 809             goto fromeof;
 810
 811           if (state == 0)
 812             {
 813               /* Preserve a single whitespace character at the
 814                  beginning of a line.  */
 815               state = 1;
 816               UNGET (ch);
 817               PUT (' ');
 818               break;
 819             }
 820
 821 #ifdef KEEP_WHITE_AROUND_COLON
 822           if (lex[ch] == LEX_IS_COLON)
 823             {
 824               /* Only keep this white if there's no white *after* the
 825                  colon.  */
 826               ch2 = GET ();
 827               UNGET (ch2);
 828               if (!IS_WHITESPACE (ch2))
 829                 {
 830                   state = 9;
 831                   UNGET (ch);
 832                   PUT (' ');
 833                   break;
 834                 }
 835             }
 836 #endif
 837           if (IS_COMMENT (ch)
 838               || ch == '/'
 839               || IS_LINE_SEPARATOR (ch)
 840               || IS_PARALLEL_SEPARATOR (ch))
 841             {
 842               if (scrub_m68k_mri)
 843                 {
 844                   /* In MRI mode, we keep these spaces.  */
 845                   UNGET (ch);
 846                   PUT (' ');
 847                   break;
 848                 }
 849               goto recycle;
 850             }
 851
 852           /* If we're in state 2 or 11, we've seen a non-white
 853              character followed by whitespace.  If the next character
 854              is ':', this is whitespace after a label name which we
 855              normally must ignore.  In MRI mode, though, spaces are
 856              not permitted between the label and the colon.  */
 857           if ((state == 2 || state == 11)
 858               && lex[ch] == LEX_IS_COLON
 859               && ! scrub_m68k_mri)
 860             {
 861               state = 1;
 862               PUT (ch);
 863               break;
 864             }
 865
 866           switch (state)
 867             {
 868             case 1:
 869               /* We can arrive here if we leave a leading whitespace
 870                  character at the beginning of a line.  */
 871               goto recycle;
 872             case 2:
 873               state = 3;
 874               if (to + 1 < toend)
 875                 {
 876                   /* Optimize common case by skipping UNGET/GET.  */
 877                   PUT (' ');    /* Sp after opco */
 878                   goto recycle;
 879                 }
 880               UNGET (ch);
 881               PUT (' ');
 882               break;
 883             case 3:
 884               if (scrub_m68k_mri)
 885                 {
 886                   /* In MRI mode, we keep these spaces.  */
 887                   UNGET (ch);
 888                   PUT (' ');
 889                   break;
 890                 }
 891               goto recycle;     /* Sp in operands */
 892             case 9:
 893             case 10:
 894               if (scrub_m68k_mri)
 895                 {
 896                   /* In MRI mode, we keep these spaces.  */
 897                   state = 3;
 898                   UNGET (ch);
 899                   PUT (' ');
 900                   break;
 901                 }
 902               state = 10;       /* Sp after symbol char */
 903               goto recycle;
 904             case 11:
 905               if (LABELS_WITHOUT_COLONS || flag_m68k_mri)
 906                 state = 1;
 907               else
 908                 {
 909                   /* We know that ch is not ':', since we tested that
 910                      case above.  Therefore this is not a label, so it
 911                      must be the opcode, and we've just seen the
 912                      whitespace after it.  */
 913                   state = 3;
 914                 }
 915               UNGET (ch);
 916               PUT (' ');        /* Sp after label definition.  */
 917               break;
 918             default:
 919               BAD_CASE (state);
 920             }
 921           break;
 922
 923         case LEX_IS_TWOCHAR_COMMENT_1ST:
 924           ch2 = GET ();
 925           if (ch2 == '*')
 926             {
 927               for (;;)
 928                 {
 929                   do
 930                     {
 931                       ch2 = GET ();
 932                       if (ch2 != EOF && IS_NEWLINE (ch2))
 933                         add_newlines++;
 934                     }
 935                   while (ch2 != EOF && ch2 != '*');
 936
 937                   while (ch2 == '*')
 938                     ch2 = GET ();
 939
 940                   if (ch2 == EOF || ch2 == '/')
 941                     break;
 942
 943                   /* This UNGET will ensure that we count newlines
 944                      correctly.  */
 945                   UNGET (ch2);
 946                 }
 947
 948               if (ch2 == EOF)
 949                 as_warn (_("end of file in multiline comment"));
 950
 951               ch = ' ';
 952               goto recycle;
 953             }
 954 #ifdef DOUBLESLASH_LINE_COMMENTS
 955           else if (ch2 == '/')
 956             {
 957               do
 958                 {
 959                   ch = GET ();
 960                 }
 961               while (ch != EOF && !IS_NEWLINE (ch));
 962               if (ch == EOF)
 963                 as_warn ("end of file in comment; newline inserted");
 964               state = 0;
 965               PUT ('\n');
 966               break;
 967             }
 968 #endif
 969           else
 970             {
 971               if (ch2 != EOF)
 972                 UNGET (ch2);
 973               if (state == 9 || state == 10)
 974                 state = 3;
 975               PUT (ch);
 976             }
 977           break;
 978
 979         case LEX_IS_STRINGQUOTE:
 980           quotechar = ch;
 981           if (state == 10)
 982             {
 983               /* Preserve the whitespace in foo "bar".  */
 984               UNGET (ch);
 985               state = 3;
 986               PUT (' ');
 987
 988               /* PUT didn't jump out.  We could just break, but we
 989                  know what will happen, so optimize a bit.  */
 990               ch = GET ();
 991               old_state = 3;
 992             }
 993           else if (state == 9)
 994             old_state = 3;
 995           else
 996             old_state = state;
 997           state = 5;
 998           PUT (ch);
 999           break;
1000
1001 #ifndef IEEE_STYLE
1002         case LEX_IS_ONECHAR_QUOTE:
1003           if (state == 10)
1004             {
1005               /* Preserve the whitespace in foo 'b'.  */
1006               UNGET (ch);
1007               state = 3;
1008               PUT (' ');
1009               break;
1010             }
1011           ch = GET ();
1012           if (ch == EOF)
1013             {
1014               as_warn (_("end of file after a one-character quote; \\0 inserted"));
1015               ch = 0;
1016             }
1017           if (ch == '\\')
1018             {
1019               ch = GET ();
1020               if (ch == EOF)
1021                 {
1022                   as_warn (_("end of file in escape character"));
1023                   ch = '\\';
1024                 }
1025               else
1026                 ch = process_escape (ch);
1027             }
1028           sprintf (out_buf, "%d", (int) (unsigned char) ch);
1029
1030           /* None of these 'x constants for us.  We want 'x'.  */
1031           if ((ch = GET ()) != '\'')
1032             {
1033 #ifdef REQUIRE_CHAR_CLOSE_QUOTE
1034               as_warn (_("missing close quote; (assumed)"));
1035 #else
1036               if (ch != EOF)
1037                 UNGET (ch);
1038 #endif
1039             }
1040           if (strlen (out_buf) == 1)
1041             {
1042               PUT (out_buf[0]);
1043               break;
1044             }
1045           if (state == 9)
1046             old_state = 3;
1047           else
1048             old_state = state;
1049           state = -1;
1050           out_string = out_buf;
1051           PUT (*out_string++);
1052           break;
1053 #endif
1054
1055         case LEX_IS_COLON:
1056 #ifdef KEEP_WHITE_AROUND_COLON
1057           state = 9;
1058 #else
1059           if (state == 9 || state == 10)
1060             state = 3;
1061           else if (state != 3)
1062             state = 1;
1063 #endif
1064           PUT (ch);
1065           break;
1066
1067         case LEX_IS_NEWLINE:
1068           /* Roll out a bunch of newlines from inside comments, etc.  */
1069           if (add_newlines)
1070             {
1071               --add_newlines;
1072               UNGET (ch);
1073             }
1074           /* Fall through.  */
1075
1076         case LEX_IS_LINE_SEPARATOR:
1077           state = 0;
1078           PUT (ch);
1079           break;
1080
1081         case LEX_IS_PARALLEL_SEPARATOR:
1082           state = 1;
1083           PUT (ch);
1084           break;
1085
1086 #ifdef TC_V850
1087         case LEX_IS_DOUBLEDASH_1ST:
1088           ch2 = GET ();
1089           if (ch2 != '-')
1090             {
1091               UNGET (ch2);
1092               goto de_fault;
1093             }
1094           /* Read and skip to end of line.  */
1095           do
1096             {
1097               ch = GET ();
1098             }
1099           while (ch != EOF && ch != '\n');
1100
1101           if (ch == EOF)
1102             as_warn (_("end of file in comment; newline inserted"));
1103
1104           state = 0;
1105           PUT ('\n');
1106           break;
1107 #endif
1108 #ifdef DOUBLEBAR_PARALLEL
1109         case LEX_IS_DOUBLEBAR_1ST:
1110           ch2 = GET ();
1111           UNGET (ch2);
1112           if (ch2 != '|')
1113             goto de_fault;
1114
1115           /* Handle '||' in two states as invoking PUT twice might
1116              result in the first one jumping out of this loop.  We'd
1117              then lose track of the state and one '|' char.  */
1118           state = 13;
1119           PUT ('|');
1120           break;
1121 #endif
1122         case LEX_IS_LINE_COMMENT_START:
1123           /* FIXME-someday: The two character comment stuff was badly
1124              thought out.  On i386, we want '/' as line comment start
1125              AND we want C style comments.  hence this hack.  The
1126              whole lexical process should be reworked.  xoxorich.  */
1127           if (ch == '/')
1128             {
1129               ch2 = GET ();
1130               if (ch2 == '*')
1131                 {
1132                   old_state = 3;
1133                   state = -2;
1134                   break;
1135                 }
1136               else
1137                 {
1138                   UNGET (ch2);
1139                 }
1140             }
1141
1142           if (state == 0 || state == 1) /* Only comment at start of line.  */
1143             {
1144               int startch;
1145
1146               startch = ch;
1147
1148               do
1149                 {
1150                   ch = GET ();
1151                 }
1152               while (ch != EOF && IS_WHITESPACE (ch));
1153
1154               if (ch == EOF)
1155                 {
1156                   as_warn (_("end of file in comment; newline inserted"));
1157                   PUT ('\n');
1158                   break;
1159                 }
1160
1161               if (ch < '0' || ch > '9' || state != 0 || startch != '#')
1162                 {
1163                   /* Not a cpp line.  */
1164                   while (ch != EOF && !IS_NEWLINE (ch))
1165                     ch = GET ();
1166                   if (ch == EOF)
1167                     as_warn (_("end of file in comment; newline inserted"));
1168                   state = 0;
1169                   PUT ('\n');
1170                   break;
1171                 }
1172               /* Looks like `# 123 "filename"' from cpp.  */
1173               UNGET (ch);
1174               old_state = 4;
1175               state = -1;
1176               if (scrub_m68k_mri)
1177                 out_string = "\tlinefile ";
1178               else
1179                 out_string = "\t.linefile ";
1180               PUT (*out_string++);
1181               break;
1182             }
1183
1184 #ifdef TC_D10V
1185           /* All insns end in a char for which LEX_IS_SYMBOL_COMPONENT is true.
1186              Trap is the only short insn that has a first operand that is
1187              neither register nor label.
1188              We must prevent exef0f ||trap #1 to degenerate to exef0f ||trap#1 .
1189              We can't make '#' LEX_IS_SYMBOL_COMPONENT because it is
1190              already LEX_IS_LINE_COMMENT_START.  However, it is the
1191              only character in line_comment_chars for d10v, hence we
1192              can recognize it as such.  */
1193           /* An alternative approach would be to reset the state to 1 when
1194              we see '||', '<'- or '->', but that seems to be overkill.  */
1195           if (state == 10)
1196             PUT (' ');
1197 #endif
1198           /* We have a line comment character which is not at the
1199              start of a line.  If this is also a normal comment
1200              character, fall through.  Otherwise treat it as a default
1201              character.  */
1202           if (strchr (tc_comment_chars, ch) == NULL
1203               && (! scrub_m68k_mri
1204                   || (ch != '!' && ch != '*')))
1205             goto de_fault;
1206           if (scrub_m68k_mri
1207               && (ch == '!' || ch == '*' || ch == '#')
1208               && state != 1
1209               && state != 10)
1210             goto de_fault;
1211           /* Fall through.  */
1212         case LEX_IS_COMMENT_START:
1213 #if defined TC_ARM && defined OBJ_ELF
1214           /* On the ARM, `@' is the comment character.
1215              Unfortunately this is also a special character in ELF .symver
1216              directives (and .type, though we deal with those another way).
1217              So we check if this line is such a directive, and treat
1218              the character as default if so.  This is a hack.  */
1219           if ((symver_state != NULL) && (*symver_state == 0))
1220             goto de_fault;
1221 #endif
1222 #ifdef WARN_COMMENTS
1223           if (!found_comment)
1224             as_where (&found_comment_file, &found_comment);
1225 #endif
1226           do
1227             {
1228               ch = GET ();
1229             }
1230           while (ch != EOF && !IS_NEWLINE (ch));
1231           if (ch == EOF)
1232             as_warn (_("end of file in comment; newline inserted"));
1233           state = 0;
1234           PUT ('\n');
1235           break;
1236
1237         case LEX_IS_SYMBOL_COMPONENT:
1238           if (state == 10)
1239             {
1240               /* This is a symbol character following another symbol
1241                  character, with whitespace in between.  We skipped
1242                  the whitespace earlier, so output it now.  */
1243               UNGET (ch);
1244               state = 3;
1245               PUT (' ');
1246               break;
1247             }
1248
1249 #ifdef TC_Z80
1250           /* "af'" is a symbol containing '\''.  */
1251           if (state == 3 && (ch == 'a' || ch == 'A'))
1252             {
1253               state = 16;
1254               PUT (ch);
1255               ch = GET ();
1256               if (ch == 'f' || ch == 'F')
1257                 {
1258                   state = 17;
1259                   PUT (ch);
1260                   break;
1261                 }
1262               else
1263                 {
1264                   state = 9;
1265                   if (!IS_SYMBOL_COMPONENT (ch))
1266                     {
1267                       UNGET (ch);
1268                       break;
1269                     }
1270                 }
1271             }
1272 #endif
1273           if (state == 3)
1274             state = 9;
1275
1276           /* This is a common case.  Quickly copy CH and all the
1277              following symbol component or normal characters.  */
1278           if (to + 1 < toend
1279               && mri_state == NULL
1280 #if defined TC_ARM && defined OBJ_ELF
1281               && symver_state == NULL
1282 #endif
1283               )
1284             {
1285               char *s;
1286               int len;
1287
1288               for (s = from; s < fromend; s++)
1289                 {
1290                   int type;
1291
1292                   ch2 = *(unsigned char *) s;
1293                   type = lex[ch2];
1294                   if (type != 0
1295                       && type != LEX_IS_SYMBOL_COMPONENT)
1296                     break;
1297                 }
1298
1299               if (s > from)
1300                 /* Handle the last character normally, for
1301                    simplicity.  */
1302                 --s;
1303
1304               len = s - from;
1305
1306               if (len > (toend - to) - 1)
1307                 len = (toend - to) - 1;
1308
1309               if (len > 0)
1310                 {
1311                   PUT (ch);
1312                   memcpy (to, from, len);
1313                   to += len;
1314                   from += len;
1315                   if (to >= toend)
1316                     goto tofull;
1317                   ch = GET ();
1318                 }
1319             }
1320
1321           /* Fall through.  */
1322         default:
1323         de_fault:
1324           /* Some relatively `normal' character.  */
1325           if (state == 0)
1326             {
1327               state = 11;       /* Now seeing label definition.  */
1328             }
1329           else if (state == 1)
1330             {
1331               state = 2;        /* Ditto.  */
1332             }
1333           else if (state == 9)
1334             {
1335               if (!IS_SYMBOL_COMPONENT (ch))
1336                 state = 3;
1337             }
1338           else if (state == 10)
1339             {
1340               if (ch == '\\')
1341                 {
1342                   /* Special handling for backslash: a backslash may
1343                      be the beginning of a formal parameter (of a
1344                      macro) following another symbol character, with
1345                      whitespace in between.  If that is the case, we
1346                      output a space before the parameter.  Strictly
1347                      speaking, correct handling depends upon what the
1348                      macro parameter expands into; if the parameter
1349                      expands into something which does not start with
1350                      an operand character, then we don't want to keep
1351                      the space.  We don't have enough information to
1352                      make the right choice, so here we are making the
1353                      choice which is more likely to be correct.  */
1354                   PUT (' ');
1355                 }
1356
1357               state = 3;
1358             }
1359           PUT (ch);
1360           break;
1361         }
1362     }
1363
1364   /*NOTREACHED*/
1365
1366  fromeof:
1367   /* We have reached the end of the input.  */
1368   return to - tostart;
1369
1370  tofull:
1371   /* The output buffer is full.  Save any input we have not yet
1372      processed.  */
1373   if (fromend > from)
1374     {
1375       saved_input = from;
1376       saved_input_len = fromend - from;
1377     }
1378   else
1379     saved_input = NULL;
1380
1381   return to - tostart;
1382 }
1383