toolchain/gcc/libjava/java/io/StreamTokenizer.java

   1 /* StreamTokenizer.java -- parses streams of characters into tokens
   2    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003  Free Software Foundation
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19 02111-1307 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package java.io;
  39
  40 /**
  41  * This class parses streams of characters into tokens.  There are a
  42  * million-zillion flags that can be set to control the parsing, as
  43  * described under the various method headings.
  44  *
  45  * @author Warren Levy <warrenl@cygnus.com>
  46  * @date October 25, 1998.
  47  */
  48 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
  49  * "The Java Language Specification", ISBN 0-201-63451-1
  50  * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
  51  * Status:  Believed complete and correct.
  52  */
  53
  54 public class StreamTokenizer
  55 {
  56   /** A constant indicating that the end of the stream has been read. */
  57   public static final int TT_EOF = -1;
  58
  59   /** A constant indicating that the end of the line has been read. */
  60   public static final int TT_EOL = '\n';
  61
  62   /** A constant indicating that a number token has been read. */
  63   public static final int TT_NUMBER = -2;
  64
  65   /** A constant indicating that a word token has been read. */
  66   public static final int TT_WORD = -3;
  67
  68   /** A constant indicating that no tokens have been read yet. */
  69   private static final int TT_NONE = -4;
  70
  71   /**
  72    * Contains the type of the token read resulting from a call to nextToken
  73    * The rules are as follows:
  74    * <ul>
  75    * <li>For a token consisting of a single ordinary character, this is the
  76    *     value of that character.
  77    * <li>For a quoted string, this is the value of the quote character
  78    * <li>For a word, this is TT_WORD
  79    * <li>For a number, this is TT_NUMBER
  80    * <li>For the end of the line, this is TT_EOL
  81    * <li>For the end of the stream, this is TT_EOF
  82    * </ul>
  83    */
  84   public int ttype = TT_NONE;
  85
  86   /** The String associated with word and string tokens. */
  87   public String sval;
  88
  89   /** The numeric value associated with number tokens. */
  90   public double nval;
  91
  92   /* Indicates whether end-of-line is recognized as a token. */
  93   private boolean eolSignificant = false;
  94
  95   /* Indicates whether word tokens are automatically made lower case. */
  96   private boolean lowerCase = false;
  97
  98   /* Indicates whether C++ style comments are recognized and skipped. */
  99   private boolean slashSlash = false;
 100
 101   /* Indicates whether C style comments are recognized and skipped. */
 102   private boolean slashStar = false;
 103
 104   /* Attribute tables of each byte from 0x00 to 0xFF. */
 105   private boolean[] whitespace = new boolean[256];
 106   private boolean[] alphabetic = new boolean[256];
 107   private boolean[] numeric = new boolean[256];
 108   private boolean[] quote = new boolean[256];
 109   private boolean[] comment = new boolean[256];
 110
 111   /* The Reader associated with this class. */
 112   private PushbackReader in;
 113
 114   /* Indicates if a token has been pushed back. */
 115   private boolean pushedBack = false;
 116
 117   /* Contains the current line number of the reader. */
 118   private int lineNumber = 1;
 119
 120   /**
 121    * This method reads bytes from an <code>InputStream</code> and tokenizes
 122    * them.  For details on how this method operates by default, see
 123    * <code>StreamTokenizer(Reader)</code>.
 124    *
 125    * @param in The <code>InputStream</code> to read from
 126    *
 127    * @deprecated Since JDK 1.1.
 128    */
 129   public StreamTokenizer(InputStream is)
 130   {
 131     this(new InputStreamReader(is));
 132   }
 133
 134   /**
 135    * This method initializes a new <code>StreamTokenizer</code> to read
 136    * characters from a <code>Reader</code> and parse them.  The char values
 137    * have their hight bits masked so that the value is treated a character
 138    * in the range of 0x0000 to 0x00FF.
 139    * <p>
 140    * This constructor sets up the parsing table to parse the stream in the
 141    * following manner:
 142    * <ul>
 143    * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
 144    *     are initialized as alphabetic
 145    * <li>The values 0x00 through 0x20 are initialized as whitespace
 146    * <li>The values '\'' and '"' are initialized as quote characters
 147    * <li>'/' is a comment character
 148    * <li>Numbers will be parsed
 149    * <li>EOL is not treated as significant
 150    * <li>C  and C++ (//) comments are not recognized
 151    * </ul>
 152    *
 153    * @param in The <code>Reader</code> to read chars from
 154    */
 155   public StreamTokenizer(Reader r)
 156   {
 157     in = new PushbackReader(r);
 158
 159     whitespaceChars(0x00, 0x20);
 160     wordChars('A', 'Z');
 161     wordChars('a', 'z');
 162     wordChars(0xA0, 0xFF);
 163     commentChar('/');
 164     quoteChar('\'');
 165     quoteChar('"');
 166     parseNumbers();
 167   }
 168
 169   /**
 170    * This method sets the comment attribute on the specified
 171    * character.  Other attributes for the character are cleared.
 172    *
 173    * @param c The character to set the comment attribute for, passed as an int
 174    */
 175   public void commentChar(int ch)
 176   {
 177     if (ch >= 0 && ch <= 255)
 178       {
 179         comment[ch] = true;
 180         whitespace[ch] = false;
 181         alphabetic[ch] = false;
 182         numeric[ch] = false;
 183         quote[ch] = false;
 184       }
 185   }
 186
 187   /**
 188    * This method sets a flag that indicates whether or not the end of line
 189    * sequence terminates and is a token.  The defaults to <code>false</code>
 190    *
 191    * @param flag <code>true</code> if EOF is significant, <code>false</code>
 192    *             otherwise
 193    */
 194   public void eolIsSignificant(boolean flag)
 195   {
 196     eolSignificant = flag;
 197   }
 198
 199   /**
 200    * This method returns the current line number.  Note that if the
 201    * <code>pushBack()</code> method is called, it has no effect on the
 202    * line number returned by this method.
 203    *
 204    * @return The current line number
 205    */
 206   public int lineno()
 207   {
 208     return lineNumber;
 209   }
 210
 211   /**
 212    * This method sets a flag that indicates whether or not alphabetic
 213    * tokens that are returned should be converted to lower case.
 214    *
 215    * @param flag <code>true</code> to convert to lower case,
 216    *             <code>false</code> otherwise
 217    */
 218   public void lowerCaseMode(boolean flag)
 219   {
 220     lowerCase = flag;
 221   }
 222
 223   private boolean isWhitespace(int ch)
 224   {
 225     return (ch >= 0 && ch <= 255 && whitespace[ch]);
 226   }
 227
 228   private boolean isAlphabetic(int ch)
 229   {
 230     return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
 231   }
 232
 233   private boolean isNumeric(int ch)
 234   {
 235     return (ch >= 0 && ch <= 255 && numeric[ch]);
 236   }
 237
 238   private boolean isQuote(int ch)
 239   {
 240     return (ch >= 0 && ch <= 255 && quote[ch]);
 241   }
 242
 243   private boolean isComment(int ch)
 244   {
 245     return (ch >= 0 && ch <= 255 && comment[ch]);
 246   }
 247
 248   /**
 249    * This method reads the next token from the stream.  It sets the
 250    * <code>ttype</code> variable to the appropriate token type and
 251    * returns it.  It also can set <code>sval</code> or <code>nval</code>
 252    * as described below.  The parsing strategy is as follows:
 253    * <ul>
 254    * <li>Skip any whitespace characters.
 255    * <li>If a numeric character is encountered, attempt to parse a numeric
 256    * value.  Leading '-' characters indicate a numeric only if followed by
 257    * another non-'-' numeric.  The value of the numeric token is terminated
 258    * by either the first non-numeric encountered, or the second occurrence of
 259    * '-' or '.'.  The token type returned is TT_NUMBER and <code>nval</code>
 260    * is set to the value parsed.
 261    * <li>If an alphabetic character is parsed, all subsequent characters
 262    * are read until the first non-alphabetic or non-numeric character is
 263    * encountered.  The token type returned is TT_WORD and the value parsed
 264    * is stored in <code>sval</code>.  If lower case mode is set, the token
 265    * stored in <code>sval</code> is converted to lower case.  The end of line
 266    * sequence terminates a word only if EOL signficance has been turned on.
 267    * The start of a comment also terminates a word.  Any character with a
 268    * non-alphabetic and non-numeric attribute (such as white space, a quote,
 269    * or a commet) are treated as non-alphabetic and terminate the word.
 270    * <li>If a comment character is parsed, then all remaining characters on
 271    * the current line are skipped and another token is parsed.  Any EOL or
 272    * EOF's encountered are not discarded, but rather terminate the comment.
 273    * <li>If a quote character is parsed, then all characters up to the
 274    * second occurrence of the same quote character are parsed into a
 275    * <code>String</code>.  This <code>String</code> is stored as
 276    * <code>sval</code>, but is not converted to lower case, even if lower case
 277    * mode is enabled.  The token type returned is the value of the quote
 278    * character encountered.  Any escape sequences
 279    * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
 280    * (carriage return), \" (double quote), \' (single quote), \\
 281    * (backslash), \XXX (octal esacpe)) are converted to the appropriate
 282    * char values.  Invalid esacape sequences are left in untranslated.
 283    * Unicode characters like ('\ u0000') are not recognized.
 284    * <li>If the C++ comment sequence "//" is encountered, and the parser
 285    * is configured to handle that sequence, then the remainder of the line
 286    * is skipped and another token is read exactly as if a character with
 287    * the comment attribute was encountered.
 288    * <li>If the C comment sequence "/*" is encountered, and the parser
 289    * is configured to handle that sequence, then all characters up to and
 290    * including the comment terminator sequence are discarded and another
 291    * token is parsed.
 292    * <li>If all cases above are not met, then the character is an ordinary
 293    * character that is parsed as a token by itself.  The char encountered
 294    * is returned as the token type.
 295    * </ul>
 296    *
 297    * @return The token type
 298    * @exception IOException If an I/O error occurs
 299    */
 300   public int nextToken() throws IOException
 301   {
 302     if (pushedBack)
 303       {
 304         pushedBack = false;
 305         if (ttype != TT_NONE)
 306           return ttype;
 307       }
 308
 309     sval = null;
 310     int ch;
 311
 312     // Skip whitespace.  Deal with EOL along the way.
 313     while (isWhitespace(ch = in.read()))
 314       if (ch == '\n' || ch == '\r')
 315         {
 316           lineNumber++;
 317
 318           // Throw away \n if in combination with \r.
 319           if (ch == '\r' && (ch = in.read()) != '\n')
 320             {
 321               if (ch != TT_EOF)
 322                 in.unread(ch);
 323             }
 324           if (eolSignificant)
 325             return (ttype = TT_EOL);
 326         }
 327
 328     if (ch == '/')
 329       if ((ch = in.read()) == '/' && slashSlash)
 330         {
 331           while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
 332             ;
 333           if (ch != TT_EOF)
 334             in.unread(ch);
 335           return nextToken(); // Recursive, but not too deep in normal cases
 336         }
 337       else if (ch == '*' && slashStar)
 338         {
 339           while (true)
 340             {
 341               ch = in.read();
 342               if (ch == '*')
 343                 {
 344                   if ((ch = in.read()) == '/')
 345                     break;
 346                   else if (ch != TT_EOF)
 347                     in.unread(ch);
 348                 }
 349               else if (ch == '\n' || ch == '\r')
 350                 {
 351                   lineNumber++;
 352                   if (ch == '\r' && (ch = in.read()) != '\n')
 353                     {
 354                       if (ch != TT_EOF)
 355                         in.unread(ch);
 356                     }
 357                 }
 358               else if (ch == TT_EOF)
 359                 {
 360                   break;
 361                 }
 362             }
 363           return nextToken(); // Recursive, but not too deep in normal cases
 364         }
 365       else
 366         {
 367           if (ch != TT_EOF)
 368             in.unread(ch);
 369           ch = '/';
 370         }
 371
 372     if (ch == TT_EOF)
 373       ttype = TT_EOF;
 374     else if (isNumeric(ch))
 375       {
 376         boolean isNegative = false;
 377         if (ch == '-')
 378           {
 379             // Read ahead to see if this is an ordinary '-' rather than numeric.
 380             ch = in.read();
 381             if (isNumeric(ch) && ch != '-')
 382               {
 383                 isNegative = true;
 384               }
 385             else
 386               {
 387                 if (ch != TT_EOF)
 388                   in.unread(ch);
 389                 return (ttype = '-');
 390               }
 391           }
 392
 393         StringBuffer tokbuf = new StringBuffer();
 394         tokbuf.append((char) ch);
 395
 396         int decCount = 0;
 397         while (isNumeric(ch = in.read()) && ch != '-')
 398           if (ch == '.' && decCount++ > 0)
 399             break;
 400           else
 401             tokbuf.append((char) ch);
 402
 403         if (ch != TT_EOF)
 404           in.unread(ch);
 405         ttype = TT_NUMBER;
 406         try
 407           {
 408             nval = Double.valueOf(tokbuf.toString()).doubleValue();
 409           }
 410         catch (NumberFormatException _)
 411           {
 412             nval = 0.0;
 413           }
 414         if (isNegative)
 415           nval = -nval;
 416       }
 417     else if (isAlphabetic(ch))
 418       {
 419         StringBuffer tokbuf = new StringBuffer();
 420         tokbuf.append((char) ch);
 421         while (isAlphabetic(ch = in.read()) || isNumeric(ch))
 422           tokbuf.append((char) ch);
 423         if (ch != TT_EOF)
 424           in.unread(ch);
 425         ttype = TT_WORD;
 426         sval = tokbuf.toString();
 427         if (lowerCase)
 428           sval = sval.toLowerCase();
 429       }
 430     else if (isComment(ch))
 431       {
 432         while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
 433           ;
 434         if (ch != TT_EOF)
 435           in.unread(ch);
 436         return nextToken();     // Recursive, but not too deep in normal cases.
 437       }
 438     else if (isQuote(ch))
 439       {
 440         ttype = ch;
 441         StringBuffer tokbuf = new StringBuffer();
 442         while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
 443                ch != TT_EOF)
 444           {
 445             if (ch == '\\')
 446               switch (ch = in.read())
 447                 {
 448                   case 'a':     ch = 0x7;
 449                     break;
 450                   case 'b':     ch = '\b';
 451                     break;
 452                   case 'f':     ch = 0xC;
 453                     break;
 454                   case 'n':     ch = '\n';
 455                     break;
 456                   case 'r':     ch = '\r';
 457                     break;
 458                   case 't':     ch = '\t';
 459                     break;
 460                   case 'v':     ch = 0xB;
 461                     break;
 462                   case '\n':    ch = '\n';
 463                     break;
 464                   case '\r':    ch = '\r';
 465                     break;
 466                   case '\"':
 467                   case '\'':
 468                   case '\\':
 469                     break;
 470                   default:
 471                     int ch1, nextch;
 472                     if ((nextch = ch1 = ch) >= '0' && ch <= '7')
 473                       {
 474                         ch -= '0';
 475                         if ((nextch = in.read()) >= '0' && nextch <= '7')
 476                           {
 477                             ch = ch * 8 + nextch - '0';
 478                             if ((nextch = in.read()) >= '0' && nextch <= '7' &&
 479                                 ch1 >= '0' && ch1 <= '3')
 480                               {
 481                                 ch = ch * 8 + nextch - '0';
 482                                 nextch = in.read();
 483                               }
 484                           }
 485                       }
 486
 487                     if (nextch != TT_EOF)
 488                       in.unread(nextch);
 489                 }
 490
 491             tokbuf.append((char) ch);
 492           }
 493
 494         // Throw away matching quote char.
 495         if (ch != ttype && ch != TT_EOF)
 496           in.unread(ch);
 497
 498         sval = tokbuf.toString();
 499       }
 500     else
 501       {
 502         ttype = ch;
 503       }
 504
 505     return ttype;
 506   }
 507
 508   private void resetChar(int ch)
 509   {
 510     whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
 511       false;
 512   }
 513
 514   /**
 515    * This method makes the specified character an ordinary character.  This
 516    * means that none of the attributes (whitespace, alphabetic, numeric,
 517    * quote, or comment) will be set on this character.  This character will
 518    * parse as its own token.
 519    *
 520    * @param c The character to make ordinary, passed as an int
 521    */
 522   public void ordinaryChar(int ch)
 523   {
 524     if (ch >= 0 && ch <= 255)
 525       resetChar(ch);
 526   }
 527
 528   /**
 529    * This method makes all the characters in the specified range, range
 530    * terminators included, ordinary.  This means the none of the attributes
 531    * (whitespace, alphabetic, numeric, quote, or comment) will be set on
 532    * any of the characters in the range.  This makes each character in this
 533    * range parse as its own token.
 534    *
 535    * @param low The low end of the range of values to set the whitespace
 536    *            attribute for
 537    * @param high The high end of the range of values to set the whitespace
 538    *            attribute for
 539    */
 540   public void ordinaryChars(int low, int hi)
 541   {
 542     if (low < 0)
 543       low = 0;
 544     if (hi > 255)
 545       hi = 255;
 546     for (int i = low; i <= hi; i++)
 547       resetChar(i);
 548   }
 549
 550   /**
 551    * This method sets the numeric attribute on the characters '0' - '9' and
 552    * the characters '.' and '-'.
 553    */
 554   public void parseNumbers()
 555   {
 556     for (int i = 0; i <= 9; i++)
 557       numeric['0' + i] = true;
 558
 559     numeric['.'] = true;
 560     numeric['-'] = true;
 561   }
 562
 563   /**
 564    * Puts the current token back into the StreamTokenizer so
 565    * <code>nextToken</code> will return the same value on the next call.
 566    * May cause the lineno method to return an incorrect value
 567    * if lineno is called before the next call to nextToken.
 568    */
 569   public void pushBack()
 570   {
 571     pushedBack = true;
 572   }
 573
 574   /**
 575    * This method sets the quote attribute on the specified character.
 576    * Other attributes for the character are cleared.
 577    *
 578    * @param c The character to set the quote attribute for, passed as an int.
 579    */
 580   public void quoteChar(int ch)
 581   {
 582     if (ch >= 0 && ch <= 255)
 583       {
 584         quote[ch] = true;
 585         comment[ch] = false;
 586         whitespace[ch] = false;
 587         alphabetic[ch] = false;
 588         numeric[ch] = false;
 589       }
 590   }
 591
 592   /**
 593    * This method removes all attributes (whitespace, alphabetic, numeric,
 594    * quote, and comment) from all characters.  It is equivalent to calling
 595    * <code>ordinaryChars(0x00, 0xFF)</code>.
 596    *
 597    * @see #ordinaryChars(int, int)
 598    */
 599   public void resetSyntax()
 600   {
 601     ordinaryChars(0x00, 0xFF);
 602   }
 603
 604   /**
 605    * This method sets a flag that indicates whether or not "C++" language style
 606    * comments ("//" comments through EOL ) are handled by the parser.
 607    * If this is <code>true</code> commented out sequences are skipped and
 608    * ignored by the parser.  This defaults to <code>false</code>.
 609    *
 610    * @param flag <code>true</code> to recognized and handle "C++" style
 611    *             comments, <code>false</code> otherwise
 612    */
 613   public void slashSlashComments(boolean flag)
 614   {
 615     slashSlash = flag;
 616   }
 617
 618   /**
 619    * This method sets a flag that indicates whether or not "C" language style
 620    * comments (with nesting not allowed) are handled by the parser.
 621    * If this is <code>true</code> commented out sequences are skipped and
 622    * ignored by the parser.  This defaults to <code>false</code>.
 623    *
 624    * @param flag <code>true</code> to recognized and handle "C" style comments,
 625    *             <code>false</code> otherwise
 626    */
 627   public void slashStarComments(boolean flag)
 628   {
 629     slashStar = flag;
 630   }
 631
 632   /**
 633    * This method returns the current token value as a <code>String</code> in
 634    * the form "Token[x], line n", where 'n' is the current line numbers and
 635    * 'x' is determined as follows.
 636    * <p>
 637    * <ul>
 638    * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0
 639    * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"
 640    * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"
 641    * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code>
 642    * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
 643    * 'strnval' is <code>String.valueOf(nval)</code>.
 644    * <li>If <code>ttype</code> is a quote character, then 'x' is
 645    * <code>sval</code>
 646    * <li>For all other cases, 'x' is <code>ttype</code>
 647    * </ul>
 648    */
 649   public String toString()
 650   {
 651     String tempstr;
 652     if (ttype == TT_EOF)
 653       tempstr = "EOF";
 654     else if (ttype == TT_EOL)
 655       tempstr = "EOL";
 656     else if (ttype == TT_WORD)
 657       tempstr = sval;
 658     else if (ttype == TT_NUMBER)
 659       tempstr = "n=" + nval;
 660     else if (ttype == TT_NONE)
 661       tempstr = "NOTHING";
 662     else // must be an ordinary char.
 663       tempstr = "\'" + (char) ttype + "\'";
 664
 665     return "Token[" + tempstr + "], line " + lineno();
 666   }
 667
 668   /**
 669    * This method sets the whitespace attribute for all characters in the
 670    * specified range, range terminators included.
 671    *
 672    * @param low The low end of the range of values to set the whitespace
 673    *            attribute for
 674    * @param high The high end of the range of values to set the whitespace
 675    *             attribute for
 676    */
 677   public void whitespaceChars(int low, int hi)
 678   {
 679     if (low < 0)
 680       low = 0;
 681     if (hi > 255)
 682       hi = 255;
 683     for (int i = low; i <= hi; i++)
 684       {
 685         resetChar(i);
 686         whitespace[i] = true;
 687       }
 688   }
 689
 690   /**
 691    * This method sets the alphabetic attribute for all characters in the
 692    * specified range, range terminators included.
 693    *
 694    * @param low The low end of the range of values to set the alphabetic
 695    *            attribute for
 696    * @param high The high end of the range of values to set the alphabetic
 697    *             attribute for
 698    */
 699   public void wordChars(int low, int hi)
 700   {
 701     if (low < 0)
 702       low = 0;
 703     if (hi > 255)
 704       hi = 255;
 705     for (int i = low; i <= hi; i++)
 706       alphabetic[i] = true;
 707   }
 708 }