src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering, meyering@cs.utexas.edu.  */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #define NDEBUG 1
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <sys/types.h>
  30 #include <getopt.h>
  31
  32 #if HAVE_LIMITS_H
  33 # include <limits.h>
  34 #endif
  35
  36 #include "system.h"
  37 #include "version.h"
  38 #include "error.h"
  39
  40 #ifndef ULONG_MAX
  41 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
  42 #endif
  43
  44 #ifndef LONG_MAX
  45 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
  46 #endif
  47
  48 #ifndef UINT_MAX
  49 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
  50 #endif
  51
  52 #ifndef INT_MAX
  53 # define INT_MAX ((int) (UINT_MAX >> 1))
  54 #endif
  55
  56 #ifndef UCHAR_MAX
  57 #define UCHAR_MAX 0xFF
  58 #endif
  59
  60 #define N_CHARS (UCHAR_MAX + 1)
  61
  62 /* A pointer to a function that returns an int.  */
  63 typedef int (*PFI) ();
  64
  65 /* Convert from character C to its index in the collating
  66    sequence array.  Just cast to an unsigned int to avoid
  67    problems with sign-extension.  */
  68 #define ORD(c) (unsigned int)(c)
  69
  70 /* The inverse of ORD.  */
  71 #define CHR(i) (unsigned char)(i)
  72
  73 /* The value for Spec_list->state that indicates to
  74    get_next that it should initialize the tail pointer.
  75    Its value should be as large as possible to avoid conflict
  76    a valid value for the state field -- and that may be as
  77    large as any valid repeat_count.  */
  78 #define BEGIN_STATE (INT_MAX - 1)
  79
  80 /* The value for Spec_list->state that indicates to
  81    get_next that the element pointed to by Spec_list->tail is
  82    being considered for the first time on this pass through the
  83    list -- it indicates that get_next should make any necessary
  84    initializations.  */
  85 #define NEW_ELEMENT (BEGIN_STATE + 1)
  86
  87 /* A value distinct from any character that may have been stored in a
  88    buffer as the result of a block-read in the function squeeze_filter.  */
  89 #define NOT_A_CHAR (unsigned int)(-1)
  90
  91 /* The following (but not CC_NO_CLASS) are indices into the array of
  92    valid character class strings.  */
  93 enum Char_class
  94   {
  95     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  96     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  97     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  98     CC_NO_CLASS = 9999
  99   };
 100
 101 /* Character class to which a character (returned by get_next) belonged;
 102    but it is set only if the construct from which the character was obtained
 103    was one of the character classes [:upper:] or [:lower:].  The value
 104    is used only when translating and then, only to make sure that upper
 105    and lower class constructs have the same relative positions in string1
 106    and string2.  */
 107 enum Upper_Lower_class
 108   {
 109     UL_LOWER = 0,
 110     UL_UPPER = 1,
 111     UL_NONE = 2
 112   };
 113
 114 /* A shortcut to ensure that when constructing the translation array,
 115    one of the values returned by paired calls to get_next (from s1 and s2)
 116    is from [:upper:] and the other is from [:lower:], or neither is from
 117    upper or lower.  In fact, no other character classes are allowed when
 118    translating, but that condition is tested elsewhere.  This array is
 119    indexed by values of type enum Upper_Lower_class.  */
 120 static int const class_ok[3][3] =
 121 {
 122   {0, 1, 0},
 123   {1, 0, 0},
 124   {0, 0, 1}
 125 };
 126
 127 /* The type of a List_element.  See build_spec_list for more details.  */
 128 enum Range_element_type
 129   {
 130     RE_NO_TYPE = 0,
 131     RE_NORMAL_CHAR,
 132     RE_RANGE,
 133     RE_CHAR_CLASS,
 134     RE_EQUIV_CLASS,
 135     RE_REPEATED_CHAR
 136   };
 137
 138 /* One construct in one of tr's argument strings.
 139    For example, consider the POSIX version of the classic tr command:
 140        tr -cs 'a-zA-Z_' '[\n*]'
 141    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 142    and a single normal character, `_'.  String2 has one construct.  */
 143 struct List_element
 144   {
 145     enum Range_element_type type;
 146     struct List_element *next;
 147     union
 148       {
 149         int normal_char;
 150         struct                  /* unnamed */
 151           {
 152             unsigned int first_char;
 153             unsigned int last_char;
 154           }
 155         range;
 156         enum Char_class char_class;
 157         int equiv_code;
 158         struct                  /* unnamed */
 159           {
 160             unsigned int the_repeated_char;
 161             size_t repeat_count;
 162           }
 163         repeated_char;
 164       }
 165     u;
 166   };
 167
 168 /* Each of tr's argument strings is parsed into a form that is easier
 169    to work with: a linked list of constructs (struct List_element).
 170    Each Spec_list structure also encapsulates various attributes of
 171    the corresponding argument string.  The attributes are used mainly
 172    to verify that the strings are valid in the context of any options
 173    specified (like -s, -d, or -c).  The main exception is the member
 174    `tail', which is first used to construct the list.  After construction,
 175    it is used by get_next to save its state when traversing the list.
 176    The member `state' serves a similar function.  */
 177 struct Spec_list
 178   {
 179     /* Points to the head of the list of range elements.
 180        The first struct is a dummy; its members are never used.  */
 181     struct List_element *head;
 182
 183     /* When appending, points to the last element.  When traversing via
 184        get_next(), points to the element to process next.  Setting
 185        Spec_list.state to the value BEGIN_STATE before calling get_next
 186        signals get_next to initialize tail to point to head->next.  */
 187     struct List_element *tail;
 188
 189     /* Used to save state between calls to get_next().  */
 190     unsigned int state;
 191
 192     /* Length, in the sense that length ('a-z[:digit:]123abc')
 193        is 42 ( = 26 + 10 + 6).  */
 194     size_t length;
 195
 196     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 197     int n_indefinite_repeats;
 198
 199     /* If n_indefinite_repeats is nonzero, this points to the List_element
 200        corresponding to the last [c*] or [c*0] construct encountered in
 201        this spec.  Otherwise it is undefined.  */
 202     struct List_element *indefinite_repeat_element;
 203
 204     /* Non-zero if this spec contains at least one equivalence
 205        class construct e.g. [=c=].  */
 206     int has_equiv_class;
 207
 208     /* Non-zero if this spec contains at least one of [:upper:] or
 209        [:lower:] class constructs.  */
 210     int has_upper_or_lower;
 211
 212     /* Non-zero if this spec contains at least one of the character class
 213        constructs (all but upper and lower) that aren't allowed in s2.  */
 214     int has_restricted_char_class;
 215   };
 216
 217 /* A representation for escaped string1 or string2.  As a string is parsed,
 218    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 219    etc.) are marked as such in this structure by setting the corresponding
 220    entry in the ESCAPED vector.  */
 221 struct E_string
 222 {
 223   unsigned char *s;
 224   int *escaped;
 225   size_t len;
 226 };
 227
 228 /* Return nonzero if the Ith character of escaped string ES matches C
 229    and is not escaped itself.  */
 230 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 231
 232
 233 char *xmalloc ();
 234 char *stpcpy ();
 235 int safe_read ();
 236
 237 /* The name by which this program was run.  */
 238 char *program_name;
 239
 240 /* When nonzero, each sequence in the input of a repeated character
 241    (call it c) is replaced (in the output) by a single occurrence of c
 242    for every c in the squeeze set.  */
 243 static int squeeze_repeats = 0;
 244
 245 /* When nonzero, removes characters in the delete set from input.  */
 246 static int delete = 0;
 247
 248 /* Use the complement of set1 in place of set1.  */
 249 static int complement = 0;
 250
 251 /* When nonzero, this flag causes GNU tr to provide strict
 252    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 253    says that when -d is used without -s, string2 (if present)
 254    must be ignored.  Silently ignoring arguments is a bad idea.
 255    The default GNU behavior is to give a usage message and exit.
 256    Additionally, when this flag is nonzero, tr prints warnings
 257    on stderr if it is being used in a manner that is not portable.
 258    Applicable warnings are given by default, but are suppressed
 259    if the environment variable `POSIXLY_CORRECT' is set, since
 260    being POSIX conformant means we can't issue such messages.
 261    Warnings on the following topics are suppressed when this
 262    variable is nonzero:
 263    1. Ambiguous octal escapes.  */
 264 static int posix_pedantic;
 265
 266 /* When tr is performing translation and string1 is longer than string2,
 267    POSIX says that the result is undefined.  That gives the implementor
 268    of a POSIX conforming version of tr two reasonable choices for the
 269    semantics of this case.
 270
 271    * The BSD tr pads string2 to the length of string1 by
 272    repeating the last character in string2.
 273
 274    * System V tr ignores characters in string1 that have no
 275    corresponding character in string2.  That is, string1 is effectively
 276    truncated to the length of string2.
 277
 278    When nonzero, this flag causes GNU tr to imitate the behavior
 279    of System V tr when translating with string1 longer than string2.
 280    The default is to emulate BSD tr.  This flag is ignored in modes where
 281    no translation is performed.  Emulating the System V tr
 282    in this exceptional case causes the relatively common BSD idiom:
 283
 284        tr -cs A-Za-z0-9 '\012'
 285
 286    to break (it would convert only zero bytes, rather than all
 287    non-alphanumerics, to newlines).
 288
 289    WARNING: This switch does not provide general BSD or System V
 290    compatibility.  For example, it doesn't disable the interpretation
 291    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 292    some unfortunate coincidence you use such constructs in scripts
 293    expecting to use some other version of tr, the scripts will break.  */
 294 static int truncate_set1 = 0;
 295
 296 /* An alias for (!delete && non_option_args == 2).
 297    It is set in main and used there and in validate().  */
 298 static int translating;
 299
 300 #ifndef BUFSIZ
 301 #define BUFSIZ 8192
 302 #endif
 303
 304 #define IO_BUF_SIZE BUFSIZ
 305 static unsigned char io_buf[IO_BUF_SIZE];
 306
 307 static char const *const char_class_name[] =
 308 {
 309   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 310   "lower", "print", "punct", "space", "upper", "xdigit"
 311 };
 312 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 313
 314 typedef char SET_TYPE;
 315
 316 /* Array of boolean values.  A character `c' is a member of the
 317    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 318    set is defined by the last (possibly, the only) string argument
 319    on the command line when the squeeze option is given.  */
 320 static SET_TYPE in_squeeze_set[N_CHARS];
 321
 322 /* Array of boolean values.  A character `c' is a member of the
 323    delete set if and only if in_delete_set[c] is true.  The delete
 324    set is defined by the first (or only) string argument on the
 325    command line when the delete option is given.  */
 326 static SET_TYPE in_delete_set[N_CHARS];
 327
 328 /* Array of character values defining the translation (if any) that
 329    tr is to perform.  Translation is performed only when there are
 330    two specification strings and the delete switch is not given.  */
 331 static char xlate[N_CHARS];
 332
 333 /* If nonzero, display usage information and exit.  */
 334 static int show_help;
 335
 336 /* If nonzero, print the version on standard output then exit.  */
 337 static int show_version;
 338
 339 static struct option const long_options[] =
 340 {
 341   {"complement", no_argument, NULL, 'c'},
 342   {"delete", no_argument, NULL, 'd'},
 343   {"squeeze-repeats", no_argument, NULL, 's'},
 344   {"truncate-set1", no_argument, NULL, 't'},
 345   {"help", no_argument, &show_help, 1},
 346   {"version", no_argument, &show_version, 1},
 347   {NULL, 0, NULL, 0}
 348 };
 349 \f
 350 static void
 351 usage (int status)
 352 {
 353   if (status != 0)
 354     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 355              program_name);
 356   else
 357     {
 358       printf (_("\
 359 Usage: %s [OPTION]... SET1 [SET2]\n\
 360 "),
 361               program_name);
 362       printf (_("\
 363 Translate, squeeze, and/or delete characters from standard input,\n\
 364 writing to standard output.\n\
 365 \n\
 366   -c, --complement        first complement SET1\n\
 367   -d, --delete            delete characters in SET1, do not translate\n\
 368   -s, --squeeze-repeats   replace sequence of characters with one\n\
 369   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 370       --help              display this help and exit\n\
 371       --version           output version information and exit\n\
 372 "));
 373       printf (_("\
 374 \n\
 375 SETs are specified as strings of characters.  Most represent themselves.\n\
 376 Interpreted sequences are:\n\
 377 \n\
 378   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 379   \\\\              backslash\n\
 380   \\a              audible BEL\n\
 381   \\b              backspace\n\
 382   \\f              form feed\n\
 383   \\n              new line\n\
 384   \\r              return\n\
 385   \\t              horizontal tab\n\
 386   \\v              vertical tab\n\
 387   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 388   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 389   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 390   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 391   [:alnum:]       all letters and digits\n\
 392   [:alpha:]       all letters\n\
 393   [:blank:]       all horizontal whitespace\n\
 394   [:cntrl:]       all control characters\n\
 395   [:digit:]       all digits\n\
 396   [:graph:]       all printable characters, not including space\n\
 397   [:lower:]       all lower case letters\n\
 398   [:print:]       all printable characters, including space\n\
 399   [:punct:]       all punctuation characters\n\
 400   [:space:]       all horizontal or vertical whitespace\n\
 401   [:upper:]       all upper case letters\n\
 402   [:xdigit:]      all hexadecimal digits\n\
 403   [=CHAR=]        all characters which are equivalent to CHAR\n\
 404 "));
 405       printf (_("\
 406 \n\
 407 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 408 -t may be used only when translating.  SET2 is extended to length of\n\
 409 SET1 by repeating its last character as necessary.  Excess characters\n\
 410 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 411 expand in ascending order; used in SET2 while translating, they may\n\
 412 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 413 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 414 translation or deletion.\n\
 415 "));
 416     }
 417   exit (status);
 418 }
 419
 420 /* Return nonzero if the character C is a member of the
 421    equivalence class containing the character EQUIV_CLASS.  */
 422
 423 static int
 424 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 425 {
 426   return (equiv_class == c);
 427 }
 428
 429 /* Return nonzero if the character C is a member of the
 430    character class CHAR_CLASS.  */
 431
 432 static int
 433 is_char_class_member (enum Char_class char_class, unsigned int c)
 434 {
 435   int result;
 436
 437   switch (char_class)
 438     {
 439     case CC_ALNUM:
 440       result = ISALNUM (c);
 441       break;
 442     case CC_ALPHA:
 443       result = ISALPHA (c);
 444       break;
 445     case CC_BLANK:
 446       result = ISBLANK (c);
 447       break;
 448     case CC_CNTRL:
 449       result = ISCNTRL (c);
 450       break;
 451     case CC_DIGIT:
 452       result = ISDIGIT (c);
 453       break;
 454     case CC_GRAPH:
 455       result = ISGRAPH (c);
 456       break;
 457     case CC_LOWER:
 458       result = ISLOWER (c);
 459       break;
 460     case CC_PRINT:
 461       result = ISPRINT (c);
 462       break;
 463     case CC_PUNCT:
 464       result = ISPUNCT (c);
 465       break;
 466     case CC_SPACE:
 467       result = ISSPACE (c);
 468       break;
 469     case CC_UPPER:
 470       result = ISUPPER (c);
 471       break;
 472     case CC_XDIGIT:
 473       result = ISXDIGIT (c);
 474       break;
 475     default:
 476       abort ();
 477       break;
 478     }
 479   return result;
 480 }
 481
 482 /* Perform the first pass over each range-spec argument S, converting all
 483    \c and \ddd escapes to their one-byte representations.  The conversion
 484    is done in-place, so S must point to writable storage.  If an invalid
 485    quote sequence is found print an error message and return nonzero.
 486    Otherwise set *LEN to the length of the resulting string and return
 487    zero.  The resulting array of characters may contain zero-bytes;
 488    however, on input, S is assumed to be null-terminated, and hence
 489    cannot contain actual (non-escaped) zero bytes.  */
 490
 491 static int
 492 unquote (const unsigned char *s, struct E_string *es)
 493 {
 494   size_t i, j;
 495   size_t len;
 496
 497   len = strlen ((char *) s);
 498
 499   es->s = (unsigned char *) xmalloc (len);
 500   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 501   for (i = 0; i < len; i++)
 502     es->escaped[i] = 0;
 503
 504   j = 0;
 505   for (i = 0; s[i]; i++)
 506     {
 507       switch (s[i])
 508         {
 509           int c;
 510         case '\\':
 511           switch (s[i + 1])
 512             {
 513               int oct_digit;
 514             case '\\':
 515               c = '\\';
 516               break;
 517             case 'a':
 518               c = '\007';
 519               break;
 520             case 'b':
 521               c = '\b';
 522               break;
 523             case 'f':
 524               c = '\f';
 525               break;
 526             case 'n':
 527               c = '\n';
 528               break;
 529             case 'r':
 530               c = '\r';
 531               break;
 532             case 't':
 533               c = '\t';
 534               break;
 535             case 'v':
 536               c = '\v';
 537               break;
 538             case '0':
 539             case '1':
 540             case '2':
 541             case '3':
 542             case '4':
 543             case '5':
 544             case '6':
 545             case '7':
 546               c = s[i + 1] - '0';
 547               oct_digit = s[i + 2] - '0';
 548               if (0 <= oct_digit && oct_digit <= 7)
 549                 {
 550                   c = 8 * c + oct_digit;
 551                   ++i;
 552                   oct_digit = s[i + 2] - '0';
 553                   if (0 <= oct_digit && oct_digit <= 7)
 554                     {
 555                       if (8 * c + oct_digit < N_CHARS)
 556                         {
 557                           c = 8 * c + oct_digit;
 558                           ++i;
 559                         }
 560                       else if (!posix_pedantic)
 561                         {
 562                           /* Any octal number larger than 0377 won't
 563                              fit in 8 bits.  So we stop when adding the
 564                              next digit would put us over the limit and
 565                              give a warning about the ambiguity.  POSIX
 566                              isn't clear on this, but one person has said
 567                              that in his interpretation, POSIX says tr
 568                              can't even give a warning.  */
 569                           error (0, 0, _("warning: the ambiguous octal escape \
 570 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 571                                  s[i], s[i + 1], s[i + 2],
 572                                  s[i], s[i + 1], s[i + 2]);
 573                         }
 574                     }
 575                 }
 576               break;
 577             case '\0':
 578               error (0, 0, _("invalid backslash escape at end of string"));
 579               return 1;
 580
 581             default:
 582               if (posix_pedantic)
 583                 {
 584                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 585                   return 1;
 586                 }
 587               else
 588                 {
 589                   c = s[i + 1];
 590                   es->escaped[j] = 1;
 591                 }
 592             }
 593           ++i;
 594           es->s[j++] = c;
 595           break;
 596         default:
 597           es->s[j++] = s[i];
 598           break;
 599         }
 600     }
 601   es->len = j;
 602   return 0;
 603 }
 604
 605 /* If CLASS_STR is a valid character class string, return its index
 606    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 607
 608 static enum Char_class
 609 look_up_char_class (const unsigned char *class_str, size_t len)
 610 {
 611   unsigned int i;
 612
 613   for (i = 0; i < N_CHAR_CLASSES; i++)
 614     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 615         && strlen (char_class_name[i]) == len)
 616       return (enum Char_class) i;
 617   return CC_NO_CLASS;
 618 }
 619
 620 /* Return a newly allocated string with a printable version of C.
 621    This function is used solely for formatting error messages.  */
 622
 623 static char *
 624 make_printable_char (unsigned int c)
 625 {
 626   char *buf = xmalloc (5);
 627
 628   assert (c < N_CHARS);
 629   if (ISPRINT (c))
 630     {
 631       buf[0] = c;
 632       buf[1] = '\0';
 633     }
 634   else
 635     {
 636       sprintf (buf, "\\%03o", c);
 637     }
 638   return buf;
 639 }
 640
 641 /* Return a newly allocated copy of S which is suitable for printing.
 642    LEN is the number of characters in S.  Most non-printing
 643    (isprint) characters are represented by a backslash followed by
 644    3 octal digits.  However, the characters represented by \c escapes
 645    where c is one of [abfnrtv] are represented by their 2-character \c
 646    sequences.  This function is used solely for printing error messages.  */
 647
 648 static char *
 649 make_printable_str (const unsigned char *s, size_t len)
 650 {
 651   /* Worst case is that every character expands to a backslash
 652      followed by a 3-character octal escape sequence.  */
 653   char *printable_buf = xmalloc (4 * len + 1);
 654   char *p = printable_buf;
 655   size_t i;
 656
 657   for (i = 0; i < len; i++)
 658     {
 659       char buf[5];
 660       char *tmp = NULL;
 661
 662       switch (s[i])
 663         {
 664         case '\\':
 665           tmp = "\\";
 666           break;
 667         case '\007':
 668           tmp = "\\a";
 669           break;
 670         case '\b':
 671           tmp = "\\b";
 672           break;
 673         case '\f':
 674           tmp = "\\f";
 675           break;
 676         case '\n':
 677           tmp = "\\n";
 678           break;
 679         case '\r':
 680           tmp = "\\r";
 681           break;
 682         case '\t':
 683           tmp = "\\t";
 684           break;
 685         case '\v':
 686           tmp = "\\v";
 687           break;
 688         default:
 689           if (ISPRINT (s[i]))
 690             {
 691               buf[0] = s[i];
 692               buf[1] = '\0';
 693             }
 694           else
 695             sprintf (buf, "\\%03o", s[i]);
 696           tmp = buf;
 697           break;
 698         }
 699       p = stpcpy (p, tmp);
 700     }
 701   return printable_buf;
 702 }
 703
 704 /* Append a newly allocated structure representing a
 705    character C to the specification list LIST.  */
 706
 707 static void
 708 append_normal_char (struct Spec_list *list, unsigned int c)
 709 {
 710   struct List_element *new;
 711
 712   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 713   new->next = NULL;
 714   new->type = RE_NORMAL_CHAR;
 715   new->u.normal_char = c;
 716   assert (list->tail);
 717   list->tail->next = new;
 718   list->tail = new;
 719 }
 720
 721 /* Append a newly allocated structure representing the range
 722    of characters from FIRST to LAST to the specification list LIST.
 723    Return nonzero if LAST precedes FIRST in the collating sequence,
 724    zero otherwise.  This means that '[c-c]' is acceptable.  */
 725
 726 static int
 727 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 728 {
 729   struct List_element *new;
 730
 731   if (ORD (first) > ORD (last))
 732     {
 733       char *tmp1 = make_printable_char (first);
 734       char *tmp2 = make_printable_char (last);
 735
 736       error (0, 0,
 737        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 738              tmp1, tmp2);
 739       free (tmp1);
 740       free (tmp2);
 741       return 1;
 742     }
 743   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 744   new->next = NULL;
 745   new->type = RE_RANGE;
 746   new->u.range.first_char = first;
 747   new->u.range.last_char = last;
 748   assert (list->tail);
 749   list->tail->next = new;
 750   list->tail = new;
 751   return 0;
 752 }
 753
 754 /* If CHAR_CLASS_STR is a valid character class string, append a
 755    newly allocated structure representing that character class to the end
 756    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 757    a valid string return nonzero.  */
 758
 759 static int
 760 append_char_class (struct Spec_list *list,
 761                    const unsigned char *char_class_str, size_t len)
 762 {
 763   enum Char_class char_class;
 764   struct List_element *new;
 765
 766   char_class = look_up_char_class (char_class_str, len);
 767   if (char_class == CC_NO_CLASS)
 768     return 1;
 769   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 770   new->next = NULL;
 771   new->type = RE_CHAR_CLASS;
 772   new->u.char_class = char_class;
 773   assert (list->tail);
 774   list->tail->next = new;
 775   list->tail = new;
 776   return 0;
 777 }
 778
 779 /* Append a newly allocated structure representing a [c*n]
 780    repeated character construct to the specification list LIST.
 781    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 782    is a non-negative repeat count.  */
 783
 784 static void
 785 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 786                       size_t repeat_count)
 787 {
 788   struct List_element *new;
 789
 790   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 791   new->next = NULL;
 792   new->type = RE_REPEATED_CHAR;
 793   new->u.repeated_char.the_repeated_char = the_char;
 794   new->u.repeated_char.repeat_count = repeat_count;
 795   assert (list->tail);
 796   list->tail->next = new;
 797   list->tail = new;
 798 }
 799
 800 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 801    the length of that string, LEN, if LEN is exactly one, append
 802    a newly allocated structure representing the specified
 803    equivalence class to the specification list, LIST and return zero.
 804    If LEN is not 1, return nonzero.  */
 805
 806 static int
 807 append_equiv_class (struct Spec_list *list,
 808                     const unsigned char *equiv_class_str, size_t len)
 809 {
 810   struct List_element *new;
 811
 812   if (len != 1)
 813     return 1;
 814   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 815   new->next = NULL;
 816   new->type = RE_EQUIV_CLASS;
 817   new->u.equiv_code = *equiv_class_str;
 818   assert (list->tail);
 819   list->tail->next = new;
 820   list->tail = new;
 821   return 0;
 822 }
 823
 824 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 825    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 826    NUL bytes, and is *not* NUL-terminated.  */
 827
 828 static unsigned char *
 829 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 830 {
 831   size_t len;
 832   unsigned char *tmp;
 833
 834   assert (first_idx <= last_idx);
 835   len = last_idx - first_idx + 1;
 836   tmp = (unsigned char *) xmalloc (len);
 837
 838   assert (first_idx <= last_idx);
 839   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 840   memcpy (tmp, p + first_idx, len);
 841   return tmp;
 842 }
 843
 844 /* Search forward starting at START_IDX for the 2-char sequence
 845    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 846    a sequence is found, set *RESULT_IDX to the index of the first
 847    character and return nonzero. Otherwise return zero.  P may contain
 848    zero bytes.  */
 849
 850 static int
 851 find_closing_delim (const struct E_string *es, size_t start_idx,
 852                     unsigned int pre_bracket_char, size_t *result_idx)
 853 {
 854   size_t i;
 855
 856   for (i = start_idx; i < es->len - 1; i++)
 857     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 858         && !es->escaped[i] && !es->escaped[i + 1])
 859       {
 860         *result_idx = i;
 861         return 1;
 862       }
 863   return 0;
 864 }
 865
 866 /* Convert a string S with explicit length LEN, possibly
 867    containing embedded zero bytes, to a long integer value.
 868    If the string represents a negative value, a value larger
 869    than LONG_MAX, or if all LEN characters do not represent a
 870    valid integer, return nonzero and do not modify *VAL.
 871    Otherwise, return zero and set *VAL to the converted value.  */
 872
 873 static int
 874 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 875 {
 876   size_t i;
 877   unsigned long sum = 0;
 878   unsigned int base;
 879
 880   if (len <= 0)
 881     return 1;
 882   if (s[0] == '0')
 883     base = 8;
 884   else if (ISDIGIT (s[0]))
 885     base = 10;
 886   else
 887     return 1;
 888
 889   for (i = 0; i < len; i++)
 890     {
 891       unsigned int c;
 892
 893       if (s[i] < '0')
 894         return 1;
 895
 896       c = s[i] - '0';
 897       if (c >= base)
 898         return 1;
 899
 900       if (sum > (LONG_MAX - c) / base)
 901         return 1;
 902       sum = sum * base + c;
 903     }
 904   *val = sum;
 905   return 0;
 906 }
 907
 908 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 909    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 910    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 911    and return zero. If the second character following
 912    the opening bracket is not `*' or if no closing bracket can be
 913    found, return -1.  If a closing bracket is found and the
 914    second char is `*', but the string between the `*' and `]' isn't
 915    empty, an octal number, or a decimal number, print an error message
 916    and return -2.  */
 917
 918 static int
 919 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 920                        unsigned int *char_to_repeat, size_t *repeat_count,
 921                        size_t *closing_bracket_idx)
 922 {
 923   size_t i;
 924
 925   assert (start_idx + 1 < es->len);
 926   if (!ES_MATCH (es, start_idx + 1, '*'))
 927     return -1;
 928
 929   for (i = start_idx + 2; i < es->len; i++)
 930     {
 931       if (ES_MATCH (es, i, ']'))
 932         {
 933           const unsigned char *digit_str;
 934           size_t digit_str_len = i - start_idx - 2;
 935
 936           *char_to_repeat = es->s[start_idx];
 937           if (digit_str_len == 0)
 938             {
 939               /* We've matched [c*] -- no explicit repeat count.  */
 940               *repeat_count = 0;
 941               *closing_bracket_idx = i;
 942               return 0;
 943             }
 944
 945           /* Here, we have found [c*s] where s should be a string
 946              of octal or decimal digits.  */
 947           digit_str = &es->s[start_idx + 2];
 948           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 949               || *repeat_count > BEGIN_STATE)
 950             {
 951               char *tmp = make_printable_str (digit_str, digit_str_len);
 952               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 953                      tmp);
 954               free (tmp);
 955               return -2;
 956             }
 957           *closing_bracket_idx = i;
 958           return 0;
 959         }
 960     }
 961   return -1;                    /* No bracket found.  */
 962 }
 963
 964 /* Return nonzero if the string at ES->s[IDX] matches the regular
 965    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 966    the `]' must not be escaped.  */
 967
 968 static int
 969 star_digits_closebracket (const struct E_string *es, size_t idx)
 970 {
 971   size_t i;
 972
 973   if (!ES_MATCH (es, idx, '*'))
 974     return 0;
 975
 976   for (i = idx + 1; i < es->len; i++)
 977     {
 978       if (!ISDIGIT (es->s[i]))
 979         {
 980           if (ES_MATCH (es, i, ']'))
 981             return 1;
 982           return 0;
 983         }
 984     }
 985   return 0;
 986 }
 987
 988 /* Convert string UNESACPED_STRING (which has been preprocessed to
 989    convert backslash-escape sequences) of length LEN characters into
 990    a linked list of the following 5 types of constructs:
 991       - [:str:] Character class where `str' is one of the 12 valid strings.
 992       - [=c=] Equivalence class where `c' is any single character.
 993       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 994           However, if `n' is present, it must be a non-negative octal or
 995           decimal integer.
 996       - r-s Range of characters from `r' to `s'.  The second endpoint must
 997           not precede the first in the current collating sequence.
 998       - c Any other character is interpreted as itself.  */
 999
1000 static int
1001 build_spec_list (const struct E_string *es, struct Spec_list *result)
1002 {
1003   const unsigned char *p;
1004   size_t i;
1005
1006   p = es->s;
1007
1008   /* The main for-loop below recognizes the 4 multi-character constructs.
1009      A character that matches (in its context) none of the multi-character
1010      constructs is classified as `normal'.  Since all multi-character
1011      constructs have at least 3 characters, any strings of length 2 or
1012      less are composed solely of normal characters.  Hence, the index of
1013      the outer for-loop runs only as far as LEN-2.  */
1014
1015   for (i = 0; i + 2 < es->len; /* empty */)
1016     {
1017       if (ES_MATCH (es, i, '['))
1018         {
1019           int matched_multi_char_construct;
1020           size_t closing_bracket_idx;
1021           unsigned int char_to_repeat;
1022           size_t repeat_count;
1023           int err;
1024
1025           matched_multi_char_construct = 1;
1026           if (ES_MATCH (es, i + 1, ':')
1027               || ES_MATCH (es, i + 1, '='))
1028             {
1029               size_t closing_delim_idx;
1030               int found;
1031
1032               found = find_closing_delim (es, i + 2, p[i + 1],
1033                                           &closing_delim_idx);
1034               if (found)
1035                 {
1036                   int parse_failed;
1037                   unsigned char *opnd_str = substr (p, i + 2,
1038                                                     closing_delim_idx - 1);
1039                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1040
1041                   if (p[i + 1] == ':')
1042                     {
1043                       parse_failed = append_char_class (result, opnd_str,
1044                                                         opnd_str_len);
1045
1046                       /* FIXME: big comment.  */
1047                       if (parse_failed)
1048                         {
1049                           if (star_digits_closebracket (es, i + 2))
1050                             {
1051                               free (opnd_str);
1052                               goto try_bracketed_repeat;
1053                             }
1054                           else
1055                             {
1056                               char *tmp = make_printable_str (opnd_str,
1057                                                               opnd_str_len);
1058                               error (0, 0, _("invalid character class `%s'"),
1059                                      tmp);
1060                               free (tmp);
1061                               return 1;
1062                             }
1063                         }
1064                     }
1065                   else
1066                     {
1067                       parse_failed = append_equiv_class (result, opnd_str,
1068                                                          opnd_str_len);
1069
1070                       /* FIXME: big comment.  */
1071                       if (parse_failed)
1072                         {
1073                           if (star_digits_closebracket (es, i + 2))
1074                             {
1075                               free (opnd_str);
1076                               goto try_bracketed_repeat;
1077                             }
1078                           else
1079                             {
1080                               char *tmp = make_printable_str (opnd_str,
1081                                                               opnd_str_len);
1082                               error (0, 0,
1083                _("%s: equivalence class operand must be a single character"),
1084                                      tmp);
1085                               free (tmp);
1086                               return 1;
1087                             }
1088                         }
1089                     }
1090                   free (opnd_str);
1091
1092                   /* Return nonzero if append_*_class reports a problem.  */
1093                   if (parse_failed)
1094                     return 1;
1095                   else
1096                     i = closing_delim_idx + 2;
1097                   continue;
1098                 }
1099               /* Else fall through.  This could be [:*] or [=*].  */
1100             }
1101
1102         try_bracketed_repeat:
1103
1104           /* Determine whether this is a bracketed repeat range
1105              matching the RE \[.\*(dec_or_oct_number)?\].  */
1106           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1107                                        &repeat_count,
1108                                        &closing_bracket_idx);
1109           if (err == 0)
1110             {
1111               append_repeated_char (result, char_to_repeat, repeat_count);
1112               i = closing_bracket_idx + 1;
1113             }
1114           else if (err == -1)
1115             {
1116               matched_multi_char_construct = 0;
1117             }
1118           else
1119             {
1120               /* Found a string that looked like [c*n] but the
1121                  numeric part was invalid.  */
1122               return 1;
1123             }
1124
1125           if (matched_multi_char_construct)
1126             continue;
1127
1128           /* We reach this point if P does not match [:str:], [=c=],
1129              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1130              (from `[' to `c').  */
1131         }
1132
1133       /* Look ahead one char for ranges like a-z.  */
1134       if (ES_MATCH (es, i + 1, '-'))
1135         {
1136           if (append_range (result, p[i], p[i + 2]))
1137             return 1;
1138           i += 3;
1139         }
1140       else
1141         {
1142           append_normal_char (result, p[i]);
1143           ++i;
1144         }
1145     }
1146
1147   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1148   for (; i < es->len; i++)
1149     append_normal_char (result, p[i]);
1150
1151   return 0;
1152 }
1153
1154 /* Given a Spec_list S (with its saved state implicit in the values
1155    of its members `tail' and `state'), return the next single character
1156    in the expansion of S's constructs.  If the last character of S was
1157    returned on the previous call or if S was empty, this function
1158    returns -1.  For example, successive calls to get_next where S
1159    represents the spec-string 'a-d[y*3]' will return the sequence
1160    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1161    which the returned character comes is [:upper:] or [:lower:], the
1162    parameter CLASS is given a value to indicate which it was.  Otherwise
1163    CLASS is set to UL_NONE.  This value is used only when constructing
1164    the translation table to verify that any occurrences of upper and
1165    lower class constructs in the spec-strings appear in the same relative
1166    positions.  */
1167
1168 static int
1169 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1170 {
1171   struct List_element *p;
1172   int return_val;
1173   int i;
1174
1175   if (class)
1176     *class = UL_NONE;
1177
1178   if (s->state == BEGIN_STATE)
1179     {
1180       s->tail = s->head->next;
1181       s->state = NEW_ELEMENT;
1182     }
1183
1184   p = s->tail;
1185   if (p == NULL)
1186     return -1;
1187
1188   switch (p->type)
1189     {
1190     case RE_NORMAL_CHAR:
1191       return_val = p->u.normal_char;
1192       s->state = NEW_ELEMENT;
1193       s->tail = p->next;
1194       break;
1195
1196     case RE_RANGE:
1197       if (s->state == NEW_ELEMENT)
1198         s->state = ORD (p->u.range.first_char);
1199       else
1200         ++(s->state);
1201       return_val = CHR (s->state);
1202       if (s->state == ORD (p->u.range.last_char))
1203         {
1204           s->tail = p->next;
1205           s->state = NEW_ELEMENT;
1206         }
1207       break;
1208
1209     case RE_CHAR_CLASS:
1210       if (s->state == NEW_ELEMENT)
1211         {
1212           for (i = 0; i < N_CHARS; i++)
1213             if (is_char_class_member (p->u.char_class, i))
1214               break;
1215           assert (i < N_CHARS);
1216           s->state = i;
1217         }
1218       assert (is_char_class_member (p->u.char_class, s->state));
1219       return_val = CHR (s->state);
1220       for (i = s->state + 1; i < N_CHARS; i++)
1221         if (is_char_class_member (p->u.char_class, i))
1222           break;
1223       if (i < N_CHARS)
1224         s->state = i;
1225       else
1226         {
1227           s->tail = p->next;
1228           s->state = NEW_ELEMENT;
1229         }
1230       if (class)
1231         {
1232           switch (p->u.char_class)
1233             {
1234             case CC_LOWER:
1235               *class = UL_LOWER;
1236               break;
1237             case CC_UPPER:
1238               *class = UL_UPPER;
1239               break;
1240             default:
1241               /* empty */
1242               break;
1243             }
1244         }
1245       break;
1246
1247     case RE_EQUIV_CLASS:
1248       /* FIXME: this assumes that each character is alone in its own
1249          equivalence class (which appears to be correct for my
1250          LC_COLLATE.  But I don't know of any function that allows
1251          one to determine a character's equivalence class.  */
1252
1253       return_val = p->u.equiv_code;
1254       s->state = NEW_ELEMENT;
1255       s->tail = p->next;
1256       break;
1257
1258     case RE_REPEATED_CHAR:
1259       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1260       if (p->u.repeated_char.repeat_count == 0)
1261         {
1262           s->tail = p->next;
1263           s->state = NEW_ELEMENT;
1264           return_val = get_next (s, class);
1265         }
1266       else
1267         {
1268           if (s->state == NEW_ELEMENT)
1269             {
1270               s->state = 0;
1271             }
1272           ++(s->state);
1273           return_val = p->u.repeated_char.the_repeated_char;
1274           if (p->u.repeated_char.repeat_count > 0
1275               && s->state == p->u.repeated_char.repeat_count)
1276             {
1277               s->tail = p->next;
1278               s->state = NEW_ELEMENT;
1279             }
1280         }
1281       break;
1282
1283     case RE_NO_TYPE:
1284       abort ();
1285       break;
1286
1287     default:
1288       abort ();
1289       break;
1290     }
1291
1292   return return_val;
1293 }
1294
1295 /* This is a minor kludge.  This function is called from
1296    get_spec_stats to determine the cardinality of a set derived
1297    from a complemented string.  It's a kludge in that some of the
1298    same operations are (duplicated) performed in set_initialize.  */
1299
1300 static int
1301 card_of_complement (struct Spec_list *s)
1302 {
1303   int c;
1304   int cardinality = N_CHARS;
1305   SET_TYPE in_set[N_CHARS];
1306
1307   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1308   s->state = BEGIN_STATE;
1309   while ((c = get_next (s, NULL)) != -1)
1310     if (!in_set[c]++)
1311       --cardinality;
1312   return cardinality;
1313 }
1314
1315 /* Gather statistics about the spec-list S in preparation for the tests
1316    in validate that determine the consistency of the specs.  This function
1317    is called at most twice; once for string1, and again for any string2.
1318    LEN_S1 < 0 indicates that this is the first call and that S represents
1319    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1320    constructs in string1, and we can use its value to resolve any
1321    indefinite repeat construct in S (which represents string2).  Hence,
1322    this function has the side-effect that it converts a valid [c*]
1323    construct in string2 to [c*n] where n is large enough (or 0) to give
1324    string2 the same length as string1.  For example, with the command
1325    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1326    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1327
1328 static void
1329 get_spec_stats (struct Spec_list *s)
1330 {
1331   struct List_element *p;
1332   int len = 0;
1333
1334   s->n_indefinite_repeats = 0;
1335   s->has_equiv_class = 0;
1336   s->has_restricted_char_class = 0;
1337   s->has_upper_or_lower = 0;
1338   for (p = s->head->next; p; p = p->next)
1339     {
1340       switch (p->type)
1341         {
1342           int i;
1343         case RE_NORMAL_CHAR:
1344           ++len;
1345           break;
1346
1347         case RE_RANGE:
1348           assert (p->u.range.last_char >= p->u.range.first_char);
1349           len += p->u.range.last_char - p->u.range.first_char + 1;
1350           break;
1351
1352         case RE_CHAR_CLASS:
1353           for (i = 0; i < N_CHARS; i++)
1354             if (is_char_class_member (p->u.char_class, i))
1355               ++len;
1356           switch (p->u.char_class)
1357             {
1358             case CC_UPPER:
1359             case CC_LOWER:
1360               s->has_upper_or_lower = 1;
1361               break;
1362             default:
1363               s->has_restricted_char_class = 1;
1364               break;
1365             }
1366           break;
1367
1368         case RE_EQUIV_CLASS:
1369           for (i = 0; i < N_CHARS; i++)
1370             if (is_equiv_class_member (p->u.equiv_code, i))
1371               ++len;
1372           s->has_equiv_class = 1;
1373           break;
1374
1375         case RE_REPEATED_CHAR:
1376           if (p->u.repeated_char.repeat_count > 0)
1377             len += p->u.repeated_char.repeat_count;
1378           else if (p->u.repeated_char.repeat_count == 0)
1379             {
1380               s->indefinite_repeat_element = p;
1381               ++(s->n_indefinite_repeats);
1382             }
1383           break;
1384
1385         case RE_NO_TYPE:
1386           assert (0);
1387           break;
1388         }
1389     }
1390
1391   s->length = len;
1392 }
1393
1394 static void
1395 get_s1_spec_stats (struct Spec_list *s1)
1396 {
1397   get_spec_stats (s1);
1398   if (complement)
1399     s1->length = card_of_complement (s1);
1400 }
1401
1402 static void
1403 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1404 {
1405   get_spec_stats (s2);
1406   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1407     {
1408       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1409         len_s1 - s2->length;
1410       s2->length = len_s1;
1411     }
1412 }
1413
1414 static void
1415 spec_init (struct Spec_list *spec_list)
1416 {
1417   spec_list->head = spec_list->tail =
1418     (struct List_element *) xmalloc (sizeof (struct List_element));
1419   spec_list->head->next = NULL;
1420 }
1421
1422 /* This function makes two passes over the argument string S.  The first
1423    one converts all \c and \ddd escapes to their one-byte representations.
1424    The second constructs a linked specification list, SPEC_LIST, of the
1425    characters and constructs that comprise the argument string.  If either
1426    of these passes detects an error, this function returns nonzero.  */
1427
1428 static int
1429 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1430 {
1431   struct E_string es;
1432
1433   if (unquote (s, &es))
1434     return 1;
1435   if (build_spec_list (&es, spec_list))
1436     return 1;
1437   return 0;
1438 }
1439
1440 /* Given two specification lists, S1 and S2, and assuming that
1441    S1->length > S2->length, append a single [c*n] element to S2 where c
1442    is the last character in the expansion of S2 and n is the difference
1443    between the two lengths.
1444    Upon successful completion, S2->length is set to S1->length.  The only
1445    way this function can fail to make S2 as long as S1 is when S2 has
1446    zero-length, since in that case, there is no last character to repeat.
1447    So S2->length is required to be at least 1.
1448
1449    Providing this functionality allows the user to do some pretty
1450    non-BSD (and non-portable) things:  For example, the command
1451        tr -cs '[:upper:]0-9' '[:lower:]'
1452    is almost guaranteed to give results that depend on your collating
1453    sequence.  */
1454
1455 static void
1456 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1457 {
1458   struct List_element *p;
1459   int char_to_repeat;
1460   int i;
1461
1462   assert (translating);
1463   assert (s1->length > s2->length);
1464   assert (s2->length > 0);
1465
1466   p = s2->tail;
1467   switch (p->type)
1468     {
1469     case RE_NORMAL_CHAR:
1470       char_to_repeat = p->u.normal_char;
1471       break;
1472     case RE_RANGE:
1473       char_to_repeat = p->u.range.last_char;
1474       break;
1475     case RE_CHAR_CLASS:
1476       for (i = N_CHARS; i >= 0; i--)
1477         if (is_char_class_member (p->u.char_class, i))
1478           break;
1479       assert (i >= 0);
1480       char_to_repeat = CHR (i);
1481       break;
1482
1483     case RE_REPEATED_CHAR:
1484       char_to_repeat = p->u.repeated_char.the_repeated_char;
1485       break;
1486
1487     case RE_EQUIV_CLASS:
1488       /* This shouldn't happen, because validate exits with an error
1489          if it finds an equiv class in string2 when translating.  */
1490       abort ();
1491       break;
1492
1493     case RE_NO_TYPE:
1494       abort ();
1495       break;
1496
1497     default:
1498       abort ();
1499       break;
1500     }
1501
1502   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1503   s2->length = s1->length;
1504 }
1505
1506 /* Die with an error message if S1 and S2 describe strings that
1507    are not valid with the given command line switches.
1508    A side effect of this function is that if a valid [c*] or
1509    [c*0] construct appears in string2, it is converted to [c*n]
1510    with a value for n that makes s2->length == s1->length.  By
1511    the same token, if the --truncate-set1 option is not
1512    given, S2 may be extended.  */
1513
1514 static void
1515 validate (struct Spec_list *s1, struct Spec_list *s2)
1516 {
1517   get_s1_spec_stats (s1);
1518   if (s1->n_indefinite_repeats > 0)
1519     {
1520       error (1, 0, _("the [c*] repeat construct may not appear in string1"));
1521     }
1522
1523   /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1524      but in the spirit of the other restrictions put on translation
1525      with character classes, this seems a logical interpretation.  */
1526   if (complement && s1->has_upper_or_lower)
1527     {
1528       error (1, 0,
1529              _("character classes may not be used when translating \
1530 and complementing"));
1531     }
1532
1533   if (s2)
1534     {
1535       get_s2_spec_stats (s2, s1->length);
1536       if (s2->has_restricted_char_class)
1537         {
1538           error (1, 0,
1539                  _("when translating, the only character classes that may \
1540 appear in\n\tstring2 are `upper' and `lower'"));
1541         }
1542
1543       if (s2->n_indefinite_repeats > 1)
1544         {
1545           error (1, 0,
1546                  _("only one [c*] repeat construct may appear in string2"));
1547         }
1548
1549       if (translating)
1550         {
1551           if (s2->has_equiv_class)
1552             {
1553               error (1, 0,
1554                      _("[=c=] expressions may not appear in string2 \
1555 when translating"));
1556             }
1557
1558           if (s1->length > s2->length)
1559             {
1560               if (!truncate_set1)
1561                 {
1562                   /* string2 must be non-empty unless --truncate-set1 is
1563                      given or string1 is empty.  */
1564
1565                   if (s2->length == 0)
1566                     error (1, 0,
1567                      _("when not truncating set1, string2 must be non-empty"));
1568                   string2_extend (s1, s2);
1569                 }
1570             }
1571
1572           if (complement && s2->has_upper_or_lower)
1573             error (1, 0,
1574                    _("character classes may not be used when translating \
1575 and complementing"));
1576         }
1577       else
1578         /* Not translating.  */
1579         {
1580           if (s2->n_indefinite_repeats > 0)
1581             error (1, 0,
1582                    _("the [c*] construct may appear in string2 only \
1583 when translating"));
1584         }
1585     }
1586 }
1587
1588 /* Read buffers of SIZE bytes via the function READER (if READER is
1589    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1590    read_and_delete or read_and_xlate.  After each buffer is read, it is
1591    processed and written to stdout.  The buffers are processed so that
1592    multiple consecutive occurrences of the same character in the input
1593    stream are replaced by a single occurrence of that character if the
1594    character is in the squeeze set.  */
1595
1596 static void
1597 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1598 {
1599   unsigned int char_to_squeeze = NOT_A_CHAR;
1600   int i = 0;
1601   int nr = 0;
1602
1603   for (;;)
1604     {
1605       int begin;
1606
1607       if (i >= nr)
1608         {
1609           if (reader == NULL)
1610             nr = safe_read (0, (char *) buf, size);
1611           else
1612             nr = (*reader) (buf, size, NULL);
1613
1614           if (nr < 0)
1615             error (1, errno, _("read error"));
1616           if (nr == 0)
1617             break;
1618           i = 0;
1619         }
1620
1621       begin = i;
1622
1623       if (char_to_squeeze == NOT_A_CHAR)
1624         {
1625           int out_len;
1626           /* Here, by being a little tricky, we can get a significant
1627              performance increase in most cases when the input is
1628              reasonably large.  Since tr will modify the input only
1629              if two consecutive (and identical) input characters are
1630              in the squeeze set, we can step by two through the data
1631              when searching for a character in the squeeze set.  This
1632              means there may be a little more work in a few cases and
1633              perhaps twice as much work in the worst cases where most
1634              of the input is removed by squeezing repeats.  But most
1635              uses of this functionality seem to remove less than 20-30%
1636              of the input.  */
1637           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1638             ;                   /* empty */
1639
1640           /* There is a special case when i == nr and we've just
1641              skipped a character (the last one in buf) that is in
1642              the squeeze set.  */
1643           if (i == nr && in_squeeze_set[buf[i - 1]])
1644             --i;
1645
1646           if (i >= nr)
1647             out_len = nr - begin;
1648           else
1649             {
1650               char_to_squeeze = buf[i];
1651               /* We're about to output buf[begin..i].  */
1652               out_len = i - begin + 1;
1653
1654               /* But since we stepped by 2 in the loop above,
1655                  out_len may be one too large.  */
1656               if (i > 0 && buf[i - 1] == char_to_squeeze)
1657                 --out_len;
1658
1659               /* Advance i to the index of first character to be
1660                  considered when looking for a char different from
1661                  char_to_squeeze.  */
1662               ++i;
1663             }
1664           if (out_len > 0
1665               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1666             error (1, errno, _("write error"));
1667         }
1668
1669       if (char_to_squeeze != NOT_A_CHAR)
1670         {
1671           /* Advance i to index of first char != char_to_squeeze
1672              (or to nr if all the rest of the characters in this
1673              buffer are the same as char_to_squeeze).  */
1674           for (; i < nr && buf[i] == char_to_squeeze; i++)
1675             ;                   /* empty */
1676           if (i < nr)
1677             char_to_squeeze = NOT_A_CHAR;
1678           /* If (i >= nr) we've squeezed the last character in this buffer.
1679              So now we have to read a new buffer and continue comparing
1680              characters against char_to_squeeze.  */
1681         }
1682     }
1683 }
1684
1685 /* Read buffers of SIZE bytes from stdin until one is found that
1686    contains at least one character not in the delete set.  Store
1687    in the array BUF, all characters from that buffer that are not
1688    in the delete set, and return the number of characters saved
1689    or 0 upon EOF.  */
1690
1691 static long
1692 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1693 {
1694   long n_saved;
1695   static int hit_eof = 0;
1696
1697   assert (not_used == NULL);
1698   assert (size > 0);
1699
1700   if (hit_eof)
1701     return 0;
1702
1703   /* This enclosing do-while loop is to make sure that
1704      we don't return zero (indicating EOF) when we've
1705      just deleted all the characters in a buffer.  */
1706   do
1707     {
1708       int i;
1709       int nr = safe_read (0, (char *) buf, size);
1710
1711       if (nr < 0)
1712         error (1, errno, _("read error"));
1713       if (nr == 0)
1714         {
1715           hit_eof = 1;
1716           return 0;
1717         }
1718
1719       /* This first loop may be a waste of code, but gives much
1720          better performance when no characters are deleted in
1721          the beginning of a buffer.  It just avoids the copying
1722          of buf[i] into buf[n_saved] when it would be a NOP.  */
1723
1724       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1725         /* empty */ ;
1726       n_saved = i;
1727
1728       for (++i; i < nr; i++)
1729         if (!in_delete_set[buf[i]])
1730           buf[n_saved++] = buf[i];
1731     }
1732   while (n_saved == 0);
1733
1734   return n_saved;
1735 }
1736
1737 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1738    perform the in-place and one-to-one mapping specified by the global
1739    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1740
1741 static long
1742 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1743 {
1744   long chars_read = 0;
1745   static int hit_eof = 0;
1746   int i;
1747
1748   assert (not_used == NULL);
1749   assert (size > 0);
1750
1751   if (hit_eof)
1752     return 0;
1753
1754   chars_read = safe_read (0, (char *) buf, size);
1755   if (chars_read < 0)
1756     error (1, errno, _("read error"));
1757   if (chars_read == 0)
1758     {
1759       hit_eof = 1;
1760       return 0;
1761     }
1762
1763   for (i = 0; i < chars_read; i++)
1764     buf[i] = xlate[buf[i]];
1765
1766   return chars_read;
1767 }
1768
1769 /* Initialize a boolean membership set IN_SET with the character
1770    values obtained by traversing the linked list of constructs S
1771    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1772    nonzero the resulting set is complemented.  */
1773
1774 static void
1775 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1776 {
1777   int c;
1778   int i;
1779
1780   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1781   s->state = BEGIN_STATE;
1782   while ((c = get_next (s, NULL)) != -1)
1783     in_set[c] = 1;
1784   if (complement_this_set)
1785     for (i = 0; i < N_CHARS; i++)
1786       in_set[i] = (!in_set[i]);
1787 }
1788
1789 void
1790 main (int argc, char **argv)
1791 {
1792   int c;
1793   int non_option_args;
1794   struct Spec_list buf1, buf2;
1795   struct Spec_list *s1 = &buf1;
1796   struct Spec_list *s2 = &buf2;
1797
1798   program_name = argv[0];
1799   setlocale (LC_ALL, "");
1800   bindtextdomain (PACKAGE, LOCALEDIR);
1801   textdomain (PACKAGE);
1802
1803   while ((c = getopt_long (argc, argv, "cdst", long_options,
1804                            (int *) 0)) != EOF)
1805     {
1806       switch (c)
1807         {
1808         case 0:
1809           break;
1810
1811         case 'c':
1812           complement = 1;
1813           break;
1814
1815         case 'd':
1816           delete = 1;
1817           break;
1818
1819         case 's':
1820           squeeze_repeats = 1;
1821           break;
1822
1823         case 't':
1824           truncate_set1 = 1;
1825           break;
1826
1827         default:
1828           usage (2);
1829           break;
1830         }
1831     }
1832
1833   if (show_version)
1834     {
1835       printf ("tr - %s\n", version_string);
1836       exit (0);
1837     }
1838
1839   if (show_help)
1840     usage (0);
1841
1842   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1843
1844   non_option_args = argc - optind;
1845   translating = (non_option_args == 2 && !delete);
1846
1847   /* Change this test if it is valid to give tr no options and
1848      no args at all.  POSIX doesn't specifically say anything
1849      either way, but it looks like they implied it's invalid
1850      by omission.  If you want to make tr do a slow imitation
1851      of `cat' use `tr a a'.  */
1852   if (non_option_args > 2)
1853     {
1854       error (0, 0, _("too many arguments"));
1855       usage (2);
1856     }
1857
1858   if (!delete && !squeeze_repeats && non_option_args != 2)
1859     error (1, 0, _("two strings must be given when translating"));
1860
1861   if (delete && squeeze_repeats && non_option_args != 2)
1862     error (1, 0, _("two strings must be given when both \
1863 deleting and squeezing repeats"));
1864
1865   /* If --delete is given without --squeeze-repeats, then
1866      only one string argument may be specified.  But POSIX
1867      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1868      is set, pretend we never saw string2.  But I think
1869      this deserves a fatal error, so that's the default.  */
1870   if ((delete && !squeeze_repeats) && non_option_args != 1)
1871     {
1872       if (posix_pedantic && non_option_args == 2)
1873         --non_option_args;
1874       else
1875         error (1, 0,
1876                _("only one string may be given when deleting \
1877 without squeezing repeats"));
1878     }
1879
1880   if (squeeze_repeats && non_option_args == 0)
1881     error (1, 0,
1882            _("at least one string must be given when squeezing repeats"));
1883
1884   spec_init (s1);
1885   if (parse_str ((unsigned char *) argv[optind], s1))
1886     exit (1);
1887
1888   if (non_option_args == 2)
1889     {
1890       spec_init (s2);
1891       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1892         exit (1);
1893     }
1894   else
1895     s2 = NULL;
1896
1897   validate (s1, s2);
1898
1899   if (squeeze_repeats && non_option_args == 1)
1900     {
1901       set_initialize (s1, complement, in_squeeze_set);
1902       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1903     }
1904   else if (delete && non_option_args == 1)
1905     {
1906       long nr;
1907
1908       set_initialize (s1, complement, in_delete_set);
1909       do
1910         {
1911           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1912           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1913             error (1, errno, _("write error"));
1914         }
1915       while (nr > 0);
1916     }
1917   else if (squeeze_repeats && delete && non_option_args == 2)
1918     {
1919       set_initialize (s1, complement, in_delete_set);
1920       set_initialize (s2, 0, in_squeeze_set);
1921       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1922     }
1923   else if (translating)
1924     {
1925       if (complement)
1926         {
1927           int i;
1928           SET_TYPE *in_s1 = in_delete_set;
1929
1930           set_initialize (s1, 0, in_s1);
1931           s2->state = BEGIN_STATE;
1932           for (i = 0; i < N_CHARS; i++)
1933             xlate[i] = i;
1934           for (i = 0; i < N_CHARS; i++)
1935             {
1936               if (!in_s1[i])
1937                 {
1938                   int ch = get_next (s2, NULL);
1939                   assert (ch != -1 || truncate_set1);
1940                   if (ch == -1)
1941                     {
1942                       /* This will happen when tr is invoked like e.g.
1943                          tr -cs A-Za-z0-9 '\012'.  */
1944                       break;
1945                     }
1946                   xlate[i] = ch;
1947                 }
1948             }
1949           assert (get_next (s2, NULL) == -1 || truncate_set1);
1950         }
1951       else
1952         {
1953           int c1, c2;
1954           int i;
1955           enum Upper_Lower_class class_s1;
1956           enum Upper_Lower_class class_s2;
1957
1958           for (i = 0; i < N_CHARS; i++)
1959             xlate[i] = i;
1960           s1->state = BEGIN_STATE;
1961           s2->state = BEGIN_STATE;
1962           for (;;)
1963             {
1964               c1 = get_next (s1, &class_s1);
1965               c2 = get_next (s2, &class_s2);
1966               if (!class_ok[(int) class_s1][(int) class_s2])
1967                 error (1, 0,
1968                      _("misaligned or mismatched upper and/or lower classes"));
1969               /* The following should have been checked by validate...  */
1970               if (c2 == -1)
1971                 break;
1972               xlate[c1] = c2;
1973             }
1974           assert (c1 == -1 || truncate_set1);
1975         }
1976       if (squeeze_repeats)
1977         {
1978           set_initialize (s2, 0, in_squeeze_set);
1979           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1980         }
1981       else
1982         {
1983           long chars_read;
1984
1985           do
1986             {
1987               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1988               if (chars_read > 0
1989                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1990                 error (1, errno, _("write error"));
1991             }
1992           while (chars_read > 0);
1993         }
1994     }
1995
1996   if (fclose (stdout) == EOF)
1997     error (2, errno, _("write error"));
1998
1999   if (close (0) != 0)
2000     error (2, errno, _("standard input"));
2001
2002   exit (0);
2003 }