src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 95, 96, 1997, 1998, 1999 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <errno.h>
  25 #include <sys/types.h>
  26 #include <getopt.h>
  27
  28 #include "system.h"
  29 #include "error.h"
  30 #include "safe-read.h"
  31
  32 #define N_CHARS (UCHAR_MAX + 1)
  33
  34 /* A pointer to a function that returns an int.  */
  35 typedef int (*PFI) ();
  36
  37 /* Convert from character C to its index in the collating
  38    sequence array.  Just cast to an unsigned int to avoid
  39    problems with sign-extension.  */
  40 #define ORD(c) (unsigned int)(c)
  41
  42 /* The inverse of ORD.  */
  43 #define CHR(i) (unsigned char)(i)
  44
  45 /* The value for Spec_list->state that indicates to
  46    get_next that it should initialize the tail pointer.
  47    Its value should be as large as possible to avoid conflict
  48    a valid value for the state field -- and that may be as
  49    large as any valid repeat_count.  */
  50 #define BEGIN_STATE (INT_MAX - 1)
  51
  52 /* The value for Spec_list->state that indicates to
  53    get_next that the element pointed to by Spec_list->tail is
  54    being considered for the first time on this pass through the
  55    list -- it indicates that get_next should make any necessary
  56    initializations.  */
  57 #define NEW_ELEMENT (BEGIN_STATE + 1)
  58
  59 /* A value distinct from any character that may have been stored in a
  60    buffer as the result of a block-read in the function squeeze_filter.  */
  61 #define NOT_A_CHAR (unsigned int)(-1)
  62
  63 /* The following (but not CC_NO_CLASS) are indices into the array of
  64    valid character class strings.  */
  65 enum Char_class
  66   {
  67     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  68     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  69     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  70     CC_NO_CLASS = 9999
  71   };
  72
  73 /* Character class to which a character (returned by get_next) belonged;
  74    but it is set only if the construct from which the character was obtained
  75    was one of the character classes [:upper:] or [:lower:].  The value
  76    is used only when translating and then, only to make sure that upper
  77    and lower class constructs have the same relative positions in string1
  78    and string2.  */
  79 enum Upper_Lower_class
  80   {
  81     UL_LOWER = 0,
  82     UL_UPPER = 1,
  83     UL_NONE = 2
  84   };
  85
  86 /* A shortcut to ensure that when constructing the translation array,
  87    one of the values returned by paired calls to get_next (from s1 and s2)
  88    is from [:upper:] and the other is from [:lower:], or neither is from
  89    upper or lower.  By default, GNU tr permits the identity mappings: from
  90    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  91    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  92    by values of type enum Upper_Lower_class.  */
  93 static int const class_ok[3][3] =
  94 {
  95   {1, 1, 0},
  96   {1, 1, 0},
  97   {0, 0, 1}
  98 };
  99
 100 /* The type of a List_element.  See build_spec_list for more details.  */
 101 enum Range_element_type
 102   {
 103     RE_NO_TYPE = 0,
 104     RE_NORMAL_CHAR,
 105     RE_RANGE,
 106     RE_CHAR_CLASS,
 107     RE_EQUIV_CLASS,
 108     RE_REPEATED_CHAR
 109   };
 110
 111 /* One construct in one of tr's argument strings.
 112    For example, consider the POSIX version of the classic tr command:
 113        tr -cs 'a-zA-Z_' '[\n*]'
 114    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 115    and a single normal character, `_'.  String2 has one construct.  */
 116 struct List_element
 117   {
 118     enum Range_element_type type;
 119     struct List_element *next;
 120     union
 121       {
 122         int normal_char;
 123         struct                  /* unnamed */
 124           {
 125             unsigned int first_char;
 126             unsigned int last_char;
 127           }
 128         range;
 129         enum Char_class char_class;
 130         int equiv_code;
 131         struct                  /* unnamed */
 132           {
 133             unsigned int the_repeated_char;
 134             size_t repeat_count;
 135           }
 136         repeated_char;
 137       }
 138     u;
 139   };
 140
 141 /* Each of tr's argument strings is parsed into a form that is easier
 142    to work with: a linked list of constructs (struct List_element).
 143    Each Spec_list structure also encapsulates various attributes of
 144    the corresponding argument string.  The attributes are used mainly
 145    to verify that the strings are valid in the context of any options
 146    specified (like -s, -d, or -c).  The main exception is the member
 147    `tail', which is first used to construct the list.  After construction,
 148    it is used by get_next to save its state when traversing the list.
 149    The member `state' serves a similar function.  */
 150 struct Spec_list
 151   {
 152     /* Points to the head of the list of range elements.
 153        The first struct is a dummy; its members are never used.  */
 154     struct List_element *head;
 155
 156     /* When appending, points to the last element.  When traversing via
 157        get_next(), points to the element to process next.  Setting
 158        Spec_list.state to the value BEGIN_STATE before calling get_next
 159        signals get_next to initialize tail to point to head->next.  */
 160     struct List_element *tail;
 161
 162     /* Used to save state between calls to get_next().  */
 163     unsigned int state;
 164
 165     /* Length, in the sense that length ('a-z[:digit:]123abc')
 166        is 42 ( = 26 + 10 + 6).  */
 167     size_t length;
 168
 169     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 170     int n_indefinite_repeats;
 171
 172     /* If n_indefinite_repeats is nonzero, this points to the List_element
 173        corresponding to the last [c*] or [c*0] construct encountered in
 174        this spec.  Otherwise it is undefined.  */
 175     struct List_element *indefinite_repeat_element;
 176
 177     /* Non-zero if this spec contains at least one equivalence
 178        class construct e.g. [=c=].  */
 179     int has_equiv_class;
 180
 181     /* Non-zero if this spec contains at least one character class
 182        construct.  E.g. [:digit:].  */
 183     int has_char_class;
 184
 185     /* Non-zero if this spec contains at least one of the character class
 186        constructs (all but upper and lower) that aren't allowed in s2.  */
 187     int has_restricted_char_class;
 188   };
 189
 190 /* A representation for escaped string1 or string2.  As a string is parsed,
 191    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 192    etc.) are marked as such in this structure by setting the corresponding
 193    entry in the ESCAPED vector.  */
 194 struct E_string
 195 {
 196   unsigned char *s;
 197   int *escaped;
 198   size_t len;
 199 };
 200
 201 /* Return nonzero if the Ith character of escaped string ES matches C
 202    and is not escaped itself.  */
 203 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 204
 205 /* The name by which this program was run.  */
 206 char *program_name;
 207
 208 /* When nonzero, each sequence in the input of a repeated character
 209    (call it c) is replaced (in the output) by a single occurrence of c
 210    for every c in the squeeze set.  */
 211 static int squeeze_repeats = 0;
 212
 213 /* When nonzero, removes characters in the delete set from input.  */
 214 static int delete = 0;
 215
 216 /* Use the complement of set1 in place of set1.  */
 217 static int complement = 0;
 218
 219 /* When nonzero, this flag causes GNU tr to provide strict
 220    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 221    says that when -d is used without -s, string2 (if present)
 222    must be ignored.  Silently ignoring arguments is a bad idea.
 223    The default GNU behavior is to give a usage message and exit.
 224    Additionally, when this flag is nonzero, tr prints warnings
 225    on stderr if it is being used in a manner that is not portable.
 226    Applicable warnings are given by default, but are suppressed
 227    if the environment variable `POSIXLY_CORRECT' is set, since
 228    being POSIX conformant means we can't issue such messages.
 229    Warnings on the following topics are suppressed when this
 230    variable is nonzero:
 231    1. Ambiguous octal escapes.  */
 232 static int posix_pedantic;
 233
 234 /* When tr is performing translation and string1 is longer than string2,
 235    POSIX says that the result is undefined.  That gives the implementor
 236    of a POSIX conforming version of tr two reasonable choices for the
 237    semantics of this case.
 238
 239    * The BSD tr pads string2 to the length of string1 by
 240    repeating the last character in string2.
 241
 242    * System V tr ignores characters in string1 that have no
 243    corresponding character in string2.  That is, string1 is effectively
 244    truncated to the length of string2.
 245
 246    When nonzero, this flag causes GNU tr to imitate the behavior
 247    of System V tr when translating with string1 longer than string2.
 248    The default is to emulate BSD tr.  This flag is ignored in modes where
 249    no translation is performed.  Emulating the System V tr
 250    in this exceptional case causes the relatively common BSD idiom:
 251
 252        tr -cs A-Za-z0-9 '\012'
 253
 254    to break (it would convert only zero bytes, rather than all
 255    non-alphanumerics, to newlines).
 256
 257    WARNING: This switch does not provide general BSD or System V
 258    compatibility.  For example, it doesn't disable the interpretation
 259    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 260    some unfortunate coincidence you use such constructs in scripts
 261    expecting to use some other version of tr, the scripts will break.  */
 262 static int truncate_set1 = 0;
 263
 264 /* An alias for (!delete && non_option_args == 2).
 265    It is set in main and used there and in validate().  */
 266 static int translating;
 267
 268 #ifndef BUFSIZ
 269 # define BUFSIZ 8192
 270 #endif
 271
 272 #define IO_BUF_SIZE BUFSIZ
 273 static unsigned char io_buf[IO_BUF_SIZE];
 274
 275 static char const *const char_class_name[] =
 276 {
 277   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 278   "lower", "print", "punct", "space", "upper", "xdigit"
 279 };
 280 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 281
 282 typedef char SET_TYPE;
 283
 284 /* Array of boolean values.  A character `c' is a member of the
 285    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 286    set is defined by the last (possibly, the only) string argument
 287    on the command line when the squeeze option is given.  */
 288 static SET_TYPE in_squeeze_set[N_CHARS];
 289
 290 /* Array of boolean values.  A character `c' is a member of the
 291    delete set if and only if in_delete_set[c] is true.  The delete
 292    set is defined by the first (or only) string argument on the
 293    command line when the delete option is given.  */
 294 static SET_TYPE in_delete_set[N_CHARS];
 295
 296 /* Array of character values defining the translation (if any) that
 297    tr is to perform.  Translation is performed only when there are
 298    two specification strings and the delete switch is not given.  */
 299 static char xlate[N_CHARS];
 300
 301 /* If nonzero, display usage information and exit.  */
 302 static int show_help;
 303
 304 /* If nonzero, print the version on standard output then exit.  */
 305 static int show_version;
 306
 307 static struct option const long_options[] =
 308 {
 309   {"complement", no_argument, NULL, 'c'},
 310   {"delete", no_argument, NULL, 'd'},
 311   {"squeeze-repeats", no_argument, NULL, 's'},
 312   {"truncate-set1", no_argument, NULL, 't'},
 313   {"help", no_argument, &show_help, 1},
 314   {"version", no_argument, &show_version, 1},
 315   {NULL, 0, NULL, 0}
 316 };
 317 \f
 318 void
 319 usage (int status)
 320 {
 321   if (status != 0)
 322     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 323              program_name);
 324   else
 325     {
 326       printf (_("\
 327 Usage: %s [OPTION]... SET1 [SET2]\n\
 328 "),
 329               program_name);
 330       printf (_("\
 331 Translate, squeeze, and/or delete characters from standard input,\n\
 332 writing to standard output.\n\
 333 \n\
 334   -c, --complement        first complement SET1\n\
 335   -d, --delete            delete characters in SET1, do not translate\n\
 336   -s, --squeeze-repeats   replace sequence of characters with one\n\
 337   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 338       --help              display this help and exit\n\
 339       --version           output version information and exit\n\
 340 "));
 341       printf (_("\
 342 \n\
 343 SETs are specified as strings of characters.  Most represent themselves.\n\
 344 Interpreted sequences are:\n\
 345 \n\
 346   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 347   \\\\              backslash\n\
 348   \\a              audible BEL\n\
 349   \\b              backspace\n\
 350   \\f              form feed\n\
 351   \\n              new line\n\
 352   \\r              return\n\
 353   \\t              horizontal tab\n\
 354   \\v              vertical tab\n\
 355   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 356   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 357   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 358   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 359   [:alnum:]       all letters and digits\n\
 360   [:alpha:]       all letters\n\
 361   [:blank:]       all horizontal whitespace\n\
 362   [:cntrl:]       all control characters\n\
 363   [:digit:]       all digits\n\
 364   [:graph:]       all printable characters, not including space\n\
 365   [:lower:]       all lower case letters\n\
 366   [:print:]       all printable characters, including space\n\
 367   [:punct:]       all punctuation characters\n\
 368   [:space:]       all horizontal or vertical whitespace\n\
 369   [:upper:]       all upper case letters\n\
 370   [:xdigit:]      all hexadecimal digits\n\
 371   [=CHAR=]        all characters which are equivalent to CHAR\n\
 372 "));
 373       printf (_("\
 374 \n\
 375 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 376 -t may be used only when translating.  SET2 is extended to length of\n\
 377 SET1 by repeating its last character as necessary.  Excess characters\n\
 378 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 379 expand in ascending order; used in SET2 while translating, they may\n\
 380 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 381 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 382 translation or deletion.\n\
 383 "));
 384       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 385     }
 386   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 387 }
 388
 389 /* Return nonzero if the character C is a member of the
 390    equivalence class containing the character EQUIV_CLASS.  */
 391
 392 static int
 393 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 394 {
 395   return (equiv_class == c);
 396 }
 397
 398 /* Return nonzero if the character C is a member of the
 399    character class CHAR_CLASS.  */
 400
 401 static int
 402 is_char_class_member (enum Char_class char_class, unsigned int c)
 403 {
 404   int result;
 405
 406   switch (char_class)
 407     {
 408     case CC_ALNUM:
 409       result = ISALNUM (c);
 410       break;
 411     case CC_ALPHA:
 412       result = ISALPHA (c);
 413       break;
 414     case CC_BLANK:
 415       result = ISBLANK (c);
 416       break;
 417     case CC_CNTRL:
 418       result = ISCNTRL (c);
 419       break;
 420     case CC_DIGIT:
 421       result = ISDIGIT_LOCALE (c);
 422       break;
 423     case CC_GRAPH:
 424       result = ISGRAPH (c);
 425       break;
 426     case CC_LOWER:
 427       result = ISLOWER (c);
 428       break;
 429     case CC_PRINT:
 430       result = ISPRINT (c);
 431       break;
 432     case CC_PUNCT:
 433       result = ISPUNCT (c);
 434       break;
 435     case CC_SPACE:
 436       result = ISSPACE (c);
 437       break;
 438     case CC_UPPER:
 439       result = ISUPPER (c);
 440       break;
 441     case CC_XDIGIT:
 442       result = ISXDIGIT (c);
 443       break;
 444     default:
 445       abort ();
 446       break;
 447     }
 448   return result;
 449 }
 450
 451 static void
 452 es_free (struct E_string *es)
 453 {
 454   free (es->s);
 455   free (es->escaped);
 456 }
 457
 458 /* Perform the first pass over each range-spec argument S, converting all
 459    \c and \ddd escapes to their one-byte representations.  The conversion
 460    is done in-place, so S must point to writable storage.  If an invalid
 461    quote sequence is found print an error message and return nonzero.
 462    Otherwise set *LEN to the length of the resulting string and return
 463    zero.  The resulting array of characters may contain zero-bytes;
 464    however, on input, S is assumed to be null-terminated, and hence
 465    cannot contain actual (non-escaped) zero bytes.  */
 466
 467 static int
 468 unquote (const unsigned char *s, struct E_string *es)
 469 {
 470   size_t i, j;
 471   size_t len;
 472
 473   len = strlen ((char *) s);
 474
 475   es->s = (unsigned char *) xmalloc (len);
 476   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 477   for (i = 0; i < len; i++)
 478     es->escaped[i] = 0;
 479
 480   j = 0;
 481   for (i = 0; s[i]; i++)
 482     {
 483       switch (s[i])
 484         {
 485           int c;
 486         case '\\':
 487           switch (s[i + 1])
 488             {
 489               int oct_digit;
 490             case '\\':
 491               c = '\\';
 492               break;
 493             case 'a':
 494               c = '\007';
 495               break;
 496             case 'b':
 497               c = '\b';
 498               break;
 499             case 'f':
 500               c = '\f';
 501               break;
 502             case 'n':
 503               c = '\n';
 504               break;
 505             case 'r':
 506               c = '\r';
 507               break;
 508             case 't':
 509               c = '\t';
 510               break;
 511             case 'v':
 512               c = '\v';
 513               break;
 514             case '0':
 515             case '1':
 516             case '2':
 517             case '3':
 518             case '4':
 519             case '5':
 520             case '6':
 521             case '7':
 522               c = s[i + 1] - '0';
 523               oct_digit = s[i + 2] - '0';
 524               if (0 <= oct_digit && oct_digit <= 7)
 525                 {
 526                   c = 8 * c + oct_digit;
 527                   ++i;
 528                   oct_digit = s[i + 2] - '0';
 529                   if (0 <= oct_digit && oct_digit <= 7)
 530                     {
 531                       if (8 * c + oct_digit < N_CHARS)
 532                         {
 533                           c = 8 * c + oct_digit;
 534                           ++i;
 535                         }
 536                       else if (!posix_pedantic)
 537                         {
 538                           /* Any octal number larger than 0377 won't
 539                              fit in 8 bits.  So we stop when adding the
 540                              next digit would put us over the limit and
 541                              give a warning about the ambiguity.  POSIX
 542                              isn't clear on this, but one person has said
 543                              that in his interpretation, POSIX says tr
 544                              can't even give a warning.  */
 545                           error (0, 0, _("warning: the ambiguous octal escape \
 546 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 547                                  s[i], s[i + 1], s[i + 2],
 548                                  s[i], s[i + 1], s[i + 2]);
 549                         }
 550                     }
 551                 }
 552               break;
 553             case '\0':
 554               error (0, 0, _("invalid backslash escape at end of string"));
 555               return 1;
 556
 557             default:
 558               if (posix_pedantic)
 559                 {
 560                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 561                   return 1;
 562                 }
 563               else
 564                 {
 565                   c = s[i + 1];
 566                   es->escaped[j] = 1;
 567                 }
 568             }
 569           ++i;
 570           es->s[j++] = c;
 571           break;
 572         default:
 573           es->s[j++] = s[i];
 574           break;
 575         }
 576     }
 577   es->len = j;
 578   return 0;
 579 }
 580
 581 /* If CLASS_STR is a valid character class string, return its index
 582    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 583
 584 static enum Char_class
 585 look_up_char_class (const unsigned char *class_str, size_t len)
 586 {
 587   unsigned int i;
 588
 589   for (i = 0; i < N_CHAR_CLASSES; i++)
 590     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 591         && strlen (char_class_name[i]) == len)
 592       return (enum Char_class) i;
 593   return CC_NO_CLASS;
 594 }
 595
 596 /* Return a newly allocated string with a printable version of C.
 597    This function is used solely for formatting error messages.  */
 598
 599 static char *
 600 make_printable_char (unsigned int c)
 601 {
 602   char *buf = xmalloc (5);
 603
 604   assert (c < N_CHARS);
 605   if (ISPRINT (c))
 606     {
 607       buf[0] = c;
 608       buf[1] = '\0';
 609     }
 610   else
 611     {
 612       sprintf (buf, "\\%03o", c);
 613     }
 614   return buf;
 615 }
 616
 617 /* Return a newly allocated copy of S which is suitable for printing.
 618    LEN is the number of characters in S.  Most non-printing
 619    (isprint) characters are represented by a backslash followed by
 620    3 octal digits.  However, the characters represented by \c escapes
 621    where c is one of [abfnrtv] are represented by their 2-character \c
 622    sequences.  This function is used solely for printing error messages.  */
 623
 624 static char *
 625 make_printable_str (const unsigned char *s, size_t len)
 626 {
 627   /* Worst case is that every character expands to a backslash
 628      followed by a 3-character octal escape sequence.  */
 629   char *printable_buf = xmalloc (4 * len + 1);
 630   char *p = printable_buf;
 631   size_t i;
 632
 633   for (i = 0; i < len; i++)
 634     {
 635       char buf[5];
 636       char *tmp = NULL;
 637
 638       switch (s[i])
 639         {
 640         case '\\':
 641           tmp = "\\";
 642           break;
 643         case '\007':
 644           tmp = "\\a";
 645           break;
 646         case '\b':
 647           tmp = "\\b";
 648           break;
 649         case '\f':
 650           tmp = "\\f";
 651           break;
 652         case '\n':
 653           tmp = "\\n";
 654           break;
 655         case '\r':
 656           tmp = "\\r";
 657           break;
 658         case '\t':
 659           tmp = "\\t";
 660           break;
 661         case '\v':
 662           tmp = "\\v";
 663           break;
 664         default:
 665           if (ISPRINT (s[i]))
 666             {
 667               buf[0] = s[i];
 668               buf[1] = '\0';
 669             }
 670           else
 671             sprintf (buf, "\\%03o", s[i]);
 672           tmp = buf;
 673           break;
 674         }
 675       p = stpcpy (p, tmp);
 676     }
 677   return printable_buf;
 678 }
 679
 680 /* Append a newly allocated structure representing a
 681    character C to the specification list LIST.  */
 682
 683 static void
 684 append_normal_char (struct Spec_list *list, unsigned int c)
 685 {
 686   struct List_element *new;
 687
 688   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 689   new->next = NULL;
 690   new->type = RE_NORMAL_CHAR;
 691   new->u.normal_char = c;
 692   assert (list->tail);
 693   list->tail->next = new;
 694   list->tail = new;
 695 }
 696
 697 /* Append a newly allocated structure representing the range
 698    of characters from FIRST to LAST to the specification list LIST.
 699    Return nonzero if LAST precedes FIRST in the collating sequence,
 700    zero otherwise.  This means that '[c-c]' is acceptable.  */
 701
 702 static int
 703 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 704 {
 705   struct List_element *new;
 706
 707   if (ORD (first) > ORD (last))
 708     {
 709       char *tmp1 = make_printable_char (first);
 710       char *tmp2 = make_printable_char (last);
 711
 712       error (0, 0,
 713        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 714              tmp1, tmp2);
 715       free (tmp1);
 716       free (tmp2);
 717       return 1;
 718     }
 719   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 720   new->next = NULL;
 721   new->type = RE_RANGE;
 722   new->u.range.first_char = first;
 723   new->u.range.last_char = last;
 724   assert (list->tail);
 725   list->tail->next = new;
 726   list->tail = new;
 727   return 0;
 728 }
 729
 730 /* If CHAR_CLASS_STR is a valid character class string, append a
 731    newly allocated structure representing that character class to the end
 732    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 733    a valid string return nonzero.  */
 734
 735 static int
 736 append_char_class (struct Spec_list *list,
 737                    const unsigned char *char_class_str, size_t len)
 738 {
 739   enum Char_class char_class;
 740   struct List_element *new;
 741
 742   char_class = look_up_char_class (char_class_str, len);
 743   if (char_class == CC_NO_CLASS)
 744     return 1;
 745   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 746   new->next = NULL;
 747   new->type = RE_CHAR_CLASS;
 748   new->u.char_class = char_class;
 749   assert (list->tail);
 750   list->tail->next = new;
 751   list->tail = new;
 752   return 0;
 753 }
 754
 755 /* Append a newly allocated structure representing a [c*n]
 756    repeated character construct to the specification list LIST.
 757    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 758    is a non-negative repeat count.  */
 759
 760 static void
 761 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 762                       size_t repeat_count)
 763 {
 764   struct List_element *new;
 765
 766   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 767   new->next = NULL;
 768   new->type = RE_REPEATED_CHAR;
 769   new->u.repeated_char.the_repeated_char = the_char;
 770   new->u.repeated_char.repeat_count = repeat_count;
 771   assert (list->tail);
 772   list->tail->next = new;
 773   list->tail = new;
 774 }
 775
 776 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 777    the length of that string, LEN, if LEN is exactly one, append
 778    a newly allocated structure representing the specified
 779    equivalence class to the specification list, LIST and return zero.
 780    If LEN is not 1, return nonzero.  */
 781
 782 static int
 783 append_equiv_class (struct Spec_list *list,
 784                     const unsigned char *equiv_class_str, size_t len)
 785 {
 786   struct List_element *new;
 787
 788   if (len != 1)
 789     return 1;
 790   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 791   new->next = NULL;
 792   new->type = RE_EQUIV_CLASS;
 793   new->u.equiv_code = *equiv_class_str;
 794   assert (list->tail);
 795   list->tail->next = new;
 796   list->tail = new;
 797   return 0;
 798 }
 799
 800 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 801    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 802    NUL bytes, and is *not* NUL-terminated.  */
 803
 804 static unsigned char *
 805 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 806 {
 807   size_t len;
 808   unsigned char *tmp;
 809
 810   assert (first_idx <= last_idx);
 811   len = last_idx - first_idx + 1;
 812   tmp = (unsigned char *) xmalloc (len);
 813
 814   assert (first_idx <= last_idx);
 815   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 816   memcpy (tmp, p + first_idx, len);
 817   return tmp;
 818 }
 819
 820 /* Search forward starting at START_IDX for the 2-char sequence
 821    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 822    a sequence is found, set *RESULT_IDX to the index of the first
 823    character and return nonzero. Otherwise return zero.  P may contain
 824    zero bytes.  */
 825
 826 static int
 827 find_closing_delim (const struct E_string *es, size_t start_idx,
 828                     unsigned int pre_bracket_char, size_t *result_idx)
 829 {
 830   size_t i;
 831
 832   for (i = start_idx; i < es->len - 1; i++)
 833     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 834         && !es->escaped[i] && !es->escaped[i + 1])
 835       {
 836         *result_idx = i;
 837         return 1;
 838       }
 839   return 0;
 840 }
 841
 842 /* Convert a string S with explicit length LEN, possibly
 843    containing embedded zero bytes, to a long integer value.
 844    If the string represents a negative value, a value larger
 845    than LONG_MAX, or if all LEN characters do not represent a
 846    valid integer, return nonzero and do not modify *VAL.
 847    Otherwise, return zero and set *VAL to the converted value.  */
 848
 849 static int
 850 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 851 {
 852   size_t i;
 853   unsigned long sum = 0;
 854   unsigned int base;
 855
 856   if (len <= 0)
 857     return 1;
 858   if (s[0] == '0')
 859     base = 8;
 860   else if (ISDIGIT (s[0]))
 861     base = 10;
 862   else
 863     return 1;
 864
 865   for (i = 0; i < len; i++)
 866     {
 867       unsigned int c;
 868
 869       if (s[i] < '0')
 870         return 1;
 871
 872       c = s[i] - '0';
 873       if (c >= base)
 874         return 1;
 875
 876       if (sum > (LONG_MAX - c) / base)
 877         return 1;
 878       sum = sum * base + c;
 879     }
 880   *val = sum;
 881   return 0;
 882 }
 883
 884 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 885    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 886    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 887    and return zero. If the second character following
 888    the opening bracket is not `*' or if no closing bracket can be
 889    found, return -1.  If a closing bracket is found and the
 890    second char is `*', but the string between the `*' and `]' isn't
 891    empty, an octal number, or a decimal number, print an error message
 892    and return -2.  */
 893
 894 static int
 895 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 896                        unsigned int *char_to_repeat, size_t *repeat_count,
 897                        size_t *closing_bracket_idx)
 898 {
 899   size_t i;
 900
 901   assert (start_idx + 1 < es->len);
 902   if (!ES_MATCH (es, start_idx + 1, '*'))
 903     return -1;
 904
 905   for (i = start_idx + 2; i < es->len; i++)
 906     {
 907       if (ES_MATCH (es, i, ']'))
 908         {
 909           const unsigned char *digit_str;
 910           size_t digit_str_len = i - start_idx - 2;
 911
 912           *char_to_repeat = es->s[start_idx];
 913           if (digit_str_len == 0)
 914             {
 915               /* We've matched [c*] -- no explicit repeat count.  */
 916               *repeat_count = 0;
 917               *closing_bracket_idx = i;
 918               return 0;
 919             }
 920
 921           /* Here, we have found [c*s] where s should be a string
 922              of octal or decimal digits.  */
 923           digit_str = &es->s[start_idx + 2];
 924           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 925               || *repeat_count > BEGIN_STATE)
 926             {
 927               char *tmp = make_printable_str (digit_str, digit_str_len);
 928               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 929                      tmp);
 930               free (tmp);
 931               return -2;
 932             }
 933           *closing_bracket_idx = i;
 934           return 0;
 935         }
 936     }
 937   return -1;                    /* No bracket found.  */
 938 }
 939
 940 /* Return nonzero if the string at ES->s[IDX] matches the regular
 941    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 942    the `]' must not be escaped.  */
 943
 944 static int
 945 star_digits_closebracket (const struct E_string *es, size_t idx)
 946 {
 947   size_t i;
 948
 949   if (!ES_MATCH (es, idx, '*'))
 950     return 0;
 951
 952   for (i = idx + 1; i < es->len; i++)
 953     {
 954       if (!ISDIGIT (es->s[i]))
 955         {
 956           if (ES_MATCH (es, i, ']'))
 957             return 1;
 958           return 0;
 959         }
 960     }
 961   return 0;
 962 }
 963
 964 /* Convert string UNESACPED_STRING (which has been preprocessed to
 965    convert backslash-escape sequences) of length LEN characters into
 966    a linked list of the following 5 types of constructs:
 967       - [:str:] Character class where `str' is one of the 12 valid strings.
 968       - [=c=] Equivalence class where `c' is any single character.
 969       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 970           However, if `n' is present, it must be a non-negative octal or
 971           decimal integer.
 972       - r-s Range of characters from `r' to `s'.  The second endpoint must
 973           not precede the first in the current collating sequence.
 974       - c Any other character is interpreted as itself.  */
 975
 976 static int
 977 build_spec_list (const struct E_string *es, struct Spec_list *result)
 978 {
 979   const unsigned char *p;
 980   size_t i;
 981
 982   p = es->s;
 983
 984   /* The main for-loop below recognizes the 4 multi-character constructs.
 985      A character that matches (in its context) none of the multi-character
 986      constructs is classified as `normal'.  Since all multi-character
 987      constructs have at least 3 characters, any strings of length 2 or
 988      less are composed solely of normal characters.  Hence, the index of
 989      the outer for-loop runs only as far as LEN-2.  */
 990
 991   for (i = 0; i + 2 < es->len; /* empty */)
 992     {
 993       if (ES_MATCH (es, i, '['))
 994         {
 995           int matched_multi_char_construct;
 996           size_t closing_bracket_idx;
 997           unsigned int char_to_repeat;
 998           size_t repeat_count;
 999           int err;
1000
1001           matched_multi_char_construct = 1;
1002           if (ES_MATCH (es, i + 1, ':')
1003               || ES_MATCH (es, i + 1, '='))
1004             {
1005               size_t closing_delim_idx;
1006               int found;
1007
1008               found = find_closing_delim (es, i + 2, p[i + 1],
1009                                           &closing_delim_idx);
1010               if (found)
1011                 {
1012                   int parse_failed;
1013                   unsigned char *opnd_str = substr (p, i + 2,
1014                                                     closing_delim_idx - 1);
1015                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1016
1017                   if (p[i + 1] == ':')
1018                     {
1019                       parse_failed = append_char_class (result, opnd_str,
1020                                                         opnd_str_len);
1021
1022                       /* FIXME: big comment.  */
1023                       if (parse_failed)
1024                         {
1025                           if (star_digits_closebracket (es, i + 2))
1026                             {
1027                               free (opnd_str);
1028                               goto try_bracketed_repeat;
1029                             }
1030                           else
1031                             {
1032                               char *tmp = make_printable_str (opnd_str,
1033                                                               opnd_str_len);
1034                               error (0, 0, _("invalid character class `%s'"),
1035                                      tmp);
1036                               free (tmp);
1037                               return 1;
1038                             }
1039                         }
1040                     }
1041                   else
1042                     {
1043                       parse_failed = append_equiv_class (result, opnd_str,
1044                                                          opnd_str_len);
1045
1046                       /* FIXME: big comment.  */
1047                       if (parse_failed)
1048                         {
1049                           if (star_digits_closebracket (es, i + 2))
1050                             {
1051                               free (opnd_str);
1052                               goto try_bracketed_repeat;
1053                             }
1054                           else
1055                             {
1056                               char *tmp = make_printable_str (opnd_str,
1057                                                               opnd_str_len);
1058                               error (0, 0,
1059                _("%s: equivalence class operand must be a single character"),
1060                                      tmp);
1061                               free (tmp);
1062                               return 1;
1063                             }
1064                         }
1065                     }
1066                   free (opnd_str);
1067
1068                   /* Return nonzero if append_*_class reports a problem.  */
1069                   if (parse_failed)
1070                     return 1;
1071                   else
1072                     i = closing_delim_idx + 2;
1073                   continue;
1074                 }
1075               /* Else fall through.  This could be [:*] or [=*].  */
1076             }
1077
1078         try_bracketed_repeat:
1079
1080           /* Determine whether this is a bracketed repeat range
1081              matching the RE \[.\*(dec_or_oct_number)?\].  */
1082           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1083                                        &repeat_count,
1084                                        &closing_bracket_idx);
1085           if (err == 0)
1086             {
1087               append_repeated_char (result, char_to_repeat, repeat_count);
1088               i = closing_bracket_idx + 1;
1089             }
1090           else if (err == -1)
1091             {
1092               matched_multi_char_construct = 0;
1093             }
1094           else
1095             {
1096               /* Found a string that looked like [c*n] but the
1097                  numeric part was invalid.  */
1098               return 1;
1099             }
1100
1101           if (matched_multi_char_construct)
1102             continue;
1103
1104           /* We reach this point if P does not match [:str:], [=c=],
1105              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1106              (from `[' to `c').  */
1107         }
1108
1109       /* Look ahead one char for ranges like a-z.  */
1110       if (ES_MATCH (es, i + 1, '-'))
1111         {
1112           if (append_range (result, p[i], p[i + 2]))
1113             return 1;
1114           i += 3;
1115         }
1116       else
1117         {
1118           append_normal_char (result, p[i]);
1119           ++i;
1120         }
1121     }
1122
1123   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1124   for (; i < es->len; i++)
1125     append_normal_char (result, p[i]);
1126
1127   return 0;
1128 }
1129
1130 /* Given a Spec_list S (with its saved state implicit in the values
1131    of its members `tail' and `state'), return the next single character
1132    in the expansion of S's constructs.  If the last character of S was
1133    returned on the previous call or if S was empty, this function
1134    returns -1.  For example, successive calls to get_next where S
1135    represents the spec-string 'a-d[y*3]' will return the sequence
1136    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1137    which the returned character comes is [:upper:] or [:lower:], the
1138    parameter CLASS is given a value to indicate which it was.  Otherwise
1139    CLASS is set to UL_NONE.  This value is used only when constructing
1140    the translation table to verify that any occurrences of upper and
1141    lower class constructs in the spec-strings appear in the same relative
1142    positions.  */
1143
1144 static int
1145 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1146 {
1147   struct List_element *p;
1148   int return_val;
1149   int i;
1150
1151   if (class)
1152     *class = UL_NONE;
1153
1154   if (s->state == BEGIN_STATE)
1155     {
1156       s->tail = s->head->next;
1157       s->state = NEW_ELEMENT;
1158     }
1159
1160   p = s->tail;
1161   if (p == NULL)
1162     return -1;
1163
1164   switch (p->type)
1165     {
1166     case RE_NORMAL_CHAR:
1167       return_val = p->u.normal_char;
1168       s->state = NEW_ELEMENT;
1169       s->tail = p->next;
1170       break;
1171
1172     case RE_RANGE:
1173       if (s->state == NEW_ELEMENT)
1174         s->state = ORD (p->u.range.first_char);
1175       else
1176         ++(s->state);
1177       return_val = CHR (s->state);
1178       if (s->state == ORD (p->u.range.last_char))
1179         {
1180           s->tail = p->next;
1181           s->state = NEW_ELEMENT;
1182         }
1183       break;
1184
1185     case RE_CHAR_CLASS:
1186       if (class)
1187         {
1188           int upper_or_lower;
1189           switch (p->u.char_class)
1190             {
1191             case CC_LOWER:
1192               *class = UL_LOWER;
1193               upper_or_lower = 1;
1194               break;
1195             case CC_UPPER:
1196               *class = UL_UPPER;
1197               upper_or_lower = 1;
1198               break;
1199             default:
1200               upper_or_lower = 0;
1201               break;
1202             }
1203
1204           if (upper_or_lower)
1205             {
1206               s->tail = p->next;
1207               s->state = NEW_ELEMENT;
1208               return_val = 0;
1209               break;
1210             }
1211         }
1212
1213       if (s->state == NEW_ELEMENT)
1214         {
1215           for (i = 0; i < N_CHARS; i++)
1216             if (is_char_class_member (p->u.char_class, i))
1217               break;
1218           assert (i < N_CHARS);
1219           s->state = i;
1220         }
1221       assert (is_char_class_member (p->u.char_class, s->state));
1222       return_val = CHR (s->state);
1223       for (i = s->state + 1; i < N_CHARS; i++)
1224         if (is_char_class_member (p->u.char_class, i))
1225           break;
1226       if (i < N_CHARS)
1227         s->state = i;
1228       else
1229         {
1230           s->tail = p->next;
1231           s->state = NEW_ELEMENT;
1232         }
1233       break;
1234
1235     case RE_EQUIV_CLASS:
1236       /* FIXME: this assumes that each character is alone in its own
1237          equivalence class (which appears to be correct for my
1238          LC_COLLATE.  But I don't know of any function that allows
1239          one to determine a character's equivalence class.  */
1240
1241       return_val = p->u.equiv_code;
1242       s->state = NEW_ELEMENT;
1243       s->tail = p->next;
1244       break;
1245
1246     case RE_REPEATED_CHAR:
1247       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1248       if (p->u.repeated_char.repeat_count == 0)
1249         {
1250           s->tail = p->next;
1251           s->state = NEW_ELEMENT;
1252           return_val = get_next (s, class);
1253         }
1254       else
1255         {
1256           if (s->state == NEW_ELEMENT)
1257             {
1258               s->state = 0;
1259             }
1260           ++(s->state);
1261           return_val = p->u.repeated_char.the_repeated_char;
1262           if (p->u.repeated_char.repeat_count > 0
1263               && s->state == p->u.repeated_char.repeat_count)
1264             {
1265               s->tail = p->next;
1266               s->state = NEW_ELEMENT;
1267             }
1268         }
1269       break;
1270
1271     case RE_NO_TYPE:
1272       abort ();
1273       break;
1274
1275     default:
1276       abort ();
1277       break;
1278     }
1279
1280   return return_val;
1281 }
1282
1283 /* This is a minor kludge.  This function is called from
1284    get_spec_stats to determine the cardinality of a set derived
1285    from a complemented string.  It's a kludge in that some of the
1286    same operations are (duplicated) performed in set_initialize.  */
1287
1288 static int
1289 card_of_complement (struct Spec_list *s)
1290 {
1291   int c;
1292   int cardinality = N_CHARS;
1293   SET_TYPE in_set[N_CHARS];
1294
1295   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1296   s->state = BEGIN_STATE;
1297   while ((c = get_next (s, NULL)) != -1)
1298     if (!in_set[c]++)
1299       --cardinality;
1300   return cardinality;
1301 }
1302
1303 /* Gather statistics about the spec-list S in preparation for the tests
1304    in validate that determine the consistency of the specs.  This function
1305    is called at most twice; once for string1, and again for any string2.
1306    LEN_S1 < 0 indicates that this is the first call and that S represents
1307    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1308    constructs in string1, and we can use its value to resolve any
1309    indefinite repeat construct in S (which represents string2).  Hence,
1310    this function has the side-effect that it converts a valid [c*]
1311    construct in string2 to [c*n] where n is large enough (or 0) to give
1312    string2 the same length as string1.  For example, with the command
1313    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1314    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1315
1316 static void
1317 get_spec_stats (struct Spec_list *s)
1318 {
1319   struct List_element *p;
1320   int len = 0;
1321
1322   s->n_indefinite_repeats = 0;
1323   s->has_equiv_class = 0;
1324   s->has_restricted_char_class = 0;
1325   s->has_char_class = 0;
1326   for (p = s->head->next; p; p = p->next)
1327     {
1328       switch (p->type)
1329         {
1330           int i;
1331         case RE_NORMAL_CHAR:
1332           ++len;
1333           break;
1334
1335         case RE_RANGE:
1336           assert (p->u.range.last_char >= p->u.range.first_char);
1337           len += p->u.range.last_char - p->u.range.first_char + 1;
1338           break;
1339
1340         case RE_CHAR_CLASS:
1341           s->has_char_class = 1;
1342           for (i = 0; i < N_CHARS; i++)
1343             if (is_char_class_member (p->u.char_class, i))
1344               ++len;
1345           switch (p->u.char_class)
1346             {
1347             case CC_UPPER:
1348             case CC_LOWER:
1349               break;
1350             default:
1351               s->has_restricted_char_class = 1;
1352               break;
1353             }
1354           break;
1355
1356         case RE_EQUIV_CLASS:
1357           for (i = 0; i < N_CHARS; i++)
1358             if (is_equiv_class_member (p->u.equiv_code, i))
1359               ++len;
1360           s->has_equiv_class = 1;
1361           break;
1362
1363         case RE_REPEATED_CHAR:
1364           if (p->u.repeated_char.repeat_count > 0)
1365             len += p->u.repeated_char.repeat_count;
1366           else if (p->u.repeated_char.repeat_count == 0)
1367             {
1368               s->indefinite_repeat_element = p;
1369               ++(s->n_indefinite_repeats);
1370             }
1371           break;
1372
1373         case RE_NO_TYPE:
1374           assert (0);
1375           break;
1376         }
1377     }
1378
1379   s->length = len;
1380 }
1381
1382 static void
1383 get_s1_spec_stats (struct Spec_list *s1)
1384 {
1385   get_spec_stats (s1);
1386   if (complement)
1387     s1->length = card_of_complement (s1);
1388 }
1389
1390 static void
1391 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1392 {
1393   get_spec_stats (s2);
1394   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1395     {
1396       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1397         len_s1 - s2->length;
1398       s2->length = len_s1;
1399     }
1400 }
1401
1402 static void
1403 spec_init (struct Spec_list *spec_list)
1404 {
1405   spec_list->head = spec_list->tail =
1406     (struct List_element *) xmalloc (sizeof (struct List_element));
1407   spec_list->head->next = NULL;
1408 }
1409
1410 /* This function makes two passes over the argument string S.  The first
1411    one converts all \c and \ddd escapes to their one-byte representations.
1412    The second constructs a linked specification list, SPEC_LIST, of the
1413    characters and constructs that comprise the argument string.  If either
1414    of these passes detects an error, this function returns nonzero.  */
1415
1416 static int
1417 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1418 {
1419   struct E_string es;
1420   int fail;
1421
1422   fail = unquote (s, &es);
1423   if (!fail)
1424     fail = build_spec_list (&es, spec_list);
1425   es_free (&es);
1426   return fail;
1427 }
1428
1429 /* Given two specification lists, S1 and S2, and assuming that
1430    S1->length > S2->length, append a single [c*n] element to S2 where c
1431    is the last character in the expansion of S2 and n is the difference
1432    between the two lengths.
1433    Upon successful completion, S2->length is set to S1->length.  The only
1434    way this function can fail to make S2 as long as S1 is when S2 has
1435    zero-length, since in that case, there is no last character to repeat.
1436    So S2->length is required to be at least 1.
1437
1438    Providing this functionality allows the user to do some pretty
1439    non-BSD (and non-portable) things:  For example, the command
1440        tr -cs '[:upper:]0-9' '[:lower:]'
1441    is almost guaranteed to give results that depend on your collating
1442    sequence.  */
1443
1444 static void
1445 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1446 {
1447   struct List_element *p;
1448   int char_to_repeat;
1449   int i;
1450
1451   assert (translating);
1452   assert (s1->length > s2->length);
1453   assert (s2->length > 0);
1454
1455   p = s2->tail;
1456   switch (p->type)
1457     {
1458     case RE_NORMAL_CHAR:
1459       char_to_repeat = p->u.normal_char;
1460       break;
1461     case RE_RANGE:
1462       char_to_repeat = p->u.range.last_char;
1463       break;
1464     case RE_CHAR_CLASS:
1465       for (i = N_CHARS; i >= 0; i--)
1466         if (is_char_class_member (p->u.char_class, i))
1467           break;
1468       assert (i >= 0);
1469       char_to_repeat = CHR (i);
1470       break;
1471
1472     case RE_REPEATED_CHAR:
1473       char_to_repeat = p->u.repeated_char.the_repeated_char;
1474       break;
1475
1476     case RE_EQUIV_CLASS:
1477       /* This shouldn't happen, because validate exits with an error
1478          if it finds an equiv class in string2 when translating.  */
1479       abort ();
1480       break;
1481
1482     case RE_NO_TYPE:
1483       abort ();
1484       break;
1485
1486     default:
1487       abort ();
1488       break;
1489     }
1490
1491   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1492   s2->length = s1->length;
1493 }
1494
1495 /* Return non-zero if S is a non-empty list in which exactly one
1496    character (but potentially, many instances of it) appears.
1497    E.g.  [X*] or xxxxxxxx.  */
1498
1499 static int
1500 homogeneous_spec_list (struct Spec_list *s)
1501 {
1502   int b, c;
1503
1504   s->state = BEGIN_STATE;
1505
1506   if ((b = get_next (s, NULL)) == -1)
1507     return 0;
1508
1509   while ((c = get_next (s, NULL)) != -1)
1510     if (c != b)
1511       return 0;
1512
1513   return 1;
1514 }
1515
1516 /* Die with an error message if S1 and S2 describe strings that
1517    are not valid with the given command line switches.
1518    A side effect of this function is that if a valid [c*] or
1519    [c*0] construct appears in string2, it is converted to [c*n]
1520    with a value for n that makes s2->length == s1->length.  By
1521    the same token, if the --truncate-set1 option is not
1522    given, S2 may be extended.  */
1523
1524 static void
1525 validate (struct Spec_list *s1, struct Spec_list *s2)
1526 {
1527   get_s1_spec_stats (s1);
1528   if (s1->n_indefinite_repeats > 0)
1529     {
1530       error (EXIT_FAILURE, 0,
1531              _("the [c*] repeat construct may not appear in string1"));
1532     }
1533
1534   if (s2)
1535     {
1536       get_s2_spec_stats (s2, s1->length);
1537
1538       if (s2->n_indefinite_repeats > 1)
1539         {
1540           error (EXIT_FAILURE, 0,
1541                  _("only one [c*] repeat construct may appear in string2"));
1542         }
1543
1544       if (translating)
1545         {
1546           if (s2->has_equiv_class)
1547             {
1548               error (EXIT_FAILURE, 0,
1549                      _("[=c=] expressions may not appear in string2 \
1550 when translating"));
1551             }
1552
1553           if (s1->length > s2->length)
1554             {
1555               if (!truncate_set1)
1556                 {
1557                   /* string2 must be non-empty unless --truncate-set1 is
1558                      given or string1 is empty.  */
1559
1560                   if (s2->length == 0)
1561                     error (EXIT_FAILURE, 0,
1562                      _("when not truncating set1, string2 must be non-empty"));
1563                   string2_extend (s1, s2);
1564                 }
1565             }
1566
1567           if (complement && s1->has_char_class
1568               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1569             {
1570               error (EXIT_FAILURE, 0,
1571                      _("when translating with complemented character classes,\
1572 \nstring2 must map all characters in the domain to one"));
1573             }
1574
1575           if (s2->has_restricted_char_class)
1576             {
1577               error (EXIT_FAILURE, 0,
1578                      _("when translating, the only character classes that may \
1579 appear in\nstring2 are `upper' and `lower'"));
1580             }
1581         }
1582       else
1583         /* Not translating.  */
1584         {
1585           if (s2->n_indefinite_repeats > 0)
1586             error (EXIT_FAILURE, 0,
1587                    _("the [c*] construct may appear in string2 only \
1588 when translating"));
1589         }
1590     }
1591 }
1592
1593 /* Read buffers of SIZE bytes via the function READER (if READER is
1594    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1595    read_and_delete or read_and_xlate.  After each buffer is read, it is
1596    processed and written to stdout.  The buffers are processed so that
1597    multiple consecutive occurrences of the same character in the input
1598    stream are replaced by a single occurrence of that character if the
1599    character is in the squeeze set.  */
1600
1601 static void
1602 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1603 {
1604   unsigned int char_to_squeeze = NOT_A_CHAR;
1605   int i = 0;
1606   int nr = 0;
1607
1608   for (;;)
1609     {
1610       int begin;
1611
1612       if (i >= nr)
1613         {
1614           if (reader == NULL)
1615             nr = safe_read (0, (char *) buf, size);
1616           else
1617             nr = (*reader) (buf, size, NULL);
1618
1619           if (nr < 0)
1620             error (EXIT_FAILURE, errno, _("read error"));
1621           if (nr == 0)
1622             break;
1623           i = 0;
1624         }
1625
1626       begin = i;
1627
1628       if (char_to_squeeze == NOT_A_CHAR)
1629         {
1630           int out_len;
1631           /* Here, by being a little tricky, we can get a significant
1632              performance increase in most cases when the input is
1633              reasonably large.  Since tr will modify the input only
1634              if two consecutive (and identical) input characters are
1635              in the squeeze set, we can step by two through the data
1636              when searching for a character in the squeeze set.  This
1637              means there may be a little more work in a few cases and
1638              perhaps twice as much work in the worst cases where most
1639              of the input is removed by squeezing repeats.  But most
1640              uses of this functionality seem to remove less than 20-30%
1641              of the input.  */
1642           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1643             ;                   /* empty */
1644
1645           /* There is a special case when i == nr and we've just
1646              skipped a character (the last one in buf) that is in
1647              the squeeze set.  */
1648           if (i == nr && in_squeeze_set[buf[i - 1]])
1649             --i;
1650
1651           if (i >= nr)
1652             out_len = nr - begin;
1653           else
1654             {
1655               char_to_squeeze = buf[i];
1656               /* We're about to output buf[begin..i].  */
1657               out_len = i - begin + 1;
1658
1659               /* But since we stepped by 2 in the loop above,
1660                  out_len may be one too large.  */
1661               if (i > 0 && buf[i - 1] == char_to_squeeze)
1662                 --out_len;
1663
1664               /* Advance i to the index of first character to be
1665                  considered when looking for a char different from
1666                  char_to_squeeze.  */
1667               ++i;
1668             }
1669           if (out_len > 0
1670               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1671             error (EXIT_FAILURE, errno, _("write error"));
1672         }
1673
1674       if (char_to_squeeze != NOT_A_CHAR)
1675         {
1676           /* Advance i to index of first char != char_to_squeeze
1677              (or to nr if all the rest of the characters in this
1678              buffer are the same as char_to_squeeze).  */
1679           for (; i < nr && buf[i] == char_to_squeeze; i++)
1680             ;                   /* empty */
1681           if (i < nr)
1682             char_to_squeeze = NOT_A_CHAR;
1683           /* If (i >= nr) we've squeezed the last character in this buffer.
1684              So now we have to read a new buffer and continue comparing
1685              characters against char_to_squeeze.  */
1686         }
1687     }
1688 }
1689
1690 /* Read buffers of SIZE bytes from stdin until one is found that
1691    contains at least one character not in the delete set.  Store
1692    in the array BUF, all characters from that buffer that are not
1693    in the delete set, and return the number of characters saved
1694    or 0 upon EOF.  */
1695
1696 static long
1697 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1698 {
1699   long n_saved;
1700   static int hit_eof = 0;
1701
1702   assert (not_used == NULL);
1703   assert (size > 0);
1704
1705   if (hit_eof)
1706     return 0;
1707
1708   /* This enclosing do-while loop is to make sure that
1709      we don't return zero (indicating EOF) when we've
1710      just deleted all the characters in a buffer.  */
1711   do
1712     {
1713       int i;
1714       int nr = safe_read (0, (char *) buf, size);
1715
1716       if (nr < 0)
1717         error (EXIT_FAILURE, errno, _("read error"));
1718       if (nr == 0)
1719         {
1720           hit_eof = 1;
1721           return 0;
1722         }
1723
1724       /* This first loop may be a waste of code, but gives much
1725          better performance when no characters are deleted in
1726          the beginning of a buffer.  It just avoids the copying
1727          of buf[i] into buf[n_saved] when it would be a NOP.  */
1728
1729       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1730         /* empty */ ;
1731       n_saved = i;
1732
1733       for (++i; i < nr; i++)
1734         if (!in_delete_set[buf[i]])
1735           buf[n_saved++] = buf[i];
1736     }
1737   while (n_saved == 0);
1738
1739   return n_saved;
1740 }
1741
1742 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1743    perform the in-place and one-to-one mapping specified by the global
1744    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1745
1746 static long
1747 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1748 {
1749   long chars_read = 0;
1750   static int hit_eof = 0;
1751   int i;
1752
1753   assert (not_used == NULL);
1754   assert (size > 0);
1755
1756   if (hit_eof)
1757     return 0;
1758
1759   chars_read = safe_read (0, (char *) buf, size);
1760   if (chars_read < 0)
1761     error (EXIT_FAILURE, errno, _("read error"));
1762   if (chars_read == 0)
1763     {
1764       hit_eof = 1;
1765       return 0;
1766     }
1767
1768   for (i = 0; i < chars_read; i++)
1769     buf[i] = xlate[buf[i]];
1770
1771   return chars_read;
1772 }
1773
1774 /* Initialize a boolean membership set IN_SET with the character
1775    values obtained by traversing the linked list of constructs S
1776    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1777    nonzero the resulting set is complemented.  */
1778
1779 static void
1780 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1781 {
1782   int c;
1783   int i;
1784
1785   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1786   s->state = BEGIN_STATE;
1787   while ((c = get_next (s, NULL)) != -1)
1788     in_set[c] = 1;
1789   if (complement_this_set)
1790     for (i = 0; i < N_CHARS; i++)
1791       in_set[i] = (!in_set[i]);
1792 }
1793
1794 int
1795 main (int argc, char **argv)
1796 {
1797   int c;
1798   int non_option_args;
1799   struct Spec_list buf1, buf2;
1800   struct Spec_list *s1 = &buf1;
1801   struct Spec_list *s2 = &buf2;
1802
1803   program_name = argv[0];
1804   setlocale (LC_ALL, "");
1805   bindtextdomain (PACKAGE, LOCALEDIR);
1806   textdomain (PACKAGE);
1807
1808   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1809     {
1810       switch (c)
1811         {
1812         case 0:
1813           break;
1814
1815         case 'c':
1816           complement = 1;
1817           break;
1818
1819         case 'd':
1820           delete = 1;
1821           break;
1822
1823         case 's':
1824           squeeze_repeats = 1;
1825           break;
1826
1827         case 't':
1828           truncate_set1 = 1;
1829           break;
1830
1831         default:
1832           usage (2);
1833           break;
1834         }
1835     }
1836
1837   if (show_version)
1838     {
1839       printf ("tr (%s) %s\n", GNU_PACKAGE, VERSION);
1840       exit (EXIT_SUCCESS);
1841     }
1842
1843   if (show_help)
1844     usage (0);
1845
1846   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1847
1848   non_option_args = argc - optind;
1849   translating = (non_option_args == 2 && !delete);
1850
1851   /* Change this test if it is valid to give tr no options and
1852      no args at all.  POSIX doesn't specifically say anything
1853      either way, but it looks like they implied it's invalid
1854      by omission.  If you want to make tr do a slow imitation
1855      of `cat' use `tr a a'.  */
1856   if (non_option_args > 2)
1857     {
1858       error (0, 0, _("too many arguments"));
1859       usage (2);
1860     }
1861
1862   if (!delete && !squeeze_repeats && non_option_args != 2)
1863     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1864
1865   if (delete && squeeze_repeats && non_option_args != 2)
1866     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1867 deleting and squeezing repeats"));
1868
1869   /* If --delete is given without --squeeze-repeats, then
1870      only one string argument may be specified.  But POSIX
1871      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1872      is set, pretend we never saw string2.  But I think
1873      this deserves a fatal error, so that's the default.  */
1874   if ((delete && !squeeze_repeats) && non_option_args != 1)
1875     {
1876       if (posix_pedantic && non_option_args == 2)
1877         --non_option_args;
1878       else
1879         error (EXIT_FAILURE, 0,
1880                _("only one string may be given when deleting \
1881 without squeezing repeats"));
1882     }
1883
1884   if (squeeze_repeats && non_option_args == 0)
1885     error (EXIT_FAILURE, 0,
1886            _("at least one string must be given when squeezing repeats"));
1887
1888   spec_init (s1);
1889   if (parse_str ((unsigned char *) argv[optind], s1))
1890     exit (EXIT_FAILURE);
1891
1892   if (non_option_args == 2)
1893     {
1894       spec_init (s2);
1895       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1896         exit (EXIT_FAILURE);
1897     }
1898   else
1899     s2 = NULL;
1900
1901   validate (s1, s2);
1902
1903   /* Use binary I/O, since `tr' is sometimes used to transliterate
1904      non-printable characters, or characters which are stripped away
1905      by text-mode reads (like CR and ^Z).  */
1906   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1907
1908   if (squeeze_repeats && non_option_args == 1)
1909     {
1910       set_initialize (s1, complement, in_squeeze_set);
1911       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1912     }
1913   else if (delete && non_option_args == 1)
1914     {
1915       long nr;
1916
1917       set_initialize (s1, complement, in_delete_set);
1918       do
1919         {
1920           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1921           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1922             error (EXIT_FAILURE, errno, _("write error"));
1923         }
1924       while (nr > 0);
1925     }
1926   else if (squeeze_repeats && delete && non_option_args == 2)
1927     {
1928       set_initialize (s1, complement, in_delete_set);
1929       set_initialize (s2, 0, in_squeeze_set);
1930       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1931     }
1932   else if (translating)
1933     {
1934       if (complement)
1935         {
1936           int i;
1937           SET_TYPE *in_s1 = in_delete_set;
1938
1939           set_initialize (s1, 0, in_s1);
1940           s2->state = BEGIN_STATE;
1941           for (i = 0; i < N_CHARS; i++)
1942             xlate[i] = i;
1943           for (i = 0; i < N_CHARS; i++)
1944             {
1945               if (!in_s1[i])
1946                 {
1947                   int ch = get_next (s2, NULL);
1948                   assert (ch != -1 || truncate_set1);
1949                   if (ch == -1)
1950                     {
1951                       /* This will happen when tr is invoked like e.g.
1952                          tr -cs A-Za-z0-9 '\012'.  */
1953                       break;
1954                     }
1955                   xlate[i] = ch;
1956                 }
1957             }
1958           assert (get_next (s2, NULL) == -1 || truncate_set1);
1959         }
1960       else
1961         {
1962           int c1, c2;
1963           int i;
1964           enum Upper_Lower_class class_s1;
1965           enum Upper_Lower_class class_s2;
1966
1967           for (i = 0; i < N_CHARS; i++)
1968             xlate[i] = i;
1969           s1->state = BEGIN_STATE;
1970           s2->state = BEGIN_STATE;
1971           for (;;)
1972             {
1973               c1 = get_next (s1, &class_s1);
1974               c2 = get_next (s2, &class_s2);
1975               if (!class_ok[(int) class_s1][(int) class_s2])
1976                 error (EXIT_FAILURE, 0,
1977                        _("misaligned [:upper:] and/or [:lower:] construct"));
1978
1979               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1980                 {
1981                   for (i = 0; i < N_CHARS; i++)
1982                     if (ISLOWER (i))
1983                       xlate[i] = toupper (i);
1984                 }
1985               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1986                 {
1987                   for (i = 0; i < N_CHARS; i++)
1988                     if (ISUPPER (i))
1989                       xlate[i] = tolower (i);
1990                 }
1991               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1992                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1993                 {
1994                   /* By default, GNU tr permits the identity mappings: from
1995                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1996                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1997                   if (posix_pedantic)
1998                     {
1999                       error (EXIT_FAILURE, 0,
2000                              _("\
2001 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
2002 construct in string1 must be aligned with a corresponding construct\n\
2003 ([:upper:] or [:lower:], respectively) in string2"));
2004                     }
2005                 }
2006               else
2007                 {
2008                   /* The following should have been checked by validate...  */
2009                   if (c1 == -1 || c2 == -1)
2010                     break;
2011                   xlate[c1] = c2;
2012                 }
2013             }
2014           assert (c1 == -1 || truncate_set1);
2015         }
2016       if (squeeze_repeats)
2017         {
2018           set_initialize (s2, 0, in_squeeze_set);
2019           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2020         }
2021       else
2022         {
2023           long chars_read;
2024
2025           do
2026             {
2027               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2028               if (chars_read > 0
2029                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2030                 error (EXIT_FAILURE, errno, _("write error"));
2031             }
2032           while (chars_read > 0);
2033         }
2034     }
2035
2036   if (fclose (stdout) == EOF)
2037     error (EXIT_FAILURE, errno, _("write error"));
2038
2039   if (close (0) != 0)
2040     error (EXIT_FAILURE, errno, _("standard input"));
2041
2042   exit (EXIT_SUCCESS);
2043 }