src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-1998, 1999 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <errno.h>
  25 #include <sys/types.h>
  26 #include <getopt.h>
  27
  28 #include "system.h"
  29 #include "error.h"
  30 #include "safe-read.h"
  31
  32 /* The official name of this program (e.g., no `g' prefix).  */
  33 #define PROGRAM_NAME "tr"
  34
  35 #define AUTHORS "Jim Meyering"
  36
  37 #define N_CHARS (UCHAR_MAX + 1)
  38
  39 /* A pointer to a function that returns an int.  */
  40 typedef int (*PFI) ();
  41
  42 /* Convert from character C to its index in the collating
  43    sequence array.  Just cast to an unsigned int to avoid
  44    problems with sign-extension.  */
  45 #define ORD(c) (unsigned int)(c)
  46
  47 /* The inverse of ORD.  */
  48 #define CHR(i) (unsigned char)(i)
  49
  50 /* The value for Spec_list->state that indicates to
  51    get_next that it should initialize the tail pointer.
  52    Its value should be as large as possible to avoid conflict
  53    a valid value for the state field -- and that may be as
  54    large as any valid repeat_count.  */
  55 #define BEGIN_STATE (INT_MAX - 1)
  56
  57 /* The value for Spec_list->state that indicates to
  58    get_next that the element pointed to by Spec_list->tail is
  59    being considered for the first time on this pass through the
  60    list -- it indicates that get_next should make any necessary
  61    initializations.  */
  62 #define NEW_ELEMENT (BEGIN_STATE + 1)
  63
  64 /* A value distinct from any character that may have been stored in a
  65    buffer as the result of a block-read in the function squeeze_filter.  */
  66 #define NOT_A_CHAR (unsigned int)(-1)
  67
  68 /* The following (but not CC_NO_CLASS) are indices into the array of
  69    valid character class strings.  */
  70 enum Char_class
  71   {
  72     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  73     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  74     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  75     CC_NO_CLASS = 9999
  76   };
  77
  78 /* Character class to which a character (returned by get_next) belonged;
  79    but it is set only if the construct from which the character was obtained
  80    was one of the character classes [:upper:] or [:lower:].  The value
  81    is used only when translating and then, only to make sure that upper
  82    and lower class constructs have the same relative positions in string1
  83    and string2.  */
  84 enum Upper_Lower_class
  85   {
  86     UL_LOWER = 0,
  87     UL_UPPER = 1,
  88     UL_NONE = 2
  89   };
  90
  91 /* A shortcut to ensure that when constructing the translation array,
  92    one of the values returned by paired calls to get_next (from s1 and s2)
  93    is from [:upper:] and the other is from [:lower:], or neither is from
  94    upper or lower.  By default, GNU tr permits the identity mappings: from
  95    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  96    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  97    by values of type enum Upper_Lower_class.  */
  98 static int const class_ok[3][3] =
  99 {
 100   {1, 1, 0},
 101   {1, 1, 0},
 102   {0, 0, 1}
 103 };
 104
 105 /* The type of a List_element.  See build_spec_list for more details.  */
 106 enum Range_element_type
 107   {
 108     RE_NO_TYPE = 0,
 109     RE_NORMAL_CHAR,
 110     RE_RANGE,
 111     RE_CHAR_CLASS,
 112     RE_EQUIV_CLASS,
 113     RE_REPEATED_CHAR
 114   };
 115
 116 /* One construct in one of tr's argument strings.
 117    For example, consider the POSIX version of the classic tr command:
 118        tr -cs 'a-zA-Z_' '[\n*]'
 119    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 120    and a single normal character, `_'.  String2 has one construct.  */
 121 struct List_element
 122   {
 123     enum Range_element_type type;
 124     struct List_element *next;
 125     union
 126       {
 127         int normal_char;
 128         struct                  /* unnamed */
 129           {
 130             unsigned int first_char;
 131             unsigned int last_char;
 132           }
 133         range;
 134         enum Char_class char_class;
 135         int equiv_code;
 136         struct                  /* unnamed */
 137           {
 138             unsigned int the_repeated_char;
 139             size_t repeat_count;
 140           }
 141         repeated_char;
 142       }
 143     u;
 144   };
 145
 146 /* Each of tr's argument strings is parsed into a form that is easier
 147    to work with: a linked list of constructs (struct List_element).
 148    Each Spec_list structure also encapsulates various attributes of
 149    the corresponding argument string.  The attributes are used mainly
 150    to verify that the strings are valid in the context of any options
 151    specified (like -s, -d, or -c).  The main exception is the member
 152    `tail', which is first used to construct the list.  After construction,
 153    it is used by get_next to save its state when traversing the list.
 154    The member `state' serves a similar function.  */
 155 struct Spec_list
 156   {
 157     /* Points to the head of the list of range elements.
 158        The first struct is a dummy; its members are never used.  */
 159     struct List_element *head;
 160
 161     /* When appending, points to the last element.  When traversing via
 162        get_next(), points to the element to process next.  Setting
 163        Spec_list.state to the value BEGIN_STATE before calling get_next
 164        signals get_next to initialize tail to point to head->next.  */
 165     struct List_element *tail;
 166
 167     /* Used to save state between calls to get_next().  */
 168     unsigned int state;
 169
 170     /* Length, in the sense that length ('a-z[:digit:]123abc')
 171        is 42 ( = 26 + 10 + 6).  */
 172     size_t length;
 173
 174     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 175     int n_indefinite_repeats;
 176
 177     /* If n_indefinite_repeats is nonzero, this points to the List_element
 178        corresponding to the last [c*] or [c*0] construct encountered in
 179        this spec.  Otherwise it is undefined.  */
 180     struct List_element *indefinite_repeat_element;
 181
 182     /* Non-zero if this spec contains at least one equivalence
 183        class construct e.g. [=c=].  */
 184     int has_equiv_class;
 185
 186     /* Non-zero if this spec contains at least one character class
 187        construct.  E.g. [:digit:].  */
 188     int has_char_class;
 189
 190     /* Non-zero if this spec contains at least one of the character class
 191        constructs (all but upper and lower) that aren't allowed in s2.  */
 192     int has_restricted_char_class;
 193   };
 194
 195 /* A representation for escaped string1 or string2.  As a string is parsed,
 196    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 197    etc.) are marked as such in this structure by setting the corresponding
 198    entry in the ESCAPED vector.  */
 199 struct E_string
 200 {
 201   unsigned char *s;
 202   int *escaped;
 203   size_t len;
 204 };
 205
 206 /* Return nonzero if the Ith character of escaped string ES matches C
 207    and is not escaped itself.  */
 208 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 209
 210 /* The name by which this program was run.  */
 211 char *program_name;
 212
 213 /* When nonzero, each sequence in the input of a repeated character
 214    (call it c) is replaced (in the output) by a single occurrence of c
 215    for every c in the squeeze set.  */
 216 static int squeeze_repeats = 0;
 217
 218 /* When nonzero, removes characters in the delete set from input.  */
 219 static int delete = 0;
 220
 221 /* Use the complement of set1 in place of set1.  */
 222 static int complement = 0;
 223
 224 /* When nonzero, this flag causes GNU tr to provide strict
 225    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 226    says that when -d is used without -s, string2 (if present)
 227    must be ignored.  Silently ignoring arguments is a bad idea.
 228    The default GNU behavior is to give a usage message and exit.
 229    Additionally, when this flag is nonzero, tr prints warnings
 230    on stderr if it is being used in a manner that is not portable.
 231    Applicable warnings are given by default, but are suppressed
 232    if the environment variable `POSIXLY_CORRECT' is set, since
 233    being POSIX conformant means we can't issue such messages.
 234    Warnings on the following topics are suppressed when this
 235    variable is nonzero:
 236    1. Ambiguous octal escapes.  */
 237 static int posix_pedantic;
 238
 239 /* When tr is performing translation and string1 is longer than string2,
 240    POSIX says that the result is undefined.  That gives the implementor
 241    of a POSIX conforming version of tr two reasonable choices for the
 242    semantics of this case.
 243
 244    * The BSD tr pads string2 to the length of string1 by
 245    repeating the last character in string2.
 246
 247    * System V tr ignores characters in string1 that have no
 248    corresponding character in string2.  That is, string1 is effectively
 249    truncated to the length of string2.
 250
 251    When nonzero, this flag causes GNU tr to imitate the behavior
 252    of System V tr when translating with string1 longer than string2.
 253    The default is to emulate BSD tr.  This flag is ignored in modes where
 254    no translation is performed.  Emulating the System V tr
 255    in this exceptional case causes the relatively common BSD idiom:
 256
 257        tr -cs A-Za-z0-9 '\012'
 258
 259    to break (it would convert only zero bytes, rather than all
 260    non-alphanumerics, to newlines).
 261
 262    WARNING: This switch does not provide general BSD or System V
 263    compatibility.  For example, it doesn't disable the interpretation
 264    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 265    some unfortunate coincidence you use such constructs in scripts
 266    expecting to use some other version of tr, the scripts will break.  */
 267 static int truncate_set1 = 0;
 268
 269 /* An alias for (!delete && non_option_args == 2).
 270    It is set in main and used there and in validate().  */
 271 static int translating;
 272
 273 #ifndef BUFSIZ
 274 # define BUFSIZ 8192
 275 #endif
 276
 277 #define IO_BUF_SIZE BUFSIZ
 278 static unsigned char io_buf[IO_BUF_SIZE];
 279
 280 static char const *const char_class_name[] =
 281 {
 282   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 283   "lower", "print", "punct", "space", "upper", "xdigit"
 284 };
 285 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 286
 287 typedef char SET_TYPE;
 288
 289 /* Array of boolean values.  A character `c' is a member of the
 290    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 291    set is defined by the last (possibly, the only) string argument
 292    on the command line when the squeeze option is given.  */
 293 static SET_TYPE in_squeeze_set[N_CHARS];
 294
 295 /* Array of boolean values.  A character `c' is a member of the
 296    delete set if and only if in_delete_set[c] is true.  The delete
 297    set is defined by the first (or only) string argument on the
 298    command line when the delete option is given.  */
 299 static SET_TYPE in_delete_set[N_CHARS];
 300
 301 /* Array of character values defining the translation (if any) that
 302    tr is to perform.  Translation is performed only when there are
 303    two specification strings and the delete switch is not given.  */
 304 static char xlate[N_CHARS];
 305
 306 static struct option const long_options[] =
 307 {
 308   {"complement", no_argument, NULL, 'c'},
 309   {"delete", no_argument, NULL, 'd'},
 310   {"squeeze-repeats", no_argument, NULL, 's'},
 311   {"truncate-set1", no_argument, NULL, 't'},
 312   {GETOPT_HELP_OPTION_DECL},
 313   {GETOPT_VERSION_OPTION_DECL},
 314   {NULL, 0, NULL, 0}
 315 };
 316 \f
 317 void
 318 usage (int status)
 319 {
 320   if (status != 0)
 321     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 322              program_name);
 323   else
 324     {
 325       printf (_("\
 326 Usage: %s [OPTION]... SET1 [SET2]\n\
 327 "),
 328               program_name);
 329       printf (_("\
 330 Translate, squeeze, and/or delete characters from standard input,\n\
 331 writing to standard output.\n\
 332 \n\
 333   -c, --complement        first complement SET1\n\
 334   -d, --delete            delete characters in SET1, do not translate\n\
 335   -s, --squeeze-repeats   replace sequence of characters with one\n\
 336   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 337       --help              display this help and exit\n\
 338       --version           output version information and exit\n\
 339 "));
 340       printf (_("\
 341 \n\
 342 SETs are specified as strings of characters.  Most represent themselves.\n\
 343 Interpreted sequences are:\n\
 344 \n\
 345   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 346   \\\\              backslash\n\
 347   \\a              audible BEL\n\
 348   \\b              backspace\n\
 349   \\f              form feed\n\
 350   \\n              new line\n\
 351   \\r              return\n\
 352   \\t              horizontal tab\n\
 353   \\v              vertical tab\n\
 354   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 355   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 356   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 357   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 358   [:alnum:]       all letters and digits\n\
 359   [:alpha:]       all letters\n\
 360   [:blank:]       all horizontal whitespace\n\
 361   [:cntrl:]       all control characters\n\
 362   [:digit:]       all digits\n\
 363   [:graph:]       all printable characters, not including space\n\
 364   [:lower:]       all lower case letters\n\
 365   [:print:]       all printable characters, including space\n\
 366   [:punct:]       all punctuation characters\n\
 367   [:space:]       all horizontal or vertical whitespace\n\
 368   [:upper:]       all upper case letters\n\
 369   [:xdigit:]      all hexadecimal digits\n\
 370   [=CHAR=]        all characters which are equivalent to CHAR\n\
 371 "));
 372       printf (_("\
 373 \n\
 374 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 375 -t may be used only when translating.  SET2 is extended to length of\n\
 376 SET1 by repeating its last character as necessary.  Excess characters\n\
 377 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 378 expand in ascending order; used in SET2 while translating, they may\n\
 379 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 380 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 381 translation or deletion.\n\
 382 "));
 383       puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
 384     }
 385   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 386 }
 387
 388 /* Return nonzero if the character C is a member of the
 389    equivalence class containing the character EQUIV_CLASS.  */
 390
 391 static int
 392 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 393 {
 394   return (equiv_class == c);
 395 }
 396
 397 /* Return nonzero if the character C is a member of the
 398    character class CHAR_CLASS.  */
 399
 400 static int
 401 is_char_class_member (enum Char_class char_class, unsigned int c)
 402 {
 403   int result;
 404
 405   switch (char_class)
 406     {
 407     case CC_ALNUM:
 408       result = ISALNUM (c);
 409       break;
 410     case CC_ALPHA:
 411       result = ISALPHA (c);
 412       break;
 413     case CC_BLANK:
 414       result = ISBLANK (c);
 415       break;
 416     case CC_CNTRL:
 417       result = ISCNTRL (c);
 418       break;
 419     case CC_DIGIT:
 420       result = ISDIGIT_LOCALE (c);
 421       break;
 422     case CC_GRAPH:
 423       result = ISGRAPH (c);
 424       break;
 425     case CC_LOWER:
 426       result = ISLOWER (c);
 427       break;
 428     case CC_PRINT:
 429       result = ISPRINT (c);
 430       break;
 431     case CC_PUNCT:
 432       result = ISPUNCT (c);
 433       break;
 434     case CC_SPACE:
 435       result = ISSPACE (c);
 436       break;
 437     case CC_UPPER:
 438       result = ISUPPER (c);
 439       break;
 440     case CC_XDIGIT:
 441       result = ISXDIGIT (c);
 442       break;
 443     default:
 444       abort ();
 445       break;
 446     }
 447   return result;
 448 }
 449
 450 static void
 451 es_free (struct E_string *es)
 452 {
 453   free (es->s);
 454   free (es->escaped);
 455 }
 456
 457 /* Perform the first pass over each range-spec argument S, converting all
 458    \c and \ddd escapes to their one-byte representations.  The conversion
 459    is done in-place, so S must point to writable storage.  If an invalid
 460    quote sequence is found print an error message and return nonzero.
 461    Otherwise set *LEN to the length of the resulting string and return
 462    zero.  The resulting array of characters may contain zero-bytes;
 463    however, on input, S is assumed to be null-terminated, and hence
 464    cannot contain actual (non-escaped) zero bytes.  */
 465
 466 static int
 467 unquote (const unsigned char *s, struct E_string *es)
 468 {
 469   size_t i, j;
 470   size_t len;
 471
 472   len = strlen ((char *) s);
 473
 474   es->s = (unsigned char *) xmalloc (len);
 475   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 476   for (i = 0; i < len; i++)
 477     es->escaped[i] = 0;
 478
 479   j = 0;
 480   for (i = 0; s[i]; i++)
 481     {
 482       switch (s[i])
 483         {
 484           int c;
 485         case '\\':
 486           switch (s[i + 1])
 487             {
 488               int oct_digit;
 489             case '\\':
 490               c = '\\';
 491               break;
 492             case 'a':
 493               c = '\007';
 494               break;
 495             case 'b':
 496               c = '\b';
 497               break;
 498             case 'f':
 499               c = '\f';
 500               break;
 501             case 'n':
 502               c = '\n';
 503               break;
 504             case 'r':
 505               c = '\r';
 506               break;
 507             case 't':
 508               c = '\t';
 509               break;
 510             case 'v':
 511               c = '\v';
 512               break;
 513             case '0':
 514             case '1':
 515             case '2':
 516             case '3':
 517             case '4':
 518             case '5':
 519             case '6':
 520             case '7':
 521               c = s[i + 1] - '0';
 522               oct_digit = s[i + 2] - '0';
 523               if (0 <= oct_digit && oct_digit <= 7)
 524                 {
 525                   c = 8 * c + oct_digit;
 526                   ++i;
 527                   oct_digit = s[i + 2] - '0';
 528                   if (0 <= oct_digit && oct_digit <= 7)
 529                     {
 530                       if (8 * c + oct_digit < N_CHARS)
 531                         {
 532                           c = 8 * c + oct_digit;
 533                           ++i;
 534                         }
 535                       else if (!posix_pedantic)
 536                         {
 537                           /* Any octal number larger than 0377 won't
 538                              fit in 8 bits.  So we stop when adding the
 539                              next digit would put us over the limit and
 540                              give a warning about the ambiguity.  POSIX
 541                              isn't clear on this, but one person has said
 542                              that in his interpretation, POSIX says tr
 543                              can't even give a warning.  */
 544                           error (0, 0, _("warning: the ambiguous octal escape \
 545 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 546                                  s[i], s[i + 1], s[i + 2],
 547                                  s[i], s[i + 1], s[i + 2]);
 548                         }
 549                     }
 550                 }
 551               break;
 552             case '\0':
 553               error (0, 0, _("invalid backslash escape at end of string"));
 554               return 1;
 555
 556             default:
 557               if (posix_pedantic)
 558                 {
 559                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 560                   return 1;
 561                 }
 562               else
 563                 {
 564                   c = s[i + 1];
 565                   es->escaped[j] = 1;
 566                 }
 567             }
 568           ++i;
 569           es->s[j++] = c;
 570           break;
 571         default:
 572           es->s[j++] = s[i];
 573           break;
 574         }
 575     }
 576   es->len = j;
 577   return 0;
 578 }
 579
 580 /* If CLASS_STR is a valid character class string, return its index
 581    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 582
 583 static enum Char_class
 584 look_up_char_class (const unsigned char *class_str, size_t len)
 585 {
 586   unsigned int i;
 587
 588   for (i = 0; i < N_CHAR_CLASSES; i++)
 589     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 590         && strlen (char_class_name[i]) == len)
 591       return (enum Char_class) i;
 592   return CC_NO_CLASS;
 593 }
 594
 595 /* Return a newly allocated string with a printable version of C.
 596    This function is used solely for formatting error messages.  */
 597
 598 static char *
 599 make_printable_char (unsigned int c)
 600 {
 601   char *buf = xmalloc (5);
 602
 603   assert (c < N_CHARS);
 604   if (ISPRINT (c))
 605     {
 606       buf[0] = c;
 607       buf[1] = '\0';
 608     }
 609   else
 610     {
 611       sprintf (buf, "\\%03o", c);
 612     }
 613   return buf;
 614 }
 615
 616 /* Return a newly allocated copy of S which is suitable for printing.
 617    LEN is the number of characters in S.  Most non-printing
 618    (isprint) characters are represented by a backslash followed by
 619    3 octal digits.  However, the characters represented by \c escapes
 620    where c is one of [abfnrtv] are represented by their 2-character \c
 621    sequences.  This function is used solely for printing error messages.  */
 622
 623 static char *
 624 make_printable_str (const unsigned char *s, size_t len)
 625 {
 626   /* Worst case is that every character expands to a backslash
 627      followed by a 3-character octal escape sequence.  */
 628   char *printable_buf = xmalloc (4 * len + 1);
 629   char *p = printable_buf;
 630   size_t i;
 631
 632   for (i = 0; i < len; i++)
 633     {
 634       char buf[5];
 635       char *tmp = NULL;
 636
 637       switch (s[i])
 638         {
 639         case '\\':
 640           tmp = "\\";
 641           break;
 642         case '\007':
 643           tmp = "\\a";
 644           break;
 645         case '\b':
 646           tmp = "\\b";
 647           break;
 648         case '\f':
 649           tmp = "\\f";
 650           break;
 651         case '\n':
 652           tmp = "\\n";
 653           break;
 654         case '\r':
 655           tmp = "\\r";
 656           break;
 657         case '\t':
 658           tmp = "\\t";
 659           break;
 660         case '\v':
 661           tmp = "\\v";
 662           break;
 663         default:
 664           if (ISPRINT (s[i]))
 665             {
 666               buf[0] = s[i];
 667               buf[1] = '\0';
 668             }
 669           else
 670             sprintf (buf, "\\%03o", s[i]);
 671           tmp = buf;
 672           break;
 673         }
 674       p = stpcpy (p, tmp);
 675     }
 676   return printable_buf;
 677 }
 678
 679 /* Append a newly allocated structure representing a
 680    character C to the specification list LIST.  */
 681
 682 static void
 683 append_normal_char (struct Spec_list *list, unsigned int c)
 684 {
 685   struct List_element *new;
 686
 687   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 688   new->next = NULL;
 689   new->type = RE_NORMAL_CHAR;
 690   new->u.normal_char = c;
 691   assert (list->tail);
 692   list->tail->next = new;
 693   list->tail = new;
 694 }
 695
 696 /* Append a newly allocated structure representing the range
 697    of characters from FIRST to LAST to the specification list LIST.
 698    Return nonzero if LAST precedes FIRST in the collating sequence,
 699    zero otherwise.  This means that '[c-c]' is acceptable.  */
 700
 701 static int
 702 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 703 {
 704   struct List_element *new;
 705
 706   if (ORD (first) > ORD (last))
 707     {
 708       char *tmp1 = make_printable_char (first);
 709       char *tmp2 = make_printable_char (last);
 710
 711       error (0, 0,
 712        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 713              tmp1, tmp2);
 714       free (tmp1);
 715       free (tmp2);
 716       return 1;
 717     }
 718   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 719   new->next = NULL;
 720   new->type = RE_RANGE;
 721   new->u.range.first_char = first;
 722   new->u.range.last_char = last;
 723   assert (list->tail);
 724   list->tail->next = new;
 725   list->tail = new;
 726   return 0;
 727 }
 728
 729 /* If CHAR_CLASS_STR is a valid character class string, append a
 730    newly allocated structure representing that character class to the end
 731    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 732    a valid string return nonzero.  */
 733
 734 static int
 735 append_char_class (struct Spec_list *list,
 736                    const unsigned char *char_class_str, size_t len)
 737 {
 738   enum Char_class char_class;
 739   struct List_element *new;
 740
 741   char_class = look_up_char_class (char_class_str, len);
 742   if (char_class == CC_NO_CLASS)
 743     return 1;
 744   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 745   new->next = NULL;
 746   new->type = RE_CHAR_CLASS;
 747   new->u.char_class = char_class;
 748   assert (list->tail);
 749   list->tail->next = new;
 750   list->tail = new;
 751   return 0;
 752 }
 753
 754 /* Append a newly allocated structure representing a [c*n]
 755    repeated character construct to the specification list LIST.
 756    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 757    is a non-negative repeat count.  */
 758
 759 static void
 760 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 761                       size_t repeat_count)
 762 {
 763   struct List_element *new;
 764
 765   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 766   new->next = NULL;
 767   new->type = RE_REPEATED_CHAR;
 768   new->u.repeated_char.the_repeated_char = the_char;
 769   new->u.repeated_char.repeat_count = repeat_count;
 770   assert (list->tail);
 771   list->tail->next = new;
 772   list->tail = new;
 773 }
 774
 775 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 776    the length of that string, LEN, if LEN is exactly one, append
 777    a newly allocated structure representing the specified
 778    equivalence class to the specification list, LIST and return zero.
 779    If LEN is not 1, return nonzero.  */
 780
 781 static int
 782 append_equiv_class (struct Spec_list *list,
 783                     const unsigned char *equiv_class_str, size_t len)
 784 {
 785   struct List_element *new;
 786
 787   if (len != 1)
 788     return 1;
 789   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 790   new->next = NULL;
 791   new->type = RE_EQUIV_CLASS;
 792   new->u.equiv_code = *equiv_class_str;
 793   assert (list->tail);
 794   list->tail->next = new;
 795   list->tail = new;
 796   return 0;
 797 }
 798
 799 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 800    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 801    NUL bytes, and is *not* NUL-terminated.  */
 802
 803 static unsigned char *
 804 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
 805 {
 806   size_t len;
 807   unsigned char *tmp;
 808
 809   assert (first_idx <= last_idx);
 810   len = last_idx - first_idx + 1;
 811   tmp = (unsigned char *) xmalloc (len);
 812
 813   assert (first_idx <= last_idx);
 814   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 815   memcpy (tmp, p + first_idx, len);
 816   return tmp;
 817 }
 818
 819 /* Search forward starting at START_IDX for the 2-char sequence
 820    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 821    a sequence is found, set *RESULT_IDX to the index of the first
 822    character and return nonzero. Otherwise return zero.  P may contain
 823    zero bytes.  */
 824
 825 static int
 826 find_closing_delim (const struct E_string *es, size_t start_idx,
 827                     unsigned int pre_bracket_char, size_t *result_idx)
 828 {
 829   size_t i;
 830
 831   for (i = start_idx; i < es->len - 1; i++)
 832     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 833         && !es->escaped[i] && !es->escaped[i + 1])
 834       {
 835         *result_idx = i;
 836         return 1;
 837       }
 838   return 0;
 839 }
 840
 841 /* Convert a string S with explicit length LEN, possibly
 842    containing embedded zero bytes, to a long integer value.
 843    If the string represents a negative value, a value larger
 844    than LONG_MAX, or if all LEN characters do not represent a
 845    valid integer, return nonzero and do not modify *VAL.
 846    Otherwise, return zero and set *VAL to the converted value.  */
 847
 848 static int
 849 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
 850 {
 851   size_t i;
 852   unsigned long sum = 0;
 853   unsigned int base;
 854
 855   if (len <= 0)
 856     return 1;
 857   if (s[0] == '0')
 858     base = 8;
 859   else if (ISDIGIT (s[0]))
 860     base = 10;
 861   else
 862     return 1;
 863
 864   for (i = 0; i < len; i++)
 865     {
 866       unsigned int c;
 867
 868       if (s[i] < '0')
 869         return 1;
 870
 871       c = s[i] - '0';
 872       if (c >= base)
 873         return 1;
 874
 875       if (sum > (LONG_MAX - c) / base)
 876         return 1;
 877       sum = sum * base + c;
 878     }
 879   *val = sum;
 880   return 0;
 881 }
 882
 883 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 884    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 885    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 886    and return zero. If the second character following
 887    the opening bracket is not `*' or if no closing bracket can be
 888    found, return -1.  If a closing bracket is found and the
 889    second char is `*', but the string between the `*' and `]' isn't
 890    empty, an octal number, or a decimal number, print an error message
 891    and return -2.  */
 892
 893 static int
 894 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 895                        unsigned int *char_to_repeat, size_t *repeat_count,
 896                        size_t *closing_bracket_idx)
 897 {
 898   size_t i;
 899
 900   assert (start_idx + 1 < es->len);
 901   if (!ES_MATCH (es, start_idx + 1, '*'))
 902     return -1;
 903
 904   for (i = start_idx + 2; i < es->len; i++)
 905     {
 906       if (ES_MATCH (es, i, ']'))
 907         {
 908           const unsigned char *digit_str;
 909           size_t digit_str_len = i - start_idx - 2;
 910
 911           *char_to_repeat = es->s[start_idx];
 912           if (digit_str_len == 0)
 913             {
 914               /* We've matched [c*] -- no explicit repeat count.  */
 915               *repeat_count = 0;
 916               *closing_bracket_idx = i;
 917               return 0;
 918             }
 919
 920           /* Here, we have found [c*s] where s should be a string
 921              of octal or decimal digits.  */
 922           digit_str = &es->s[start_idx + 2];
 923           if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
 924               || *repeat_count > BEGIN_STATE)
 925             {
 926               char *tmp = make_printable_str (digit_str, digit_str_len);
 927               error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 928                      tmp);
 929               free (tmp);
 930               return -2;
 931             }
 932           *closing_bracket_idx = i;
 933           return 0;
 934         }
 935     }
 936   return -1;                    /* No bracket found.  */
 937 }
 938
 939 /* Return nonzero if the string at ES->s[IDX] matches the regular
 940    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 941    the `]' must not be escaped.  */
 942
 943 static int
 944 star_digits_closebracket (const struct E_string *es, size_t idx)
 945 {
 946   size_t i;
 947
 948   if (!ES_MATCH (es, idx, '*'))
 949     return 0;
 950
 951   for (i = idx + 1; i < es->len; i++)
 952     {
 953       if (!ISDIGIT (es->s[i]))
 954         {
 955           if (ES_MATCH (es, i, ']'))
 956             return 1;
 957           return 0;
 958         }
 959     }
 960   return 0;
 961 }
 962
 963 /* Convert string UNESACPED_STRING (which has been preprocessed to
 964    convert backslash-escape sequences) of length LEN characters into
 965    a linked list of the following 5 types of constructs:
 966       - [:str:] Character class where `str' is one of the 12 valid strings.
 967       - [=c=] Equivalence class where `c' is any single character.
 968       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 969           However, if `n' is present, it must be a non-negative octal or
 970           decimal integer.
 971       - r-s Range of characters from `r' to `s'.  The second endpoint must
 972           not precede the first in the current collating sequence.
 973       - c Any other character is interpreted as itself.  */
 974
 975 static int
 976 build_spec_list (const struct E_string *es, struct Spec_list *result)
 977 {
 978   const unsigned char *p;
 979   size_t i;
 980
 981   p = es->s;
 982
 983   /* The main for-loop below recognizes the 4 multi-character constructs.
 984      A character that matches (in its context) none of the multi-character
 985      constructs is classified as `normal'.  Since all multi-character
 986      constructs have at least 3 characters, any strings of length 2 or
 987      less are composed solely of normal characters.  Hence, the index of
 988      the outer for-loop runs only as far as LEN-2.  */
 989
 990   for (i = 0; i + 2 < es->len; /* empty */)
 991     {
 992       if (ES_MATCH (es, i, '['))
 993         {
 994           int matched_multi_char_construct;
 995           size_t closing_bracket_idx;
 996           unsigned int char_to_repeat;
 997           size_t repeat_count;
 998           int err;
 999
1000           matched_multi_char_construct = 1;
1001           if (ES_MATCH (es, i + 1, ':')
1002               || ES_MATCH (es, i + 1, '='))
1003             {
1004               size_t closing_delim_idx;
1005               int found;
1006
1007               found = find_closing_delim (es, i + 2, p[i + 1],
1008                                           &closing_delim_idx);
1009               if (found)
1010                 {
1011                   int parse_failed;
1012                   unsigned char *opnd_str = substr (p, i + 2,
1013                                                     closing_delim_idx - 1);
1014                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1015
1016                   if (p[i + 1] == ':')
1017                     {
1018                       parse_failed = append_char_class (result, opnd_str,
1019                                                         opnd_str_len);
1020
1021                       /* FIXME: big comment.  */
1022                       if (parse_failed)
1023                         {
1024                           if (star_digits_closebracket (es, i + 2))
1025                             {
1026                               free (opnd_str);
1027                               goto try_bracketed_repeat;
1028                             }
1029                           else
1030                             {
1031                               char *tmp = make_printable_str (opnd_str,
1032                                                               opnd_str_len);
1033                               error (0, 0, _("invalid character class `%s'"),
1034                                      tmp);
1035                               free (tmp);
1036                               return 1;
1037                             }
1038                         }
1039                     }
1040                   else
1041                     {
1042                       parse_failed = append_equiv_class (result, opnd_str,
1043                                                          opnd_str_len);
1044
1045                       /* FIXME: big comment.  */
1046                       if (parse_failed)
1047                         {
1048                           if (star_digits_closebracket (es, i + 2))
1049                             {
1050                               free (opnd_str);
1051                               goto try_bracketed_repeat;
1052                             }
1053                           else
1054                             {
1055                               char *tmp = make_printable_str (opnd_str,
1056                                                               opnd_str_len);
1057                               error (0, 0,
1058                _("%s: equivalence class operand must be a single character"),
1059                                      tmp);
1060                               free (tmp);
1061                               return 1;
1062                             }
1063                         }
1064                     }
1065                   free (opnd_str);
1066
1067                   /* Return nonzero if append_*_class reports a problem.  */
1068                   if (parse_failed)
1069                     return 1;
1070                   else
1071                     i = closing_delim_idx + 2;
1072                   continue;
1073                 }
1074               /* Else fall through.  This could be [:*] or [=*].  */
1075             }
1076
1077         try_bracketed_repeat:
1078
1079           /* Determine whether this is a bracketed repeat range
1080              matching the RE \[.\*(dec_or_oct_number)?\].  */
1081           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1082                                        &repeat_count,
1083                                        &closing_bracket_idx);
1084           if (err == 0)
1085             {
1086               append_repeated_char (result, char_to_repeat, repeat_count);
1087               i = closing_bracket_idx + 1;
1088             }
1089           else if (err == -1)
1090             {
1091               matched_multi_char_construct = 0;
1092             }
1093           else
1094             {
1095               /* Found a string that looked like [c*n] but the
1096                  numeric part was invalid.  */
1097               return 1;
1098             }
1099
1100           if (matched_multi_char_construct)
1101             continue;
1102
1103           /* We reach this point if P does not match [:str:], [=c=],
1104              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1105              (from `[' to `c').  */
1106         }
1107
1108       /* Look ahead one char for ranges like a-z.  */
1109       if (ES_MATCH (es, i + 1, '-'))
1110         {
1111           if (append_range (result, p[i], p[i + 2]))
1112             return 1;
1113           i += 3;
1114         }
1115       else
1116         {
1117           append_normal_char (result, p[i]);
1118           ++i;
1119         }
1120     }
1121
1122   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1123   for (; i < es->len; i++)
1124     append_normal_char (result, p[i]);
1125
1126   return 0;
1127 }
1128
1129 /* Given a Spec_list S (with its saved state implicit in the values
1130    of its members `tail' and `state'), return the next single character
1131    in the expansion of S's constructs.  If the last character of S was
1132    returned on the previous call or if S was empty, this function
1133    returns -1.  For example, successive calls to get_next where S
1134    represents the spec-string 'a-d[y*3]' will return the sequence
1135    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1136    which the returned character comes is [:upper:] or [:lower:], the
1137    parameter CLASS is given a value to indicate which it was.  Otherwise
1138    CLASS is set to UL_NONE.  This value is used only when constructing
1139    the translation table to verify that any occurrences of upper and
1140    lower class constructs in the spec-strings appear in the same relative
1141    positions.  */
1142
1143 static int
1144 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1145 {
1146   struct List_element *p;
1147   int return_val;
1148   int i;
1149
1150   if (class)
1151     *class = UL_NONE;
1152
1153   if (s->state == BEGIN_STATE)
1154     {
1155       s->tail = s->head->next;
1156       s->state = NEW_ELEMENT;
1157     }
1158
1159   p = s->tail;
1160   if (p == NULL)
1161     return -1;
1162
1163   switch (p->type)
1164     {
1165     case RE_NORMAL_CHAR:
1166       return_val = p->u.normal_char;
1167       s->state = NEW_ELEMENT;
1168       s->tail = p->next;
1169       break;
1170
1171     case RE_RANGE:
1172       if (s->state == NEW_ELEMENT)
1173         s->state = ORD (p->u.range.first_char);
1174       else
1175         ++(s->state);
1176       return_val = CHR (s->state);
1177       if (s->state == ORD (p->u.range.last_char))
1178         {
1179           s->tail = p->next;
1180           s->state = NEW_ELEMENT;
1181         }
1182       break;
1183
1184     case RE_CHAR_CLASS:
1185       if (class)
1186         {
1187           int upper_or_lower;
1188           switch (p->u.char_class)
1189             {
1190             case CC_LOWER:
1191               *class = UL_LOWER;
1192               upper_or_lower = 1;
1193               break;
1194             case CC_UPPER:
1195               *class = UL_UPPER;
1196               upper_or_lower = 1;
1197               break;
1198             default:
1199               upper_or_lower = 0;
1200               break;
1201             }
1202
1203           if (upper_or_lower)
1204             {
1205               s->tail = p->next;
1206               s->state = NEW_ELEMENT;
1207               return_val = 0;
1208               break;
1209             }
1210         }
1211
1212       if (s->state == NEW_ELEMENT)
1213         {
1214           for (i = 0; i < N_CHARS; i++)
1215             if (is_char_class_member (p->u.char_class, i))
1216               break;
1217           assert (i < N_CHARS);
1218           s->state = i;
1219         }
1220       assert (is_char_class_member (p->u.char_class, s->state));
1221       return_val = CHR (s->state);
1222       for (i = s->state + 1; i < N_CHARS; i++)
1223         if (is_char_class_member (p->u.char_class, i))
1224           break;
1225       if (i < N_CHARS)
1226         s->state = i;
1227       else
1228         {
1229           s->tail = p->next;
1230           s->state = NEW_ELEMENT;
1231         }
1232       break;
1233
1234     case RE_EQUIV_CLASS:
1235       /* FIXME: this assumes that each character is alone in its own
1236          equivalence class (which appears to be correct for my
1237          LC_COLLATE.  But I don't know of any function that allows
1238          one to determine a character's equivalence class.  */
1239
1240       return_val = p->u.equiv_code;
1241       s->state = NEW_ELEMENT;
1242       s->tail = p->next;
1243       break;
1244
1245     case RE_REPEATED_CHAR:
1246       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1247       if (p->u.repeated_char.repeat_count == 0)
1248         {
1249           s->tail = p->next;
1250           s->state = NEW_ELEMENT;
1251           return_val = get_next (s, class);
1252         }
1253       else
1254         {
1255           if (s->state == NEW_ELEMENT)
1256             {
1257               s->state = 0;
1258             }
1259           ++(s->state);
1260           return_val = p->u.repeated_char.the_repeated_char;
1261           if (p->u.repeated_char.repeat_count > 0
1262               && s->state == p->u.repeated_char.repeat_count)
1263             {
1264               s->tail = p->next;
1265               s->state = NEW_ELEMENT;
1266             }
1267         }
1268       break;
1269
1270     case RE_NO_TYPE:
1271       abort ();
1272       break;
1273
1274     default:
1275       abort ();
1276       break;
1277     }
1278
1279   return return_val;
1280 }
1281
1282 /* This is a minor kludge.  This function is called from
1283    get_spec_stats to determine the cardinality of a set derived
1284    from a complemented string.  It's a kludge in that some of the
1285    same operations are (duplicated) performed in set_initialize.  */
1286
1287 static int
1288 card_of_complement (struct Spec_list *s)
1289 {
1290   int c;
1291   int cardinality = N_CHARS;
1292   SET_TYPE in_set[N_CHARS];
1293
1294   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1295   s->state = BEGIN_STATE;
1296   while ((c = get_next (s, NULL)) != -1)
1297     if (!in_set[c]++)
1298       --cardinality;
1299   return cardinality;
1300 }
1301
1302 /* Gather statistics about the spec-list S in preparation for the tests
1303    in validate that determine the consistency of the specs.  This function
1304    is called at most twice; once for string1, and again for any string2.
1305    LEN_S1 < 0 indicates that this is the first call and that S represents
1306    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1307    constructs in string1, and we can use its value to resolve any
1308    indefinite repeat construct in S (which represents string2).  Hence,
1309    this function has the side-effect that it converts a valid [c*]
1310    construct in string2 to [c*n] where n is large enough (or 0) to give
1311    string2 the same length as string1.  For example, with the command
1312    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1313    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1314
1315 static void
1316 get_spec_stats (struct Spec_list *s)
1317 {
1318   struct List_element *p;
1319   int len = 0;
1320
1321   s->n_indefinite_repeats = 0;
1322   s->has_equiv_class = 0;
1323   s->has_restricted_char_class = 0;
1324   s->has_char_class = 0;
1325   for (p = s->head->next; p; p = p->next)
1326     {
1327       switch (p->type)
1328         {
1329           int i;
1330         case RE_NORMAL_CHAR:
1331           ++len;
1332           break;
1333
1334         case RE_RANGE:
1335           assert (p->u.range.last_char >= p->u.range.first_char);
1336           len += p->u.range.last_char - p->u.range.first_char + 1;
1337           break;
1338
1339         case RE_CHAR_CLASS:
1340           s->has_char_class = 1;
1341           for (i = 0; i < N_CHARS; i++)
1342             if (is_char_class_member (p->u.char_class, i))
1343               ++len;
1344           switch (p->u.char_class)
1345             {
1346             case CC_UPPER:
1347             case CC_LOWER:
1348               break;
1349             default:
1350               s->has_restricted_char_class = 1;
1351               break;
1352             }
1353           break;
1354
1355         case RE_EQUIV_CLASS:
1356           for (i = 0; i < N_CHARS; i++)
1357             if (is_equiv_class_member (p->u.equiv_code, i))
1358               ++len;
1359           s->has_equiv_class = 1;
1360           break;
1361
1362         case RE_REPEATED_CHAR:
1363           if (p->u.repeated_char.repeat_count > 0)
1364             len += p->u.repeated_char.repeat_count;
1365           else if (p->u.repeated_char.repeat_count == 0)
1366             {
1367               s->indefinite_repeat_element = p;
1368               ++(s->n_indefinite_repeats);
1369             }
1370           break;
1371
1372         case RE_NO_TYPE:
1373           assert (0);
1374           break;
1375         }
1376     }
1377
1378   s->length = len;
1379 }
1380
1381 static void
1382 get_s1_spec_stats (struct Spec_list *s1)
1383 {
1384   get_spec_stats (s1);
1385   if (complement)
1386     s1->length = card_of_complement (s1);
1387 }
1388
1389 static void
1390 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1391 {
1392   get_spec_stats (s2);
1393   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1394     {
1395       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1396         len_s1 - s2->length;
1397       s2->length = len_s1;
1398     }
1399 }
1400
1401 static void
1402 spec_init (struct Spec_list *spec_list)
1403 {
1404   spec_list->head = spec_list->tail =
1405     (struct List_element *) xmalloc (sizeof (struct List_element));
1406   spec_list->head->next = NULL;
1407 }
1408
1409 /* This function makes two passes over the argument string S.  The first
1410    one converts all \c and \ddd escapes to their one-byte representations.
1411    The second constructs a linked specification list, SPEC_LIST, of the
1412    characters and constructs that comprise the argument string.  If either
1413    of these passes detects an error, this function returns nonzero.  */
1414
1415 static int
1416 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1417 {
1418   struct E_string es;
1419   int fail;
1420
1421   fail = unquote (s, &es);
1422   if (!fail)
1423     fail = build_spec_list (&es, spec_list);
1424   es_free (&es);
1425   return fail;
1426 }
1427
1428 /* Given two specification lists, S1 and S2, and assuming that
1429    S1->length > S2->length, append a single [c*n] element to S2 where c
1430    is the last character in the expansion of S2 and n is the difference
1431    between the two lengths.
1432    Upon successful completion, S2->length is set to S1->length.  The only
1433    way this function can fail to make S2 as long as S1 is when S2 has
1434    zero-length, since in that case, there is no last character to repeat.
1435    So S2->length is required to be at least 1.
1436
1437    Providing this functionality allows the user to do some pretty
1438    non-BSD (and non-portable) things:  For example, the command
1439        tr -cs '[:upper:]0-9' '[:lower:]'
1440    is almost guaranteed to give results that depend on your collating
1441    sequence.  */
1442
1443 static void
1444 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1445 {
1446   struct List_element *p;
1447   int char_to_repeat;
1448   int i;
1449
1450   assert (translating);
1451   assert (s1->length > s2->length);
1452   assert (s2->length > 0);
1453
1454   p = s2->tail;
1455   switch (p->type)
1456     {
1457     case RE_NORMAL_CHAR:
1458       char_to_repeat = p->u.normal_char;
1459       break;
1460     case RE_RANGE:
1461       char_to_repeat = p->u.range.last_char;
1462       break;
1463     case RE_CHAR_CLASS:
1464       for (i = N_CHARS; i >= 0; i--)
1465         if (is_char_class_member (p->u.char_class, i))
1466           break;
1467       assert (i >= 0);
1468       char_to_repeat = CHR (i);
1469       break;
1470
1471     case RE_REPEATED_CHAR:
1472       char_to_repeat = p->u.repeated_char.the_repeated_char;
1473       break;
1474
1475     case RE_EQUIV_CLASS:
1476       /* This shouldn't happen, because validate exits with an error
1477          if it finds an equiv class in string2 when translating.  */
1478       abort ();
1479       break;
1480
1481     case RE_NO_TYPE:
1482       abort ();
1483       break;
1484
1485     default:
1486       abort ();
1487       break;
1488     }
1489
1490   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1491   s2->length = s1->length;
1492 }
1493
1494 /* Return non-zero if S is a non-empty list in which exactly one
1495    character (but potentially, many instances of it) appears.
1496    E.g.  [X*] or xxxxxxxx.  */
1497
1498 static int
1499 homogeneous_spec_list (struct Spec_list *s)
1500 {
1501   int b, c;
1502
1503   s->state = BEGIN_STATE;
1504
1505   if ((b = get_next (s, NULL)) == -1)
1506     return 0;
1507
1508   while ((c = get_next (s, NULL)) != -1)
1509     if (c != b)
1510       return 0;
1511
1512   return 1;
1513 }
1514
1515 /* Die with an error message if S1 and S2 describe strings that
1516    are not valid with the given command line switches.
1517    A side effect of this function is that if a valid [c*] or
1518    [c*0] construct appears in string2, it is converted to [c*n]
1519    with a value for n that makes s2->length == s1->length.  By
1520    the same token, if the --truncate-set1 option is not
1521    given, S2 may be extended.  */
1522
1523 static void
1524 validate (struct Spec_list *s1, struct Spec_list *s2)
1525 {
1526   get_s1_spec_stats (s1);
1527   if (s1->n_indefinite_repeats > 0)
1528     {
1529       error (EXIT_FAILURE, 0,
1530              _("the [c*] repeat construct may not appear in string1"));
1531     }
1532
1533   if (s2)
1534     {
1535       get_s2_spec_stats (s2, s1->length);
1536
1537       if (s2->n_indefinite_repeats > 1)
1538         {
1539           error (EXIT_FAILURE, 0,
1540                  _("only one [c*] repeat construct may appear in string2"));
1541         }
1542
1543       if (translating)
1544         {
1545           if (s2->has_equiv_class)
1546             {
1547               error (EXIT_FAILURE, 0,
1548                      _("[=c=] expressions may not appear in string2 \
1549 when translating"));
1550             }
1551
1552           if (s1->length > s2->length)
1553             {
1554               if (!truncate_set1)
1555                 {
1556                   /* string2 must be non-empty unless --truncate-set1 is
1557                      given or string1 is empty.  */
1558
1559                   if (s2->length == 0)
1560                     error (EXIT_FAILURE, 0,
1561                      _("when not truncating set1, string2 must be non-empty"));
1562                   string2_extend (s1, s2);
1563                 }
1564             }
1565
1566           if (complement && s1->has_char_class
1567               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1568             {
1569               error (EXIT_FAILURE, 0,
1570                      _("when translating with complemented character classes,\
1571 \nstring2 must map all characters in the domain to one"));
1572             }
1573
1574           if (s2->has_restricted_char_class)
1575             {
1576               error (EXIT_FAILURE, 0,
1577                      _("when translating, the only character classes that may \
1578 appear in\nstring2 are `upper' and `lower'"));
1579             }
1580         }
1581       else
1582         /* Not translating.  */
1583         {
1584           if (s2->n_indefinite_repeats > 0)
1585             error (EXIT_FAILURE, 0,
1586                    _("the [c*] construct may appear in string2 only \
1587 when translating"));
1588         }
1589     }
1590 }
1591
1592 /* Read buffers of SIZE bytes via the function READER (if READER is
1593    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1594    read_and_delete or read_and_xlate.  After each buffer is read, it is
1595    processed and written to stdout.  The buffers are processed so that
1596    multiple consecutive occurrences of the same character in the input
1597    stream are replaced by a single occurrence of that character if the
1598    character is in the squeeze set.  */
1599
1600 static void
1601 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1602 {
1603   unsigned int char_to_squeeze = NOT_A_CHAR;
1604   int i = 0;
1605   int nr = 0;
1606
1607   for (;;)
1608     {
1609       int begin;
1610
1611       if (i >= nr)
1612         {
1613           if (reader == NULL)
1614             nr = safe_read (0, (char *) buf, size);
1615           else
1616             nr = (*reader) (buf, size, NULL);
1617
1618           if (nr < 0)
1619             error (EXIT_FAILURE, errno, _("read error"));
1620           if (nr == 0)
1621             break;
1622           i = 0;
1623         }
1624
1625       begin = i;
1626
1627       if (char_to_squeeze == NOT_A_CHAR)
1628         {
1629           int out_len;
1630           /* Here, by being a little tricky, we can get a significant
1631              performance increase in most cases when the input is
1632              reasonably large.  Since tr will modify the input only
1633              if two consecutive (and identical) input characters are
1634              in the squeeze set, we can step by two through the data
1635              when searching for a character in the squeeze set.  This
1636              means there may be a little more work in a few cases and
1637              perhaps twice as much work in the worst cases where most
1638              of the input is removed by squeezing repeats.  But most
1639              uses of this functionality seem to remove less than 20-30%
1640              of the input.  */
1641           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1642             ;                   /* empty */
1643
1644           /* There is a special case when i == nr and we've just
1645              skipped a character (the last one in buf) that is in
1646              the squeeze set.  */
1647           if (i == nr && in_squeeze_set[buf[i - 1]])
1648             --i;
1649
1650           if (i >= nr)
1651             out_len = nr - begin;
1652           else
1653             {
1654               char_to_squeeze = buf[i];
1655               /* We're about to output buf[begin..i].  */
1656               out_len = i - begin + 1;
1657
1658               /* But since we stepped by 2 in the loop above,
1659                  out_len may be one too large.  */
1660               if (i > 0 && buf[i - 1] == char_to_squeeze)
1661                 --out_len;
1662
1663               /* Advance i to the index of first character to be
1664                  considered when looking for a char different from
1665                  char_to_squeeze.  */
1666               ++i;
1667             }
1668           if (out_len > 0
1669               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1670             error (EXIT_FAILURE, errno, _("write error"));
1671         }
1672
1673       if (char_to_squeeze != NOT_A_CHAR)
1674         {
1675           /* Advance i to index of first char != char_to_squeeze
1676              (or to nr if all the rest of the characters in this
1677              buffer are the same as char_to_squeeze).  */
1678           for (; i < nr && buf[i] == char_to_squeeze; i++)
1679             ;                   /* empty */
1680           if (i < nr)
1681             char_to_squeeze = NOT_A_CHAR;
1682           /* If (i >= nr) we've squeezed the last character in this buffer.
1683              So now we have to read a new buffer and continue comparing
1684              characters against char_to_squeeze.  */
1685         }
1686     }
1687 }
1688
1689 /* Read buffers of SIZE bytes from stdin until one is found that
1690    contains at least one character not in the delete set.  Store
1691    in the array BUF, all characters from that buffer that are not
1692    in the delete set, and return the number of characters saved
1693    or 0 upon EOF.  */
1694
1695 static long
1696 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1697 {
1698   long n_saved;
1699   static int hit_eof = 0;
1700
1701   assert (not_used == NULL);
1702   assert (size > 0);
1703
1704   if (hit_eof)
1705     return 0;
1706
1707   /* This enclosing do-while loop is to make sure that
1708      we don't return zero (indicating EOF) when we've
1709      just deleted all the characters in a buffer.  */
1710   do
1711     {
1712       int i;
1713       int nr = safe_read (0, (char *) buf, size);
1714
1715       if (nr < 0)
1716         error (EXIT_FAILURE, errno, _("read error"));
1717       if (nr == 0)
1718         {
1719           hit_eof = 1;
1720           return 0;
1721         }
1722
1723       /* This first loop may be a waste of code, but gives much
1724          better performance when no characters are deleted in
1725          the beginning of a buffer.  It just avoids the copying
1726          of buf[i] into buf[n_saved] when it would be a NOP.  */
1727
1728       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1729         /* empty */ ;
1730       n_saved = i;
1731
1732       for (++i; i < nr; i++)
1733         if (!in_delete_set[buf[i]])
1734           buf[n_saved++] = buf[i];
1735     }
1736   while (n_saved == 0);
1737
1738   return n_saved;
1739 }
1740
1741 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1742    perform the in-place and one-to-one mapping specified by the global
1743    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1744
1745 static long
1746 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1747 {
1748   long chars_read = 0;
1749   static int hit_eof = 0;
1750   int i;
1751
1752   assert (not_used == NULL);
1753   assert (size > 0);
1754
1755   if (hit_eof)
1756     return 0;
1757
1758   chars_read = safe_read (0, (char *) buf, size);
1759   if (chars_read < 0)
1760     error (EXIT_FAILURE, errno, _("read error"));
1761   if (chars_read == 0)
1762     {
1763       hit_eof = 1;
1764       return 0;
1765     }
1766
1767   for (i = 0; i < chars_read; i++)
1768     buf[i] = xlate[buf[i]];
1769
1770   return chars_read;
1771 }
1772
1773 /* Initialize a boolean membership set IN_SET with the character
1774    values obtained by traversing the linked list of constructs S
1775    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1776    nonzero the resulting set is complemented.  */
1777
1778 static void
1779 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1780 {
1781   int c;
1782   int i;
1783
1784   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1785   s->state = BEGIN_STATE;
1786   while ((c = get_next (s, NULL)) != -1)
1787     in_set[c] = 1;
1788   if (complement_this_set)
1789     for (i = 0; i < N_CHARS; i++)
1790       in_set[i] = (!in_set[i]);
1791 }
1792
1793 int
1794 main (int argc, char **argv)
1795 {
1796   int c;
1797   int non_option_args;
1798   struct Spec_list buf1, buf2;
1799   struct Spec_list *s1 = &buf1;
1800   struct Spec_list *s2 = &buf2;
1801
1802   program_name = argv[0];
1803   setlocale (LC_ALL, "");
1804   bindtextdomain (PACKAGE, LOCALEDIR);
1805   textdomain (PACKAGE);
1806
1807   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1808     {
1809       switch (c)
1810         {
1811         case 0:
1812           break;
1813
1814         case 'c':
1815           complement = 1;
1816           break;
1817
1818         case 'd':
1819           delete = 1;
1820           break;
1821
1822         case 's':
1823           squeeze_repeats = 1;
1824           break;
1825
1826         case 't':
1827           truncate_set1 = 1;
1828           break;
1829
1830         case_GETOPT_HELP_CHAR;
1831
1832         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1833
1834         default:
1835           usage (2);
1836           break;
1837         }
1838     }
1839
1840   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1841
1842   non_option_args = argc - optind;
1843   translating = (non_option_args == 2 && !delete);
1844
1845   /* Change this test if it is valid to give tr no options and
1846      no args at all.  POSIX doesn't specifically say anything
1847      either way, but it looks like they implied it's invalid
1848      by omission.  If you want to make tr do a slow imitation
1849      of `cat' use `tr a a'.  */
1850   if (non_option_args > 2)
1851     {
1852       error (0, 0, _("too many arguments"));
1853       usage (2);
1854     }
1855
1856   if (!delete && !squeeze_repeats && non_option_args != 2)
1857     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1858
1859   if (delete && squeeze_repeats && non_option_args != 2)
1860     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1861 deleting and squeezing repeats"));
1862
1863   /* If --delete is given without --squeeze-repeats, then
1864      only one string argument may be specified.  But POSIX
1865      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1866      is set, pretend we never saw string2.  But I think
1867      this deserves a fatal error, so that's the default.  */
1868   if ((delete && !squeeze_repeats) && non_option_args != 1)
1869     {
1870       if (posix_pedantic && non_option_args == 2)
1871         --non_option_args;
1872       else
1873         error (EXIT_FAILURE, 0,
1874                _("only one string may be given when deleting \
1875 without squeezing repeats"));
1876     }
1877
1878   if (squeeze_repeats && non_option_args == 0)
1879     error (EXIT_FAILURE, 0,
1880            _("at least one string must be given when squeezing repeats"));
1881
1882   spec_init (s1);
1883   if (parse_str ((unsigned char *) argv[optind], s1))
1884     exit (EXIT_FAILURE);
1885
1886   if (non_option_args == 2)
1887     {
1888       spec_init (s2);
1889       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1890         exit (EXIT_FAILURE);
1891     }
1892   else
1893     s2 = NULL;
1894
1895   validate (s1, s2);
1896
1897   /* Use binary I/O, since `tr' is sometimes used to transliterate
1898      non-printable characters, or characters which are stripped away
1899      by text-mode reads (like CR and ^Z).  */
1900   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1901
1902   if (squeeze_repeats && non_option_args == 1)
1903     {
1904       set_initialize (s1, complement, in_squeeze_set);
1905       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1906     }
1907   else if (delete && non_option_args == 1)
1908     {
1909       long nr;
1910
1911       set_initialize (s1, complement, in_delete_set);
1912       do
1913         {
1914           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1915           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1916             error (EXIT_FAILURE, errno, _("write error"));
1917         }
1918       while (nr > 0);
1919     }
1920   else if (squeeze_repeats && delete && non_option_args == 2)
1921     {
1922       set_initialize (s1, complement, in_delete_set);
1923       set_initialize (s2, 0, in_squeeze_set);
1924       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1925     }
1926   else if (translating)
1927     {
1928       if (complement)
1929         {
1930           int i;
1931           SET_TYPE *in_s1 = in_delete_set;
1932
1933           set_initialize (s1, 0, in_s1);
1934           s2->state = BEGIN_STATE;
1935           for (i = 0; i < N_CHARS; i++)
1936             xlate[i] = i;
1937           for (i = 0; i < N_CHARS; i++)
1938             {
1939               if (!in_s1[i])
1940                 {
1941                   int ch = get_next (s2, NULL);
1942                   assert (ch != -1 || truncate_set1);
1943                   if (ch == -1)
1944                     {
1945                       /* This will happen when tr is invoked like e.g.
1946                          tr -cs A-Za-z0-9 '\012'.  */
1947                       break;
1948                     }
1949                   xlate[i] = ch;
1950                 }
1951             }
1952           assert (get_next (s2, NULL) == -1 || truncate_set1);
1953         }
1954       else
1955         {
1956           int c1, c2;
1957           int i;
1958           enum Upper_Lower_class class_s1;
1959           enum Upper_Lower_class class_s2;
1960
1961           for (i = 0; i < N_CHARS; i++)
1962             xlate[i] = i;
1963           s1->state = BEGIN_STATE;
1964           s2->state = BEGIN_STATE;
1965           for (;;)
1966             {
1967               c1 = get_next (s1, &class_s1);
1968               c2 = get_next (s2, &class_s2);
1969               if (!class_ok[(int) class_s1][(int) class_s2])
1970                 error (EXIT_FAILURE, 0,
1971                        _("misaligned [:upper:] and/or [:lower:] construct"));
1972
1973               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1974                 {
1975                   for (i = 0; i < N_CHARS; i++)
1976                     if (ISLOWER (i))
1977                       xlate[i] = toupper (i);
1978                 }
1979               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1980                 {
1981                   for (i = 0; i < N_CHARS; i++)
1982                     if (ISUPPER (i))
1983                       xlate[i] = tolower (i);
1984                 }
1985               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1986                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1987                 {
1988                   /* By default, GNU tr permits the identity mappings: from
1989                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1990                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1991                   if (posix_pedantic)
1992                     {
1993                       error (EXIT_FAILURE, 0,
1994                              _("\
1995 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
1996 construct in string1 must be aligned with a corresponding construct\n\
1997 ([:upper:] or [:lower:], respectively) in string2"));
1998                     }
1999                 }
2000               else
2001                 {
2002                   /* The following should have been checked by validate...  */
2003                   if (c1 == -1 || c2 == -1)
2004                     break;
2005                   xlate[c1] = c2;
2006                 }
2007             }
2008           assert (c1 == -1 || truncate_set1);
2009         }
2010       if (squeeze_repeats)
2011         {
2012           set_initialize (s2, 0, in_squeeze_set);
2013           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2014         }
2015       else
2016         {
2017           long chars_read;
2018
2019           do
2020             {
2021               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2022               if (chars_read > 0
2023                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2024                 error (EXIT_FAILURE, errno, _("write error"));
2025             }
2026           while (chars_read > 0);
2027         }
2028     }
2029
2030   if (fclose (stdout) == EOF)
2031     error (EXIT_FAILURE, errno, _("write error"));
2032
2033   if (close (0) != 0)
2034     error (EXIT_FAILURE, errno, _("standard input"));
2035
2036   exit (EXIT_SUCCESS);
2037 }