src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 1991, 1995 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software
  16    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering, meyering@cs.utexas.edu.  */
  19
  20 #include <config.h>
  21
  22 /* Get isblank from GNU libc.  */
  23 #define _GNU_SOURCE
  24
  25 #include <stdio.h>
  26 #define NDEBUG 1
  27 #include <assert.h>
  28 #include <errno.h>
  29 #include <sys/types.h>
  30 #include <getopt.h>
  31
  32 #include "system.h"
  33 #include "version.h"
  34 #include "error.h"
  35
  36 #ifndef ULONG_MAX
  37 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
  38 #endif
  39
  40 #ifndef LONG_MAX
  41 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
  42 #endif
  43
  44 #ifndef UCHAR_MAX
  45 #define UCHAR_MAX 0xFF
  46 #endif
  47
  48 #define N_CHARS (UCHAR_MAX + 1)
  49
  50 /* A pointer to a function that returns an int.  */
  51 typedef int (*PFI) ();
  52
  53 /* Convert from character C to its index in the collating
  54    sequence array.  Just cast to an unsigned int to avoid
  55    problems with sign-extension.  */
  56 #define ORD(c) (unsigned int)(c)
  57
  58 /* The inverse of ORD.  */
  59 #define CHR(i) (unsigned char)(i)
  60
  61 /* The value for Spec_list->state that indicates to
  62    get_next that it should initialize the tail pointer.
  63    Its value doesn't matter as long as it can't be
  64    confused with a valid character code.  */
  65 #define BEGIN_STATE (2 * N_CHARS)
  66
  67 /* The value for Spec_list->state that indicates to
  68    get_next that the element pointed to by Spec_list->tail is
  69    being considered for the first time on this pass through the
  70    list -- it indicates that get_next should make any necessary
  71    initializations.  */
  72 #define NEW_ELEMENT (BEGIN_STATE + 1)
  73
  74 /* A value distinct from any character that may have been stored in a
  75    buffer as the result of a block-read in the function squeeze_filter.  */
  76 #define NOT_A_CHAR (unsigned int)(-1)
  77
  78 /* The following (but not CC_NO_CLASS) are indices into the array of
  79    valid character class strings.  */
  80 enum Char_class
  81   {
  82     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  83     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  84     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  85     CC_NO_CLASS = 9999
  86   };
  87
  88 /* Character class to which a character (returned by get_next) belonged;
  89    but it is set only if the construct from which the character was obtained
  90    was one of the character classes [:upper:] or [:lower:].  The value
  91    is used only when translating and then, only to make sure that upper
  92    and lower class constructs have the same relative positions in string1
  93    and string2.  */
  94 enum Upper_Lower_class
  95   {
  96     UL_LOWER = 0,
  97     UL_UPPER = 1,
  98     UL_NONE = 2
  99   };
 100
 101 /* A shortcut to ensure that when constructing the translation array,
 102    one of the values returned by paired calls to get_next (from s1 and s2)
 103    is from [:upper:] and the other is from [:lower:], or neither is from
 104    upper or lower.  In fact, no other character classes are allowed when
 105    translating, but that condition is tested elsewhere.  This array is
 106    indexed by values of type enum Upper_Lower_class.  */
 107 static int const class_ok[3][3] =
 108 {
 109   {0, 1, 0},
 110   {1, 0, 0},
 111   {0, 0, 1}
 112 };
 113
 114 /* The type of a List_element.  See build_spec_list for more details.  */
 115 enum Range_element_type
 116   {
 117     RE_NO_TYPE = 0,
 118     RE_NORMAL_CHAR,
 119     RE_RANGE,
 120     RE_CHAR_CLASS,
 121     RE_EQUIV_CLASS,
 122     RE_REPEATED_CHAR
 123   };
 124
 125 /* One construct in one of tr's argument strings.
 126    For example, consider the POSIX version of the classic tr command:
 127        tr -cs 'a-zA-Z_' '[\n*]'
 128    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 129    and a single normal character, `_'.  String2 has one construct.  */
 130 struct List_element
 131   {
 132     enum Range_element_type type;
 133     struct List_element *next;
 134     union
 135       {
 136         int normal_char;
 137         struct                  /* unnamed */
 138           {
 139             unsigned int first_char;
 140             unsigned int last_char;
 141           }
 142         range;
 143         enum Char_class char_class;
 144         int equiv_code;
 145         struct                  /* unnamed */
 146           {
 147             unsigned int the_repeated_char;
 148             size_t repeat_count;
 149           }
 150         repeated_char;
 151       }
 152     u;
 153   };
 154
 155 /* Each of tr's argument strings is parsed into a form that is easier
 156    to work with: a linked list of constructs (struct List_element).
 157    Each Spec_list structure also encapsulates various attributes of
 158    the corresponding argument string.  The attributes are used mainly
 159    to verify that the strings are valid in the context of any options
 160    specified (like -s, -d, or -c).  The main exception is the member
 161    `tail', which is first used to construct the list.  After construction,
 162    it is used by get_next to save its state when traversing the list.
 163    The member `state' serves a similar function.  */
 164 struct Spec_list
 165   {
 166     /* Points to the head of the list of range elements.
 167        The first struct is a dummy; its members are never used.  */
 168     struct List_element *head;
 169
 170     /* When appending, points to the last element.  When traversing via
 171        get_next(), points to the element to process next.  Setting
 172        Spec_list.state to the value BEGIN_STATE before calling get_next
 173        signals get_next to initialize tail to point to head->next.  */
 174     struct List_element *tail;
 175
 176     /* Used to save state between calls to get_next().  */
 177     unsigned int state;
 178
 179     /* Length, in the sense that length ('a-z[:digit:]123abc')
 180        is 42 ( = 26 + 10 + 6).  */
 181     size_t length;
 182
 183     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 184     int n_indefinite_repeats;
 185
 186     /* If n_indefinite_repeats is non-zero, this points to the List_element
 187        corresponding to the last [c*] or [c*0] construct encountered in
 188        this spec.  Otherwise it is undefined.  */
 189     struct List_element *indefinite_repeat_element;
 190
 191     /* Non-zero if this spec contains at least one equivalence
 192        class construct e.g. [=c=].  */
 193     int has_equiv_class;
 194
 195     /* Non-zero if this spec contains at least one of [:upper:] or
 196        [:lower:] class constructs.  */
 197     int has_upper_or_lower;
 198
 199     /* Non-zero if this spec contains at least one of the character class
 200        constructs (all but upper and lower) that aren't allowed in s2.  */
 201     int has_restricted_char_class;
 202   };
 203
 204 char *xmalloc ();
 205 char *stpcpy ();
 206 int safe_read ();
 207
 208 /* The name by which this program was run.  */
 209 char *program_name;
 210
 211 /* When non-zero, each sequence in the input of a repeated character
 212    (call it c) is replaced (in the output) by a single occurrence of c
 213    for every c in the squeeze set.  */
 214 static int squeeze_repeats = 0;
 215
 216 /* When non-zero, removes characters in the delete set from input.  */
 217 static int delete = 0;
 218
 219 /* Use the complement of set1 in place of set1.  */
 220 static int complement = 0;
 221
 222 /* When non-zero, this flag causes GNU tr to provide strict
 223    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 224    says that when -d is used without -s, string2 (if present)
 225    must be ignored.  Silently ignoring arguments is a bad idea.
 226    The default GNU behavior is to give a usage message and exit.
 227    Additionally, when this flag is non-zero, tr prints warnings
 228    on stderr if it is being used in a manner that is not portable.
 229    Applicable warnings are given by default, but are suppressed
 230    if the environment variable `POSIXLY_CORRECT' is set, since
 231    being POSIX conformant means we can't issue such messages.
 232    Warnings on the following topics are suppressed when this
 233    variable is non-zero:
 234    1. Ambiguous octal escapes.  */
 235 static int posix_pedantic;
 236
 237 /* When tr is performing translation and string1 is longer than string2,
 238    POSIX says that the result is undefined.  That gives the implementor
 239    of a POSIX conforming version of tr two reasonable choices for the
 240    semantics of this case.
 241
 242    * The BSD tr pads string2 to the length of string1 by
 243    repeating the last character in string2.
 244
 245    * System V tr ignores characters in string1 that have no
 246    corresponding character in string2.  That is, string1 is effectively
 247    truncated to the length of string2.
 248
 249    When non-zero, this flag causes GNU tr to imitate the behavior
 250    of System V tr when translating with string1 longer than string2.
 251    The default is to emulate BSD tr.  This flag is ignored in modes where
 252    no translation is performed.  Emulating the System V tr
 253    in this exceptional case causes the relatively common BSD idiom:
 254
 255        tr -cs A-Za-z0-9 '\012'
 256
 257    to break (it would convert only zero bytes, rather than all
 258    non-alphanumerics, to newlines).
 259
 260    WARNING: This switch does not provide general BSD or System V
 261    compatibility.  For example, it doesn't disable the interpretation
 262    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 263    some unfortunate coincidence you use such constructs in scripts
 264    expecting to use some other version of tr, the scripts will break.  */
 265 static int truncate_set1 = 0;
 266
 267 /* An alias for (!delete && non_option_args == 2).
 268    It is set in main and used there and in validate().  */
 269 static int translating;
 270
 271 #ifndef BUFSIZ
 272 #define BUFSIZ 8192
 273 #endif
 274
 275 #define IO_BUF_SIZE BUFSIZ
 276 static unsigned char io_buf[IO_BUF_SIZE];
 277
 278 static char const *const char_class_name[] =
 279 {
 280   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 281   "lower", "print", "punct", "space", "upper", "xdigit"
 282 };
 283 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 284
 285 typedef char SET_TYPE;
 286
 287 /* Array of boolean values.  A character `c' is a member of the
 288    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 289    set is defined by the last (possibly, the only) string argument
 290    on the command line when the squeeze option is given.  */
 291 static SET_TYPE in_squeeze_set[N_CHARS];
 292
 293 /* Array of boolean values.  A character `c' is a member of the
 294    delete set if and only if in_delete_set[c] is true.  The delete
 295    set is defined by the first (or only) string argument on the
 296    command line when the delete option is given.  */
 297 static SET_TYPE in_delete_set[N_CHARS];
 298
 299 /* Array of character values defining the translation (if any) that
 300    tr is to perform.  Translation is performed only when there are
 301    two specification strings and the delete switch is not given.  */
 302 static char xlate[N_CHARS];
 303
 304 /* If non-zero, display usage information and exit.  */
 305 static int show_help;
 306
 307 /* If non-zero, print the version on standard output then exit.  */
 308 static int show_version;
 309
 310 static struct option const long_options[] =
 311 {
 312   {"complement", no_argument, NULL, 'c'},
 313   {"delete", no_argument, NULL, 'd'},
 314   {"squeeze-repeats", no_argument, NULL, 's'},
 315   {"truncate-set1", no_argument, NULL, 't'},
 316   {"help", no_argument, &show_help, 1},
 317   {"version", no_argument, &show_version, 1},
 318   {NULL, 0, NULL, 0}
 319 };
 320 \f
 321 static void
 322 usage (status)
 323      int status;
 324 {
 325   if (status != 0)
 326     fprintf (stderr, "Try `%s --help' for more information.\n",
 327              program_name);
 328   else
 329     {
 330       printf ("\
 331 Usage: %s [OPTION]... SET1 [SET2]\n\
 332 ",
 333               program_name);
 334       printf ("\
 335 Translate, squeeze, and/or delete characters from standard input,\n\
 336 writing to standard output.\n\
 337 \n\
 338   -c, --complement        first complement SET1\n\
 339   -d, --delete            delete characters in SET1, do not translate\n\
 340   -s, --squeeze-repeats   replace sequence of characters with one\n\
 341   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 342       --help              display this help and exit\n\
 343       --version           output version information and exit\n\
 344 ");
 345       printf ("\
 346 \n\
 347 SETs are specified as strings of characters.  Most represent themselves.\n\
 348 Interpreted sequences are:\n\
 349 \n\
 350   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 351   \\\\              backslash\n\
 352   \\a              audible BEL\n\
 353   \\b              backspace\n\
 354   \\f              form feed\n\
 355   \\n              new line\n\
 356   \\r              return\n\
 357   \\t              horizontal tab\n\
 358   \\v              vertical tab\n\
 359   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 360   [CHAR1-CHAR2]   same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
 361   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 362   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 363   [:alnum:]       all letters and digits\n\
 364   [:alpha:]       all letters\n\
 365   [:blank:]       all horizontal whitespace\n\
 366   [:cntrl:]       all control characters\n\
 367   [:digit:]       all digits\n\
 368   [:graph:]       all printable characters, not including space\n\
 369   [:lower:]       all lower case letters\n\
 370   [:print:]       all printable characters, including space\n\
 371   [:punct:]       all punctuation characters\n\
 372   [:space:]       all horizontal or vertical whitespace\n\
 373   [:upper:]       all upper case letters\n\
 374   [:xdigit:]      all hexadecimal digits\n\
 375   [=CHAR=]        all characters which are equivalent to CHAR\n\
 376 ");
 377       printf ("\
 378 \n\
 379 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 380 -t may be used only when translating.  SET2 is extended to length of\n\
 381 SET1 by repeating its last character as necessary.  Excess characters\n\
 382 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 383 expand in ascending order; used in SET2 while translating, they may\n\
 384 only be used in pairs to specify case conversion.  -s uses SET1 if not\n\
 385 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 386 translation or deletion.\n\
 387 ");
 388     }
 389   exit (status);
 390 }
 391
 392 /* Return non-zero if the character C is a member of the
 393    equivalence class containing the character EQUIV_CLASS.  */
 394
 395 static int
 396 is_equiv_class_member (equiv_class, c)
 397      unsigned int equiv_class;
 398      unsigned int c;
 399 {
 400   return (equiv_class == c);
 401 }
 402
 403 /* Return non-zero if the character C is a member of the
 404    character class CHAR_CLASS.  */
 405
 406 static int
 407 is_char_class_member (char_class, c)
 408      enum Char_class char_class;
 409      unsigned int c;
 410 {
 411   int result;
 412
 413   switch (char_class)
 414     {
 415     case CC_ALNUM:
 416       result = ISALNUM (c);
 417       break;
 418     case CC_ALPHA:
 419       result = ISALPHA (c);
 420       break;
 421     case CC_BLANK:
 422       result = ISBLANK (c);
 423       break;
 424     case CC_CNTRL:
 425       result = ISCNTRL (c);
 426       break;
 427     case CC_DIGIT:
 428       result = ISDIGIT (c);
 429       break;
 430     case CC_GRAPH:
 431       result = ISGRAPH (c);
 432       break;
 433     case CC_LOWER:
 434       result = ISLOWER (c);
 435       break;
 436     case CC_PRINT:
 437       result = ISPRINT (c);
 438       break;
 439     case CC_PUNCT:
 440       result = ISPUNCT (c);
 441       break;
 442     case CC_SPACE:
 443       result = ISSPACE (c);
 444       break;
 445     case CC_UPPER:
 446       result = ISUPPER (c);
 447       break;
 448     case CC_XDIGIT:
 449       result = ISXDIGIT (c);
 450       break;
 451     default:
 452       abort ();
 453       break;
 454     }
 455   return result;
 456 }
 457
 458 /* Perform the first pass over each range-spec argument S, converting all
 459    \c and \ddd escapes to their one-byte representations.  The conversion
 460    is done in-place, so S must point to writable storage.  If an invalid
 461    quote sequence is found print an error message and return non-zero.
 462    Otherwise set *LEN to the length of the resulting string and return
 463    zero.  The resulting array of characters may contain zero-bytes;
 464    however, on input, S is assumed to be null-terminated, and hence
 465    cannot contain actual (non-escaped) zero bytes.  */
 466
 467 static int
 468 unquote (s, len)
 469      unsigned char *s;
 470      size_t *len;
 471 {
 472   size_t i, j;
 473
 474   j = 0;
 475   for (i = 0; s[i]; i++)
 476     {
 477       switch (s[i])
 478         {
 479           int c;
 480         case '\\':
 481           switch (s[i + 1])
 482             {
 483               int oct_digit;
 484             case '\\':
 485               c = '\\';
 486               break;
 487             case 'a':
 488               c = '\007';
 489               break;
 490             case 'b':
 491               c = '\b';
 492               break;
 493             case 'f':
 494               c = '\f';
 495               break;
 496             case 'n':
 497               c = '\n';
 498               break;
 499             case 'r':
 500               c = '\r';
 501               break;
 502             case 't':
 503               c = '\t';
 504               break;
 505             case 'v':
 506               c = '\v';
 507               break;
 508             case '0':
 509             case '1':
 510             case '2':
 511             case '3':
 512             case '4':
 513             case '5':
 514             case '6':
 515             case '7':
 516               c = s[i + 1] - '0';
 517               oct_digit = s[i + 2] - '0';
 518               if (0 <= oct_digit && oct_digit <= 7)
 519                 {
 520                   c = 8 * c + oct_digit;
 521                   ++i;
 522                   oct_digit = s[i + 2] - '0';
 523                   if (0 <= oct_digit && oct_digit <= 7)
 524                     {
 525                       if (8 * c + oct_digit < N_CHARS)
 526                         {
 527                           c = 8 * c + oct_digit;
 528                           ++i;
 529                         }
 530                       else if (!posix_pedantic)
 531                         {
 532                           /* Any octal number larger than 0377 won't
 533                              fit in 8 bits.  So we stop when adding the
 534                              next digit would put us over the limit and
 535                              give a warning about the ambiguity.  POSIX
 536                              isn't clear on this, but one person has said
 537                              that in his interpretation, POSIX says tr
 538                              can't even give a warning.  */
 539                           error (0, 0, "warning: the ambiguous octal escape \
 540 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
 541                                  s[i], s[i + 1], s[i + 2],
 542                                  s[i], s[i + 1], s[i + 2]);
 543                         }
 544                     }
 545                 }
 546               break;
 547             case '\0':
 548               error (0, 0, "invalid backslash escape at end of string");
 549               return 1;
 550               break;
 551             default:
 552               error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]);
 553               return 1;
 554               break;
 555             }
 556           ++i;
 557           s[j++] = c;
 558           break;
 559         default:
 560           s[j++] = s[i];
 561           break;
 562         }
 563     }
 564   *len = j;
 565   return 0;
 566 }
 567
 568 /* If CLASS_STR is a valid character class string, return its index
 569    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 570
 571 static enum Char_class
 572 look_up_char_class (class_str, len)
 573      const unsigned char *class_str;
 574      size_t len;
 575 {
 576   unsigned int i;
 577
 578   for (i = 0; i < N_CHAR_CLASSES; i++)
 579     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 580         && strlen (char_class_name[i]) == len)
 581       return (enum Char_class) i;
 582   return CC_NO_CLASS;
 583 }
 584
 585 /* Return a newly allocated string with a printable version of C.
 586    This function is used solely for formatting error messages.  */
 587
 588 static char *
 589 make_printable_char (c)
 590      unsigned int c;
 591 {
 592   char *buf = xmalloc (5);
 593
 594   assert (c < N_CHARS);
 595   if (ISPRINT (c))
 596     {
 597       buf[0] = c;
 598       buf[1] = '\0';
 599     }
 600   else
 601     {
 602       sprintf (buf, "\\%03o", c);
 603     }
 604   return buf;
 605 }
 606
 607 /* Return a newly allocated copy of S which is suitable for printing.
 608    LEN is the number of characters in S.  Most non-printing
 609    (isprint) characters are represented by a backslash followed by
 610    3 octal digits.  However, the characters represented by \c escapes
 611    where c is one of [abfnrtv] are represented by their 2-character \c
 612    sequences.  This function is used solely for printing error messages.  */
 613
 614 static char *
 615 make_printable_str (s, len)
 616      const unsigned char *s;
 617      size_t len;
 618 {
 619   /* Worst case is that every character expands to a backslash
 620      followed by a 3-character octal escape sequence.  */
 621   char *printable_buf = xmalloc (4 * len + 1);
 622   char *p = printable_buf;
 623   size_t i;
 624
 625   for (i = 0; i < len; i++)
 626     {
 627       char buf[5];
 628       char *tmp = NULL;
 629
 630       switch (s[i])
 631         {
 632         case '\\':
 633           tmp = "\\";
 634           break;
 635         case '\007':
 636           tmp = "\\a";
 637           break;
 638         case '\b':
 639           tmp = "\\b";
 640           break;
 641         case '\f':
 642           tmp = "\\f";
 643           break;
 644         case '\n':
 645           tmp = "\\n";
 646           break;
 647         case '\r':
 648           tmp = "\\r";
 649           break;
 650         case '\t':
 651           tmp = "\\t";
 652           break;
 653         case '\v':
 654           tmp = "\\v";
 655           break;
 656         default:
 657           if (ISPRINT (s[i]))
 658             {
 659               buf[0] = s[i];
 660               buf[1] = '\0';
 661             }
 662           else
 663             sprintf (buf, "\\%03o", s[i]);
 664           tmp = buf;
 665           break;
 666         }
 667       p = stpcpy (p, tmp);
 668     }
 669   return printable_buf;
 670 }
 671
 672 /* Append a newly allocated structure representing a
 673    character C to the specification list LIST.  */
 674
 675 static void
 676 append_normal_char (list, c)
 677      struct Spec_list *list;
 678      unsigned int c;
 679 {
 680   struct List_element *new;
 681
 682   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 683   new->next = NULL;
 684   new->type = RE_NORMAL_CHAR;
 685   new->u.normal_char = c;
 686   assert (list->tail);
 687   list->tail->next = new;
 688   list->tail = new;
 689 }
 690
 691 /* Append a newly allocated structure representing the range
 692    of characters from FIRST to LAST to the specification list LIST.
 693    Return non-zero if LAST precedes FIRST in the collating sequence,
 694    zero otherwise.  This means that '[c-c]' is acceptable.  */
 695
 696 static int
 697 append_range (list, first, last)
 698      struct Spec_list *list;
 699      size_t first;
 700      size_t last;
 701 {
 702   struct List_element *new;
 703
 704   if (ORD (first) > ORD (last))
 705     {
 706       char *tmp1 = make_printable_char (first);
 707       char *tmp2 = make_printable_char (last);
 708
 709       error (0, 0,
 710        "range-endpoints of `%s-%s' are in reverse collating sequence order",
 711              tmp1, tmp2);
 712       free (tmp1);
 713       free (tmp2);
 714       return 1;
 715     }
 716   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 717   new->next = NULL;
 718   new->type = RE_RANGE;
 719   new->u.range.first_char = first;
 720   new->u.range.last_char = last;
 721   assert (list->tail);
 722   list->tail->next = new;
 723   list->tail = new;
 724   return 0;
 725 }
 726
 727 /* If CHAR_CLASS_STR is a valid character class string, append a
 728    newly allocated structure representing that character class to the end
 729    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 730    a valid string, print an error message and return non-zero.  */
 731
 732 static int
 733 append_char_class (list, char_class_str, len)
 734      struct Spec_list *list;
 735      const unsigned char *char_class_str;
 736      size_t len;
 737 {
 738   enum Char_class char_class;
 739   struct List_element *new;
 740
 741   char_class = look_up_char_class (char_class_str, len);
 742   if (char_class == CC_NO_CLASS)
 743     {
 744       char *tmp = make_printable_str (char_class_str, len);
 745
 746       error (0, 0, "invalid character class `%s'", tmp);
 747       free (tmp);
 748       return 1;
 749     }
 750   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 751   new->next = NULL;
 752   new->type = RE_CHAR_CLASS;
 753   new->u.char_class = char_class;
 754   assert (list->tail);
 755   list->tail->next = new;
 756   list->tail = new;
 757   return 0;
 758 }
 759
 760 /* Append a newly allocated structure representing a [c*n]
 761    repeated character construct to the specification list LIST.
 762    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 763    is a non-negative repeat count.  */
 764
 765 static void
 766 append_repeated_char (list, the_char, repeat_count)
 767      struct Spec_list *list;
 768      unsigned int the_char;
 769      size_t repeat_count;
 770 {
 771   struct List_element *new;
 772
 773   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 774   new->next = NULL;
 775   new->type = RE_REPEATED_CHAR;
 776   new->u.repeated_char.the_repeated_char = the_char;
 777   new->u.repeated_char.repeat_count = repeat_count;
 778   assert (list->tail);
 779   list->tail->next = new;
 780   list->tail = new;
 781 }
 782
 783 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 784    the length of that string, LEN, if LEN is exactly one, append
 785    a newly allocated structure representing the specified
 786    equivalence class to the specification list, LIST and return zero.
 787    If LEN is not 1, issue an error message and return non-zero.  */
 788
 789 static int
 790 append_equiv_class (list, equiv_class_str, len)
 791      struct Spec_list *list;
 792      const unsigned char *equiv_class_str;
 793      size_t len;
 794 {
 795   struct List_element *new;
 796
 797   if (len != 1)
 798     {
 799       char *tmp = make_printable_str (equiv_class_str, len);
 800
 801       error (0, 0, "%s: equivalence class operand must be a single character",
 802              tmp);
 803       free (tmp);
 804       return 1;
 805     }
 806   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 807   new->next = NULL;
 808   new->type = RE_EQUIV_CLASS;
 809   new->u.equiv_code = *equiv_class_str;
 810   assert (list->tail);
 811   list->tail->next = new;
 812   list->tail = new;
 813   return 0;
 814 }
 815
 816 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
 817    The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
 818    NUL bytes, and is *not* NUL-terminated.  */
 819
 820 static unsigned char *
 821 substr (p, first_idx, last_idx)
 822      const unsigned char *p;
 823      size_t first_idx;
 824      size_t last_idx;
 825 {
 826   size_t len;
 827   unsigned char *tmp;
 828
 829   assert (first_idx <= last_idx);
 830   len = last_idx - first_idx + 1;
 831   tmp = (unsigned char *) xmalloc (len);
 832
 833   assert (first_idx <= last_idx);
 834   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 835   memcpy (tmp, p + first_idx, len);
 836   return tmp;
 837 }
 838
 839 /* Search forward starting at START_IDX for the 2-char sequence
 840    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 841    a sequence is found, return the index of the first character,
 842    otherwise return -1.  P may contain zero bytes.  */
 843
 844 static int
 845 find_closing_delim (p, start_idx, p_len, pre_bracket_char)
 846      const unsigned char *p;
 847      size_t start_idx;
 848      size_t p_len;
 849      unsigned int pre_bracket_char;
 850 {
 851   size_t i;
 852
 853   for (i = start_idx; i < p_len - 1; i++)
 854     if (p[i] == pre_bracket_char && p[i + 1] == ']')
 855       return i;
 856   return -1;
 857 }
 858
 859 /* Convert a string S with explicit length LEN, possibly
 860    containing embedded zero bytes, to a long integer value.
 861    If the string represents a negative value, a value larger
 862    than LONG_MAX, or if all LEN characters do not represent a
 863    valid integer, return non-zero and do not modify *VAL.
 864    Otherwise, return zero and set *VAL to the converted value.  */
 865
 866 static int
 867 non_neg_strtol (s, len, val)
 868      const unsigned char *s;
 869      size_t len;
 870      size_t *val;
 871 {
 872   size_t i;
 873   unsigned long sum = 0;
 874   unsigned int base;
 875
 876   if (len <= 0)
 877     return 1;
 878   if (s[0] == '0')
 879     base = 8;
 880   else if (ISDIGIT (s[0]))
 881     base = 10;
 882   else
 883     return 1;
 884
 885   for (i = 0; i < len; i++)
 886     {
 887       unsigned int c;
 888
 889       if (s[i] < '0')
 890         return 1;
 891
 892       c = s[i] - '0';
 893       if (c >= base)
 894         return 1;
 895
 896       if (sum > (LONG_MAX - c) / base)
 897         return 1;
 898       sum = sum * base + c;
 899     }
 900   *val = sum;
 901   return 0;
 902 }
 903
 904 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 905    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 906    return the character and the repeat count through the arg pointers,
 907    CHAR_TO_REPEAT and N, and then return the index of the closing
 908    bracket as the function value.  If the second character following
 909    the opening bracket is not `*' or if no closing bracket can be
 910    found, return -1.  If a closing bracket is found and the
 911    second char is `*', but the string between the `*' and `]' isn't
 912    empty, an octal number, or a decimal number, print an error message
 913    and return -2.  */
 914
 915 static int
 916 find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n)
 917      const unsigned char *p;
 918      size_t start_idx;
 919      size_t p_len;
 920      unsigned int *char_to_repeat;
 921      size_t *n;
 922 {
 923   size_t i;
 924
 925   assert (start_idx + 1 < p_len);
 926   if (p[start_idx + 1] != '*')
 927     return -1;
 928
 929   for (i = start_idx + 2; i < p_len; i++)
 930     {
 931       if (p[i] == ']')
 932         {
 933           const unsigned char *digit_str;
 934           size_t digit_str_len = i - start_idx - 2;
 935
 936           *char_to_repeat = p[start_idx];
 937           if (digit_str_len == 0)
 938             {
 939               /* We've matched [c*] -- no explicit repeat count.  */
 940               *n = 0;
 941               return i;
 942             }
 943
 944           /* Here, we have found [c*s] where s should be a string
 945              of octal or decimal digits.  */
 946           digit_str = &p[start_idx + 2];
 947           if (non_neg_strtol (digit_str, digit_str_len, n))
 948             {
 949               char *tmp = make_printable_str (digit_str, digit_str_len);
 950               error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp);
 951               free (tmp);
 952               return -2;
 953             }
 954           return i;
 955         }
 956     }
 957   return -1;                    /* No bracket found.  */
 958 }
 959
 960 /* Convert string UNESACPED_STRING (which has been preprocessed to
 961    convert backslash-escape sequences) of length LEN characters into
 962    a linked list of the following 5 types of constructs:
 963       - [:str:] Character class where `str' is one of the 12 valid strings.
 964       - [=c=] Equivalence class where `c' is any single character.
 965       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 966           However, if `n' is present, it must be a non-negative octal or
 967           decimal integer.
 968       - r-s Range of characters from `r' to `s'.  The second endpoint must
 969           not precede the first in the current collating sequence.
 970       - c Any other character is interpreted as itself.  */
 971
 972 static int
 973 build_spec_list (unescaped_string, len, result)
 974      const unsigned char *unescaped_string;
 975      size_t len;
 976      struct Spec_list *result;
 977 {
 978   const unsigned char *p;
 979   size_t i;
 980
 981   p = unescaped_string;
 982
 983   /* The main for-loop below recognizes the 4 multi-character constructs.
 984      A character that matches (in its context) none of the multi-character
 985      constructs is classified as `normal'.  Since all multi-character
 986      constructs have at least 3 characters, any strings of length 2 or
 987      less are composed solely of normal characters.  Hence, the index of
 988      the outer for-loop runs only as far as LEN-2.  */
 989
 990   for (i = 0; i + 2 < len;)
 991     {
 992       switch (p[i])
 993         {
 994           int fall_through;
 995         case '[':
 996           fall_through = 0;
 997           switch (p[i + 1])
 998             {
 999               int closing_delim_idx;
1000               int closing_bracket_idx;
1001               unsigned int char_to_repeat;
1002               size_t repeat_count;
1003             case ':':
1004             case '=':
1005               closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]);
1006               if (closing_delim_idx >= 0)
1007                 {
1008                   int parse_failed;
1009                   unsigned char *opnd_str = substr (p, i + 2,
1010                                                     closing_delim_idx - 1);
1011                   if (p[i + 1] == ':')
1012                     parse_failed = append_char_class (result, opnd_str,
1013                                      (closing_delim_idx - 1) - (i + 2) + 1);
1014                   else
1015                     parse_failed = append_equiv_class (result, opnd_str,
1016                                      (closing_delim_idx - 1) - (i + 2) + 1);
1017                   free (opnd_str);
1018
1019                   /* Return non-zero if append_*_class reports a problem.  */
1020                   if (parse_failed)
1021                     return 1;
1022                   else
1023                     i = closing_delim_idx + 2;
1024                   break;
1025                 }
1026               /* Else fall through.  This could be [:*] or [=*].  */
1027             default:
1028               /* Determine whether this is a bracketed repeat range
1029                  matching the RE \[.\*(dec_or_oct_number)?\].  */
1030               closing_bracket_idx = find_bracketed_repeat (p, i + 1,
1031                                        len, &char_to_repeat, &repeat_count);
1032               if (closing_bracket_idx >= 0)
1033                 {
1034                   append_repeated_char (result, char_to_repeat, repeat_count);
1035                   i = closing_bracket_idx + 1;
1036                   break;
1037                 }
1038               else if (closing_bracket_idx == -1)
1039                 {
1040                   fall_through = 1;
1041                 }
1042               else
1043                 /* Found a string that looked like [c*n] but the
1044                    numeric part was invalid.  */
1045                 return 1;
1046               break;
1047             }
1048           if (!fall_through)
1049             break;
1050
1051           /* Here if we've tried to match [c*n], [:str:], and [=c=]
1052              and none of them fit.  So we still have to consider the
1053              range `[-c' (from `[' to `c').  */
1054         default:
1055           /* Look ahead one char for ranges like a-z.  */
1056           if (p[i + 1] == '-')
1057             {
1058               if (append_range (result, p[i], p[i + 2]))
1059                 return 1;
1060               i += 3;
1061             }
1062           else
1063             {
1064               append_normal_char (result, p[i]);
1065               ++i;
1066             }
1067           break;
1068         }
1069     }
1070
1071   /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1].  */
1072   for (; i < len; i++)
1073     append_normal_char (result, p[i]);
1074
1075   return 0;
1076 }
1077
1078 /* Given a Spec_list S (with its saved state implicit in the values
1079    of its members `tail' and `state'), return the next single character
1080    in the expansion of S's constructs.  If the last character of S was
1081    returned on the previous call or if S was empty, this function
1082    returns -1.  For example, successive calls to get_next where S
1083    represents the spec-string 'a-d[y*3]' will return the sequence
1084    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1085    which the returned character comes is [:upper:] or [:lower:], the
1086    parameter CLASS is given a value to indicate which it was.  Otherwise
1087    CLASS is set to UL_NONE.  This value is used only when constructing
1088    the translation table to verify that any occurrences of upper and
1089    lower class constructs in the spec-strings appear in the same relative
1090    positions.  */
1091
1092 static int
1093 get_next (s, class)
1094      struct Spec_list *s;
1095      enum Upper_Lower_class *class;
1096 {
1097   struct List_element *p;
1098   int return_val;
1099   int i;
1100
1101   if (class)
1102     *class = UL_NONE;
1103
1104   if (s->state == BEGIN_STATE)
1105     {
1106       s->tail = s->head->next;
1107       s->state = NEW_ELEMENT;
1108     }
1109
1110   p = s->tail;
1111   if (p == NULL)
1112     return -1;
1113
1114   switch (p->type)
1115     {
1116     case RE_NORMAL_CHAR:
1117       return_val = p->u.normal_char;
1118       s->state = NEW_ELEMENT;
1119       s->tail = p->next;
1120       break;
1121
1122     case RE_RANGE:
1123       if (s->state == NEW_ELEMENT)
1124         s->state = ORD (p->u.range.first_char);
1125       else
1126         ++(s->state);
1127       return_val = CHR (s->state);
1128       if (s->state == ORD (p->u.range.last_char))
1129         {
1130           s->tail = p->next;
1131           s->state = NEW_ELEMENT;
1132         }
1133       break;
1134
1135     case RE_CHAR_CLASS:
1136       if (s->state == NEW_ELEMENT)
1137         {
1138           for (i = 0; i < N_CHARS; i++)
1139             if (is_char_class_member (p->u.char_class, i))
1140               break;
1141           assert (i < N_CHARS);
1142           s->state = i;
1143         }
1144       assert (is_char_class_member (p->u.char_class, s->state));
1145       return_val = CHR (s->state);
1146       for (i = s->state + 1; i < N_CHARS; i++)
1147         if (is_char_class_member (p->u.char_class, i))
1148           break;
1149       if (i < N_CHARS)
1150         s->state = i;
1151       else
1152         {
1153           s->tail = p->next;
1154           s->state = NEW_ELEMENT;
1155         }
1156       if (class)
1157         {
1158           switch (p->u.char_class)
1159             {
1160             case CC_LOWER:
1161               *class = UL_LOWER;
1162               break;
1163             case CC_UPPER:
1164               *class = UL_UPPER;
1165               break;
1166             default:
1167               /* empty */
1168               break;
1169             }
1170         }
1171       break;
1172
1173     case RE_EQUIV_CLASS:
1174       /* FIXME: this assumes that each character is alone in its own
1175          equivalence class (which appears to be correct for my
1176          LC_COLLATE.  But I don't know of any function that allows
1177          one to determine a character's equivalence class.  */
1178
1179       return_val = p->u.equiv_code;
1180       s->state = NEW_ELEMENT;
1181       s->tail = p->next;
1182       break;
1183
1184     case RE_REPEATED_CHAR:
1185       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1186       if (p->u.repeated_char.repeat_count == 0)
1187         {
1188           s->tail = p->next;
1189           s->state = NEW_ELEMENT;
1190           return_val = get_next (s, class);
1191         }
1192       else
1193         {
1194           if (s->state == NEW_ELEMENT)
1195             {
1196               s->state = 0;
1197             }
1198           ++(s->state);
1199           return_val = p->u.repeated_char.the_repeated_char;
1200           if (p->u.repeated_char.repeat_count > 0
1201               && s->state == p->u.repeated_char.repeat_count)
1202             {
1203               s->tail = p->next;
1204               s->state = NEW_ELEMENT;
1205             }
1206         }
1207       break;
1208
1209     case RE_NO_TYPE:
1210       abort ();
1211       break;
1212
1213     default:
1214       abort ();
1215       break;
1216     }
1217
1218   return return_val;
1219 }
1220
1221 /* This is a minor kludge.  This function is called from
1222    get_spec_stats to determine the cardinality of a set derived
1223    from a complemented string.  It's a kludge in that some of the
1224    same operations are (duplicated) performed in set_initialize.  */
1225
1226 static int
1227 card_of_complement (s)
1228      struct Spec_list *s;
1229 {
1230   int c;
1231   int cardinality = N_CHARS;
1232   SET_TYPE in_set[N_CHARS];
1233
1234   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1235   s->state = BEGIN_STATE;
1236   while ((c = get_next (s, NULL)) != -1)
1237     if (!in_set[c]++)
1238       --cardinality;
1239   return cardinality;
1240 }
1241
1242 /* Gather statistics about the spec-list S in preparation for the tests
1243    in validate that determine the consistency of the specs.  This function
1244    is called at most twice; once for string1, and again for any string2.
1245    LEN_S1 < 0 indicates that this is the first call and that S represents
1246    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1247    constructs in string1, and we can use its value to resolve any
1248    indefinite repeat construct in S (which represents string2).  Hence,
1249    this function has the side-effect that it converts a valid [c*]
1250    construct in string2 to [c*n] where n is large enough (or 0) to give
1251    string2 the same length as string1.  For example, with the command
1252    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1253    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1254
1255 static void
1256 get_spec_stats (s)
1257      struct Spec_list *s;
1258 {
1259   struct List_element *p;
1260   int len = 0;
1261
1262   s->n_indefinite_repeats = 0;
1263   s->has_equiv_class = 0;
1264   s->has_restricted_char_class = 0;
1265   s->has_upper_or_lower = 0;
1266   for (p = s->head->next; p; p = p->next)
1267     {
1268       switch (p->type)
1269         {
1270           int i;
1271         case RE_NORMAL_CHAR:
1272           ++len;
1273           break;
1274
1275         case RE_RANGE:
1276           assert (p->u.range.last_char >= p->u.range.first_char);
1277           len += p->u.range.last_char - p->u.range.first_char + 1;
1278           break;
1279
1280         case RE_CHAR_CLASS:
1281           for (i = 0; i < N_CHARS; i++)
1282             if (is_char_class_member (p->u.char_class, i))
1283               ++len;
1284           switch (p->u.char_class)
1285             {
1286             case CC_UPPER:
1287             case CC_LOWER:
1288               s->has_upper_or_lower = 1;
1289               break;
1290             default:
1291               s->has_restricted_char_class = 1;
1292               break;
1293             }
1294           break;
1295
1296         case RE_EQUIV_CLASS:
1297           for (i = 0; i < N_CHARS; i++)
1298             if (is_equiv_class_member (p->u.equiv_code, i))
1299               ++len;
1300           s->has_equiv_class = 1;
1301           break;
1302
1303         case RE_REPEATED_CHAR:
1304           if (p->u.repeated_char.repeat_count > 0)
1305             len += p->u.repeated_char.repeat_count;
1306           else if (p->u.repeated_char.repeat_count == 0)
1307             {
1308               s->indefinite_repeat_element = p;
1309               ++(s->n_indefinite_repeats);
1310             }
1311           break;
1312
1313         case RE_NO_TYPE:
1314           assert (0);
1315           break;
1316         }
1317     }
1318
1319   s->length = len;
1320 }
1321
1322 static void
1323 get_s1_spec_stats (s1)
1324      struct Spec_list *s1;
1325 {
1326   get_spec_stats (s1);
1327   if (complement)
1328     s1->length = card_of_complement (s1);
1329 }
1330
1331 static void
1332 get_s2_spec_stats (s2, len_s1)
1333      struct Spec_list *s2;
1334      size_t len_s1;
1335 {
1336   get_spec_stats (s2);
1337   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1338     {
1339       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1340         len_s1 - s2->length;
1341       s2->length = len_s1;
1342     }
1343 }
1344
1345 static void
1346 spec_init (spec_list)
1347      struct Spec_list *spec_list;
1348 {
1349   spec_list->head = spec_list->tail =
1350     (struct List_element *) xmalloc (sizeof (struct List_element));
1351   spec_list->head->next = NULL;
1352 }
1353
1354 /* This function makes two passes over the argument string S.  The first
1355    one converts all \c and \ddd escapes to their one-byte representations.
1356    The second constructs a linked specification list, SPEC_LIST, of the
1357    characters and constructs that comprise the argument string.  If either
1358    of these passes detects an error, this function returns non-zero.  */
1359
1360 static int
1361 parse_str (s, spec_list)
1362      unsigned char *s;
1363      struct Spec_list *spec_list;
1364 {
1365   size_t len;
1366
1367   if (unquote (s, &len))
1368     return 1;
1369   if (build_spec_list (s, len, spec_list))
1370     return 1;
1371   return 0;
1372 }
1373
1374 /* Given two specification lists, S1 and S2, and assuming that
1375    S1->length > S2->length, append a single [c*n] element to S2 where c
1376    is the last character in the expansion of S2 and n is the difference
1377    between the two lengths.
1378    Upon successful completion, S2->length is set to S1->length.  The only
1379    way this function can fail to make S2 as long as S1 is when S2 has
1380    zero-length, since in that case, there is no last character to repeat.
1381    So S2->length is required to be at least 1.
1382
1383    Providing this functionality allows the user to do some pretty
1384    non-BSD (and non-portable) things:  For example, the command
1385        tr -cs '[:upper:]0-9' '[:lower:]'
1386    is almost guaranteed to give results that depend on your collating
1387    sequence.  */
1388
1389 static void
1390 string2_extend (s1, s2)
1391      const struct Spec_list *s1;
1392      struct Spec_list *s2;
1393 {
1394   struct List_element *p;
1395   int char_to_repeat;
1396   int i;
1397
1398   assert (translating);
1399   assert (s1->length > s2->length);
1400   assert (s2->length > 0);
1401
1402   p = s2->tail;
1403   switch (p->type)
1404     {
1405     case RE_NORMAL_CHAR:
1406       char_to_repeat = p->u.normal_char;
1407       break;
1408     case RE_RANGE:
1409       char_to_repeat = p->u.range.last_char;
1410       break;
1411     case RE_CHAR_CLASS:
1412       for (i = N_CHARS; i >= 0; i--)
1413         if (is_char_class_member (p->u.char_class, i))
1414           break;
1415       assert (i >= 0);
1416       char_to_repeat = CHR (i);
1417       break;
1418
1419     case RE_REPEATED_CHAR:
1420       char_to_repeat = p->u.repeated_char.the_repeated_char;
1421       break;
1422
1423     case RE_EQUIV_CLASS:
1424       /* This shouldn't happen, because validate exits with an error
1425          if it finds an equiv class in string2 when translating.  */
1426       abort ();
1427       break;
1428
1429     case RE_NO_TYPE:
1430       abort ();
1431       break;
1432
1433     default:
1434       abort ();
1435       break;
1436     }
1437
1438   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1439   s2->length = s1->length;
1440 }
1441
1442 /* Die with an error message if S1 and S2 describe strings that
1443    are not valid with the given command line switches.
1444    A side effect of this function is that if a valid [c*] or
1445    [c*0] construct appears in string2, it is converted to [c*n]
1446    with a value for n that makes s2->length == s1->length.  By
1447    the same token, if the --truncate-set1 option is not
1448    given, S2 may be extended.  */
1449
1450 static void
1451 validate (s1, s2)
1452      const struct Spec_list *s1;
1453      struct Spec_list *s2;
1454 {
1455   get_s1_spec_stats (s1);
1456   if (s1->n_indefinite_repeats > 0)
1457     {
1458       error (1, 0, "the [c*] repeat construct may not appear in string1");
1459     }
1460
1461   /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1462      but in the spirit of the other restrictions put on translation
1463      with character classes, this seems a logical interpretation.  */
1464   if (complement && s1->has_upper_or_lower)
1465     {
1466       error (1, 0,
1467              "character classes may not be used when translating \
1468 and complementing");
1469     }
1470
1471   if (s2)
1472     {
1473       get_s2_spec_stats (s2, s1->length);
1474       if (s2->has_restricted_char_class)
1475         {
1476           error (1, 0,
1477                  "when translating, the only character classes that may \
1478 appear in\n\tstring2 are `upper' and `lower'");
1479         }
1480
1481       if (s2->n_indefinite_repeats > 1)
1482         {
1483           error (1, 0, "only one [c*] repeat construct may appear in string2");
1484         }
1485
1486       if (translating)
1487         {
1488           if (s2->has_equiv_class)
1489             {
1490               error (1, 0,
1491                      "[=c=] expressions may not appear in string2 \
1492 when translating");
1493             }
1494
1495           if (s1->length > s2->length)
1496             {
1497               if (!truncate_set1)
1498                 {
1499                   /* string2 must be non-empty unless --truncate-set1 is
1500                      given or string1 is empty.  */
1501
1502                   if (s2->length == 0)
1503                     error (1, 0,
1504                      "when not truncating set1, string2 must be non-empty");
1505                   string2_extend (s1, s2);
1506                 }
1507             }
1508
1509           if (complement && s2->has_upper_or_lower)
1510             error (1, 0,
1511                    "character classes may not be used when translating \
1512 and complementing");
1513         }
1514       else
1515         /* Not translating.  */
1516         {
1517           if (s2->n_indefinite_repeats > 0)
1518             error (1, 0,
1519                    "the [c*] construct may appear in string2 only \
1520 when translating");
1521         }
1522     }
1523 }
1524
1525 /* Read buffers of SIZE bytes via the function READER (if READER is
1526    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1527    read_and_delete or read_and_xlate.  After each buffer is read, it is
1528    processed and written to stdout.  The buffers are processed so that
1529    multiple consecutive occurrences of the same character in the input
1530    stream are replaced by a single occurrence of that character if the
1531    character is in the squeeze set.  */
1532
1533 static void
1534 squeeze_filter (buf, size, reader)
1535      unsigned char *buf;
1536      long int size;
1537      PFI reader;
1538 {
1539   unsigned int char_to_squeeze = NOT_A_CHAR;
1540   int i = 0;
1541   int nr = 0;
1542
1543   for (;;)
1544     {
1545       int begin;
1546
1547       if (i >= nr)
1548         {
1549           if (reader == NULL)
1550             nr = safe_read (0, (char *) buf, size);
1551           else
1552             nr = (*reader) (buf, size, NULL);
1553
1554           if (nr < 0)
1555             error (1, errno, "read error");
1556           if (nr == 0)
1557             break;
1558           i = 0;
1559         }
1560
1561       begin = i;
1562
1563       if (char_to_squeeze == NOT_A_CHAR)
1564         {
1565           int out_len;
1566           /* Here, by being a little tricky, we can get a significant
1567              performance increase in most cases when the input is
1568              reasonably large.  Since tr will modify the input only
1569              if two consecutive (and identical) input characters are
1570              in the squeeze set, we can step by two through the data
1571              when searching for a character in the squeeze set.  This
1572              means there may be a little more work in a few cases and
1573              perhaps twice as much work in the worst cases where most
1574              of the input is removed by squeezing repeats.  But most
1575              uses of this functionality seem to remove less than 20-30%
1576              of the input.  */
1577           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1578             ;                   /* empty */
1579
1580           /* There is a special case when i == nr and we've just
1581              skipped a character (the last one in buf) that is in
1582              the squeeze set.  */
1583           if (i == nr && in_squeeze_set[buf[i - 1]])
1584             --i;
1585
1586           if (i >= nr)
1587             out_len = nr - begin;
1588           else
1589             {
1590               char_to_squeeze = buf[i];
1591               /* We're about to output buf[begin..i].  */
1592               out_len = i - begin + 1;
1593
1594               /* But since we stepped by 2 in the loop above,
1595                  out_len may be one too large.  */
1596               if (i > 0 && buf[i - 1] == char_to_squeeze)
1597                 --out_len;
1598
1599               /* Advance i to the index of first character to be
1600                  considered when looking for a char different from
1601                  char_to_squeeze.  */
1602               ++i;
1603             }
1604           if (out_len > 0
1605               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1606             error (1, errno, "write error");
1607         }
1608
1609       if (char_to_squeeze != NOT_A_CHAR)
1610         {
1611           /* Advance i to index of first char != char_to_squeeze
1612              (or to nr if all the rest of the characters in this
1613              buffer are the same as char_to_squeeze).  */
1614           for (; i < nr && buf[i] == char_to_squeeze; i++)
1615             ;                   /* empty */
1616           if (i < nr)
1617             char_to_squeeze = NOT_A_CHAR;
1618           /* If (i >= nr) we've squeezed the last character in this buffer.
1619              So now we have to read a new buffer and continue comparing
1620              characters against char_to_squeeze.  */
1621         }
1622     }
1623 }
1624
1625 /* Read buffers of SIZE bytes from stdin until one is found that
1626    contains at least one character not in the delete set.  Store
1627    in the array BUF, all characters from that buffer that are not
1628    in the delete set, and return the number of characters saved
1629    or 0 upon EOF.  */
1630
1631 static long
1632 read_and_delete (buf, size, not_used)
1633      unsigned char *buf;
1634      long int size;
1635      PFI not_used;
1636 {
1637   long n_saved;
1638   static int hit_eof = 0;
1639
1640   assert (not_used == NULL);
1641   assert (size > 0);
1642
1643   if (hit_eof)
1644     return 0;
1645
1646   /* This enclosing do-while loop is to make sure that
1647      we don't return zero (indicating EOF) when we've
1648      just deleted all the characters in a buffer.  */
1649   do
1650     {
1651       int i;
1652       int nr = safe_read (0, (char *) buf, size);
1653
1654       if (nr < 0)
1655         error (1, errno, "read error");
1656       if (nr == 0)
1657         {
1658           hit_eof = 1;
1659           return 0;
1660         }
1661
1662       /* This first loop may be a waste of code, but gives much
1663          better performance when no characters are deleted in
1664          the beginning of a buffer.  It just avoids the copying
1665          of buf[i] into buf[n_saved] when it would be a NOP.  */
1666
1667       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1668         /* empty */ ;
1669       n_saved = i;
1670
1671       for (++i; i < nr; i++)
1672         if (!in_delete_set[buf[i]])
1673           buf[n_saved++] = buf[i];
1674     }
1675   while (n_saved == 0);
1676
1677   return n_saved;
1678 }
1679
1680 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1681    perform the in-place and one-to-one mapping specified by the global
1682    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1683
1684 static long
1685 read_and_xlate (buf, size, not_used)
1686      unsigned char *buf;
1687      long int size;
1688      PFI not_used;
1689 {
1690   long chars_read = 0;
1691   static int hit_eof = 0;
1692   int i;
1693
1694   assert (not_used == NULL);
1695   assert (size > 0);
1696
1697   if (hit_eof)
1698     return 0;
1699
1700   chars_read = safe_read (0, (char *) buf, size);
1701   if (chars_read < 0)
1702     error (1, errno, "read error");
1703   if (chars_read == 0)
1704     {
1705       hit_eof = 1;
1706       return 0;
1707     }
1708
1709   for (i = 0; i < chars_read; i++)
1710     buf[i] = xlate[buf[i]];
1711
1712   return chars_read;
1713 }
1714
1715 /* Initialize a boolean membership set IN_SET with the character
1716    values obtained by traversing the linked list of constructs S
1717    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1718    non-zero the resulting set is complemented.  */
1719
1720 static void
1721 set_initialize (s, complement_this_set, in_set)
1722      struct Spec_list *s;
1723      int complement_this_set;
1724      SET_TYPE *in_set;
1725 {
1726   int c;
1727   int i;
1728
1729   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1730   s->state = BEGIN_STATE;
1731   while ((c = get_next (s, NULL)) != -1)
1732     in_set[c] = 1;
1733   if (complement_this_set)
1734     for (i = 0; i < N_CHARS; i++)
1735       in_set[i] = (!in_set[i]);
1736 }
1737
1738 void
1739 main (argc, argv)
1740      int argc;
1741      char **argv;
1742 {
1743   int c;
1744   int non_option_args;
1745   struct Spec_list buf1, buf2;
1746   struct Spec_list *s1 = &buf1;
1747   struct Spec_list *s2 = &buf2;
1748
1749   program_name = argv[0];
1750
1751   while ((c = getopt_long (argc, argv, "cdst", long_options,
1752                            (int *) 0)) != EOF)
1753     {
1754       switch (c)
1755         {
1756         case 0:
1757           break;
1758
1759         case 'c':
1760           complement = 1;
1761           break;
1762
1763         case 'd':
1764           delete = 1;
1765           break;
1766
1767         case 's':
1768           squeeze_repeats = 1;
1769           break;
1770
1771         case 't':
1772           truncate_set1 = 1;
1773           break;
1774
1775         default:
1776           usage (2);
1777           break;
1778         }
1779     }
1780
1781   if (show_version)
1782     {
1783       printf ("tr - %s\n", version_string);
1784       exit (0);
1785     }
1786
1787   if (show_help)
1788     usage (0);
1789
1790   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1791
1792   non_option_args = argc - optind;
1793   translating = (non_option_args == 2 && !delete);
1794
1795   /* Change this test if it is valid to give tr no options and
1796      no args at all.  POSIX doesn't specifically say anything
1797      either way, but it looks like they implied it's invalid
1798      by omission.  If you want to make tr do a slow imitation
1799      of `cat' use `tr a a'.  */
1800   if (non_option_args > 2)
1801     {
1802       error (0, 0, "too many arguments");
1803       usage (2);
1804     }
1805
1806   if (!delete && !squeeze_repeats && non_option_args != 2)
1807     error (1, 0, "two strings must be given when translating");
1808
1809   if (delete && squeeze_repeats && non_option_args != 2)
1810     error (1, 0, "two strings must be given when both \
1811 deleting and squeezing repeats");
1812
1813   /* If --delete is given without --squeeze-repeats, then
1814      only one string argument may be specified.  But POSIX
1815      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1816      is set, pretend we never saw string2.  But I think
1817      this deserves a fatal error, so that's the default.  */
1818   if ((delete && !squeeze_repeats) && non_option_args != 1)
1819     {
1820       if (posix_pedantic && non_option_args == 2)
1821         --non_option_args;
1822       else
1823         error (1, 0,
1824                "only one string may be given when deleting \
1825 without squeezing repeats");
1826     }
1827
1828   if (squeeze_repeats && non_option_args == 0)
1829     error (1, 0, "at least one string must be given when squeezing repeats");
1830
1831   spec_init (s1);
1832   if (parse_str ((unsigned char *) argv[optind], s1))
1833     exit (1);
1834
1835   if (non_option_args == 2)
1836     {
1837       spec_init (s2);
1838       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1839         exit (1);
1840     }
1841   else
1842     s2 = NULL;
1843
1844   validate (s1, s2);
1845
1846   if (squeeze_repeats && non_option_args == 1)
1847     {
1848       set_initialize (s1, complement, in_squeeze_set);
1849       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1850     }
1851   else if (delete && non_option_args == 1)
1852     {
1853       int nr;
1854
1855       set_initialize (s1, complement, in_delete_set);
1856       do
1857         {
1858           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1859           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1860             error (1, errno, "write error");
1861         }
1862       while (nr > 0);
1863     }
1864   else if (squeeze_repeats && delete && non_option_args == 2)
1865     {
1866       set_initialize (s1, complement, in_delete_set);
1867       set_initialize (s2, 0, in_squeeze_set);
1868       squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1869     }
1870   else if (translating)
1871     {
1872       if (complement)
1873         {
1874           int i;
1875           SET_TYPE *in_s1 = in_delete_set;
1876
1877           set_initialize (s1, 0, in_s1);
1878           s2->state = BEGIN_STATE;
1879           for (i = 0; i < N_CHARS; i++)
1880             xlate[i] = i;
1881           for (i = 0; i < N_CHARS; i++)
1882             {
1883               if (!in_s1[i])
1884                 {
1885                   int ch = get_next (s2, NULL);
1886                   assert (ch != -1 || truncate_set1);
1887                   if (ch == -1)
1888                     {
1889                       /* This will happen when tr is invoked like e.g.
1890                          tr -cs A-Za-z0-9 '\012'.  */
1891                       break;
1892                     }
1893                   xlate[i] = ch;
1894                 }
1895             }
1896           assert (get_next (s2, NULL) == -1 || truncate_set1);
1897         }
1898       else
1899         {
1900           int c1, c2;
1901           int i;
1902           enum Upper_Lower_class class_s1;
1903           enum Upper_Lower_class class_s2;
1904
1905           for (i = 0; i < N_CHARS; i++)
1906             xlate[i] = i;
1907           s1->state = BEGIN_STATE;
1908           s2->state = BEGIN_STATE;
1909           for (;;)
1910             {
1911               c1 = get_next (s1, &class_s1);
1912               c2 = get_next (s2, &class_s2);
1913               if (!class_ok[(int) class_s1][(int) class_s2])
1914                 error (1, 0,
1915                      "misaligned or mismatched upper and/or lower classes");
1916               /* The following should have been checked by validate...  */
1917               if (c2 == -1)
1918                 break;
1919               xlate[c1] = c2;
1920             }
1921           assert (c1 == -1 || truncate_set1);
1922         }
1923       if (squeeze_repeats)
1924         {
1925           set_initialize (s2, 0, in_squeeze_set);
1926           squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1927         }
1928       else
1929         {
1930           int chars_read;
1931
1932           do
1933             {
1934               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1935               if (chars_read > 0
1936                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1937                 error (1, errno, "write error");
1938             }
1939           while (chars_read > 0);
1940         }
1941     }
1942
1943   if (fclose (stdout) == EOF)
1944     error (2, errno, "write error");
1945
1946   if (close (0) != 0)
1947     error (2, errno, "standard input");
1948
1949   exit (0);
1950 }