src/tr.c

   1 /* tr -- a filter to translate characters
   2    Copyright (C) 91, 1995-2002 Free Software Foundation, Inc.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation; either version 2, or (at your option)
   7    any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, write to the Free Software Foundation,
  16    Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  17
  18 /* Written by Jim Meyering */
  19
  20 #include <config.h>
  21
  22 #include <stdio.h>
  23 #include <assert.h>
  24 #include <errno.h>
  25 #include <sys/types.h>
  26 #include <getopt.h>
  27
  28 #include "system.h"
  29 #include "closeout.h"
  30 #include "error.h"
  31 #include "safe-read.h"
  32 #include "xstrtol.h"
  33
  34 /* The official name of this program (e.g., no `g' prefix).  */
  35 #define PROGRAM_NAME "tr"
  36
  37 #define AUTHORS "Jim Meyering"
  38
  39 #define N_CHARS (UCHAR_MAX + 1)
  40
  41 /* A pointer to a filtering function.  */
  42 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
  43
  44 /* Convert from character C to its index in the collating
  45    sequence array.  Just cast to an unsigned int to avoid
  46    problems with sign-extension.  */
  47 #define ORD(c) (unsigned int)(c)
  48
  49 /* The inverse of ORD.  */
  50 #define CHR(i) (unsigned char)(i)
  51
  52 /* The value for Spec_list->state that indicates to
  53    get_next that it should initialize the tail pointer.
  54    Its value should be as large as possible to avoid conflict
  55    a valid value for the state field -- and that may be as
  56    large as any valid repeat_count.  */
  57 #define BEGIN_STATE (INT_MAX - 1)
  58
  59 /* The value for Spec_list->state that indicates to
  60    get_next that the element pointed to by Spec_list->tail is
  61    being considered for the first time on this pass through the
  62    list -- it indicates that get_next should make any necessary
  63    initializations.  */
  64 #define NEW_ELEMENT (BEGIN_STATE + 1)
  65
  66 /* A value distinct from any character that may have been stored in a
  67    buffer as the result of a block-read in the function squeeze_filter.  */
  68 #define NOT_A_CHAR (unsigned int)(-1)
  69
  70 /* The following (but not CC_NO_CLASS) are indices into the array of
  71    valid character class strings.  */
  72 enum Char_class
  73   {
  74     CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
  75     CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
  76     CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
  77     CC_NO_CLASS = 9999
  78   };
  79
  80 /* Character class to which a character (returned by get_next) belonged;
  81    but it is set only if the construct from which the character was obtained
  82    was one of the character classes [:upper:] or [:lower:].  The value
  83    is used only when translating and then, only to make sure that upper
  84    and lower class constructs have the same relative positions in string1
  85    and string2.  */
  86 enum Upper_Lower_class
  87   {
  88     UL_LOWER = 0,
  89     UL_UPPER = 1,
  90     UL_NONE = 2
  91   };
  92
  93 /* A shortcut to ensure that when constructing the translation array,
  94    one of the values returned by paired calls to get_next (from s1 and s2)
  95    is from [:upper:] and the other is from [:lower:], or neither is from
  96    upper or lower.  By default, GNU tr permits the identity mappings: from
  97    [:upper:] to [:upper:] and [:lower:] to [:lower:].  But when
  98    POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
  99    by values of type enum Upper_Lower_class.  */
 100 static int const class_ok[3][3] =
 101 {
 102   {1, 1, 0},
 103   {1, 1, 0},
 104   {0, 0, 1}
 105 };
 106
 107 /* The type of a List_element.  See build_spec_list for more details.  */
 108 enum Range_element_type
 109   {
 110     RE_NO_TYPE = 0,
 111     RE_NORMAL_CHAR,
 112     RE_RANGE,
 113     RE_CHAR_CLASS,
 114     RE_EQUIV_CLASS,
 115     RE_REPEATED_CHAR
 116   };
 117
 118 /* One construct in one of tr's argument strings.
 119    For example, consider the POSIX version of the classic tr command:
 120        tr -cs 'a-zA-Z_' '[\n*]'
 121    String1 has 3 constructs, two of which are ranges (a-z and A-Z),
 122    and a single normal character, `_'.  String2 has one construct.  */
 123 struct List_element
 124   {
 125     enum Range_element_type type;
 126     struct List_element *next;
 127     union
 128       {
 129         int normal_char;
 130         struct                  /* unnamed */
 131           {
 132             unsigned int first_char;
 133             unsigned int last_char;
 134           }
 135         range;
 136         enum Char_class char_class;
 137         int equiv_code;
 138         struct                  /* unnamed */
 139           {
 140             unsigned int the_repeated_char;
 141             size_t repeat_count;
 142           }
 143         repeated_char;
 144       }
 145     u;
 146   };
 147
 148 /* Each of tr's argument strings is parsed into a form that is easier
 149    to work with: a linked list of constructs (struct List_element).
 150    Each Spec_list structure also encapsulates various attributes of
 151    the corresponding argument string.  The attributes are used mainly
 152    to verify that the strings are valid in the context of any options
 153    specified (like -s, -d, or -c).  The main exception is the member
 154    `tail', which is first used to construct the list.  After construction,
 155    it is used by get_next to save its state when traversing the list.
 156    The member `state' serves a similar function.  */
 157 struct Spec_list
 158   {
 159     /* Points to the head of the list of range elements.
 160        The first struct is a dummy; its members are never used.  */
 161     struct List_element *head;
 162
 163     /* When appending, points to the last element.  When traversing via
 164        get_next(), points to the element to process next.  Setting
 165        Spec_list.state to the value BEGIN_STATE before calling get_next
 166        signals get_next to initialize tail to point to head->next.  */
 167     struct List_element *tail;
 168
 169     /* Used to save state between calls to get_next().  */
 170     unsigned int state;
 171
 172     /* Length, in the sense that length ('a-z[:digit:]123abc')
 173        is 42 ( = 26 + 10 + 6).  */
 174     size_t length;
 175
 176     /* The number of [c*] and [c*0] constructs that appear in this spec.  */
 177     int n_indefinite_repeats;
 178
 179     /* If n_indefinite_repeats is nonzero, this points to the List_element
 180        corresponding to the last [c*] or [c*0] construct encountered in
 181        this spec.  Otherwise it is undefined.  */
 182     struct List_element *indefinite_repeat_element;
 183
 184     /* Non-zero if this spec contains at least one equivalence
 185        class construct e.g. [=c=].  */
 186     int has_equiv_class;
 187
 188     /* Non-zero if this spec contains at least one character class
 189        construct.  E.g. [:digit:].  */
 190     int has_char_class;
 191
 192     /* Non-zero if this spec contains at least one of the character class
 193        constructs (all but upper and lower) that aren't allowed in s2.  */
 194     int has_restricted_char_class;
 195   };
 196
 197 /* A representation for escaped string1 or string2.  As a string is parsed,
 198    any backslash-escaped characters (other than octal or \a, \b, \f, \n,
 199    etc.) are marked as such in this structure by setting the corresponding
 200    entry in the ESCAPED vector.  */
 201 struct E_string
 202 {
 203   unsigned char *s;
 204   int *escaped;
 205   size_t len;
 206 };
 207
 208 /* Return nonzero if the Ith character of escaped string ES matches C
 209    and is not escaped itself.  */
 210 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
 211
 212 /* The name by which this program was run.  */
 213 char *program_name;
 214
 215 /* When nonzero, each sequence in the input of a repeated character
 216    (call it c) is replaced (in the output) by a single occurrence of c
 217    for every c in the squeeze set.  */
 218 static int squeeze_repeats = 0;
 219
 220 /* When nonzero, removes characters in the delete set from input.  */
 221 static int delete = 0;
 222
 223 /* Use the complement of set1 in place of set1.  */
 224 static int complement = 0;
 225
 226 /* When nonzero, this flag causes GNU tr to provide strict
 227    compliance with POSIX draft 1003.2.11.2.  The POSIX spec
 228    says that when -d is used without -s, string2 (if present)
 229    must be ignored.  Silently ignoring arguments is a bad idea.
 230    The default GNU behavior is to give a usage message and exit.
 231    Additionally, when this flag is nonzero, tr prints warnings
 232    on stderr if it is being used in a manner that is not portable.
 233    Applicable warnings are given by default, but are suppressed
 234    if the environment variable `POSIXLY_CORRECT' is set, since
 235    being POSIX conformant means we can't issue such messages.
 236    Warnings on the following topics are suppressed when this
 237    variable is nonzero:
 238    1. Ambiguous octal escapes.  */
 239 static int posix_pedantic;
 240
 241 /* When tr is performing translation and string1 is longer than string2,
 242    POSIX says that the result is undefined.  That gives the implementor
 243    of a POSIX conforming version of tr two reasonable choices for the
 244    semantics of this case.
 245
 246    * The BSD tr pads string2 to the length of string1 by
 247    repeating the last character in string2.
 248
 249    * System V tr ignores characters in string1 that have no
 250    corresponding character in string2.  That is, string1 is effectively
 251    truncated to the length of string2.
 252
 253    When nonzero, this flag causes GNU tr to imitate the behavior
 254    of System V tr when translating with string1 longer than string2.
 255    The default is to emulate BSD tr.  This flag is ignored in modes where
 256    no translation is performed.  Emulating the System V tr
 257    in this exceptional case causes the relatively common BSD idiom:
 258
 259        tr -cs A-Za-z0-9 '\012'
 260
 261    to break (it would convert only zero bytes, rather than all
 262    non-alphanumerics, to newlines).
 263
 264    WARNING: This switch does not provide general BSD or System V
 265    compatibility.  For example, it doesn't disable the interpretation
 266    of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
 267    some unfortunate coincidence you use such constructs in scripts
 268    expecting to use some other version of tr, the scripts will break.  */
 269 static int truncate_set1 = 0;
 270
 271 /* An alias for (!delete && non_option_args == 2).
 272    It is set in main and used there and in validate().  */
 273 static int translating;
 274
 275 #ifndef BUFSIZ
 276 # define BUFSIZ 8192
 277 #endif
 278
 279 #define IO_BUF_SIZE BUFSIZ
 280 static unsigned char io_buf[IO_BUF_SIZE];
 281
 282 static char const *const char_class_name[] =
 283 {
 284   "alnum", "alpha", "blank", "cntrl", "digit", "graph",
 285   "lower", "print", "punct", "space", "upper", "xdigit"
 286 };
 287 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
 288
 289 typedef char SET_TYPE;
 290
 291 /* Array of boolean values.  A character `c' is a member of the
 292    squeeze set if and only if in_squeeze_set[c] is true.  The squeeze
 293    set is defined by the last (possibly, the only) string argument
 294    on the command line when the squeeze option is given.  */
 295 static SET_TYPE in_squeeze_set[N_CHARS];
 296
 297 /* Array of boolean values.  A character `c' is a member of the
 298    delete set if and only if in_delete_set[c] is true.  The delete
 299    set is defined by the first (or only) string argument on the
 300    command line when the delete option is given.  */
 301 static SET_TYPE in_delete_set[N_CHARS];
 302
 303 /* Array of character values defining the translation (if any) that
 304    tr is to perform.  Translation is performed only when there are
 305    two specification strings and the delete switch is not given.  */
 306 static char xlate[N_CHARS];
 307
 308 static struct option const long_options[] =
 309 {
 310   {"complement", no_argument, NULL, 'c'},
 311   {"delete", no_argument, NULL, 'd'},
 312   {"squeeze-repeats", no_argument, NULL, 's'},
 313   {"truncate-set1", no_argument, NULL, 't'},
 314   {GETOPT_HELP_OPTION_DECL},
 315   {GETOPT_VERSION_OPTION_DECL},
 316   {NULL, 0, NULL, 0}
 317 };
 318 \f
 319 void
 320 usage (int status)
 321 {
 322   if (status != 0)
 323     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 324              program_name);
 325   else
 326     {
 327       printf (_("\
 328 Usage: %s [OPTION]... SET1 [SET2]\n\
 329 "),
 330               program_name);
 331       fputs (_("\
 332 Translate, squeeze, and/or delete characters from standard input,\n\
 333 writing to standard output.\n\
 334 \n\
 335   -c, --complement        first complement SET1\n\
 336   -d, --delete            delete characters in SET1, do not translate\n\
 337   -s, --squeeze-repeats   replace each input sequence of a repeated character\n\
 338                             that is listed in SET1 with a single occurrence\n\
 339                             of that character\n\
 340   -t, --truncate-set1     first truncate SET1 to length of SET2\n\
 341 "), stdout);
 342       fputs (HELP_OPTION_DESCRIPTION, stdout);
 343       fputs (VERSION_OPTION_DESCRIPTION, stdout);
 344       fputs (_("\
 345 \n\
 346 SETs are specified as strings of characters.  Most represent themselves.\n\
 347 Interpreted sequences are:\n\
 348 \n\
 349   \\NNN            character with octal value NNN (1 to 3 octal digits)\n\
 350   \\\\              backslash\n\
 351   \\a              audible BEL\n\
 352   \\b              backspace\n\
 353   \\f              form feed\n\
 354   \\n              new line\n\
 355   \\r              return\n\
 356   \\t              horizontal tab\n\
 357 "), stdout);
 358      fputs (_("\
 359   \\v              vertical tab\n\
 360   CHAR1-CHAR2     all characters from CHAR1 to CHAR2 in ascending order\n\
 361   [CHAR*]         in SET2, copies of CHAR until length of SET1\n\
 362   [CHAR*REPEAT]   REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
 363   [:alnum:]       all letters and digits\n\
 364   [:alpha:]       all letters\n\
 365   [:blank:]       all horizontal whitespace\n\
 366   [:cntrl:]       all control characters\n\
 367   [:digit:]       all digits\n\
 368 "), stdout);
 369      fputs (_("\
 370   [:graph:]       all printable characters, not including space\n\
 371   [:lower:]       all lower case letters\n\
 372   [:print:]       all printable characters, including space\n\
 373   [:punct:]       all punctuation characters\n\
 374   [:space:]       all horizontal or vertical whitespace\n\
 375   [:upper:]       all upper case letters\n\
 376   [:xdigit:]      all hexadecimal digits\n\
 377   [=CHAR=]        all characters which are equivalent to CHAR\n\
 378 "), stdout);
 379      fputs (_("\
 380 \n\
 381 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
 382 -t may be used only when translating.  SET2 is extended to length of\n\
 383 SET1 by repeating its last character as necessary.  \
 384 "), stdout);
 385      fputs (_("\
 386 Excess characters\n\
 387 of SET2 are ignored.  Only [:lower:] and [:upper:] are guaranteed to\n\
 388 expand in ascending order; used in SET2 while translating, they may\n\
 389 only be used in pairs to specify case conversion.  \
 390 "), stdout);
 391      fputs (_("\
 392 -s uses SET1 if not\n\
 393 translating nor deleting; else squeezing uses SET2 and occurs after\n\
 394 translation or deletion.\n\
 395 "), stdout);
 396       printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 397     }
 398   exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 399 }
 400
 401 /* Return nonzero if the character C is a member of the
 402    equivalence class containing the character EQUIV_CLASS.  */
 403
 404 static int
 405 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
 406 {
 407   return (equiv_class == c);
 408 }
 409
 410 /* Return nonzero if the character C is a member of the
 411    character class CHAR_CLASS.  */
 412
 413 static int
 414 is_char_class_member (enum Char_class char_class, unsigned int c)
 415 {
 416   int result;
 417
 418   switch (char_class)
 419     {
 420     case CC_ALNUM:
 421       result = ISALNUM (c);
 422       break;
 423     case CC_ALPHA:
 424       result = ISALPHA (c);
 425       break;
 426     case CC_BLANK:
 427       result = ISBLANK (c);
 428       break;
 429     case CC_CNTRL:
 430       result = ISCNTRL (c);
 431       break;
 432     case CC_DIGIT:
 433       result = ISDIGIT_LOCALE (c);
 434       break;
 435     case CC_GRAPH:
 436       result = ISGRAPH (c);
 437       break;
 438     case CC_LOWER:
 439       result = ISLOWER (c);
 440       break;
 441     case CC_PRINT:
 442       result = ISPRINT (c);
 443       break;
 444     case CC_PUNCT:
 445       result = ISPUNCT (c);
 446       break;
 447     case CC_SPACE:
 448       result = ISSPACE (c);
 449       break;
 450     case CC_UPPER:
 451       result = ISUPPER (c);
 452       break;
 453     case CC_XDIGIT:
 454       result = ISXDIGIT (c);
 455       break;
 456     default:
 457       abort ();
 458       break;
 459     }
 460   return result;
 461 }
 462
 463 static void
 464 es_free (struct E_string *es)
 465 {
 466   free (es->s);
 467   free (es->escaped);
 468 }
 469
 470 /* Perform the first pass over each range-spec argument S, converting all
 471    \c and \ddd escapes to their one-byte representations.  The conversion
 472    is done in-place, so S must point to writable storage.  If an invalid
 473    quote sequence is found print an error message and return nonzero.
 474    Otherwise set *LEN to the length of the resulting string and return
 475    zero.  The resulting array of characters may contain zero-bytes;
 476    however, on input, S is assumed to be null-terminated, and hence
 477    cannot contain actual (non-escaped) zero bytes.  */
 478
 479 static int
 480 unquote (const unsigned char *s, struct E_string *es)
 481 {
 482   size_t i, j;
 483   size_t len;
 484
 485   len = strlen ((char *) s);
 486
 487   es->s = (unsigned char *) xmalloc (len);
 488   es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
 489   for (i = 0; i < len; i++)
 490     es->escaped[i] = 0;
 491
 492   j = 0;
 493   for (i = 0; s[i]; i++)
 494     {
 495       switch (s[i])
 496         {
 497           int c;
 498         case '\\':
 499           switch (s[i + 1])
 500             {
 501               int oct_digit;
 502             case '\\':
 503               c = '\\';
 504               break;
 505             case 'a':
 506               c = '\007';
 507               break;
 508             case 'b':
 509               c = '\b';
 510               break;
 511             case 'f':
 512               c = '\f';
 513               break;
 514             case 'n':
 515               c = '\n';
 516               break;
 517             case 'r':
 518               c = '\r';
 519               break;
 520             case 't':
 521               c = '\t';
 522               break;
 523             case 'v':
 524               c = '\v';
 525               break;
 526             case '0':
 527             case '1':
 528             case '2':
 529             case '3':
 530             case '4':
 531             case '5':
 532             case '6':
 533             case '7':
 534               c = s[i + 1] - '0';
 535               oct_digit = s[i + 2] - '0';
 536               if (0 <= oct_digit && oct_digit <= 7)
 537                 {
 538                   c = 8 * c + oct_digit;
 539                   ++i;
 540                   oct_digit = s[i + 2] - '0';
 541                   if (0 <= oct_digit && oct_digit <= 7)
 542                     {
 543                       if (8 * c + oct_digit < N_CHARS)
 544                         {
 545                           c = 8 * c + oct_digit;
 546                           ++i;
 547                         }
 548                       else if (!posix_pedantic)
 549                         {
 550                           /* A 3-digit octal number larger than \377 won't
 551                              fit in 8 bits.  So we stop when adding the
 552                              next digit would put us over the limit and
 553                              give a warning about the ambiguity.  POSIX
 554                              isn't clear on this, but one person has said
 555                              that in his interpretation, POSIX says tr
 556                              can't even give a warning.  */
 557                           error (0, 0, _("warning: the ambiguous octal escape \
 558 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
 559                                  s[i], s[i + 1], s[i + 2],
 560                                  s[i], s[i + 1], s[i + 2]);
 561                         }
 562                     }
 563                 }
 564               break;
 565             case '\0':
 566               error (0, 0, _("invalid backslash escape at end of string"));
 567               return 1;
 568
 569             default:
 570               if (posix_pedantic)
 571                 {
 572                   error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
 573                   return 1;
 574                 }
 575               else
 576                 {
 577                   c = s[i + 1];
 578                   es->escaped[j] = 1;
 579                 }
 580             }
 581           ++i;
 582           es->s[j++] = c;
 583           break;
 584         default:
 585           es->s[j++] = s[i];
 586           break;
 587         }
 588     }
 589   es->len = j;
 590   return 0;
 591 }
 592
 593 /* If CLASS_STR is a valid character class string, return its index
 594    in the global char_class_name array.  Otherwise, return CC_NO_CLASS.  */
 595
 596 static enum Char_class
 597 look_up_char_class (const unsigned char *class_str, size_t len)
 598 {
 599   unsigned int i;
 600
 601   for (i = 0; i < N_CHAR_CLASSES; i++)
 602     if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
 603         && strlen (char_class_name[i]) == len)
 604       return (enum Char_class) i;
 605   return CC_NO_CLASS;
 606 }
 607
 608 /* Return a newly allocated string with a printable version of C.
 609    This function is used solely for formatting error messages.  */
 610
 611 static char *
 612 make_printable_char (unsigned int c)
 613 {
 614   char *buf = xmalloc (5);
 615
 616   assert (c < N_CHARS);
 617   if (ISPRINT (c))
 618     {
 619       buf[0] = c;
 620       buf[1] = '\0';
 621     }
 622   else
 623     {
 624       sprintf (buf, "\\%03o", c);
 625     }
 626   return buf;
 627 }
 628
 629 /* Return a newly allocated copy of S which is suitable for printing.
 630    LEN is the number of characters in S.  Most non-printing
 631    (isprint) characters are represented by a backslash followed by
 632    3 octal digits.  However, the characters represented by \c escapes
 633    where c is one of [abfnrtv] are represented by their 2-character \c
 634    sequences.  This function is used solely for printing error messages.  */
 635
 636 static char *
 637 make_printable_str (const unsigned char *s, size_t len)
 638 {
 639   /* Worst case is that every character expands to a backslash
 640      followed by a 3-character octal escape sequence.  */
 641   char *printable_buf = xmalloc (4 * len + 1);
 642   char *p = printable_buf;
 643   size_t i;
 644
 645   for (i = 0; i < len; i++)
 646     {
 647       char buf[5];
 648       char *tmp = NULL;
 649
 650       switch (s[i])
 651         {
 652         case '\\':
 653           tmp = "\\";
 654           break;
 655         case '\007':
 656           tmp = "\\a";
 657           break;
 658         case '\b':
 659           tmp = "\\b";
 660           break;
 661         case '\f':
 662           tmp = "\\f";
 663           break;
 664         case '\n':
 665           tmp = "\\n";
 666           break;
 667         case '\r':
 668           tmp = "\\r";
 669           break;
 670         case '\t':
 671           tmp = "\\t";
 672           break;
 673         case '\v':
 674           tmp = "\\v";
 675           break;
 676         default:
 677           if (ISPRINT (s[i]))
 678             {
 679               buf[0] = s[i];
 680               buf[1] = '\0';
 681             }
 682           else
 683             sprintf (buf, "\\%03o", s[i]);
 684           tmp = buf;
 685           break;
 686         }
 687       p = stpcpy (p, tmp);
 688     }
 689   return printable_buf;
 690 }
 691
 692 /* Append a newly allocated structure representing a
 693    character C to the specification list LIST.  */
 694
 695 static void
 696 append_normal_char (struct Spec_list *list, unsigned int c)
 697 {
 698   struct List_element *new;
 699
 700   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 701   new->next = NULL;
 702   new->type = RE_NORMAL_CHAR;
 703   new->u.normal_char = c;
 704   assert (list->tail);
 705   list->tail->next = new;
 706   list->tail = new;
 707 }
 708
 709 /* Append a newly allocated structure representing the range
 710    of characters from FIRST to LAST to the specification list LIST.
 711    Return nonzero if LAST precedes FIRST in the collating sequence,
 712    zero otherwise.  This means that '[c-c]' is acceptable.  */
 713
 714 static int
 715 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
 716 {
 717   struct List_element *new;
 718
 719   if (ORD (first) > ORD (last))
 720     {
 721       char *tmp1 = make_printable_char (first);
 722       char *tmp2 = make_printable_char (last);
 723
 724       error (0, 0,
 725        _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
 726              tmp1, tmp2);
 727       free (tmp1);
 728       free (tmp2);
 729       return 1;
 730     }
 731   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 732   new->next = NULL;
 733   new->type = RE_RANGE;
 734   new->u.range.first_char = first;
 735   new->u.range.last_char = last;
 736   assert (list->tail);
 737   list->tail->next = new;
 738   list->tail = new;
 739   return 0;
 740 }
 741
 742 /* If CHAR_CLASS_STR is a valid character class string, append a
 743    newly allocated structure representing that character class to the end
 744    of the specification list LIST and return 0.  If CHAR_CLASS_STR is not
 745    a valid string return nonzero.  */
 746
 747 static int
 748 append_char_class (struct Spec_list *list,
 749                    const unsigned char *char_class_str, size_t len)
 750 {
 751   enum Char_class char_class;
 752   struct List_element *new;
 753
 754   char_class = look_up_char_class (char_class_str, len);
 755   if (char_class == CC_NO_CLASS)
 756     return 1;
 757   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 758   new->next = NULL;
 759   new->type = RE_CHAR_CLASS;
 760   new->u.char_class = char_class;
 761   assert (list->tail);
 762   list->tail->next = new;
 763   list->tail = new;
 764   return 0;
 765 }
 766
 767 /* Append a newly allocated structure representing a [c*n]
 768    repeated character construct to the specification list LIST.
 769    THE_CHAR is the single character to be repeated, and REPEAT_COUNT
 770    is a non-negative repeat count.  */
 771
 772 static void
 773 append_repeated_char (struct Spec_list *list, unsigned int the_char,
 774                       size_t repeat_count)
 775 {
 776   struct List_element *new;
 777
 778   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 779   new->next = NULL;
 780   new->type = RE_REPEATED_CHAR;
 781   new->u.repeated_char.the_repeated_char = the_char;
 782   new->u.repeated_char.repeat_count = repeat_count;
 783   assert (list->tail);
 784   list->tail->next = new;
 785   list->tail = new;
 786 }
 787
 788 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
 789    the length of that string, LEN, if LEN is exactly one, append
 790    a newly allocated structure representing the specified
 791    equivalence class to the specification list, LIST and return zero.
 792    If LEN is not 1, return nonzero.  */
 793
 794 static int
 795 append_equiv_class (struct Spec_list *list,
 796                     const unsigned char *equiv_class_str, size_t len)
 797 {
 798   struct List_element *new;
 799
 800   if (len != 1)
 801     return 1;
 802   new = (struct List_element *) xmalloc (sizeof (struct List_element));
 803   new->next = NULL;
 804   new->type = RE_EQUIV_CLASS;
 805   new->u.equiv_code = *equiv_class_str;
 806   assert (list->tail);
 807   list->tail->next = new;
 808   list->tail = new;
 809   return 0;
 810 }
 811
 812 /* Return a newly allocated copy of the LEN-byte prefix of P.
 813    The returned string may contain NUL bytes and is *not* NUL-terminated.  */
 814
 815 static unsigned char *
 816 xmemdup (const unsigned char *p, size_t len)
 817 {
 818   unsigned char *tmp = (unsigned char *) xmalloc (len);
 819
 820   /* Use memcpy rather than strncpy because `p' may contain zero-bytes.  */
 821   memcpy (tmp, p, len);
 822   return tmp;
 823 }
 824
 825 /* Search forward starting at START_IDX for the 2-char sequence
 826    (PRE_BRACKET_CHAR,']') in the string P of length P_LEN.  If such
 827    a sequence is found, set *RESULT_IDX to the index of the first
 828    character and return nonzero. Otherwise return zero.  P may contain
 829    zero bytes.  */
 830
 831 static int
 832 find_closing_delim (const struct E_string *es, size_t start_idx,
 833                     unsigned int pre_bracket_char, size_t *result_idx)
 834 {
 835   size_t i;
 836
 837   for (i = start_idx; i < es->len - 1; i++)
 838     if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
 839         && !es->escaped[i] && !es->escaped[i + 1])
 840       {
 841         *result_idx = i;
 842         return 1;
 843       }
 844   return 0;
 845 }
 846
 847 /* Parse the bracketed repeat-char syntax.  If the P_LEN characters
 848    beginning with P[ START_IDX ] comprise a valid [c*n] construct,
 849    then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
 850    and return zero. If the second character following
 851    the opening bracket is not `*' or if no closing bracket can be
 852    found, return -1.  If a closing bracket is found and the
 853    second char is `*', but the string between the `*' and `]' isn't
 854    empty, an octal number, or a decimal number, print an error message
 855    and return -2.  */
 856
 857 static int
 858 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
 859                        unsigned int *char_to_repeat, size_t *repeat_count,
 860                        size_t *closing_bracket_idx)
 861 {
 862   size_t i;
 863
 864   assert (start_idx + 1 < es->len);
 865   if (!ES_MATCH (es, start_idx + 1, '*'))
 866     return -1;
 867
 868   for (i = start_idx + 2; i < es->len; i++)
 869     {
 870       if (ES_MATCH (es, i, ']'))
 871         {
 872           size_t digit_str_len = i - start_idx - 2;
 873
 874           *char_to_repeat = es->s[start_idx];
 875           if (digit_str_len == 0)
 876             {
 877               /* We've matched [c*] -- no explicit repeat count.  */
 878               *repeat_count = 0;
 879               *closing_bracket_idx = i;
 880               return 0;
 881             }
 882
 883           /* Here, we have found [c*s] where s should be a string
 884              of octal (if it starts with `0') or decimal digits.  */
 885           {
 886             const char *digit_str = (const char *) &es->s[start_idx + 2];
 887             unsigned long int tmp_ulong;
 888             char *d_end;
 889             int base = 10;
 890             /* Select the base manually so we can be sure it's either 8 or 10.
 891                If the spec allowed it to be interpreted as hexadecimal, we
 892                could have used `0' and let xstrtoul decide.  */
 893             if (*digit_str == '0')
 894               {
 895                 base = 8;
 896                 ++digit_str;
 897                 --digit_str_len;
 898               }
 899             if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
 900                   != LONGINT_OK
 901                 || BEGIN_STATE < tmp_ulong
 902                 || d_end - digit_str != digit_str_len)
 903               {
 904                 char *tmp = make_printable_str (es->s + start_idx + 2,
 905                                                 i - start_idx - 2);
 906                 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
 907                        tmp);
 908                 free (tmp);
 909                 return -2;
 910               }
 911             *repeat_count = tmp_ulong;
 912           }
 913           *closing_bracket_idx = i;
 914           return 0;
 915         }
 916     }
 917   return -1;                    /* No bracket found.  */
 918 }
 919
 920 /* Return nonzero if the string at ES->s[IDX] matches the regular
 921    expression `\*[0-9]*\]', zero otherwise.  To match, the `*' and
 922    the `]' must not be escaped.  */
 923
 924 static int
 925 star_digits_closebracket (const struct E_string *es, size_t idx)
 926 {
 927   size_t i;
 928
 929   if (!ES_MATCH (es, idx, '*'))
 930     return 0;
 931
 932   for (i = idx + 1; i < es->len; i++)
 933     {
 934       if (!ISDIGIT (es->s[i]))
 935         {
 936           if (ES_MATCH (es, i, ']'))
 937             return 1;
 938           return 0;
 939         }
 940     }
 941   return 0;
 942 }
 943
 944 /* Convert string UNESCAPED_STRING (which has been preprocessed to
 945    convert backslash-escape sequences) of length LEN characters into
 946    a linked list of the following 5 types of constructs:
 947       - [:str:] Character class where `str' is one of the 12 valid strings.
 948       - [=c=] Equivalence class where `c' is any single character.
 949       - [c*n] Repeat the single character `c' `n' times. n may be omitted.
 950           However, if `n' is present, it must be a non-negative octal or
 951           decimal integer.
 952       - r-s Range of characters from `r' to `s'.  The second endpoint must
 953           not precede the first in the current collating sequence.
 954       - c Any other character is interpreted as itself.  */
 955
 956 static int
 957 build_spec_list (const struct E_string *es, struct Spec_list *result)
 958 {
 959   const unsigned char *p;
 960   size_t i;
 961
 962   p = es->s;
 963
 964   /* The main for-loop below recognizes the 4 multi-character constructs.
 965      A character that matches (in its context) none of the multi-character
 966      constructs is classified as `normal'.  Since all multi-character
 967      constructs have at least 3 characters, any strings of length 2 or
 968      less are composed solely of normal characters.  Hence, the index of
 969      the outer for-loop runs only as far as LEN-2.  */
 970
 971   for (i = 0; i + 2 < es->len; /* empty */)
 972     {
 973       if (ES_MATCH (es, i, '['))
 974         {
 975           int matched_multi_char_construct;
 976           size_t closing_bracket_idx;
 977           unsigned int char_to_repeat;
 978           size_t repeat_count;
 979           int err;
 980
 981           matched_multi_char_construct = 1;
 982           if (ES_MATCH (es, i + 1, ':')
 983               || ES_MATCH (es, i + 1, '='))
 984             {
 985               size_t closing_delim_idx;
 986               int found;
 987
 988               found = find_closing_delim (es, i + 2, p[i + 1],
 989                                           &closing_delim_idx);
 990               if (found)
 991                 {
 992                   int parse_failed;
 993                   size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
 994                   unsigned char *opnd_str;
 995
 996                   if (opnd_str_len == 0)
 997                     {
 998                       if (p[i + 1] == ':')
 999                         error (0, 0, _("missing character class name `[::]'"));
1000                       else
1001                         error (0, 0,
1002                                _("missing equivalence class character `[==]'"));
1003                       return 1;
1004                     }
1005
1006                   opnd_str = xmemdup (p + i + 2, opnd_str_len);
1007
1008                   if (p[i + 1] == ':')
1009                     {
1010                       parse_failed = append_char_class (result, opnd_str,
1011                                                         opnd_str_len);
1012
1013                       /* FIXME: big comment.  */
1014                       if (parse_failed)
1015                         {
1016                           if (star_digits_closebracket (es, i + 2))
1017                             {
1018                               free (opnd_str);
1019                               goto try_bracketed_repeat;
1020                             }
1021                           else
1022                             {
1023                               char *tmp = make_printable_str (opnd_str,
1024                                                               opnd_str_len);
1025                               error (0, 0, _("invalid character class `%s'"),
1026                                      tmp);
1027                               free (tmp);
1028                               return 1;
1029                             }
1030                         }
1031                     }
1032                   else
1033                     {
1034                       parse_failed = append_equiv_class (result, opnd_str,
1035                                                          opnd_str_len);
1036
1037                       /* FIXME: big comment.  */
1038                       if (parse_failed)
1039                         {
1040                           if (star_digits_closebracket (es, i + 2))
1041                             {
1042                               free (opnd_str);
1043                               goto try_bracketed_repeat;
1044                             }
1045                           else
1046                             {
1047                               char *tmp = make_printable_str (opnd_str,
1048                                                               opnd_str_len);
1049                               error (0, 0,
1050                _("%s: equivalence class operand must be a single character"),
1051                                      tmp);
1052                               free (tmp);
1053                               return 1;
1054                             }
1055                         }
1056                     }
1057                   free (opnd_str);
1058
1059                   /* Return nonzero if append_*_class reports a problem.  */
1060                   if (parse_failed)
1061                     return 1;
1062                   else
1063                     i = closing_delim_idx + 2;
1064                   continue;
1065                 }
1066               /* Else fall through.  This could be [:*] or [=*].  */
1067             }
1068
1069         try_bracketed_repeat:
1070
1071           /* Determine whether this is a bracketed repeat range
1072              matching the RE \[.\*(dec_or_oct_number)?\].  */
1073           err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1074                                        &repeat_count,
1075                                        &closing_bracket_idx);
1076           if (err == 0)
1077             {
1078               append_repeated_char (result, char_to_repeat, repeat_count);
1079               i = closing_bracket_idx + 1;
1080             }
1081           else if (err == -1)
1082             {
1083               matched_multi_char_construct = 0;
1084             }
1085           else
1086             {
1087               /* Found a string that looked like [c*n] but the
1088                  numeric part was invalid.  */
1089               return 1;
1090             }
1091
1092           if (matched_multi_char_construct)
1093             continue;
1094
1095           /* We reach this point if P does not match [:str:], [=c=],
1096              [c*n], or [c*].  Now, see if P looks like a range `[-c'
1097              (from `[' to `c').  */
1098         }
1099
1100       /* Look ahead one char for ranges like a-z.  */
1101       if (ES_MATCH (es, i + 1, '-'))
1102         {
1103           if (append_range (result, p[i], p[i + 2]))
1104             return 1;
1105           i += 3;
1106         }
1107       else
1108         {
1109           append_normal_char (result, p[i]);
1110           ++i;
1111         }
1112     }
1113
1114   /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1].  */
1115   for (; i < es->len; i++)
1116     append_normal_char (result, p[i]);
1117
1118   return 0;
1119 }
1120
1121 /* Given a Spec_list S (with its saved state implicit in the values
1122    of its members `tail' and `state'), return the next single character
1123    in the expansion of S's constructs.  If the last character of S was
1124    returned on the previous call or if S was empty, this function
1125    returns -1.  For example, successive calls to get_next where S
1126    represents the spec-string 'a-d[y*3]' will return the sequence
1127    of values a, b, c, d, y, y, y, -1.  Finally, if the construct from
1128    which the returned character comes is [:upper:] or [:lower:], the
1129    parameter CLASS is given a value to indicate which it was.  Otherwise
1130    CLASS is set to UL_NONE.  This value is used only when constructing
1131    the translation table to verify that any occurrences of upper and
1132    lower class constructs in the spec-strings appear in the same relative
1133    positions.  */
1134
1135 static int
1136 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1137 {
1138   struct List_element *p;
1139   int return_val;
1140   int i;
1141
1142   if (class)
1143     *class = UL_NONE;
1144
1145   if (s->state == BEGIN_STATE)
1146     {
1147       s->tail = s->head->next;
1148       s->state = NEW_ELEMENT;
1149     }
1150
1151   p = s->tail;
1152   if (p == NULL)
1153     return -1;
1154
1155   switch (p->type)
1156     {
1157     case RE_NORMAL_CHAR:
1158       return_val = p->u.normal_char;
1159       s->state = NEW_ELEMENT;
1160       s->tail = p->next;
1161       break;
1162
1163     case RE_RANGE:
1164       if (s->state == NEW_ELEMENT)
1165         s->state = ORD (p->u.range.first_char);
1166       else
1167         ++(s->state);
1168       return_val = CHR (s->state);
1169       if (s->state == ORD (p->u.range.last_char))
1170         {
1171           s->tail = p->next;
1172           s->state = NEW_ELEMENT;
1173         }
1174       break;
1175
1176     case RE_CHAR_CLASS:
1177       if (class)
1178         {
1179           int upper_or_lower;
1180           switch (p->u.char_class)
1181             {
1182             case CC_LOWER:
1183               *class = UL_LOWER;
1184               upper_or_lower = 1;
1185               break;
1186             case CC_UPPER:
1187               *class = UL_UPPER;
1188               upper_or_lower = 1;
1189               break;
1190             default:
1191               upper_or_lower = 0;
1192               break;
1193             }
1194
1195           if (upper_or_lower)
1196             {
1197               s->tail = p->next;
1198               s->state = NEW_ELEMENT;
1199               return_val = 0;
1200               break;
1201             }
1202         }
1203
1204       if (s->state == NEW_ELEMENT)
1205         {
1206           for (i = 0; i < N_CHARS; i++)
1207             if (is_char_class_member (p->u.char_class, i))
1208               break;
1209           assert (i < N_CHARS);
1210           s->state = i;
1211         }
1212       assert (is_char_class_member (p->u.char_class, s->state));
1213       return_val = CHR (s->state);
1214       for (i = s->state + 1; i < N_CHARS; i++)
1215         if (is_char_class_member (p->u.char_class, i))
1216           break;
1217       if (i < N_CHARS)
1218         s->state = i;
1219       else
1220         {
1221           s->tail = p->next;
1222           s->state = NEW_ELEMENT;
1223         }
1224       break;
1225
1226     case RE_EQUIV_CLASS:
1227       /* FIXME: this assumes that each character is alone in its own
1228          equivalence class (which appears to be correct for my
1229          LC_COLLATE.  But I don't know of any function that allows
1230          one to determine a character's equivalence class.  */
1231
1232       return_val = p->u.equiv_code;
1233       s->state = NEW_ELEMENT;
1234       s->tail = p->next;
1235       break;
1236
1237     case RE_REPEATED_CHAR:
1238       /* Here, a repeat count of n == 0 means don't repeat at all.  */
1239       if (p->u.repeated_char.repeat_count == 0)
1240         {
1241           s->tail = p->next;
1242           s->state = NEW_ELEMENT;
1243           return_val = get_next (s, class);
1244         }
1245       else
1246         {
1247           if (s->state == NEW_ELEMENT)
1248             {
1249               s->state = 0;
1250             }
1251           ++(s->state);
1252           return_val = p->u.repeated_char.the_repeated_char;
1253           if (p->u.repeated_char.repeat_count > 0
1254               && s->state == p->u.repeated_char.repeat_count)
1255             {
1256               s->tail = p->next;
1257               s->state = NEW_ELEMENT;
1258             }
1259         }
1260       break;
1261
1262     case RE_NO_TYPE:
1263       abort ();
1264       break;
1265
1266     default:
1267       abort ();
1268       break;
1269     }
1270
1271   return return_val;
1272 }
1273
1274 /* This is a minor kludge.  This function is called from
1275    get_spec_stats to determine the cardinality of a set derived
1276    from a complemented string.  It's a kludge in that some of the
1277    same operations are (duplicated) performed in set_initialize.  */
1278
1279 static int
1280 card_of_complement (struct Spec_list *s)
1281 {
1282   int c;
1283   int cardinality = N_CHARS;
1284   SET_TYPE in_set[N_CHARS];
1285
1286   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1287   s->state = BEGIN_STATE;
1288   while ((c = get_next (s, NULL)) != -1)
1289     if (!in_set[c]++)
1290       --cardinality;
1291   return cardinality;
1292 }
1293
1294 /* Gather statistics about the spec-list S in preparation for the tests
1295    in validate that determine the consistency of the specs.  This function
1296    is called at most twice; once for string1, and again for any string2.
1297    LEN_S1 < 0 indicates that this is the first call and that S represents
1298    string1.  When LEN_S1 >= 0, it is the length of the expansion of the
1299    constructs in string1, and we can use its value to resolve any
1300    indefinite repeat construct in S (which represents string2).  Hence,
1301    this function has the side-effect that it converts a valid [c*]
1302    construct in string2 to [c*n] where n is large enough (or 0) to give
1303    string2 the same length as string1.  For example, with the command
1304    tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1305    be 26 and S (representing string2) would be converted to 'A[\n*24]Z'.  */
1306
1307 static void
1308 get_spec_stats (struct Spec_list *s)
1309 {
1310   struct List_element *p;
1311   int len = 0;
1312
1313   s->n_indefinite_repeats = 0;
1314   s->has_equiv_class = 0;
1315   s->has_restricted_char_class = 0;
1316   s->has_char_class = 0;
1317   for (p = s->head->next; p; p = p->next)
1318     {
1319       switch (p->type)
1320         {
1321           int i;
1322         case RE_NORMAL_CHAR:
1323           ++len;
1324           break;
1325
1326         case RE_RANGE:
1327           assert (p->u.range.last_char >= p->u.range.first_char);
1328           len += p->u.range.last_char - p->u.range.first_char + 1;
1329           break;
1330
1331         case RE_CHAR_CLASS:
1332           s->has_char_class = 1;
1333           for (i = 0; i < N_CHARS; i++)
1334             if (is_char_class_member (p->u.char_class, i))
1335               ++len;
1336           switch (p->u.char_class)
1337             {
1338             case CC_UPPER:
1339             case CC_LOWER:
1340               break;
1341             default:
1342               s->has_restricted_char_class = 1;
1343               break;
1344             }
1345           break;
1346
1347         case RE_EQUIV_CLASS:
1348           for (i = 0; i < N_CHARS; i++)
1349             if (is_equiv_class_member (p->u.equiv_code, i))
1350               ++len;
1351           s->has_equiv_class = 1;
1352           break;
1353
1354         case RE_REPEATED_CHAR:
1355           if (p->u.repeated_char.repeat_count > 0)
1356             len += p->u.repeated_char.repeat_count;
1357           else if (p->u.repeated_char.repeat_count == 0)
1358             {
1359               s->indefinite_repeat_element = p;
1360               ++(s->n_indefinite_repeats);
1361             }
1362           break;
1363
1364         case RE_NO_TYPE:
1365           assert (0);
1366           break;
1367         }
1368     }
1369
1370   s->length = len;
1371 }
1372
1373 static void
1374 get_s1_spec_stats (struct Spec_list *s1)
1375 {
1376   get_spec_stats (s1);
1377   if (complement)
1378     s1->length = card_of_complement (s1);
1379 }
1380
1381 static void
1382 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1383 {
1384   get_spec_stats (s2);
1385   if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1386     {
1387       s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1388         len_s1 - s2->length;
1389       s2->length = len_s1;
1390     }
1391 }
1392
1393 static void
1394 spec_init (struct Spec_list *spec_list)
1395 {
1396   spec_list->head = spec_list->tail =
1397     (struct List_element *) xmalloc (sizeof (struct List_element));
1398   spec_list->head->next = NULL;
1399 }
1400
1401 /* This function makes two passes over the argument string S.  The first
1402    one converts all \c and \ddd escapes to their one-byte representations.
1403    The second constructs a linked specification list, SPEC_LIST, of the
1404    characters and constructs that comprise the argument string.  If either
1405    of these passes detects an error, this function returns nonzero.  */
1406
1407 static int
1408 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1409 {
1410   struct E_string es;
1411   int fail;
1412
1413   fail = unquote (s, &es);
1414   if (!fail)
1415     fail = build_spec_list (&es, spec_list);
1416   es_free (&es);
1417   return fail;
1418 }
1419
1420 /* Given two specification lists, S1 and S2, and assuming that
1421    S1->length > S2->length, append a single [c*n] element to S2 where c
1422    is the last character in the expansion of S2 and n is the difference
1423    between the two lengths.
1424    Upon successful completion, S2->length is set to S1->length.  The only
1425    way this function can fail to make S2 as long as S1 is when S2 has
1426    zero-length, since in that case, there is no last character to repeat.
1427    So S2->length is required to be at least 1.
1428
1429    Providing this functionality allows the user to do some pretty
1430    non-BSD (and non-portable) things:  For example, the command
1431        tr -cs '[:upper:]0-9' '[:lower:]'
1432    is almost guaranteed to give results that depend on your collating
1433    sequence.  */
1434
1435 static void
1436 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1437 {
1438   struct List_element *p;
1439   int char_to_repeat;
1440   int i;
1441
1442   assert (translating);
1443   assert (s1->length > s2->length);
1444   assert (s2->length > 0);
1445
1446   p = s2->tail;
1447   switch (p->type)
1448     {
1449     case RE_NORMAL_CHAR:
1450       char_to_repeat = p->u.normal_char;
1451       break;
1452     case RE_RANGE:
1453       char_to_repeat = p->u.range.last_char;
1454       break;
1455     case RE_CHAR_CLASS:
1456       for (i = N_CHARS; i >= 0; i--)
1457         if (is_char_class_member (p->u.char_class, i))
1458           break;
1459       assert (i >= 0);
1460       char_to_repeat = CHR (i);
1461       break;
1462
1463     case RE_REPEATED_CHAR:
1464       char_to_repeat = p->u.repeated_char.the_repeated_char;
1465       break;
1466
1467     case RE_EQUIV_CLASS:
1468       /* This shouldn't happen, because validate exits with an error
1469          if it finds an equiv class in string2 when translating.  */
1470       abort ();
1471       break;
1472
1473     case RE_NO_TYPE:
1474       abort ();
1475       break;
1476
1477     default:
1478       abort ();
1479       break;
1480     }
1481
1482   append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1483   s2->length = s1->length;
1484 }
1485
1486 /* Return non-zero if S is a non-empty list in which exactly one
1487    character (but potentially, many instances of it) appears.
1488    E.g.  [X*] or xxxxxxxx.  */
1489
1490 static int
1491 homogeneous_spec_list (struct Spec_list *s)
1492 {
1493   int b, c;
1494
1495   s->state = BEGIN_STATE;
1496
1497   if ((b = get_next (s, NULL)) == -1)
1498     return 0;
1499
1500   while ((c = get_next (s, NULL)) != -1)
1501     if (c != b)
1502       return 0;
1503
1504   return 1;
1505 }
1506
1507 /* Die with an error message if S1 and S2 describe strings that
1508    are not valid with the given command line switches.
1509    A side effect of this function is that if a valid [c*] or
1510    [c*0] construct appears in string2, it is converted to [c*n]
1511    with a value for n that makes s2->length == s1->length.  By
1512    the same token, if the --truncate-set1 option is not
1513    given, S2 may be extended.  */
1514
1515 static void
1516 validate (struct Spec_list *s1, struct Spec_list *s2)
1517 {
1518   get_s1_spec_stats (s1);
1519   if (s1->n_indefinite_repeats > 0)
1520     {
1521       error (EXIT_FAILURE, 0,
1522              _("the [c*] repeat construct may not appear in string1"));
1523     }
1524
1525   if (s2)
1526     {
1527       get_s2_spec_stats (s2, s1->length);
1528
1529       if (s2->n_indefinite_repeats > 1)
1530         {
1531           error (EXIT_FAILURE, 0,
1532                  _("only one [c*] repeat construct may appear in string2"));
1533         }
1534
1535       if (translating)
1536         {
1537           if (s2->has_equiv_class)
1538             {
1539               error (EXIT_FAILURE, 0,
1540                      _("[=c=] expressions may not appear in string2 \
1541 when translating"));
1542             }
1543
1544           if (s1->length > s2->length)
1545             {
1546               if (!truncate_set1)
1547                 {
1548                   /* string2 must be non-empty unless --truncate-set1 is
1549                      given or string1 is empty.  */
1550
1551                   if (s2->length == 0)
1552                     error (EXIT_FAILURE, 0,
1553                      _("when not truncating set1, string2 must be non-empty"));
1554                   string2_extend (s1, s2);
1555                 }
1556             }
1557
1558           if (complement && s1->has_char_class
1559               && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1560             {
1561               error (EXIT_FAILURE, 0,
1562                      _("when translating with complemented character classes,\
1563 \nstring2 must map all characters in the domain to one"));
1564             }
1565
1566           if (s2->has_restricted_char_class)
1567             {
1568               error (EXIT_FAILURE, 0,
1569                      _("when translating, the only character classes that may \
1570 appear in\nstring2 are `upper' and `lower'"));
1571             }
1572         }
1573       else
1574         /* Not translating.  */
1575         {
1576           if (s2->n_indefinite_repeats > 0)
1577             error (EXIT_FAILURE, 0,
1578                    _("the [c*] construct may appear in string2 only \
1579 when translating"));
1580         }
1581     }
1582 }
1583
1584 /* Read buffers of SIZE bytes via the function READER (if READER is
1585    NULL, read from stdin) until EOF.  When non-NULL, READER is either
1586    read_and_delete or read_and_xlate.  After each buffer is read, it is
1587    processed and written to stdout.  The buffers are processed so that
1588    multiple consecutive occurrences of the same character in the input
1589    stream are replaced by a single occurrence of that character if the
1590    character is in the squeeze set.  */
1591
1592 static void
1593 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1594 {
1595   unsigned int char_to_squeeze = NOT_A_CHAR;
1596   size_t i = 0;
1597   ssize_t nr = 0;
1598
1599   for (;;)
1600     {
1601       size_t begin;
1602
1603       if (i >= nr)
1604         {
1605           if (reader == NULL)
1606             {
1607               ssize_t signed_nr = safe_read (0, (char *) buf, size);
1608               if (signed_nr < 0)
1609                 error (EXIT_FAILURE, errno, _("read error"));
1610               nr = signed_nr;
1611             }
1612           else
1613             {
1614               nr = (*reader) (buf, size, NULL);
1615             }
1616
1617           if (nr == 0)
1618             break;
1619           i = 0;
1620         }
1621
1622       begin = i;
1623
1624       if (char_to_squeeze == NOT_A_CHAR)
1625         {
1626           size_t out_len;
1627           /* Here, by being a little tricky, we can get a significant
1628              performance increase in most cases when the input is
1629              reasonably large.  Since tr will modify the input only
1630              if two consecutive (and identical) input characters are
1631              in the squeeze set, we can step by two through the data
1632              when searching for a character in the squeeze set.  This
1633              means there may be a little more work in a few cases and
1634              perhaps twice as much work in the worst cases where most
1635              of the input is removed by squeezing repeats.  But most
1636              uses of this functionality seem to remove less than 20-30%
1637              of the input.  */
1638           for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1639             ;                   /* empty */
1640
1641           /* There is a special case when i == nr and we've just
1642              skipped a character (the last one in buf) that is in
1643              the squeeze set.  */
1644           if (i == nr && in_squeeze_set[buf[i - 1]])
1645             --i;
1646
1647           if (i >= nr)
1648             out_len = nr - begin;
1649           else
1650             {
1651               char_to_squeeze = buf[i];
1652               /* We're about to output buf[begin..i].  */
1653               out_len = i - begin + 1;
1654
1655               /* But since we stepped by 2 in the loop above,
1656                  out_len may be one too large.  */
1657               if (i > 0 && buf[i - 1] == char_to_squeeze)
1658                 --out_len;
1659
1660               /* Advance i to the index of first character to be
1661                  considered when looking for a char different from
1662                  char_to_squeeze.  */
1663               ++i;
1664             }
1665           if (out_len > 0
1666               && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1667             error (EXIT_FAILURE, errno, _("write error"));
1668         }
1669
1670       if (char_to_squeeze != NOT_A_CHAR)
1671         {
1672           /* Advance i to index of first char != char_to_squeeze
1673              (or to nr if all the rest of the characters in this
1674              buffer are the same as char_to_squeeze).  */
1675           for (; i < nr && buf[i] == char_to_squeeze; i++)
1676             ;                   /* empty */
1677           if (i < nr)
1678             char_to_squeeze = NOT_A_CHAR;
1679           /* If (i >= nr) we've squeezed the last character in this buffer.
1680              So now we have to read a new buffer and continue comparing
1681              characters against char_to_squeeze.  */
1682         }
1683     }
1684 }
1685
1686 /* Read buffers of SIZE bytes from stdin until one is found that
1687    contains at least one character not in the delete set.  Store
1688    in the array BUF, all characters from that buffer that are not
1689    in the delete set, and return the number of characters saved
1690    or 0 upon EOF.  */
1691
1692 static size_t
1693 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1694 {
1695   size_t n_saved;
1696   static int hit_eof = 0;
1697
1698   assert (not_used == NULL);
1699
1700   if (hit_eof)
1701     return 0;
1702
1703   /* This enclosing do-while loop is to make sure that
1704      we don't return zero (indicating EOF) when we've
1705      just deleted all the characters in a buffer.  */
1706   do
1707     {
1708       size_t i;
1709       ssize_t nr = safe_read (0, (char *) buf, size);
1710
1711       if (nr < 0)
1712         error (EXIT_FAILURE, errno, _("read error"));
1713       if (nr == 0)
1714         {
1715           hit_eof = 1;
1716           return 0;
1717         }
1718
1719       /* This first loop may be a waste of code, but gives much
1720          better performance when no characters are deleted in
1721          the beginning of a buffer.  It just avoids the copying
1722          of buf[i] into buf[n_saved] when it would be a NOP.  */
1723
1724       for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1725         /* empty */ ;
1726       n_saved = i;
1727
1728       for (++i; i < nr; i++)
1729         if (!in_delete_set[buf[i]])
1730           buf[n_saved++] = buf[i];
1731     }
1732   while (n_saved == 0);
1733
1734   return n_saved;
1735 }
1736
1737 /* Read at most SIZE bytes from stdin into the array BUF.  Then
1738    perform the in-place and one-to-one mapping specified by the global
1739    array `xlate'.  Return the number of characters read, or 0 upon EOF.  */
1740
1741 static size_t
1742 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1743 {
1744   ssize_t chars_read = 0;
1745   static int hit_eof = 0;
1746   size_t i;
1747
1748   assert (not_used == NULL);
1749
1750   if (hit_eof)
1751     return 0;
1752
1753   chars_read = safe_read (0, (char *) buf, size);
1754   if (chars_read < 0)
1755     error (EXIT_FAILURE, errno, _("read error"));
1756   if (chars_read == 0)
1757     {
1758       hit_eof = 1;
1759       return 0;
1760     }
1761
1762   for (i = 0; i < chars_read; i++)
1763     buf[i] = xlate[buf[i]];
1764
1765   return chars_read;
1766 }
1767
1768 /* Initialize a boolean membership set IN_SET with the character
1769    values obtained by traversing the linked list of constructs S
1770    using the function `get_next'.  If COMPLEMENT_THIS_SET is
1771    nonzero the resulting set is complemented.  */
1772
1773 static void
1774 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1775 {
1776   int c;
1777   size_t i;
1778
1779   memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1780   s->state = BEGIN_STATE;
1781   while ((c = get_next (s, NULL)) != -1)
1782     in_set[c] = 1;
1783   if (complement_this_set)
1784     for (i = 0; i < N_CHARS; i++)
1785       in_set[i] = (!in_set[i]);
1786 }
1787
1788 int
1789 main (int argc, char **argv)
1790 {
1791   int c;
1792   int non_option_args;
1793   struct Spec_list buf1, buf2;
1794   struct Spec_list *s1 = &buf1;
1795   struct Spec_list *s2 = &buf2;
1796
1797   program_name = argv[0];
1798   setlocale (LC_ALL, "");
1799   bindtextdomain (PACKAGE, LOCALEDIR);
1800   textdomain (PACKAGE);
1801
1802   atexit (close_stdout);
1803
1804   while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1805     {
1806       switch (c)
1807         {
1808         case 0:
1809           break;
1810
1811         case 'c':
1812           complement = 1;
1813           break;
1814
1815         case 'd':
1816           delete = 1;
1817           break;
1818
1819         case 's':
1820           squeeze_repeats = 1;
1821           break;
1822
1823         case 't':
1824           truncate_set1 = 1;
1825           break;
1826
1827         case_GETOPT_HELP_CHAR;
1828
1829         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1830
1831         default:
1832           usage (2);
1833           break;
1834         }
1835     }
1836
1837   posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1838
1839   non_option_args = argc - optind;
1840   translating = (non_option_args == 2 && !delete);
1841
1842   /* Change this test if it is valid to give tr no options and
1843      no args at all.  POSIX doesn't specifically say anything
1844      either way, but it looks like they implied it's invalid
1845      by omission.  If you want to make tr do a slow imitation
1846      of `cat' use `tr a a'.  */
1847   if (non_option_args > 2)
1848     {
1849       error (0, 0, _("too many arguments"));
1850       usage (2);
1851     }
1852
1853   if (!delete && !squeeze_repeats && non_option_args != 2)
1854     error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1855
1856   if (delete && squeeze_repeats && non_option_args != 2)
1857     error (EXIT_FAILURE, 0, _("two strings must be given when both \
1858 deleting and squeezing repeats"));
1859
1860   /* If --delete is given without --squeeze-repeats, then
1861      only one string argument may be specified.  But POSIX
1862      says to ignore any string2 in this case, so if POSIXLY_CORRECT
1863      is set, pretend we never saw string2.  But I think
1864      this deserves a fatal error, so that's the default.  */
1865   if ((delete && !squeeze_repeats) && non_option_args != 1)
1866     {
1867       if (posix_pedantic && non_option_args == 2)
1868         --non_option_args;
1869       else
1870         error (EXIT_FAILURE, 0,
1871                _("only one string may be given when deleting \
1872 without squeezing repeats"));
1873     }
1874
1875   if (squeeze_repeats && non_option_args == 0)
1876     error (EXIT_FAILURE, 0,
1877            _("at least one string must be given when squeezing repeats"));
1878
1879   spec_init (s1);
1880   if (parse_str ((unsigned char *) argv[optind], s1))
1881     exit (EXIT_FAILURE);
1882
1883   if (non_option_args == 2)
1884     {
1885       spec_init (s2);
1886       if (parse_str ((unsigned char *) argv[optind + 1], s2))
1887         exit (EXIT_FAILURE);
1888     }
1889   else
1890     s2 = NULL;
1891
1892   validate (s1, s2);
1893
1894   /* Use binary I/O, since `tr' is sometimes used to transliterate
1895      non-printable characters, or characters which are stripped away
1896      by text-mode reads (like CR and ^Z).  */
1897   SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1898
1899   if (squeeze_repeats && non_option_args == 1)
1900     {
1901       set_initialize (s1, complement, in_squeeze_set);
1902       squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1903     }
1904   else if (delete && non_option_args == 1)
1905     {
1906       size_t nr;
1907
1908       set_initialize (s1, complement, in_delete_set);
1909       do
1910         {
1911           nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1912           if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1913             error (EXIT_FAILURE, errno, _("write error"));
1914         }
1915       while (nr > 0);
1916     }
1917   else if (squeeze_repeats && delete && non_option_args == 2)
1918     {
1919       set_initialize (s1, complement, in_delete_set);
1920       set_initialize (s2, 0, in_squeeze_set);
1921       squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
1922     }
1923   else if (translating)
1924     {
1925       if (complement)
1926         {
1927           int i;
1928           SET_TYPE *in_s1 = in_delete_set;
1929
1930           set_initialize (s1, 0, in_s1);
1931           s2->state = BEGIN_STATE;
1932           for (i = 0; i < N_CHARS; i++)
1933             xlate[i] = i;
1934           for (i = 0; i < N_CHARS; i++)
1935             {
1936               if (!in_s1[i])
1937                 {
1938                   int ch = get_next (s2, NULL);
1939                   assert (ch != -1 || truncate_set1);
1940                   if (ch == -1)
1941                     {
1942                       /* This will happen when tr is invoked like e.g.
1943                          tr -cs A-Za-z0-9 '\012'.  */
1944                       break;
1945                     }
1946                   xlate[i] = ch;
1947                 }
1948             }
1949           assert (get_next (s2, NULL) == -1 || truncate_set1);
1950         }
1951       else
1952         {
1953           int c1, c2;
1954           int i;
1955           enum Upper_Lower_class class_s1;
1956           enum Upper_Lower_class class_s2;
1957
1958           for (i = 0; i < N_CHARS; i++)
1959             xlate[i] = i;
1960           s1->state = BEGIN_STATE;
1961           s2->state = BEGIN_STATE;
1962           for (;;)
1963             {
1964               c1 = get_next (s1, &class_s1);
1965               c2 = get_next (s2, &class_s2);
1966               if (!class_ok[(int) class_s1][(int) class_s2])
1967                 error (EXIT_FAILURE, 0,
1968                        _("misaligned [:upper:] and/or [:lower:] construct"));
1969
1970               if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1971                 {
1972                   for (i = 0; i < N_CHARS; i++)
1973                     if (ISLOWER (i))
1974                       xlate[i] = toupper (i);
1975                 }
1976               else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1977                 {
1978                   for (i = 0; i < N_CHARS; i++)
1979                     if (ISUPPER (i))
1980                       xlate[i] = tolower (i);
1981                 }
1982               else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1983                        || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1984                 {
1985                   /* By default, GNU tr permits the identity mappings: from
1986                      [:upper:] to [:upper:] and [:lower:] to [:lower:].  But
1987                      when POSIXLY_CORRECT is set, those evoke diagnostics.  */
1988                   if (posix_pedantic)
1989                     {
1990                       error (EXIT_FAILURE, 0,
1991                              _("\
1992 invalid identity mapping;  when translating, any [:lower:] or [:upper:]\n\
1993 construct in string1 must be aligned with a corresponding construct\n\
1994 ([:upper:] or [:lower:], respectively) in string2"));
1995                     }
1996                 }
1997               else
1998                 {
1999                   /* The following should have been checked by validate...  */
2000                   if (c1 == -1 || c2 == -1)
2001                     break;
2002                   xlate[c1] = c2;
2003                 }
2004             }
2005           assert (c1 == -1 || truncate_set1);
2006         }
2007       if (squeeze_repeats)
2008         {
2009           set_initialize (s2, 0, in_squeeze_set);
2010           squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
2011         }
2012       else
2013         {
2014           size_t chars_read;
2015
2016           do
2017             {
2018               chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2019               if (chars_read > 0
2020                   && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2021                 error (EXIT_FAILURE, errno, _("write error"));
2022             }
2023           while (chars_read > 0);
2024         }
2025     }
2026
2027   if (close (0) != 0)
2028     error (EXIT_FAILURE, errno, _("standard input"));
2029
2030   exit (EXIT_SUCCESS);
2031 }