*** empty log message ***
[coreutils.git] / src / tr.c
blobe931990ce6c345a8ad4d66c82f4b8956db5bd802
1 /* tr -- a filter to translate characters
2 Copyright (C) 91, 1995-2002 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/types.h>
26 #include <getopt.h>
28 #include "system.h"
29 #include "closeout.h"
30 #include "error.h"
31 #include "safe-read.h"
32 #include "xstrtol.h"
34 /* The official name of this program (e.g., no `g' prefix). */
35 #define PROGRAM_NAME "tr"
37 #define AUTHORS "Jim Meyering"
39 #define N_CHARS (UCHAR_MAX + 1)
41 /* A pointer to a filtering function. */
42 typedef size_t (*Filter) (/* unsigned char *, size_t, Filter */);
44 /* Convert from character C to its index in the collating
45 sequence array. Just cast to an unsigned int to avoid
46 problems with sign-extension. */
47 #define ORD(c) (unsigned int)(c)
49 /* The inverse of ORD. */
50 #define CHR(i) (unsigned char)(i)
52 /* The value for Spec_list->state that indicates to
53 get_next that it should initialize the tail pointer.
54 Its value should be as large as possible to avoid conflict
55 a valid value for the state field -- and that may be as
56 large as any valid repeat_count. */
57 #define BEGIN_STATE (INT_MAX - 1)
59 /* The value for Spec_list->state that indicates to
60 get_next that the element pointed to by Spec_list->tail is
61 being considered for the first time on this pass through the
62 list -- it indicates that get_next should make any necessary
63 initializations. */
64 #define NEW_ELEMENT (BEGIN_STATE + 1)
66 /* A value distinct from any character that may have been stored in a
67 buffer as the result of a block-read in the function squeeze_filter. */
68 #define NOT_A_CHAR (unsigned int)(-1)
70 /* The following (but not CC_NO_CLASS) are indices into the array of
71 valid character class strings. */
72 enum Char_class
74 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
75 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
76 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
77 CC_NO_CLASS = 9999
80 /* Character class to which a character (returned by get_next) belonged;
81 but it is set only if the construct from which the character was obtained
82 was one of the character classes [:upper:] or [:lower:]. The value
83 is used only when translating and then, only to make sure that upper
84 and lower class constructs have the same relative positions in string1
85 and string2. */
86 enum Upper_Lower_class
88 UL_LOWER = 0,
89 UL_UPPER = 1,
90 UL_NONE = 2
93 /* A shortcut to ensure that when constructing the translation array,
94 one of the values returned by paired calls to get_next (from s1 and s2)
95 is from [:upper:] and the other is from [:lower:], or neither is from
96 upper or lower. By default, GNU tr permits the identity mappings: from
97 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But when
98 POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
99 by values of type enum Upper_Lower_class. */
100 static int const class_ok[3][3] =
102 {1, 1, 0},
103 {1, 1, 0},
104 {0, 0, 1}
107 /* The type of a List_element. See build_spec_list for more details. */
108 enum Range_element_type
110 RE_NO_TYPE = 0,
111 RE_NORMAL_CHAR,
112 RE_RANGE,
113 RE_CHAR_CLASS,
114 RE_EQUIV_CLASS,
115 RE_REPEATED_CHAR
118 /* One construct in one of tr's argument strings.
119 For example, consider the POSIX version of the classic tr command:
120 tr -cs 'a-zA-Z_' '[\n*]'
121 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
122 and a single normal character, `_'. String2 has one construct. */
123 struct List_element
125 enum Range_element_type type;
126 struct List_element *next;
127 union
129 int normal_char;
130 struct /* unnamed */
132 unsigned int first_char;
133 unsigned int last_char;
135 range;
136 enum Char_class char_class;
137 int equiv_code;
138 struct /* unnamed */
140 unsigned int the_repeated_char;
141 size_t repeat_count;
143 repeated_char;
148 /* Each of tr's argument strings is parsed into a form that is easier
149 to work with: a linked list of constructs (struct List_element).
150 Each Spec_list structure also encapsulates various attributes of
151 the corresponding argument string. The attributes are used mainly
152 to verify that the strings are valid in the context of any options
153 specified (like -s, -d, or -c). The main exception is the member
154 `tail', which is first used to construct the list. After construction,
155 it is used by get_next to save its state when traversing the list.
156 The member `state' serves a similar function. */
157 struct Spec_list
159 /* Points to the head of the list of range elements.
160 The first struct is a dummy; its members are never used. */
161 struct List_element *head;
163 /* When appending, points to the last element. When traversing via
164 get_next(), points to the element to process next. Setting
165 Spec_list.state to the value BEGIN_STATE before calling get_next
166 signals get_next to initialize tail to point to head->next. */
167 struct List_element *tail;
169 /* Used to save state between calls to get_next(). */
170 unsigned int state;
172 /* Length, in the sense that length ('a-z[:digit:]123abc')
173 is 42 ( = 26 + 10 + 6). */
174 size_t length;
176 /* The number of [c*] and [c*0] constructs that appear in this spec. */
177 int n_indefinite_repeats;
179 /* If n_indefinite_repeats is nonzero, this points to the List_element
180 corresponding to the last [c*] or [c*0] construct encountered in
181 this spec. Otherwise it is undefined. */
182 struct List_element *indefinite_repeat_element;
184 /* Non-zero if this spec contains at least one equivalence
185 class construct e.g. [=c=]. */
186 int has_equiv_class;
188 /* Non-zero if this spec contains at least one character class
189 construct. E.g. [:digit:]. */
190 int has_char_class;
192 /* Non-zero if this spec contains at least one of the character class
193 constructs (all but upper and lower) that aren't allowed in s2. */
194 int has_restricted_char_class;
197 /* A representation for escaped string1 or string2. As a string is parsed,
198 any backslash-escaped characters (other than octal or \a, \b, \f, \n,
199 etc.) are marked as such in this structure by setting the corresponding
200 entry in the ESCAPED vector. */
201 struct E_string
203 unsigned char *s;
204 int *escaped;
205 size_t len;
208 /* Return nonzero if the Ith character of escaped string ES matches C
209 and is not escaped itself. */
210 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
212 /* The name by which this program was run. */
213 char *program_name;
215 /* When nonzero, each sequence in the input of a repeated character
216 (call it c) is replaced (in the output) by a single occurrence of c
217 for every c in the squeeze set. */
218 static int squeeze_repeats = 0;
220 /* When nonzero, removes characters in the delete set from input. */
221 static int delete = 0;
223 /* Use the complement of set1 in place of set1. */
224 static int complement = 0;
226 /* When nonzero, this flag causes GNU tr to provide strict
227 compliance with POSIX draft 1003.2.11.2. The POSIX spec
228 says that when -d is used without -s, string2 (if present)
229 must be ignored. Silently ignoring arguments is a bad idea.
230 The default GNU behavior is to give a usage message and exit.
231 Additionally, when this flag is nonzero, tr prints warnings
232 on stderr if it is being used in a manner that is not portable.
233 Applicable warnings are given by default, but are suppressed
234 if the environment variable `POSIXLY_CORRECT' is set, since
235 being POSIX conformant means we can't issue such messages.
236 Warnings on the following topics are suppressed when this
237 variable is nonzero:
238 1. Ambiguous octal escapes. */
239 static int posix_pedantic;
241 /* When tr is performing translation and string1 is longer than string2,
242 POSIX says that the result is undefined. That gives the implementor
243 of a POSIX conforming version of tr two reasonable choices for the
244 semantics of this case.
246 * The BSD tr pads string2 to the length of string1 by
247 repeating the last character in string2.
249 * System V tr ignores characters in string1 that have no
250 corresponding character in string2. That is, string1 is effectively
251 truncated to the length of string2.
253 When nonzero, this flag causes GNU tr to imitate the behavior
254 of System V tr when translating with string1 longer than string2.
255 The default is to emulate BSD tr. This flag is ignored in modes where
256 no translation is performed. Emulating the System V tr
257 in this exceptional case causes the relatively common BSD idiom:
259 tr -cs A-Za-z0-9 '\012'
261 to break (it would convert only zero bytes, rather than all
262 non-alphanumerics, to newlines).
264 WARNING: This switch does not provide general BSD or System V
265 compatibility. For example, it doesn't disable the interpretation
266 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
267 some unfortunate coincidence you use such constructs in scripts
268 expecting to use some other version of tr, the scripts will break. */
269 static int truncate_set1 = 0;
271 /* An alias for (!delete && non_option_args == 2).
272 It is set in main and used there and in validate(). */
273 static int translating;
275 #ifndef BUFSIZ
276 # define BUFSIZ 8192
277 #endif
279 #define IO_BUF_SIZE BUFSIZ
280 static unsigned char io_buf[IO_BUF_SIZE];
282 static char const *const char_class_name[] =
284 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
285 "lower", "print", "punct", "space", "upper", "xdigit"
287 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
289 typedef char SET_TYPE;
291 /* Array of boolean values. A character `c' is a member of the
292 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
293 set is defined by the last (possibly, the only) string argument
294 on the command line when the squeeze option is given. */
295 static SET_TYPE in_squeeze_set[N_CHARS];
297 /* Array of boolean values. A character `c' is a member of the
298 delete set if and only if in_delete_set[c] is true. The delete
299 set is defined by the first (or only) string argument on the
300 command line when the delete option is given. */
301 static SET_TYPE in_delete_set[N_CHARS];
303 /* Array of character values defining the translation (if any) that
304 tr is to perform. Translation is performed only when there are
305 two specification strings and the delete switch is not given. */
306 static char xlate[N_CHARS];
308 static struct option const long_options[] =
310 {"complement", no_argument, NULL, 'c'},
311 {"delete", no_argument, NULL, 'd'},
312 {"squeeze-repeats", no_argument, NULL, 's'},
313 {"truncate-set1", no_argument, NULL, 't'},
314 {GETOPT_HELP_OPTION_DECL},
315 {GETOPT_VERSION_OPTION_DECL},
316 {NULL, 0, NULL, 0}
319 void
320 usage (int status)
322 if (status != 0)
323 fprintf (stderr, _("Try `%s --help' for more information.\n"),
324 program_name);
325 else
327 printf (_("\
328 Usage: %s [OPTION]... SET1 [SET2]\n\
330 program_name);
331 fputs (_("\
332 Translate, squeeze, and/or delete characters from standard input,\n\
333 writing to standard output.\n\
335 -c, --complement first complement SET1\n\
336 -d, --delete delete characters in SET1, do not translate\n\
337 -s, --squeeze-repeats replace each input sequence of a repeated character\n\
338 that is listed in SET1 with a single occurrence\n\
339 of that character\n\
340 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
341 "), stdout);
342 fputs (HELP_OPTION_DESCRIPTION, stdout);
343 fputs (VERSION_OPTION_DESCRIPTION, stdout);
344 fputs (_("\
346 SETs are specified as strings of characters. Most represent themselves.\n\
347 Interpreted sequences are:\n\
349 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
350 \\\\ backslash\n\
351 \\a audible BEL\n\
352 \\b backspace\n\
353 \\f form feed\n\
354 \\n new line\n\
355 \\r return\n\
356 \\t horizontal tab\n\
357 "), stdout);
358 fputs (_("\
359 \\v vertical tab\n\
360 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
361 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
362 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
363 [:alnum:] all letters and digits\n\
364 [:alpha:] all letters\n\
365 [:blank:] all horizontal whitespace\n\
366 [:cntrl:] all control characters\n\
367 [:digit:] all digits\n\
368 "), stdout);
369 fputs (_("\
370 [:graph:] all printable characters, not including space\n\
371 [:lower:] all lower case letters\n\
372 [:print:] all printable characters, including space\n\
373 [:punct:] all punctuation characters\n\
374 [:space:] all horizontal or vertical whitespace\n\
375 [:upper:] all upper case letters\n\
376 [:xdigit:] all hexadecimal digits\n\
377 [=CHAR=] all characters which are equivalent to CHAR\n\
378 "), stdout);
379 fputs (_("\
381 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
382 -t may be used only when translating. SET2 is extended to length of\n\
383 SET1 by repeating its last character as necessary. \
384 "), stdout);
385 fputs (_("\
386 Excess characters\n\
387 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
388 expand in ascending order; used in SET2 while translating, they may\n\
389 only be used in pairs to specify case conversion. \
390 "), stdout);
391 fputs (_("\
392 -s uses SET1 if not\n\
393 translating nor deleting; else squeezing uses SET2 and occurs after\n\
394 translation or deletion.\n\
395 "), stdout);
396 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
398 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
401 /* Return nonzero if the character C is a member of the
402 equivalence class containing the character EQUIV_CLASS. */
404 static int
405 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
407 return (equiv_class == c);
410 /* Return nonzero if the character C is a member of the
411 character class CHAR_CLASS. */
413 static int
414 is_char_class_member (enum Char_class char_class, unsigned int c)
416 int result;
418 switch (char_class)
420 case CC_ALNUM:
421 result = ISALNUM (c);
422 break;
423 case CC_ALPHA:
424 result = ISALPHA (c);
425 break;
426 case CC_BLANK:
427 result = ISBLANK (c);
428 break;
429 case CC_CNTRL:
430 result = ISCNTRL (c);
431 break;
432 case CC_DIGIT:
433 result = ISDIGIT_LOCALE (c);
434 break;
435 case CC_GRAPH:
436 result = ISGRAPH (c);
437 break;
438 case CC_LOWER:
439 result = ISLOWER (c);
440 break;
441 case CC_PRINT:
442 result = ISPRINT (c);
443 break;
444 case CC_PUNCT:
445 result = ISPUNCT (c);
446 break;
447 case CC_SPACE:
448 result = ISSPACE (c);
449 break;
450 case CC_UPPER:
451 result = ISUPPER (c);
452 break;
453 case CC_XDIGIT:
454 result = ISXDIGIT (c);
455 break;
456 default:
457 abort ();
458 break;
460 return result;
463 static void
464 es_free (struct E_string *es)
466 free (es->s);
467 free (es->escaped);
470 /* Perform the first pass over each range-spec argument S, converting all
471 \c and \ddd escapes to their one-byte representations. The conversion
472 is done in-place, so S must point to writable storage. If an invalid
473 quote sequence is found print an error message and return nonzero.
474 Otherwise set *LEN to the length of the resulting string and return
475 zero. The resulting array of characters may contain zero-bytes;
476 however, on input, S is assumed to be null-terminated, and hence
477 cannot contain actual (non-escaped) zero bytes. */
479 static int
480 unquote (const unsigned char *s, struct E_string *es)
482 size_t i, j;
483 size_t len;
485 len = strlen ((char *) s);
487 es->s = (unsigned char *) xmalloc (len);
488 es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
489 for (i = 0; i < len; i++)
490 es->escaped[i] = 0;
492 j = 0;
493 for (i = 0; s[i]; i++)
495 switch (s[i])
497 int c;
498 case '\\':
499 switch (s[i + 1])
501 int oct_digit;
502 case '\\':
503 c = '\\';
504 break;
505 case 'a':
506 c = '\007';
507 break;
508 case 'b':
509 c = '\b';
510 break;
511 case 'f':
512 c = '\f';
513 break;
514 case 'n':
515 c = '\n';
516 break;
517 case 'r':
518 c = '\r';
519 break;
520 case 't':
521 c = '\t';
522 break;
523 case 'v':
524 c = '\v';
525 break;
526 case '0':
527 case '1':
528 case '2':
529 case '3':
530 case '4':
531 case '5':
532 case '6':
533 case '7':
534 c = s[i + 1] - '0';
535 oct_digit = s[i + 2] - '0';
536 if (0 <= oct_digit && oct_digit <= 7)
538 c = 8 * c + oct_digit;
539 ++i;
540 oct_digit = s[i + 2] - '0';
541 if (0 <= oct_digit && oct_digit <= 7)
543 if (8 * c + oct_digit < N_CHARS)
545 c = 8 * c + oct_digit;
546 ++i;
548 else if (!posix_pedantic)
550 /* A 3-digit octal number larger than \377 won't
551 fit in 8 bits. So we stop when adding the
552 next digit would put us over the limit and
553 give a warning about the ambiguity. POSIX
554 isn't clear on this, but one person has said
555 that in his interpretation, POSIX says tr
556 can't even give a warning. */
557 error (0, 0, _("warning: the ambiguous octal escape \
558 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
559 s[i], s[i + 1], s[i + 2],
560 s[i], s[i + 1], s[i + 2]);
564 break;
565 case '\0':
566 error (0, 0, _("invalid backslash escape at end of string"));
567 return 1;
569 default:
570 if (posix_pedantic)
572 error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
573 return 1;
575 else
577 c = s[i + 1];
578 es->escaped[j] = 1;
581 ++i;
582 es->s[j++] = c;
583 break;
584 default:
585 es->s[j++] = s[i];
586 break;
589 es->len = j;
590 return 0;
593 /* If CLASS_STR is a valid character class string, return its index
594 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
596 static enum Char_class
597 look_up_char_class (const unsigned char *class_str, size_t len)
599 unsigned int i;
601 for (i = 0; i < N_CHAR_CLASSES; i++)
602 if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
603 && strlen (char_class_name[i]) == len)
604 return (enum Char_class) i;
605 return CC_NO_CLASS;
608 /* Return a newly allocated string with a printable version of C.
609 This function is used solely for formatting error messages. */
611 static char *
612 make_printable_char (unsigned int c)
614 char *buf = xmalloc (5);
616 assert (c < N_CHARS);
617 if (ISPRINT (c))
619 buf[0] = c;
620 buf[1] = '\0';
622 else
624 sprintf (buf, "\\%03o", c);
626 return buf;
629 /* Return a newly allocated copy of S which is suitable for printing.
630 LEN is the number of characters in S. Most non-printing
631 (isprint) characters are represented by a backslash followed by
632 3 octal digits. However, the characters represented by \c escapes
633 where c is one of [abfnrtv] are represented by their 2-character \c
634 sequences. This function is used solely for printing error messages. */
636 static char *
637 make_printable_str (const unsigned char *s, size_t len)
639 /* Worst case is that every character expands to a backslash
640 followed by a 3-character octal escape sequence. */
641 char *printable_buf = xmalloc (4 * len + 1);
642 char *p = printable_buf;
643 size_t i;
645 for (i = 0; i < len; i++)
647 char buf[5];
648 char *tmp = NULL;
650 switch (s[i])
652 case '\\':
653 tmp = "\\";
654 break;
655 case '\007':
656 tmp = "\\a";
657 break;
658 case '\b':
659 tmp = "\\b";
660 break;
661 case '\f':
662 tmp = "\\f";
663 break;
664 case '\n':
665 tmp = "\\n";
666 break;
667 case '\r':
668 tmp = "\\r";
669 break;
670 case '\t':
671 tmp = "\\t";
672 break;
673 case '\v':
674 tmp = "\\v";
675 break;
676 default:
677 if (ISPRINT (s[i]))
679 buf[0] = s[i];
680 buf[1] = '\0';
682 else
683 sprintf (buf, "\\%03o", s[i]);
684 tmp = buf;
685 break;
687 p = stpcpy (p, tmp);
689 return printable_buf;
692 /* Append a newly allocated structure representing a
693 character C to the specification list LIST. */
695 static void
696 append_normal_char (struct Spec_list *list, unsigned int c)
698 struct List_element *new;
700 new = (struct List_element *) xmalloc (sizeof (struct List_element));
701 new->next = NULL;
702 new->type = RE_NORMAL_CHAR;
703 new->u.normal_char = c;
704 assert (list->tail);
705 list->tail->next = new;
706 list->tail = new;
709 /* Append a newly allocated structure representing the range
710 of characters from FIRST to LAST to the specification list LIST.
711 Return nonzero if LAST precedes FIRST in the collating sequence,
712 zero otherwise. This means that '[c-c]' is acceptable. */
714 static int
715 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
717 struct List_element *new;
719 if (ORD (first) > ORD (last))
721 char *tmp1 = make_printable_char (first);
722 char *tmp2 = make_printable_char (last);
724 error (0, 0,
725 _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
726 tmp1, tmp2);
727 free (tmp1);
728 free (tmp2);
729 return 1;
731 new = (struct List_element *) xmalloc (sizeof (struct List_element));
732 new->next = NULL;
733 new->type = RE_RANGE;
734 new->u.range.first_char = first;
735 new->u.range.last_char = last;
736 assert (list->tail);
737 list->tail->next = new;
738 list->tail = new;
739 return 0;
742 /* If CHAR_CLASS_STR is a valid character class string, append a
743 newly allocated structure representing that character class to the end
744 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
745 a valid string return nonzero. */
747 static int
748 append_char_class (struct Spec_list *list,
749 const unsigned char *char_class_str, size_t len)
751 enum Char_class char_class;
752 struct List_element *new;
754 char_class = look_up_char_class (char_class_str, len);
755 if (char_class == CC_NO_CLASS)
756 return 1;
757 new = (struct List_element *) xmalloc (sizeof (struct List_element));
758 new->next = NULL;
759 new->type = RE_CHAR_CLASS;
760 new->u.char_class = char_class;
761 assert (list->tail);
762 list->tail->next = new;
763 list->tail = new;
764 return 0;
767 /* Append a newly allocated structure representing a [c*n]
768 repeated character construct to the specification list LIST.
769 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
770 is a non-negative repeat count. */
772 static void
773 append_repeated_char (struct Spec_list *list, unsigned int the_char,
774 size_t repeat_count)
776 struct List_element *new;
778 new = (struct List_element *) xmalloc (sizeof (struct List_element));
779 new->next = NULL;
780 new->type = RE_REPEATED_CHAR;
781 new->u.repeated_char.the_repeated_char = the_char;
782 new->u.repeated_char.repeat_count = repeat_count;
783 assert (list->tail);
784 list->tail->next = new;
785 list->tail = new;
788 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
789 the length of that string, LEN, if LEN is exactly one, append
790 a newly allocated structure representing the specified
791 equivalence class to the specification list, LIST and return zero.
792 If LEN is not 1, return nonzero. */
794 static int
795 append_equiv_class (struct Spec_list *list,
796 const unsigned char *equiv_class_str, size_t len)
798 struct List_element *new;
800 if (len != 1)
801 return 1;
802 new = (struct List_element *) xmalloc (sizeof (struct List_element));
803 new->next = NULL;
804 new->type = RE_EQUIV_CLASS;
805 new->u.equiv_code = *equiv_class_str;
806 assert (list->tail);
807 list->tail->next = new;
808 list->tail = new;
809 return 0;
812 /* Return a newly allocated copy of the LEN-byte prefix of P.
813 The returned string may contain NUL bytes and is *not* NUL-terminated. */
815 static unsigned char *
816 xmemdup (const unsigned char *p, size_t len)
818 unsigned char *tmp = (unsigned char *) xmalloc (len);
820 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
821 memcpy (tmp, p, len);
822 return tmp;
825 /* Search forward starting at START_IDX for the 2-char sequence
826 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
827 a sequence is found, set *RESULT_IDX to the index of the first
828 character and return nonzero. Otherwise return zero. P may contain
829 zero bytes. */
831 static int
832 find_closing_delim (const struct E_string *es, size_t start_idx,
833 unsigned int pre_bracket_char, size_t *result_idx)
835 size_t i;
837 for (i = start_idx; i < es->len - 1; i++)
838 if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
839 && !es->escaped[i] && !es->escaped[i + 1])
841 *result_idx = i;
842 return 1;
844 return 0;
847 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
848 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
849 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
850 and return zero. If the second character following
851 the opening bracket is not `*' or if no closing bracket can be
852 found, return -1. If a closing bracket is found and the
853 second char is `*', but the string between the `*' and `]' isn't
854 empty, an octal number, or a decimal number, print an error message
855 and return -2. */
857 static int
858 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
859 unsigned int *char_to_repeat, size_t *repeat_count,
860 size_t *closing_bracket_idx)
862 size_t i;
864 assert (start_idx + 1 < es->len);
865 if (!ES_MATCH (es, start_idx + 1, '*'))
866 return -1;
868 for (i = start_idx + 2; i < es->len; i++)
870 if (ES_MATCH (es, i, ']'))
872 size_t digit_str_len = i - start_idx - 2;
874 *char_to_repeat = es->s[start_idx];
875 if (digit_str_len == 0)
877 /* We've matched [c*] -- no explicit repeat count. */
878 *repeat_count = 0;
879 *closing_bracket_idx = i;
880 return 0;
883 /* Here, we have found [c*s] where s should be a string
884 of octal (if it starts with `0') or decimal digits. */
886 const char *digit_str = (const char *) &es->s[start_idx + 2];
887 unsigned long int tmp_ulong;
888 char *d_end;
889 int base = 10;
890 /* Select the base manually so we can be sure it's either 8 or 10.
891 If the spec allowed it to be interpreted as hexadecimal, we
892 could have used `0' and let xstrtoul decide. */
893 if (*digit_str == '0')
895 base = 8;
896 ++digit_str;
897 --digit_str_len;
899 if (xstrtoul (digit_str, &d_end, base, &tmp_ulong, NULL)
900 != LONGINT_OK
901 || BEGIN_STATE < tmp_ulong
902 || d_end - digit_str != digit_str_len)
904 char *tmp = make_printable_str (es->s + start_idx + 2,
905 i - start_idx - 2);
906 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
907 tmp);
908 free (tmp);
909 return -2;
911 *repeat_count = tmp_ulong;
913 *closing_bracket_idx = i;
914 return 0;
917 return -1; /* No bracket found. */
920 /* Return nonzero if the string at ES->s[IDX] matches the regular
921 expression `\*[0-9]*\]', zero otherwise. To match, the `*' and
922 the `]' must not be escaped. */
924 static int
925 star_digits_closebracket (const struct E_string *es, size_t idx)
927 size_t i;
929 if (!ES_MATCH (es, idx, '*'))
930 return 0;
932 for (i = idx + 1; i < es->len; i++)
934 if (!ISDIGIT (es->s[i]))
936 if (ES_MATCH (es, i, ']'))
937 return 1;
938 return 0;
941 return 0;
944 /* Convert string UNESCAPED_STRING (which has been preprocessed to
945 convert backslash-escape sequences) of length LEN characters into
946 a linked list of the following 5 types of constructs:
947 - [:str:] Character class where `str' is one of the 12 valid strings.
948 - [=c=] Equivalence class where `c' is any single character.
949 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
950 However, if `n' is present, it must be a non-negative octal or
951 decimal integer.
952 - r-s Range of characters from `r' to `s'. The second endpoint must
953 not precede the first in the current collating sequence.
954 - c Any other character is interpreted as itself. */
956 static int
957 build_spec_list (const struct E_string *es, struct Spec_list *result)
959 const unsigned char *p;
960 size_t i;
962 p = es->s;
964 /* The main for-loop below recognizes the 4 multi-character constructs.
965 A character that matches (in its context) none of the multi-character
966 constructs is classified as `normal'. Since all multi-character
967 constructs have at least 3 characters, any strings of length 2 or
968 less are composed solely of normal characters. Hence, the index of
969 the outer for-loop runs only as far as LEN-2. */
971 for (i = 0; i + 2 < es->len; /* empty */)
973 if (ES_MATCH (es, i, '['))
975 int matched_multi_char_construct;
976 size_t closing_bracket_idx;
977 unsigned int char_to_repeat;
978 size_t repeat_count;
979 int err;
981 matched_multi_char_construct = 1;
982 if (ES_MATCH (es, i + 1, ':')
983 || ES_MATCH (es, i + 1, '='))
985 size_t closing_delim_idx;
986 int found;
988 found = find_closing_delim (es, i + 2, p[i + 1],
989 &closing_delim_idx);
990 if (found)
992 int parse_failed;
993 size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
994 unsigned char *opnd_str;
996 if (opnd_str_len == 0)
998 if (p[i + 1] == ':')
999 error (0, 0, _("missing character class name `[::]'"));
1000 else
1001 error (0, 0,
1002 _("missing equivalence class character `[==]'"));
1003 return 1;
1006 opnd_str = xmemdup (p + i + 2, opnd_str_len);
1008 if (p[i + 1] == ':')
1010 parse_failed = append_char_class (result, opnd_str,
1011 opnd_str_len);
1013 /* FIXME: big comment. */
1014 if (parse_failed)
1016 if (star_digits_closebracket (es, i + 2))
1018 free (opnd_str);
1019 goto try_bracketed_repeat;
1021 else
1023 char *tmp = make_printable_str (opnd_str,
1024 opnd_str_len);
1025 error (0, 0, _("invalid character class `%s'"),
1026 tmp);
1027 free (tmp);
1028 return 1;
1032 else
1034 parse_failed = append_equiv_class (result, opnd_str,
1035 opnd_str_len);
1037 /* FIXME: big comment. */
1038 if (parse_failed)
1040 if (star_digits_closebracket (es, i + 2))
1042 free (opnd_str);
1043 goto try_bracketed_repeat;
1045 else
1047 char *tmp = make_printable_str (opnd_str,
1048 opnd_str_len);
1049 error (0, 0,
1050 _("%s: equivalence class operand must be a single character"),
1051 tmp);
1052 free (tmp);
1053 return 1;
1057 free (opnd_str);
1059 /* Return nonzero if append_*_class reports a problem. */
1060 if (parse_failed)
1061 return 1;
1062 else
1063 i = closing_delim_idx + 2;
1064 continue;
1066 /* Else fall through. This could be [:*] or [=*]. */
1069 try_bracketed_repeat:
1071 /* Determine whether this is a bracketed repeat range
1072 matching the RE \[.\*(dec_or_oct_number)?\]. */
1073 err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1074 &repeat_count,
1075 &closing_bracket_idx);
1076 if (err == 0)
1078 append_repeated_char (result, char_to_repeat, repeat_count);
1079 i = closing_bracket_idx + 1;
1081 else if (err == -1)
1083 matched_multi_char_construct = 0;
1085 else
1087 /* Found a string that looked like [c*n] but the
1088 numeric part was invalid. */
1089 return 1;
1092 if (matched_multi_char_construct)
1093 continue;
1095 /* We reach this point if P does not match [:str:], [=c=],
1096 [c*n], or [c*]. Now, see if P looks like a range `[-c'
1097 (from `[' to `c'). */
1100 /* Look ahead one char for ranges like a-z. */
1101 if (ES_MATCH (es, i + 1, '-'))
1103 if (append_range (result, p[i], p[i + 2]))
1104 return 1;
1105 i += 3;
1107 else
1109 append_normal_char (result, p[i]);
1110 ++i;
1114 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
1115 for (; i < es->len; i++)
1116 append_normal_char (result, p[i]);
1118 return 0;
1121 /* Given a Spec_list S (with its saved state implicit in the values
1122 of its members `tail' and `state'), return the next single character
1123 in the expansion of S's constructs. If the last character of S was
1124 returned on the previous call or if S was empty, this function
1125 returns -1. For example, successive calls to get_next where S
1126 represents the spec-string 'a-d[y*3]' will return the sequence
1127 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1128 which the returned character comes is [:upper:] or [:lower:], the
1129 parameter CLASS is given a value to indicate which it was. Otherwise
1130 CLASS is set to UL_NONE. This value is used only when constructing
1131 the translation table to verify that any occurrences of upper and
1132 lower class constructs in the spec-strings appear in the same relative
1133 positions. */
1135 static int
1136 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1138 struct List_element *p;
1139 int return_val;
1140 int i;
1142 if (class)
1143 *class = UL_NONE;
1145 if (s->state == BEGIN_STATE)
1147 s->tail = s->head->next;
1148 s->state = NEW_ELEMENT;
1151 p = s->tail;
1152 if (p == NULL)
1153 return -1;
1155 switch (p->type)
1157 case RE_NORMAL_CHAR:
1158 return_val = p->u.normal_char;
1159 s->state = NEW_ELEMENT;
1160 s->tail = p->next;
1161 break;
1163 case RE_RANGE:
1164 if (s->state == NEW_ELEMENT)
1165 s->state = ORD (p->u.range.first_char);
1166 else
1167 ++(s->state);
1168 return_val = CHR (s->state);
1169 if (s->state == ORD (p->u.range.last_char))
1171 s->tail = p->next;
1172 s->state = NEW_ELEMENT;
1174 break;
1176 case RE_CHAR_CLASS:
1177 if (class)
1179 int upper_or_lower;
1180 switch (p->u.char_class)
1182 case CC_LOWER:
1183 *class = UL_LOWER;
1184 upper_or_lower = 1;
1185 break;
1186 case CC_UPPER:
1187 *class = UL_UPPER;
1188 upper_or_lower = 1;
1189 break;
1190 default:
1191 upper_or_lower = 0;
1192 break;
1195 if (upper_or_lower)
1197 s->tail = p->next;
1198 s->state = NEW_ELEMENT;
1199 return_val = 0;
1200 break;
1204 if (s->state == NEW_ELEMENT)
1206 for (i = 0; i < N_CHARS; i++)
1207 if (is_char_class_member (p->u.char_class, i))
1208 break;
1209 assert (i < N_CHARS);
1210 s->state = i;
1212 assert (is_char_class_member (p->u.char_class, s->state));
1213 return_val = CHR (s->state);
1214 for (i = s->state + 1; i < N_CHARS; i++)
1215 if (is_char_class_member (p->u.char_class, i))
1216 break;
1217 if (i < N_CHARS)
1218 s->state = i;
1219 else
1221 s->tail = p->next;
1222 s->state = NEW_ELEMENT;
1224 break;
1226 case RE_EQUIV_CLASS:
1227 /* FIXME: this assumes that each character is alone in its own
1228 equivalence class (which appears to be correct for my
1229 LC_COLLATE. But I don't know of any function that allows
1230 one to determine a character's equivalence class. */
1232 return_val = p->u.equiv_code;
1233 s->state = NEW_ELEMENT;
1234 s->tail = p->next;
1235 break;
1237 case RE_REPEATED_CHAR:
1238 /* Here, a repeat count of n == 0 means don't repeat at all. */
1239 if (p->u.repeated_char.repeat_count == 0)
1241 s->tail = p->next;
1242 s->state = NEW_ELEMENT;
1243 return_val = get_next (s, class);
1245 else
1247 if (s->state == NEW_ELEMENT)
1249 s->state = 0;
1251 ++(s->state);
1252 return_val = p->u.repeated_char.the_repeated_char;
1253 if (p->u.repeated_char.repeat_count > 0
1254 && s->state == p->u.repeated_char.repeat_count)
1256 s->tail = p->next;
1257 s->state = NEW_ELEMENT;
1260 break;
1262 case RE_NO_TYPE:
1263 abort ();
1264 break;
1266 default:
1267 abort ();
1268 break;
1271 return return_val;
1274 /* This is a minor kludge. This function is called from
1275 get_spec_stats to determine the cardinality of a set derived
1276 from a complemented string. It's a kludge in that some of the
1277 same operations are (duplicated) performed in set_initialize. */
1279 static int
1280 card_of_complement (struct Spec_list *s)
1282 int c;
1283 int cardinality = N_CHARS;
1284 SET_TYPE in_set[N_CHARS];
1286 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1287 s->state = BEGIN_STATE;
1288 while ((c = get_next (s, NULL)) != -1)
1289 if (!in_set[c]++)
1290 --cardinality;
1291 return cardinality;
1294 /* Gather statistics about the spec-list S in preparation for the tests
1295 in validate that determine the consistency of the specs. This function
1296 is called at most twice; once for string1, and again for any string2.
1297 LEN_S1 < 0 indicates that this is the first call and that S represents
1298 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1299 constructs in string1, and we can use its value to resolve any
1300 indefinite repeat construct in S (which represents string2). Hence,
1301 this function has the side-effect that it converts a valid [c*]
1302 construct in string2 to [c*n] where n is large enough (or 0) to give
1303 string2 the same length as string1. For example, with the command
1304 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1305 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1307 static void
1308 get_spec_stats (struct Spec_list *s)
1310 struct List_element *p;
1311 int len = 0;
1313 s->n_indefinite_repeats = 0;
1314 s->has_equiv_class = 0;
1315 s->has_restricted_char_class = 0;
1316 s->has_char_class = 0;
1317 for (p = s->head->next; p; p = p->next)
1319 switch (p->type)
1321 int i;
1322 case RE_NORMAL_CHAR:
1323 ++len;
1324 break;
1326 case RE_RANGE:
1327 assert (p->u.range.last_char >= p->u.range.first_char);
1328 len += p->u.range.last_char - p->u.range.first_char + 1;
1329 break;
1331 case RE_CHAR_CLASS:
1332 s->has_char_class = 1;
1333 for (i = 0; i < N_CHARS; i++)
1334 if (is_char_class_member (p->u.char_class, i))
1335 ++len;
1336 switch (p->u.char_class)
1338 case CC_UPPER:
1339 case CC_LOWER:
1340 break;
1341 default:
1342 s->has_restricted_char_class = 1;
1343 break;
1345 break;
1347 case RE_EQUIV_CLASS:
1348 for (i = 0; i < N_CHARS; i++)
1349 if (is_equiv_class_member (p->u.equiv_code, i))
1350 ++len;
1351 s->has_equiv_class = 1;
1352 break;
1354 case RE_REPEATED_CHAR:
1355 if (p->u.repeated_char.repeat_count > 0)
1356 len += p->u.repeated_char.repeat_count;
1357 else if (p->u.repeated_char.repeat_count == 0)
1359 s->indefinite_repeat_element = p;
1360 ++(s->n_indefinite_repeats);
1362 break;
1364 case RE_NO_TYPE:
1365 assert (0);
1366 break;
1370 s->length = len;
1373 static void
1374 get_s1_spec_stats (struct Spec_list *s1)
1376 get_spec_stats (s1);
1377 if (complement)
1378 s1->length = card_of_complement (s1);
1381 static void
1382 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1384 get_spec_stats (s2);
1385 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1387 s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1388 len_s1 - s2->length;
1389 s2->length = len_s1;
1393 static void
1394 spec_init (struct Spec_list *spec_list)
1396 spec_list->head = spec_list->tail =
1397 (struct List_element *) xmalloc (sizeof (struct List_element));
1398 spec_list->head->next = NULL;
1401 /* This function makes two passes over the argument string S. The first
1402 one converts all \c and \ddd escapes to their one-byte representations.
1403 The second constructs a linked specification list, SPEC_LIST, of the
1404 characters and constructs that comprise the argument string. If either
1405 of these passes detects an error, this function returns nonzero. */
1407 static int
1408 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1410 struct E_string es;
1411 int fail;
1413 fail = unquote (s, &es);
1414 if (!fail)
1415 fail = build_spec_list (&es, spec_list);
1416 es_free (&es);
1417 return fail;
1420 /* Given two specification lists, S1 and S2, and assuming that
1421 S1->length > S2->length, append a single [c*n] element to S2 where c
1422 is the last character in the expansion of S2 and n is the difference
1423 between the two lengths.
1424 Upon successful completion, S2->length is set to S1->length. The only
1425 way this function can fail to make S2 as long as S1 is when S2 has
1426 zero-length, since in that case, there is no last character to repeat.
1427 So S2->length is required to be at least 1.
1429 Providing this functionality allows the user to do some pretty
1430 non-BSD (and non-portable) things: For example, the command
1431 tr -cs '[:upper:]0-9' '[:lower:]'
1432 is almost guaranteed to give results that depend on your collating
1433 sequence. */
1435 static void
1436 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1438 struct List_element *p;
1439 int char_to_repeat;
1440 int i;
1442 assert (translating);
1443 assert (s1->length > s2->length);
1444 assert (s2->length > 0);
1446 p = s2->tail;
1447 switch (p->type)
1449 case RE_NORMAL_CHAR:
1450 char_to_repeat = p->u.normal_char;
1451 break;
1452 case RE_RANGE:
1453 char_to_repeat = p->u.range.last_char;
1454 break;
1455 case RE_CHAR_CLASS:
1456 for (i = N_CHARS; i >= 0; i--)
1457 if (is_char_class_member (p->u.char_class, i))
1458 break;
1459 assert (i >= 0);
1460 char_to_repeat = CHR (i);
1461 break;
1463 case RE_REPEATED_CHAR:
1464 char_to_repeat = p->u.repeated_char.the_repeated_char;
1465 break;
1467 case RE_EQUIV_CLASS:
1468 /* This shouldn't happen, because validate exits with an error
1469 if it finds an equiv class in string2 when translating. */
1470 abort ();
1471 break;
1473 case RE_NO_TYPE:
1474 abort ();
1475 break;
1477 default:
1478 abort ();
1479 break;
1482 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1483 s2->length = s1->length;
1486 /* Return non-zero if S is a non-empty list in which exactly one
1487 character (but potentially, many instances of it) appears.
1488 E.g. [X*] or xxxxxxxx. */
1490 static int
1491 homogeneous_spec_list (struct Spec_list *s)
1493 int b, c;
1495 s->state = BEGIN_STATE;
1497 if ((b = get_next (s, NULL)) == -1)
1498 return 0;
1500 while ((c = get_next (s, NULL)) != -1)
1501 if (c != b)
1502 return 0;
1504 return 1;
1507 /* Die with an error message if S1 and S2 describe strings that
1508 are not valid with the given command line switches.
1509 A side effect of this function is that if a valid [c*] or
1510 [c*0] construct appears in string2, it is converted to [c*n]
1511 with a value for n that makes s2->length == s1->length. By
1512 the same token, if the --truncate-set1 option is not
1513 given, S2 may be extended. */
1515 static void
1516 validate (struct Spec_list *s1, struct Spec_list *s2)
1518 get_s1_spec_stats (s1);
1519 if (s1->n_indefinite_repeats > 0)
1521 error (EXIT_FAILURE, 0,
1522 _("the [c*] repeat construct may not appear in string1"));
1525 if (s2)
1527 get_s2_spec_stats (s2, s1->length);
1529 if (s2->n_indefinite_repeats > 1)
1531 error (EXIT_FAILURE, 0,
1532 _("only one [c*] repeat construct may appear in string2"));
1535 if (translating)
1537 if (s2->has_equiv_class)
1539 error (EXIT_FAILURE, 0,
1540 _("[=c=] expressions may not appear in string2 \
1541 when translating"));
1544 if (s1->length > s2->length)
1546 if (!truncate_set1)
1548 /* string2 must be non-empty unless --truncate-set1 is
1549 given or string1 is empty. */
1551 if (s2->length == 0)
1552 error (EXIT_FAILURE, 0,
1553 _("when not truncating set1, string2 must be non-empty"));
1554 string2_extend (s1, s2);
1558 if (complement && s1->has_char_class
1559 && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1561 error (EXIT_FAILURE, 0,
1562 _("when translating with complemented character classes,\
1563 \nstring2 must map all characters in the domain to one"));
1566 if (s2->has_restricted_char_class)
1568 error (EXIT_FAILURE, 0,
1569 _("when translating, the only character classes that may \
1570 appear in\nstring2 are `upper' and `lower'"));
1573 else
1574 /* Not translating. */
1576 if (s2->n_indefinite_repeats > 0)
1577 error (EXIT_FAILURE, 0,
1578 _("the [c*] construct may appear in string2 only \
1579 when translating"));
1584 /* Read buffers of SIZE bytes via the function READER (if READER is
1585 NULL, read from stdin) until EOF. When non-NULL, READER is either
1586 read_and_delete or read_and_xlate. After each buffer is read, it is
1587 processed and written to stdout. The buffers are processed so that
1588 multiple consecutive occurrences of the same character in the input
1589 stream are replaced by a single occurrence of that character if the
1590 character is in the squeeze set. */
1592 static void
1593 squeeze_filter (unsigned char *buf, size_t size, Filter reader)
1595 unsigned int char_to_squeeze = NOT_A_CHAR;
1596 size_t i = 0;
1597 ssize_t nr = 0;
1599 for (;;)
1601 size_t begin;
1603 if (i >= nr)
1605 if (reader == NULL)
1607 ssize_t signed_nr = safe_read (0, (char *) buf, size);
1608 if (signed_nr < 0)
1609 error (EXIT_FAILURE, errno, _("read error"));
1610 nr = signed_nr;
1612 else
1614 nr = (*reader) (buf, size, NULL);
1617 if (nr == 0)
1618 break;
1619 i = 0;
1622 begin = i;
1624 if (char_to_squeeze == NOT_A_CHAR)
1626 size_t out_len;
1627 /* Here, by being a little tricky, we can get a significant
1628 performance increase in most cases when the input is
1629 reasonably large. Since tr will modify the input only
1630 if two consecutive (and identical) input characters are
1631 in the squeeze set, we can step by two through the data
1632 when searching for a character in the squeeze set. This
1633 means there may be a little more work in a few cases and
1634 perhaps twice as much work in the worst cases where most
1635 of the input is removed by squeezing repeats. But most
1636 uses of this functionality seem to remove less than 20-30%
1637 of the input. */
1638 for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1639 ; /* empty */
1641 /* There is a special case when i == nr and we've just
1642 skipped a character (the last one in buf) that is in
1643 the squeeze set. */
1644 if (i == nr && in_squeeze_set[buf[i - 1]])
1645 --i;
1647 if (i >= nr)
1648 out_len = nr - begin;
1649 else
1651 char_to_squeeze = buf[i];
1652 /* We're about to output buf[begin..i]. */
1653 out_len = i - begin + 1;
1655 /* But since we stepped by 2 in the loop above,
1656 out_len may be one too large. */
1657 if (i > 0 && buf[i - 1] == char_to_squeeze)
1658 --out_len;
1660 /* Advance i to the index of first character to be
1661 considered when looking for a char different from
1662 char_to_squeeze. */
1663 ++i;
1665 if (out_len > 0
1666 && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1667 error (EXIT_FAILURE, errno, _("write error"));
1670 if (char_to_squeeze != NOT_A_CHAR)
1672 /* Advance i to index of first char != char_to_squeeze
1673 (or to nr if all the rest of the characters in this
1674 buffer are the same as char_to_squeeze). */
1675 for (; i < nr && buf[i] == char_to_squeeze; i++)
1676 ; /* empty */
1677 if (i < nr)
1678 char_to_squeeze = NOT_A_CHAR;
1679 /* If (i >= nr) we've squeezed the last character in this buffer.
1680 So now we have to read a new buffer and continue comparing
1681 characters against char_to_squeeze. */
1686 /* Read buffers of SIZE bytes from stdin until one is found that
1687 contains at least one character not in the delete set. Store
1688 in the array BUF, all characters from that buffer that are not
1689 in the delete set, and return the number of characters saved
1690 or 0 upon EOF. */
1692 static size_t
1693 read_and_delete (unsigned char *buf, size_t size, Filter not_used)
1695 size_t n_saved;
1696 static int hit_eof = 0;
1698 assert (not_used == NULL);
1700 if (hit_eof)
1701 return 0;
1703 /* This enclosing do-while loop is to make sure that
1704 we don't return zero (indicating EOF) when we've
1705 just deleted all the characters in a buffer. */
1708 size_t i;
1709 ssize_t nr = safe_read (0, (char *) buf, size);
1711 if (nr < 0)
1712 error (EXIT_FAILURE, errno, _("read error"));
1713 if (nr == 0)
1715 hit_eof = 1;
1716 return 0;
1719 /* This first loop may be a waste of code, but gives much
1720 better performance when no characters are deleted in
1721 the beginning of a buffer. It just avoids the copying
1722 of buf[i] into buf[n_saved] when it would be a NOP. */
1724 for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1725 /* empty */ ;
1726 n_saved = i;
1728 for (++i; i < nr; i++)
1729 if (!in_delete_set[buf[i]])
1730 buf[n_saved++] = buf[i];
1732 while (n_saved == 0);
1734 return n_saved;
1737 /* Read at most SIZE bytes from stdin into the array BUF. Then
1738 perform the in-place and one-to-one mapping specified by the global
1739 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1741 static size_t
1742 read_and_xlate (unsigned char *buf, size_t size, Filter not_used)
1744 ssize_t chars_read = 0;
1745 static int hit_eof = 0;
1746 size_t i;
1748 assert (not_used == NULL);
1750 if (hit_eof)
1751 return 0;
1753 chars_read = safe_read (0, (char *) buf, size);
1754 if (chars_read < 0)
1755 error (EXIT_FAILURE, errno, _("read error"));
1756 if (chars_read == 0)
1758 hit_eof = 1;
1759 return 0;
1762 for (i = 0; i < chars_read; i++)
1763 buf[i] = xlate[buf[i]];
1765 return chars_read;
1768 /* Initialize a boolean membership set IN_SET with the character
1769 values obtained by traversing the linked list of constructs S
1770 using the function `get_next'. If COMPLEMENT_THIS_SET is
1771 nonzero the resulting set is complemented. */
1773 static void
1774 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1776 int c;
1777 size_t i;
1779 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1780 s->state = BEGIN_STATE;
1781 while ((c = get_next (s, NULL)) != -1)
1782 in_set[c] = 1;
1783 if (complement_this_set)
1784 for (i = 0; i < N_CHARS; i++)
1785 in_set[i] = (!in_set[i]);
1789 main (int argc, char **argv)
1791 int c;
1792 int non_option_args;
1793 struct Spec_list buf1, buf2;
1794 struct Spec_list *s1 = &buf1;
1795 struct Spec_list *s2 = &buf2;
1797 program_name = argv[0];
1798 setlocale (LC_ALL, "");
1799 bindtextdomain (PACKAGE, LOCALEDIR);
1800 textdomain (PACKAGE);
1802 atexit (close_stdout);
1804 while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1806 switch (c)
1808 case 0:
1809 break;
1811 case 'c':
1812 complement = 1;
1813 break;
1815 case 'd':
1816 delete = 1;
1817 break;
1819 case 's':
1820 squeeze_repeats = 1;
1821 break;
1823 case 't':
1824 truncate_set1 = 1;
1825 break;
1827 case_GETOPT_HELP_CHAR;
1829 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1831 default:
1832 usage (2);
1833 break;
1837 posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1839 non_option_args = argc - optind;
1840 translating = (non_option_args == 2 && !delete);
1842 /* Change this test if it is valid to give tr no options and
1843 no args at all. POSIX doesn't specifically say anything
1844 either way, but it looks like they implied it's invalid
1845 by omission. If you want to make tr do a slow imitation
1846 of `cat' use `tr a a'. */
1847 if (non_option_args > 2)
1849 error (0, 0, _("too many arguments"));
1850 usage (2);
1853 if (!delete && !squeeze_repeats && non_option_args != 2)
1854 error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1856 if (delete && squeeze_repeats && non_option_args != 2)
1857 error (EXIT_FAILURE, 0, _("two strings must be given when both \
1858 deleting and squeezing repeats"));
1860 /* If --delete is given without --squeeze-repeats, then
1861 only one string argument may be specified. But POSIX
1862 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1863 is set, pretend we never saw string2. But I think
1864 this deserves a fatal error, so that's the default. */
1865 if ((delete && !squeeze_repeats) && non_option_args != 1)
1867 if (posix_pedantic && non_option_args == 2)
1868 --non_option_args;
1869 else
1870 error (EXIT_FAILURE, 0,
1871 _("only one string may be given when deleting \
1872 without squeezing repeats"));
1875 if (squeeze_repeats && non_option_args == 0)
1876 error (EXIT_FAILURE, 0,
1877 _("at least one string must be given when squeezing repeats"));
1879 spec_init (s1);
1880 if (parse_str ((unsigned char *) argv[optind], s1))
1881 exit (EXIT_FAILURE);
1883 if (non_option_args == 2)
1885 spec_init (s2);
1886 if (parse_str ((unsigned char *) argv[optind + 1], s2))
1887 exit (EXIT_FAILURE);
1889 else
1890 s2 = NULL;
1892 validate (s1, s2);
1894 /* Use binary I/O, since `tr' is sometimes used to transliterate
1895 non-printable characters, or characters which are stripped away
1896 by text-mode reads (like CR and ^Z). */
1897 SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1899 if (squeeze_repeats && non_option_args == 1)
1901 set_initialize (s1, complement, in_squeeze_set);
1902 squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1904 else if (delete && non_option_args == 1)
1906 size_t nr;
1908 set_initialize (s1, complement, in_delete_set);
1911 nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1912 if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1913 error (EXIT_FAILURE, errno, _("write error"));
1915 while (nr > 0);
1917 else if (squeeze_repeats && delete && non_option_args == 2)
1919 set_initialize (s1, complement, in_delete_set);
1920 set_initialize (s2, 0, in_squeeze_set);
1921 squeeze_filter (io_buf, IO_BUF_SIZE, read_and_delete);
1923 else if (translating)
1925 if (complement)
1927 int i;
1928 SET_TYPE *in_s1 = in_delete_set;
1930 set_initialize (s1, 0, in_s1);
1931 s2->state = BEGIN_STATE;
1932 for (i = 0; i < N_CHARS; i++)
1933 xlate[i] = i;
1934 for (i = 0; i < N_CHARS; i++)
1936 if (!in_s1[i])
1938 int ch = get_next (s2, NULL);
1939 assert (ch != -1 || truncate_set1);
1940 if (ch == -1)
1942 /* This will happen when tr is invoked like e.g.
1943 tr -cs A-Za-z0-9 '\012'. */
1944 break;
1946 xlate[i] = ch;
1949 assert (get_next (s2, NULL) == -1 || truncate_set1);
1951 else
1953 int c1, c2;
1954 int i;
1955 enum Upper_Lower_class class_s1;
1956 enum Upper_Lower_class class_s2;
1958 for (i = 0; i < N_CHARS; i++)
1959 xlate[i] = i;
1960 s1->state = BEGIN_STATE;
1961 s2->state = BEGIN_STATE;
1962 for (;;)
1964 c1 = get_next (s1, &class_s1);
1965 c2 = get_next (s2, &class_s2);
1966 if (!class_ok[(int) class_s1][(int) class_s2])
1967 error (EXIT_FAILURE, 0,
1968 _("misaligned [:upper:] and/or [:lower:] construct"));
1970 if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1972 for (i = 0; i < N_CHARS; i++)
1973 if (ISLOWER (i))
1974 xlate[i] = toupper (i);
1976 else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1978 for (i = 0; i < N_CHARS; i++)
1979 if (ISUPPER (i))
1980 xlate[i] = tolower (i);
1982 else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1983 || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1985 /* By default, GNU tr permits the identity mappings: from
1986 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But
1987 when POSIXLY_CORRECT is set, those evoke diagnostics. */
1988 if (posix_pedantic)
1990 error (EXIT_FAILURE, 0,
1991 _("\
1992 invalid identity mapping; when translating, any [:lower:] or [:upper:]\n\
1993 construct in string1 must be aligned with a corresponding construct\n\
1994 ([:upper:] or [:lower:], respectively) in string2"));
1997 else
1999 /* The following should have been checked by validate... */
2000 if (c1 == -1 || c2 == -1)
2001 break;
2002 xlate[c1] = c2;
2005 assert (c1 == -1 || truncate_set1);
2007 if (squeeze_repeats)
2009 set_initialize (s2, 0, in_squeeze_set);
2010 squeeze_filter (io_buf, IO_BUF_SIZE, read_and_xlate);
2012 else
2014 size_t chars_read;
2018 chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2019 if (chars_read > 0
2020 && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2021 error (EXIT_FAILURE, errno, _("write error"));
2023 while (chars_read > 0);
2027 if (close (0) != 0)
2028 error (EXIT_FAILURE, errno, _("standard input"));
2030 exit (EXIT_SUCCESS);