.
[coreutils.git] / src / tr.c
blob3b4eb1179cc4a4474e37a5b3e36988640ec88448
1 /* tr -- a filter to translate characters
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering, meyering@cs.utexas.edu. */
20 #include <config.h>
22 /* Get isblank from GNU libc. */
23 #define _GNU_SOURCE
25 #include <stdio.h>
26 #define NDEBUG 1
27 #include <assert.h>
28 #include <errno.h>
29 #include <sys/types.h>
30 #include <getopt.h>
32 #include "system.h"
33 #include "version.h"
34 #include "error.h"
36 #ifndef ULONG_MAX
37 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
38 #endif
40 #ifndef LONG_MAX
41 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
42 #endif
44 #ifndef UCHAR_MAX
45 #define UCHAR_MAX 0xFF
46 #endif
48 #define N_CHARS (UCHAR_MAX + 1)
50 /* A pointer to a function that returns an int. */
51 typedef int (*PFI) ();
53 /* Convert from character C to its index in the collating
54 sequence array. Just cast to an unsigned int to avoid
55 problems with sign-extension. */
56 #define ORD(c) (unsigned int)(c)
58 /* The inverse of ORD. */
59 #define CHR(i) (unsigned char)(i)
61 /* The value for Spec_list->state that indicates to
62 get_next that it should initialize the tail pointer.
63 Its value doesn't matter as long as it can't be
64 confused with a valid character code. */
65 #define BEGIN_STATE (2 * N_CHARS)
67 /* The value for Spec_list->state that indicates to
68 get_next that the element pointed to by Spec_list->tail is
69 being considered for the first time on this pass through the
70 list -- it indicates that get_next should make any necessary
71 initializations. */
72 #define NEW_ELEMENT (BEGIN_STATE + 1)
74 /* A value distinct from any character that may have been stored in a
75 buffer as the result of a block-read in the function squeeze_filter. */
76 #define NOT_A_CHAR (unsigned int)(-1)
78 /* The following (but not CC_NO_CLASS) are indices into the array of
79 valid character class strings. */
80 enum Char_class
82 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
83 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
84 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
85 CC_NO_CLASS = 9999
88 /* Character class to which a character (returned by get_next) belonged;
89 but it is set only if the construct from which the character was obtained
90 was one of the character classes [:upper:] or [:lower:]. The value
91 is used only when translating and then, only to make sure that upper
92 and lower class constructs have the same relative positions in string1
93 and string2. */
94 enum Upper_Lower_class
96 UL_LOWER = 0,
97 UL_UPPER = 1,
98 UL_NONE = 2
101 /* A shortcut to ensure that when constructing the translation array,
102 one of the values returned by paired calls to get_next (from s1 and s2)
103 is from [:upper:] and the other is from [:lower:], or neither is from
104 upper or lower. In fact, no other character classes are allowed when
105 translating, but that condition is tested elsewhere. This array is
106 indexed by values of type enum Upper_Lower_class. */
107 static int const class_ok[3][3] =
109 {0, 1, 0},
110 {1, 0, 0},
111 {0, 0, 1}
114 /* The type of a List_element. See build_spec_list for more details. */
115 enum Range_element_type
117 RE_NO_TYPE = 0,
118 RE_NORMAL_CHAR,
119 RE_RANGE,
120 RE_CHAR_CLASS,
121 RE_EQUIV_CLASS,
122 RE_REPEATED_CHAR
125 /* One construct in one of tr's argument strings.
126 For example, consider the POSIX version of the classic tr command:
127 tr -cs 'a-zA-Z_' '[\n*]'
128 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
129 and a single normal character, `_'. String2 has one construct. */
130 struct List_element
132 enum Range_element_type type;
133 struct List_element *next;
134 union
136 int normal_char;
137 struct /* unnamed */
139 unsigned int first_char;
140 unsigned int last_char;
142 range;
143 enum Char_class char_class;
144 int equiv_code;
145 struct /* unnamed */
147 unsigned int the_repeated_char;
148 size_t repeat_count;
150 repeated_char;
155 /* Each of tr's argument strings is parsed into a form that is easier
156 to work with: a linked list of constructs (struct List_element).
157 Each Spec_list structure also encapsulates various attributes of
158 the corresponding argument string. The attributes are used mainly
159 to verify that the strings are valid in the context of any options
160 specified (like -s, -d, or -c). The main exception is the member
161 `tail', which is first used to construct the list. After construction,
162 it is used by get_next to save its state when traversing the list.
163 The member `state' serves a similar function. */
164 struct Spec_list
166 /* Points to the head of the list of range elements.
167 The first struct is a dummy; its members are never used. */
168 struct List_element *head;
170 /* When appending, points to the last element. When traversing via
171 get_next(), points to the element to process next. Setting
172 Spec_list.state to the value BEGIN_STATE before calling get_next
173 signals get_next to initialize tail to point to head->next. */
174 struct List_element *tail;
176 /* Used to save state between calls to get_next(). */
177 unsigned int state;
179 /* Length, in the sense that length ('a-z[:digit:]123abc')
180 is 42 ( = 26 + 10 + 6). */
181 size_t length;
183 /* The number of [c*] and [c*0] constructs that appear in this spec. */
184 int n_indefinite_repeats;
186 /* If n_indefinite_repeats is non-zero, this points to the List_element
187 corresponding to the last [c*] or [c*0] construct encountered in
188 this spec. Otherwise it is undefined. */
189 struct List_element *indefinite_repeat_element;
191 /* Non-zero if this spec contains at least one equivalence
192 class construct e.g. [=c=]. */
193 int has_equiv_class;
195 /* Non-zero if this spec contains at least one of [:upper:] or
196 [:lower:] class constructs. */
197 int has_upper_or_lower;
199 /* Non-zero if this spec contains at least one of the character class
200 constructs (all but upper and lower) that aren't allowed in s2. */
201 int has_restricted_char_class;
204 char *xmalloc ();
205 char *stpcpy ();
206 int safe_read ();
208 /* The name by which this program was run. */
209 char *program_name;
211 /* When non-zero, each sequence in the input of a repeated character
212 (call it c) is replaced (in the output) by a single occurrence of c
213 for every c in the squeeze set. */
214 static int squeeze_repeats = 0;
216 /* When non-zero, removes characters in the delete set from input. */
217 static int delete = 0;
219 /* Use the complement of set1 in place of set1. */
220 static int complement = 0;
222 /* When non-zero, this flag causes GNU tr to provide strict
223 compliance with POSIX draft 1003.2.11.2. The POSIX spec
224 says that when -d is used without -s, string2 (if present)
225 must be ignored. Silently ignoring arguments is a bad idea.
226 The default GNU behavior is to give a usage message and exit.
227 Additionally, when this flag is non-zero, tr prints warnings
228 on stderr if it is being used in a manner that is not portable.
229 Applicable warnings are given by default, but are suppressed
230 if the environment variable `POSIXLY_CORRECT' is set, since
231 being POSIX conformant means we can't issue such messages.
232 Warnings on the following topics are suppressed when this
233 variable is non-zero:
234 1. Ambiguous octal escapes. */
235 static int posix_pedantic;
237 /* When tr is performing translation and string1 is longer than string2,
238 POSIX says that the result is undefined. That gives the implementor
239 of a POSIX conforming version of tr two reasonable choices for the
240 semantics of this case.
242 * The BSD tr pads string2 to the length of string1 by
243 repeating the last character in string2.
245 * System V tr ignores characters in string1 that have no
246 corresponding character in string2. That is, string1 is effectively
247 truncated to the length of string2.
249 When non-zero, this flag causes GNU tr to imitate the behavior
250 of System V tr when translating with string1 longer than string2.
251 The default is to emulate BSD tr. This flag is ignored in modes where
252 no translation is performed. Emulating the System V tr
253 in this exceptional case causes the relatively common BSD idiom:
255 tr -cs A-Za-z0-9 '\012'
257 to break (it would convert only zero bytes, rather than all
258 non-alphanumerics, to newlines).
260 WARNING: This switch does not provide general BSD or System V
261 compatibility. For example, it doesn't disable the interpretation
262 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
263 some unfortunate coincidence you use such constructs in scripts
264 expecting to use some other version of tr, the scripts will break. */
265 static int truncate_set1 = 0;
267 /* An alias for (!delete && non_option_args == 2).
268 It is set in main and used there and in validate(). */
269 static int translating;
271 #ifndef BUFSIZ
272 #define BUFSIZ 8192
273 #endif
275 #define IO_BUF_SIZE BUFSIZ
276 static unsigned char io_buf[IO_BUF_SIZE];
278 static char const *const char_class_name[] =
280 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
281 "lower", "print", "punct", "space", "upper", "xdigit"
283 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
285 typedef char SET_TYPE;
287 /* Array of boolean values. A character `c' is a member of the
288 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
289 set is defined by the last (possibly, the only) string argument
290 on the command line when the squeeze option is given. */
291 static SET_TYPE in_squeeze_set[N_CHARS];
293 /* Array of boolean values. A character `c' is a member of the
294 delete set if and only if in_delete_set[c] is true. The delete
295 set is defined by the first (or only) string argument on the
296 command line when the delete option is given. */
297 static SET_TYPE in_delete_set[N_CHARS];
299 /* Array of character values defining the translation (if any) that
300 tr is to perform. Translation is performed only when there are
301 two specification strings and the delete switch is not given. */
302 static char xlate[N_CHARS];
304 /* If non-zero, display usage information and exit. */
305 static int show_help;
307 /* If non-zero, print the version on standard output then exit. */
308 static int show_version;
310 static struct option const long_options[] =
312 {"complement", no_argument, NULL, 'c'},
313 {"delete", no_argument, NULL, 'd'},
314 {"squeeze-repeats", no_argument, NULL, 's'},
315 {"truncate-set1", no_argument, NULL, 't'},
316 {"help", no_argument, &show_help, 1},
317 {"version", no_argument, &show_version, 1},
318 {NULL, 0, NULL, 0}
321 static void
322 usage (status)
323 int status;
325 if (status != 0)
326 fprintf (stderr, "Try `%s --help' for more information.\n",
327 program_name);
328 else
330 printf ("\
331 Usage: %s [OPTION]... SET1 [SET2]\n\
333 program_name);
334 printf ("\
335 Translate, squeeze, and/or delete characters from standard input,\n\
336 writing to standard output.\n\
338 -c, --complement first complement SET1\n\
339 -d, --delete delete characters in SET1, do not translate\n\
340 -s, --squeeze-repeats replace sequence of characters with one\n\
341 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
342 --help display this help and exit\n\
343 --version output version information and exit\n\
345 printf ("\
347 SETs are specified as strings of characters. Most represent themselves.\n\
348 Interpreted sequences are:\n\
350 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
351 \\\\ backslash\n\
352 \\a audible BEL\n\
353 \\b backspace\n\
354 \\f form feed\n\
355 \\n new line\n\
356 \\r return\n\
357 \\t horizontal tab\n\
358 \\v vertical tab\n\
359 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
360 [CHAR1-CHAR2] same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
361 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
362 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
363 [:alnum:] all letters and digits\n\
364 [:alpha:] all letters\n\
365 [:blank:] all horizontal whitespace\n\
366 [:cntrl:] all control characters\n\
367 [:digit:] all digits\n\
368 [:graph:] all printable characters, not including space\n\
369 [:lower:] all lower case letters\n\
370 [:print:] all printable characters, including space\n\
371 [:punct:] all punctuation characters\n\
372 [:space:] all horizontal or vertical whitespace\n\
373 [:upper:] all upper case letters\n\
374 [:xdigit:] all hexadecimal digits\n\
375 [=CHAR=] all characters which are equivalent to CHAR\n\
377 printf ("\
379 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
380 -t may be used only when translating. SET2 is extended to length of\n\
381 SET1 by repeating its last character as necessary. Excess characters\n\
382 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
383 expand in ascending order; used in SET2 while translating, they may\n\
384 only be used in pairs to specify case conversion. -s uses SET1 if not\n\
385 translating nor deleting; else squeezing uses SET2 and occurs after\n\
386 translation or deletion.\n\
389 exit (status);
392 /* Return non-zero if the character C is a member of the
393 equivalence class containing the character EQUIV_CLASS. */
395 static int
396 is_equiv_class_member (equiv_class, c)
397 unsigned int equiv_class;
398 unsigned int c;
400 return (equiv_class == c);
403 /* Return non-zero if the character C is a member of the
404 character class CHAR_CLASS. */
406 static int
407 is_char_class_member (char_class, c)
408 enum Char_class char_class;
409 unsigned int c;
411 int result;
413 switch (char_class)
415 case CC_ALNUM:
416 result = ISALNUM (c);
417 break;
418 case CC_ALPHA:
419 result = ISALPHA (c);
420 break;
421 case CC_BLANK:
422 result = ISBLANK (c);
423 break;
424 case CC_CNTRL:
425 result = ISCNTRL (c);
426 break;
427 case CC_DIGIT:
428 result = ISDIGIT (c);
429 break;
430 case CC_GRAPH:
431 result = ISGRAPH (c);
432 break;
433 case CC_LOWER:
434 result = ISLOWER (c);
435 break;
436 case CC_PRINT:
437 result = ISPRINT (c);
438 break;
439 case CC_PUNCT:
440 result = ISPUNCT (c);
441 break;
442 case CC_SPACE:
443 result = ISSPACE (c);
444 break;
445 case CC_UPPER:
446 result = ISUPPER (c);
447 break;
448 case CC_XDIGIT:
449 result = ISXDIGIT (c);
450 break;
451 default:
452 abort ();
453 break;
455 return result;
458 /* Perform the first pass over each range-spec argument S, converting all
459 \c and \ddd escapes to their one-byte representations. The conversion
460 is done in-place, so S must point to writable storage. If an invalid
461 quote sequence is found print an error message and return non-zero.
462 Otherwise set *LEN to the length of the resulting string and return
463 zero. The resulting array of characters may contain zero-bytes;
464 however, on input, S is assumed to be null-terminated, and hence
465 cannot contain actual (non-escaped) zero bytes. */
467 static int
468 unquote (s, len)
469 unsigned char *s;
470 size_t *len;
472 size_t i, j;
474 j = 0;
475 for (i = 0; s[i]; i++)
477 switch (s[i])
479 int c;
480 case '\\':
481 switch (s[i + 1])
483 int oct_digit;
484 case '\\':
485 c = '\\';
486 break;
487 case 'a':
488 c = '\007';
489 break;
490 case 'b':
491 c = '\b';
492 break;
493 case 'f':
494 c = '\f';
495 break;
496 case 'n':
497 c = '\n';
498 break;
499 case 'r':
500 c = '\r';
501 break;
502 case 't':
503 c = '\t';
504 break;
505 case 'v':
506 c = '\v';
507 break;
508 case '0':
509 case '1':
510 case '2':
511 case '3':
512 case '4':
513 case '5':
514 case '6':
515 case '7':
516 c = s[i + 1] - '0';
517 oct_digit = s[i + 2] - '0';
518 if (0 <= oct_digit && oct_digit <= 7)
520 c = 8 * c + oct_digit;
521 ++i;
522 oct_digit = s[i + 2] - '0';
523 if (0 <= oct_digit && oct_digit <= 7)
525 if (8 * c + oct_digit < N_CHARS)
527 c = 8 * c + oct_digit;
528 ++i;
530 else if (!posix_pedantic)
532 /* Any octal number larger than 0377 won't
533 fit in 8 bits. So we stop when adding the
534 next digit would put us over the limit and
535 give a warning about the ambiguity. POSIX
536 isn't clear on this, but one person has said
537 that in his interpretation, POSIX says tr
538 can't even give a warning. */
539 error (0, 0, "warning: the ambiguous octal escape \
540 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
541 s[i], s[i + 1], s[i + 2],
542 s[i], s[i + 1], s[i + 2]);
546 break;
547 case '\0':
548 error (0, 0, "invalid backslash escape at end of string");
549 return 1;
550 break;
551 default:
552 error (0, 0, "invalid backslash escape `\\%c'", s[i + 1]);
553 return 1;
554 break;
556 ++i;
557 s[j++] = c;
558 break;
559 default:
560 s[j++] = s[i];
561 break;
564 *len = j;
565 return 0;
568 /* If CLASS_STR is a valid character class string, return its index
569 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
571 static enum Char_class
572 look_up_char_class (class_str, len)
573 const unsigned char *class_str;
574 size_t len;
576 unsigned int i;
578 for (i = 0; i < N_CHAR_CLASSES; i++)
579 if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
580 && strlen (char_class_name[i]) == len)
581 return (enum Char_class) i;
582 return CC_NO_CLASS;
585 /* Return a newly allocated string with a printable version of C.
586 This function is used solely for formatting error messages. */
588 static char *
589 make_printable_char (c)
590 unsigned int c;
592 char *buf = xmalloc (5);
594 assert (c < N_CHARS);
595 if (ISPRINT (c))
597 buf[0] = c;
598 buf[1] = '\0';
600 else
602 sprintf (buf, "\\%03o", c);
604 return buf;
607 /* Return a newly allocated copy of S which is suitable for printing.
608 LEN is the number of characters in S. Most non-printing
609 (isprint) characters are represented by a backslash followed by
610 3 octal digits. However, the characters represented by \c escapes
611 where c is one of [abfnrtv] are represented by their 2-character \c
612 sequences. This function is used solely for printing error messages. */
614 static char *
615 make_printable_str (s, len)
616 const unsigned char *s;
617 size_t len;
619 /* Worst case is that every character expands to a backslash
620 followed by a 3-character octal escape sequence. */
621 char *printable_buf = xmalloc (4 * len + 1);
622 char *p = printable_buf;
623 size_t i;
625 for (i = 0; i < len; i++)
627 char buf[5];
628 char *tmp = NULL;
630 switch (s[i])
632 case '\\':
633 tmp = "\\";
634 break;
635 case '\007':
636 tmp = "\\a";
637 break;
638 case '\b':
639 tmp = "\\b";
640 break;
641 case '\f':
642 tmp = "\\f";
643 break;
644 case '\n':
645 tmp = "\\n";
646 break;
647 case '\r':
648 tmp = "\\r";
649 break;
650 case '\t':
651 tmp = "\\t";
652 break;
653 case '\v':
654 tmp = "\\v";
655 break;
656 default:
657 if (ISPRINT (s[i]))
659 buf[0] = s[i];
660 buf[1] = '\0';
662 else
663 sprintf (buf, "\\%03o", s[i]);
664 tmp = buf;
665 break;
667 p = stpcpy (p, tmp);
669 return printable_buf;
672 /* Append a newly allocated structure representing a
673 character C to the specification list LIST. */
675 static void
676 append_normal_char (list, c)
677 struct Spec_list *list;
678 unsigned int c;
680 struct List_element *new;
682 new = (struct List_element *) xmalloc (sizeof (struct List_element));
683 new->next = NULL;
684 new->type = RE_NORMAL_CHAR;
685 new->u.normal_char = c;
686 assert (list->tail);
687 list->tail->next = new;
688 list->tail = new;
691 /* Append a newly allocated structure representing the range
692 of characters from FIRST to LAST to the specification list LIST.
693 Return non-zero if LAST precedes FIRST in the collating sequence,
694 zero otherwise. This means that '[c-c]' is acceptable. */
696 static int
697 append_range (list, first, last)
698 struct Spec_list *list;
699 size_t first;
700 size_t last;
702 struct List_element *new;
704 if (ORD (first) > ORD (last))
706 char *tmp1 = make_printable_char (first);
707 char *tmp2 = make_printable_char (last);
709 error (0, 0,
710 "range-endpoints of `%s-%s' are in reverse collating sequence order",
711 tmp1, tmp2);
712 free (tmp1);
713 free (tmp2);
714 return 1;
716 new = (struct List_element *) xmalloc (sizeof (struct List_element));
717 new->next = NULL;
718 new->type = RE_RANGE;
719 new->u.range.first_char = first;
720 new->u.range.last_char = last;
721 assert (list->tail);
722 list->tail->next = new;
723 list->tail = new;
724 return 0;
727 /* If CHAR_CLASS_STR is a valid character class string, append a
728 newly allocated structure representing that character class to the end
729 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
730 a valid string, print an error message and return non-zero. */
732 static int
733 append_char_class (list, char_class_str, len)
734 struct Spec_list *list;
735 const unsigned char *char_class_str;
736 size_t len;
738 enum Char_class char_class;
739 struct List_element *new;
741 char_class = look_up_char_class (char_class_str, len);
742 if (char_class == CC_NO_CLASS)
744 char *tmp = make_printable_str (char_class_str, len);
746 error (0, 0, "invalid character class `%s'", tmp);
747 free (tmp);
748 return 1;
750 new = (struct List_element *) xmalloc (sizeof (struct List_element));
751 new->next = NULL;
752 new->type = RE_CHAR_CLASS;
753 new->u.char_class = char_class;
754 assert (list->tail);
755 list->tail->next = new;
756 list->tail = new;
757 return 0;
760 /* Append a newly allocated structure representing a [c*n]
761 repeated character construct to the specification list LIST.
762 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
763 is a non-negative repeat count. */
765 static void
766 append_repeated_char (list, the_char, repeat_count)
767 struct Spec_list *list;
768 unsigned int the_char;
769 size_t repeat_count;
771 struct List_element *new;
773 new = (struct List_element *) xmalloc (sizeof (struct List_element));
774 new->next = NULL;
775 new->type = RE_REPEATED_CHAR;
776 new->u.repeated_char.the_repeated_char = the_char;
777 new->u.repeated_char.repeat_count = repeat_count;
778 assert (list->tail);
779 list->tail->next = new;
780 list->tail = new;
783 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
784 the length of that string, LEN, if LEN is exactly one, append
785 a newly allocated structure representing the specified
786 equivalence class to the specification list, LIST and return zero.
787 If LEN is not 1, issue an error message and return non-zero. */
789 static int
790 append_equiv_class (list, equiv_class_str, len)
791 struct Spec_list *list;
792 const unsigned char *equiv_class_str;
793 size_t len;
795 struct List_element *new;
797 if (len != 1)
799 char *tmp = make_printable_str (equiv_class_str, len);
801 error (0, 0, "%s: equivalence class operand must be a single character",
802 tmp);
803 free (tmp);
804 return 1;
806 new = (struct List_element *) xmalloc (sizeof (struct List_element));
807 new->next = NULL;
808 new->type = RE_EQUIV_CLASS;
809 new->u.equiv_code = *equiv_class_str;
810 assert (list->tail);
811 list->tail->next = new;
812 list->tail = new;
813 return 0;
816 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
817 The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
818 NUL bytes, and is *not* NUL-terminated. */
820 static unsigned char *
821 substr (p, first_idx, last_idx)
822 const unsigned char *p;
823 size_t first_idx;
824 size_t last_idx;
826 size_t len;
827 unsigned char *tmp;
829 assert (first_idx <= last_idx);
830 len = last_idx - first_idx + 1;
831 tmp = (unsigned char *) xmalloc (len);
833 assert (first_idx <= last_idx);
834 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
835 memcpy (tmp, p + first_idx, len);
836 return tmp;
839 /* Search forward starting at START_IDX for the 2-char sequence
840 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
841 a sequence is found, return the index of the first character,
842 otherwise return -1. P may contain zero bytes. */
844 static int
845 find_closing_delim (p, start_idx, p_len, pre_bracket_char)
846 const unsigned char *p;
847 size_t start_idx;
848 size_t p_len;
849 unsigned int pre_bracket_char;
851 size_t i;
853 for (i = start_idx; i < p_len - 1; i++)
854 if (p[i] == pre_bracket_char && p[i + 1] == ']')
855 return i;
856 return -1;
859 /* Convert a string S with explicit length LEN, possibly
860 containing embedded zero bytes, to a long integer value.
861 If the string represents a negative value, a value larger
862 than LONG_MAX, or if all LEN characters do not represent a
863 valid integer, return non-zero and do not modify *VAL.
864 Otherwise, return zero and set *VAL to the converted value. */
866 static int
867 non_neg_strtol (s, len, val)
868 const unsigned char *s;
869 size_t len;
870 size_t *val;
872 size_t i;
873 unsigned long sum = 0;
874 unsigned int base;
876 if (len <= 0)
877 return 1;
878 if (s[0] == '0')
879 base = 8;
880 else if (ISDIGIT (s[0]))
881 base = 10;
882 else
883 return 1;
885 for (i = 0; i < len; i++)
887 unsigned int c;
889 if (s[i] < '0')
890 return 1;
892 c = s[i] - '0';
893 if (c >= base)
894 return 1;
896 if (sum > (LONG_MAX - c) / base)
897 return 1;
898 sum = sum * base + c;
900 *val = sum;
901 return 0;
904 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
905 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
906 return the character and the repeat count through the arg pointers,
907 CHAR_TO_REPEAT and N, and then return the index of the closing
908 bracket as the function value. If the second character following
909 the opening bracket is not `*' or if no closing bracket can be
910 found, return -1. If a closing bracket is found and the
911 second char is `*', but the string between the `*' and `]' isn't
912 empty, an octal number, or a decimal number, print an error message
913 and return -2. */
915 static int
916 find_bracketed_repeat (p, start_idx, p_len, char_to_repeat, n)
917 const unsigned char *p;
918 size_t start_idx;
919 size_t p_len;
920 unsigned int *char_to_repeat;
921 size_t *n;
923 size_t i;
925 assert (start_idx + 1 < p_len);
926 if (p[start_idx + 1] != '*')
927 return -1;
929 for (i = start_idx + 2; i < p_len; i++)
931 if (p[i] == ']')
933 const unsigned char *digit_str;
934 size_t digit_str_len = i - start_idx - 2;
936 *char_to_repeat = p[start_idx];
937 if (digit_str_len == 0)
939 /* We've matched [c*] -- no explicit repeat count. */
940 *n = 0;
941 return i;
944 /* Here, we have found [c*s] where s should be a string
945 of octal or decimal digits. */
946 digit_str = &p[start_idx + 2];
947 if (non_neg_strtol (digit_str, digit_str_len, n))
949 char *tmp = make_printable_str (digit_str, digit_str_len);
950 error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp);
951 free (tmp);
952 return -2;
954 return i;
957 return -1; /* No bracket found. */
960 /* Convert string UNESACPED_STRING (which has been preprocessed to
961 convert backslash-escape sequences) of length LEN characters into
962 a linked list of the following 5 types of constructs:
963 - [:str:] Character class where `str' is one of the 12 valid strings.
964 - [=c=] Equivalence class where `c' is any single character.
965 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
966 However, if `n' is present, it must be a non-negative octal or
967 decimal integer.
968 - r-s Range of characters from `r' to `s'. The second endpoint must
969 not precede the first in the current collating sequence.
970 - c Any other character is interpreted as itself. */
972 static int
973 build_spec_list (unescaped_string, len, result)
974 const unsigned char *unescaped_string;
975 size_t len;
976 struct Spec_list *result;
978 const unsigned char *p;
979 size_t i;
981 p = unescaped_string;
983 /* The main for-loop below recognizes the 4 multi-character constructs.
984 A character that matches (in its context) none of the multi-character
985 constructs is classified as `normal'. Since all multi-character
986 constructs have at least 3 characters, any strings of length 2 or
987 less are composed solely of normal characters. Hence, the index of
988 the outer for-loop runs only as far as LEN-2. */
990 for (i = 0; i + 2 < len;)
992 switch (p[i])
994 int fall_through;
995 case '[':
996 fall_through = 0;
997 switch (p[i + 1])
999 int closing_delim_idx;
1000 int closing_bracket_idx;
1001 unsigned int char_to_repeat;
1002 size_t repeat_count;
1003 case ':':
1004 case '=':
1005 closing_delim_idx = find_closing_delim (p, i + 2, len, p[i + 1]);
1006 if (closing_delim_idx >= 0)
1008 int parse_failed;
1009 unsigned char *opnd_str = substr (p, i + 2,
1010 closing_delim_idx - 1);
1011 if (p[i + 1] == ':')
1012 parse_failed = append_char_class (result, opnd_str,
1013 (closing_delim_idx - 1) - (i + 2) + 1);
1014 else
1015 parse_failed = append_equiv_class (result, opnd_str,
1016 (closing_delim_idx - 1) - (i + 2) + 1);
1017 free (opnd_str);
1019 /* Return non-zero if append_*_class reports a problem. */
1020 if (parse_failed)
1021 return 1;
1022 else
1023 i = closing_delim_idx + 2;
1024 break;
1026 /* Else fall through. This could be [:*] or [=*]. */
1027 default:
1028 /* Determine whether this is a bracketed repeat range
1029 matching the RE \[.\*(dec_or_oct_number)?\]. */
1030 closing_bracket_idx = find_bracketed_repeat (p, i + 1,
1031 len, &char_to_repeat, &repeat_count);
1032 if (closing_bracket_idx >= 0)
1034 append_repeated_char (result, char_to_repeat, repeat_count);
1035 i = closing_bracket_idx + 1;
1036 break;
1038 else if (closing_bracket_idx == -1)
1040 fall_through = 1;
1042 else
1043 /* Found a string that looked like [c*n] but the
1044 numeric part was invalid. */
1045 return 1;
1046 break;
1048 if (!fall_through)
1049 break;
1051 /* Here if we've tried to match [c*n], [:str:], and [=c=]
1052 and none of them fit. So we still have to consider the
1053 range `[-c' (from `[' to `c'). */
1054 default:
1055 /* Look ahead one char for ranges like a-z. */
1056 if (p[i + 1] == '-')
1058 if (append_range (result, p[i], p[i + 2]))
1059 return 1;
1060 i += 3;
1062 else
1064 append_normal_char (result, p[i]);
1065 ++i;
1067 break;
1071 /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1]. */
1072 for (; i < len; i++)
1073 append_normal_char (result, p[i]);
1075 return 0;
1078 /* Given a Spec_list S (with its saved state implicit in the values
1079 of its members `tail' and `state'), return the next single character
1080 in the expansion of S's constructs. If the last character of S was
1081 returned on the previous call or if S was empty, this function
1082 returns -1. For example, successive calls to get_next where S
1083 represents the spec-string 'a-d[y*3]' will return the sequence
1084 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1085 which the returned character comes is [:upper:] or [:lower:], the
1086 parameter CLASS is given a value to indicate which it was. Otherwise
1087 CLASS is set to UL_NONE. This value is used only when constructing
1088 the translation table to verify that any occurrences of upper and
1089 lower class constructs in the spec-strings appear in the same relative
1090 positions. */
1092 static int
1093 get_next (s, class)
1094 struct Spec_list *s;
1095 enum Upper_Lower_class *class;
1097 struct List_element *p;
1098 int return_val;
1099 int i;
1101 if (class)
1102 *class = UL_NONE;
1104 if (s->state == BEGIN_STATE)
1106 s->tail = s->head->next;
1107 s->state = NEW_ELEMENT;
1110 p = s->tail;
1111 if (p == NULL)
1112 return -1;
1114 switch (p->type)
1116 case RE_NORMAL_CHAR:
1117 return_val = p->u.normal_char;
1118 s->state = NEW_ELEMENT;
1119 s->tail = p->next;
1120 break;
1122 case RE_RANGE:
1123 if (s->state == NEW_ELEMENT)
1124 s->state = ORD (p->u.range.first_char);
1125 else
1126 ++(s->state);
1127 return_val = CHR (s->state);
1128 if (s->state == ORD (p->u.range.last_char))
1130 s->tail = p->next;
1131 s->state = NEW_ELEMENT;
1133 break;
1135 case RE_CHAR_CLASS:
1136 if (s->state == NEW_ELEMENT)
1138 for (i = 0; i < N_CHARS; i++)
1139 if (is_char_class_member (p->u.char_class, i))
1140 break;
1141 assert (i < N_CHARS);
1142 s->state = i;
1144 assert (is_char_class_member (p->u.char_class, s->state));
1145 return_val = CHR (s->state);
1146 for (i = s->state + 1; i < N_CHARS; i++)
1147 if (is_char_class_member (p->u.char_class, i))
1148 break;
1149 if (i < N_CHARS)
1150 s->state = i;
1151 else
1153 s->tail = p->next;
1154 s->state = NEW_ELEMENT;
1156 if (class)
1158 switch (p->u.char_class)
1160 case CC_LOWER:
1161 *class = UL_LOWER;
1162 break;
1163 case CC_UPPER:
1164 *class = UL_UPPER;
1165 break;
1166 default:
1167 /* empty */
1168 break;
1171 break;
1173 case RE_EQUIV_CLASS:
1174 /* FIXME: this assumes that each character is alone in its own
1175 equivalence class (which appears to be correct for my
1176 LC_COLLATE. But I don't know of any function that allows
1177 one to determine a character's equivalence class. */
1179 return_val = p->u.equiv_code;
1180 s->state = NEW_ELEMENT;
1181 s->tail = p->next;
1182 break;
1184 case RE_REPEATED_CHAR:
1185 /* Here, a repeat count of n == 0 means don't repeat at all. */
1186 if (p->u.repeated_char.repeat_count == 0)
1188 s->tail = p->next;
1189 s->state = NEW_ELEMENT;
1190 return_val = get_next (s, class);
1192 else
1194 if (s->state == NEW_ELEMENT)
1196 s->state = 0;
1198 ++(s->state);
1199 return_val = p->u.repeated_char.the_repeated_char;
1200 if (p->u.repeated_char.repeat_count > 0
1201 && s->state == p->u.repeated_char.repeat_count)
1203 s->tail = p->next;
1204 s->state = NEW_ELEMENT;
1207 break;
1209 case RE_NO_TYPE:
1210 abort ();
1211 break;
1213 default:
1214 abort ();
1215 break;
1218 return return_val;
1221 /* This is a minor kludge. This function is called from
1222 get_spec_stats to determine the cardinality of a set derived
1223 from a complemented string. It's a kludge in that some of the
1224 same operations are (duplicated) performed in set_initialize. */
1226 static int
1227 card_of_complement (s)
1228 struct Spec_list *s;
1230 int c;
1231 int cardinality = N_CHARS;
1232 SET_TYPE in_set[N_CHARS];
1234 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1235 s->state = BEGIN_STATE;
1236 while ((c = get_next (s, NULL)) != -1)
1237 if (!in_set[c]++)
1238 --cardinality;
1239 return cardinality;
1242 /* Gather statistics about the spec-list S in preparation for the tests
1243 in validate that determine the consistency of the specs. This function
1244 is called at most twice; once for string1, and again for any string2.
1245 LEN_S1 < 0 indicates that this is the first call and that S represents
1246 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1247 constructs in string1, and we can use its value to resolve any
1248 indefinite repeat construct in S (which represents string2). Hence,
1249 this function has the side-effect that it converts a valid [c*]
1250 construct in string2 to [c*n] where n is large enough (or 0) to give
1251 string2 the same length as string1. For example, with the command
1252 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1253 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1255 static void
1256 get_spec_stats (s)
1257 struct Spec_list *s;
1259 struct List_element *p;
1260 int len = 0;
1262 s->n_indefinite_repeats = 0;
1263 s->has_equiv_class = 0;
1264 s->has_restricted_char_class = 0;
1265 s->has_upper_or_lower = 0;
1266 for (p = s->head->next; p; p = p->next)
1268 switch (p->type)
1270 int i;
1271 case RE_NORMAL_CHAR:
1272 ++len;
1273 break;
1275 case RE_RANGE:
1276 assert (p->u.range.last_char >= p->u.range.first_char);
1277 len += p->u.range.last_char - p->u.range.first_char + 1;
1278 break;
1280 case RE_CHAR_CLASS:
1281 for (i = 0; i < N_CHARS; i++)
1282 if (is_char_class_member (p->u.char_class, i))
1283 ++len;
1284 switch (p->u.char_class)
1286 case CC_UPPER:
1287 case CC_LOWER:
1288 s->has_upper_or_lower = 1;
1289 break;
1290 default:
1291 s->has_restricted_char_class = 1;
1292 break;
1294 break;
1296 case RE_EQUIV_CLASS:
1297 for (i = 0; i < N_CHARS; i++)
1298 if (is_equiv_class_member (p->u.equiv_code, i))
1299 ++len;
1300 s->has_equiv_class = 1;
1301 break;
1303 case RE_REPEATED_CHAR:
1304 if (p->u.repeated_char.repeat_count > 0)
1305 len += p->u.repeated_char.repeat_count;
1306 else if (p->u.repeated_char.repeat_count == 0)
1308 s->indefinite_repeat_element = p;
1309 ++(s->n_indefinite_repeats);
1311 break;
1313 case RE_NO_TYPE:
1314 assert (0);
1315 break;
1319 s->length = len;
1322 static void
1323 get_s1_spec_stats (s1)
1324 struct Spec_list *s1;
1326 get_spec_stats (s1);
1327 if (complement)
1328 s1->length = card_of_complement (s1);
1331 static void
1332 get_s2_spec_stats (s2, len_s1)
1333 struct Spec_list *s2;
1334 size_t len_s1;
1336 get_spec_stats (s2);
1337 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1339 s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1340 len_s1 - s2->length;
1341 s2->length = len_s1;
1345 static void
1346 spec_init (spec_list)
1347 struct Spec_list *spec_list;
1349 spec_list->head = spec_list->tail =
1350 (struct List_element *) xmalloc (sizeof (struct List_element));
1351 spec_list->head->next = NULL;
1354 /* This function makes two passes over the argument string S. The first
1355 one converts all \c and \ddd escapes to their one-byte representations.
1356 The second constructs a linked specification list, SPEC_LIST, of the
1357 characters and constructs that comprise the argument string. If either
1358 of these passes detects an error, this function returns non-zero. */
1360 static int
1361 parse_str (s, spec_list)
1362 unsigned char *s;
1363 struct Spec_list *spec_list;
1365 size_t len;
1367 if (unquote (s, &len))
1368 return 1;
1369 if (build_spec_list (s, len, spec_list))
1370 return 1;
1371 return 0;
1374 /* Given two specification lists, S1 and S2, and assuming that
1375 S1->length > S2->length, append a single [c*n] element to S2 where c
1376 is the last character in the expansion of S2 and n is the difference
1377 between the two lengths.
1378 Upon successful completion, S2->length is set to S1->length. The only
1379 way this function can fail to make S2 as long as S1 is when S2 has
1380 zero-length, since in that case, there is no last character to repeat.
1381 So S2->length is required to be at least 1.
1383 Providing this functionality allows the user to do some pretty
1384 non-BSD (and non-portable) things: For example, the command
1385 tr -cs '[:upper:]0-9' '[:lower:]'
1386 is almost guaranteed to give results that depend on your collating
1387 sequence. */
1389 static void
1390 string2_extend (s1, s2)
1391 const struct Spec_list *s1;
1392 struct Spec_list *s2;
1394 struct List_element *p;
1395 int char_to_repeat;
1396 int i;
1398 assert (translating);
1399 assert (s1->length > s2->length);
1400 assert (s2->length > 0);
1402 p = s2->tail;
1403 switch (p->type)
1405 case RE_NORMAL_CHAR:
1406 char_to_repeat = p->u.normal_char;
1407 break;
1408 case RE_RANGE:
1409 char_to_repeat = p->u.range.last_char;
1410 break;
1411 case RE_CHAR_CLASS:
1412 for (i = N_CHARS; i >= 0; i--)
1413 if (is_char_class_member (p->u.char_class, i))
1414 break;
1415 assert (i >= 0);
1416 char_to_repeat = CHR (i);
1417 break;
1419 case RE_REPEATED_CHAR:
1420 char_to_repeat = p->u.repeated_char.the_repeated_char;
1421 break;
1423 case RE_EQUIV_CLASS:
1424 /* This shouldn't happen, because validate exits with an error
1425 if it finds an equiv class in string2 when translating. */
1426 abort ();
1427 break;
1429 case RE_NO_TYPE:
1430 abort ();
1431 break;
1433 default:
1434 abort ();
1435 break;
1438 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1439 s2->length = s1->length;
1442 /* Die with an error message if S1 and S2 describe strings that
1443 are not valid with the given command line switches.
1444 A side effect of this function is that if a valid [c*] or
1445 [c*0] construct appears in string2, it is converted to [c*n]
1446 with a value for n that makes s2->length == s1->length. By
1447 the same token, if the --truncate-set1 option is not
1448 given, S2 may be extended. */
1450 static void
1451 validate (s1, s2)
1452 const struct Spec_list *s1;
1453 struct Spec_list *s2;
1455 get_s1_spec_stats (s1);
1456 if (s1->n_indefinite_repeats > 0)
1458 error (1, 0, "the [c*] repeat construct may not appear in string1");
1461 /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1462 but in the spirit of the other restrictions put on translation
1463 with character classes, this seems a logical interpretation. */
1464 if (complement && s1->has_upper_or_lower)
1466 error (1, 0,
1467 "character classes may not be used when translating \
1468 and complementing");
1471 if (s2)
1473 get_s2_spec_stats (s2, s1->length);
1474 if (s2->has_restricted_char_class)
1476 error (1, 0,
1477 "when translating, the only character classes that may \
1478 appear in\n\tstring2 are `upper' and `lower'");
1481 if (s2->n_indefinite_repeats > 1)
1483 error (1, 0, "only one [c*] repeat construct may appear in string2");
1486 if (translating)
1488 if (s2->has_equiv_class)
1490 error (1, 0,
1491 "[=c=] expressions may not appear in string2 \
1492 when translating");
1495 if (s1->length > s2->length)
1497 if (!truncate_set1)
1499 /* string2 must be non-empty unless --truncate-set1 is
1500 given or string1 is empty. */
1502 if (s2->length == 0)
1503 error (1, 0,
1504 "when not truncating set1, string2 must be non-empty");
1505 string2_extend (s1, s2);
1509 if (complement && s2->has_upper_or_lower)
1510 error (1, 0,
1511 "character classes may not be used when translating \
1512 and complementing");
1514 else
1515 /* Not translating. */
1517 if (s2->n_indefinite_repeats > 0)
1518 error (1, 0,
1519 "the [c*] construct may appear in string2 only \
1520 when translating");
1525 /* Read buffers of SIZE bytes via the function READER (if READER is
1526 NULL, read from stdin) until EOF. When non-NULL, READER is either
1527 read_and_delete or read_and_xlate. After each buffer is read, it is
1528 processed and written to stdout. The buffers are processed so that
1529 multiple consecutive occurrences of the same character in the input
1530 stream are replaced by a single occurrence of that character if the
1531 character is in the squeeze set. */
1533 static void
1534 squeeze_filter (buf, size, reader)
1535 unsigned char *buf;
1536 long int size;
1537 PFI reader;
1539 unsigned int char_to_squeeze = NOT_A_CHAR;
1540 int i = 0;
1541 int nr = 0;
1543 for (;;)
1545 int begin;
1547 if (i >= nr)
1549 if (reader == NULL)
1550 nr = safe_read (0, (char *) buf, size);
1551 else
1552 nr = (*reader) (buf, size, NULL);
1554 if (nr < 0)
1555 error (1, errno, "read error");
1556 if (nr == 0)
1557 break;
1558 i = 0;
1561 begin = i;
1563 if (char_to_squeeze == NOT_A_CHAR)
1565 int out_len;
1566 /* Here, by being a little tricky, we can get a significant
1567 performance increase in most cases when the input is
1568 reasonably large. Since tr will modify the input only
1569 if two consecutive (and identical) input characters are
1570 in the squeeze set, we can step by two through the data
1571 when searching for a character in the squeeze set. This
1572 means there may be a little more work in a few cases and
1573 perhaps twice as much work in the worst cases where most
1574 of the input is removed by squeezing repeats. But most
1575 uses of this functionality seem to remove less than 20-30%
1576 of the input. */
1577 for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1578 ; /* empty */
1580 /* There is a special case when i == nr and we've just
1581 skipped a character (the last one in buf) that is in
1582 the squeeze set. */
1583 if (i == nr && in_squeeze_set[buf[i - 1]])
1584 --i;
1586 if (i >= nr)
1587 out_len = nr - begin;
1588 else
1590 char_to_squeeze = buf[i];
1591 /* We're about to output buf[begin..i]. */
1592 out_len = i - begin + 1;
1594 /* But since we stepped by 2 in the loop above,
1595 out_len may be one too large. */
1596 if (i > 0 && buf[i - 1] == char_to_squeeze)
1597 --out_len;
1599 /* Advance i to the index of first character to be
1600 considered when looking for a char different from
1601 char_to_squeeze. */
1602 ++i;
1604 if (out_len > 0
1605 && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1606 error (1, errno, "write error");
1609 if (char_to_squeeze != NOT_A_CHAR)
1611 /* Advance i to index of first char != char_to_squeeze
1612 (or to nr if all the rest of the characters in this
1613 buffer are the same as char_to_squeeze). */
1614 for (; i < nr && buf[i] == char_to_squeeze; i++)
1615 ; /* empty */
1616 if (i < nr)
1617 char_to_squeeze = NOT_A_CHAR;
1618 /* If (i >= nr) we've squeezed the last character in this buffer.
1619 So now we have to read a new buffer and continue comparing
1620 characters against char_to_squeeze. */
1625 /* Read buffers of SIZE bytes from stdin until one is found that
1626 contains at least one character not in the delete set. Store
1627 in the array BUF, all characters from that buffer that are not
1628 in the delete set, and return the number of characters saved
1629 or 0 upon EOF. */
1631 static long
1632 read_and_delete (buf, size, not_used)
1633 unsigned char *buf;
1634 long int size;
1635 PFI not_used;
1637 long n_saved;
1638 static int hit_eof = 0;
1640 assert (not_used == NULL);
1641 assert (size > 0);
1643 if (hit_eof)
1644 return 0;
1646 /* This enclosing do-while loop is to make sure that
1647 we don't return zero (indicating EOF) when we've
1648 just deleted all the characters in a buffer. */
1651 int i;
1652 int nr = safe_read (0, (char *) buf, size);
1654 if (nr < 0)
1655 error (1, errno, "read error");
1656 if (nr == 0)
1658 hit_eof = 1;
1659 return 0;
1662 /* This first loop may be a waste of code, but gives much
1663 better performance when no characters are deleted in
1664 the beginning of a buffer. It just avoids the copying
1665 of buf[i] into buf[n_saved] when it would be a NOP. */
1667 for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1668 /* empty */ ;
1669 n_saved = i;
1671 for (++i; i < nr; i++)
1672 if (!in_delete_set[buf[i]])
1673 buf[n_saved++] = buf[i];
1675 while (n_saved == 0);
1677 return n_saved;
1680 /* Read at most SIZE bytes from stdin into the array BUF. Then
1681 perform the in-place and one-to-one mapping specified by the global
1682 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1684 static long
1685 read_and_xlate (buf, size, not_used)
1686 unsigned char *buf;
1687 long int size;
1688 PFI not_used;
1690 long chars_read = 0;
1691 static int hit_eof = 0;
1692 int i;
1694 assert (not_used == NULL);
1695 assert (size > 0);
1697 if (hit_eof)
1698 return 0;
1700 chars_read = safe_read (0, (char *) buf, size);
1701 if (chars_read < 0)
1702 error (1, errno, "read error");
1703 if (chars_read == 0)
1705 hit_eof = 1;
1706 return 0;
1709 for (i = 0; i < chars_read; i++)
1710 buf[i] = xlate[buf[i]];
1712 return chars_read;
1715 /* Initialize a boolean membership set IN_SET with the character
1716 values obtained by traversing the linked list of constructs S
1717 using the function `get_next'. If COMPLEMENT_THIS_SET is
1718 non-zero the resulting set is complemented. */
1720 static void
1721 set_initialize (s, complement_this_set, in_set)
1722 struct Spec_list *s;
1723 int complement_this_set;
1724 SET_TYPE *in_set;
1726 int c;
1727 int i;
1729 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1730 s->state = BEGIN_STATE;
1731 while ((c = get_next (s, NULL)) != -1)
1732 in_set[c] = 1;
1733 if (complement_this_set)
1734 for (i = 0; i < N_CHARS; i++)
1735 in_set[i] = (!in_set[i]);
1738 void
1739 main (argc, argv)
1740 int argc;
1741 char **argv;
1743 int c;
1744 int non_option_args;
1745 struct Spec_list buf1, buf2;
1746 struct Spec_list *s1 = &buf1;
1747 struct Spec_list *s2 = &buf2;
1749 program_name = argv[0];
1751 while ((c = getopt_long (argc, argv, "cdst", long_options,
1752 (int *) 0)) != EOF)
1754 switch (c)
1756 case 0:
1757 break;
1759 case 'c':
1760 complement = 1;
1761 break;
1763 case 'd':
1764 delete = 1;
1765 break;
1767 case 's':
1768 squeeze_repeats = 1;
1769 break;
1771 case 't':
1772 truncate_set1 = 1;
1773 break;
1775 default:
1776 usage (2);
1777 break;
1781 if (show_version)
1783 printf ("tr - %s\n", version_string);
1784 exit (0);
1787 if (show_help)
1788 usage (0);
1790 posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1792 non_option_args = argc - optind;
1793 translating = (non_option_args == 2 && !delete);
1795 /* Change this test if it is valid to give tr no options and
1796 no args at all. POSIX doesn't specifically say anything
1797 either way, but it looks like they implied it's invalid
1798 by omission. If you want to make tr do a slow imitation
1799 of `cat' use `tr a a'. */
1800 if (non_option_args > 2)
1802 error (0, 0, "too many arguments");
1803 usage (2);
1806 if (!delete && !squeeze_repeats && non_option_args != 2)
1807 error (1, 0, "two strings must be given when translating");
1809 if (delete && squeeze_repeats && non_option_args != 2)
1810 error (1, 0, "two strings must be given when both \
1811 deleting and squeezing repeats");
1813 /* If --delete is given without --squeeze-repeats, then
1814 only one string argument may be specified. But POSIX
1815 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1816 is set, pretend we never saw string2. But I think
1817 this deserves a fatal error, so that's the default. */
1818 if ((delete && !squeeze_repeats) && non_option_args != 1)
1820 if (posix_pedantic && non_option_args == 2)
1821 --non_option_args;
1822 else
1823 error (1, 0,
1824 "only one string may be given when deleting \
1825 without squeezing repeats");
1828 if (squeeze_repeats && non_option_args == 0)
1829 error (1, 0, "at least one string must be given when squeezing repeats");
1831 spec_init (s1);
1832 if (parse_str ((unsigned char *) argv[optind], s1))
1833 exit (1);
1835 if (non_option_args == 2)
1837 spec_init (s2);
1838 if (parse_str ((unsigned char *) argv[optind + 1], s2))
1839 exit (1);
1841 else
1842 s2 = NULL;
1844 validate (s1, s2);
1846 if (squeeze_repeats && non_option_args == 1)
1848 set_initialize (s1, complement, in_squeeze_set);
1849 squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1851 else if (delete && non_option_args == 1)
1853 int nr;
1855 set_initialize (s1, complement, in_delete_set);
1858 nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1859 if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1860 error (1, errno, "write error");
1862 while (nr > 0);
1864 else if (squeeze_repeats && delete && non_option_args == 2)
1866 set_initialize (s1, complement, in_delete_set);
1867 set_initialize (s2, 0, in_squeeze_set);
1868 squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1870 else if (translating)
1872 if (complement)
1874 int i;
1875 SET_TYPE *in_s1 = in_delete_set;
1877 set_initialize (s1, 0, in_s1);
1878 s2->state = BEGIN_STATE;
1879 for (i = 0; i < N_CHARS; i++)
1880 xlate[i] = i;
1881 for (i = 0; i < N_CHARS; i++)
1883 if (!in_s1[i])
1885 int ch = get_next (s2, NULL);
1886 assert (ch != -1 || truncate_set1);
1887 if (ch == -1)
1889 /* This will happen when tr is invoked like e.g.
1890 tr -cs A-Za-z0-9 '\012'. */
1891 break;
1893 xlate[i] = ch;
1896 assert (get_next (s2, NULL) == -1 || truncate_set1);
1898 else
1900 int c1, c2;
1901 int i;
1902 enum Upper_Lower_class class_s1;
1903 enum Upper_Lower_class class_s2;
1905 for (i = 0; i < N_CHARS; i++)
1906 xlate[i] = i;
1907 s1->state = BEGIN_STATE;
1908 s2->state = BEGIN_STATE;
1909 for (;;)
1911 c1 = get_next (s1, &class_s1);
1912 c2 = get_next (s2, &class_s2);
1913 if (!class_ok[(int) class_s1][(int) class_s2])
1914 error (1, 0,
1915 "misaligned or mismatched upper and/or lower classes");
1916 /* The following should have been checked by validate... */
1917 if (c2 == -1)
1918 break;
1919 xlate[c1] = c2;
1921 assert (c1 == -1 || truncate_set1);
1923 if (squeeze_repeats)
1925 set_initialize (s2, 0, in_squeeze_set);
1926 squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
1928 else
1930 int chars_read;
1934 chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
1935 if (chars_read > 0
1936 && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
1937 error (1, errno, "write error");
1939 while (chars_read > 0);
1943 if (fclose (stdout) == EOF)
1944 error (2, errno, "write error");
1946 if (close (0) != 0)
1947 error (2, errno, "standard input");
1949 exit (0);