1 /* tr -- a filter to translate characters
2 Copyright (C) 91, 1995-1998, 1999 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering */
25 #include <sys/types.h>
30 #include "safe-read.h"
32 /* The official name of this program (e.g., no `g' prefix). */
33 #define PROGRAM_NAME "tr"
35 #define AUTHORS "Jim Meyering"
37 #define N_CHARS (UCHAR_MAX + 1)
39 /* A pointer to a function that returns an int. */
40 typedef int (*PFI
) ();
42 /* Convert from character C to its index in the collating
43 sequence array. Just cast to an unsigned int to avoid
44 problems with sign-extension. */
45 #define ORD(c) (unsigned int)(c)
47 /* The inverse of ORD. */
48 #define CHR(i) (unsigned char)(i)
50 /* The value for Spec_list->state that indicates to
51 get_next that it should initialize the tail pointer.
52 Its value should be as large as possible to avoid conflict
53 a valid value for the state field -- and that may be as
54 large as any valid repeat_count. */
55 #define BEGIN_STATE (INT_MAX - 1)
57 /* The value for Spec_list->state that indicates to
58 get_next that the element pointed to by Spec_list->tail is
59 being considered for the first time on this pass through the
60 list -- it indicates that get_next should make any necessary
62 #define NEW_ELEMENT (BEGIN_STATE + 1)
64 /* A value distinct from any character that may have been stored in a
65 buffer as the result of a block-read in the function squeeze_filter. */
66 #define NOT_A_CHAR (unsigned int)(-1)
68 /* The following (but not CC_NO_CLASS) are indices into the array of
69 valid character class strings. */
72 CC_ALNUM
= 0, CC_ALPHA
= 1, CC_BLANK
= 2, CC_CNTRL
= 3,
73 CC_DIGIT
= 4, CC_GRAPH
= 5, CC_LOWER
= 6, CC_PRINT
= 7,
74 CC_PUNCT
= 8, CC_SPACE
= 9, CC_UPPER
= 10, CC_XDIGIT
= 11,
78 /* Character class to which a character (returned by get_next) belonged;
79 but it is set only if the construct from which the character was obtained
80 was one of the character classes [:upper:] or [:lower:]. The value
81 is used only when translating and then, only to make sure that upper
82 and lower class constructs have the same relative positions in string1
84 enum Upper_Lower_class
91 /* A shortcut to ensure that when constructing the translation array,
92 one of the values returned by paired calls to get_next (from s1 and s2)
93 is from [:upper:] and the other is from [:lower:], or neither is from
94 upper or lower. By default, GNU tr permits the identity mappings: from
95 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But when
96 POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
97 by values of type enum Upper_Lower_class. */
98 static int const class_ok
[3][3] =
105 /* The type of a List_element. See build_spec_list for more details. */
106 enum Range_element_type
116 /* One construct in one of tr's argument strings.
117 For example, consider the POSIX version of the classic tr command:
118 tr -cs 'a-zA-Z_' '[\n*]'
119 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
120 and a single normal character, `_'. String2 has one construct. */
123 enum Range_element_type type
;
124 struct List_element
*next
;
130 unsigned int first_char
;
131 unsigned int last_char
;
134 enum Char_class char_class
;
138 unsigned int the_repeated_char
;
146 /* Each of tr's argument strings is parsed into a form that is easier
147 to work with: a linked list of constructs (struct List_element).
148 Each Spec_list structure also encapsulates various attributes of
149 the corresponding argument string. The attributes are used mainly
150 to verify that the strings are valid in the context of any options
151 specified (like -s, -d, or -c). The main exception is the member
152 `tail', which is first used to construct the list. After construction,
153 it is used by get_next to save its state when traversing the list.
154 The member `state' serves a similar function. */
157 /* Points to the head of the list of range elements.
158 The first struct is a dummy; its members are never used. */
159 struct List_element
*head
;
161 /* When appending, points to the last element. When traversing via
162 get_next(), points to the element to process next. Setting
163 Spec_list.state to the value BEGIN_STATE before calling get_next
164 signals get_next to initialize tail to point to head->next. */
165 struct List_element
*tail
;
167 /* Used to save state between calls to get_next(). */
170 /* Length, in the sense that length ('a-z[:digit:]123abc')
171 is 42 ( = 26 + 10 + 6). */
174 /* The number of [c*] and [c*0] constructs that appear in this spec. */
175 int n_indefinite_repeats
;
177 /* If n_indefinite_repeats is nonzero, this points to the List_element
178 corresponding to the last [c*] or [c*0] construct encountered in
179 this spec. Otherwise it is undefined. */
180 struct List_element
*indefinite_repeat_element
;
182 /* Non-zero if this spec contains at least one equivalence
183 class construct e.g. [=c=]. */
186 /* Non-zero if this spec contains at least one character class
187 construct. E.g. [:digit:]. */
190 /* Non-zero if this spec contains at least one of the character class
191 constructs (all but upper and lower) that aren't allowed in s2. */
192 int has_restricted_char_class
;
195 /* A representation for escaped string1 or string2. As a string is parsed,
196 any backslash-escaped characters (other than octal or \a, \b, \f, \n,
197 etc.) are marked as such in this structure by setting the corresponding
198 entry in the ESCAPED vector. */
206 /* Return nonzero if the Ith character of escaped string ES matches C
207 and is not escaped itself. */
208 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
210 /* The name by which this program was run. */
213 /* When nonzero, each sequence in the input of a repeated character
214 (call it c) is replaced (in the output) by a single occurrence of c
215 for every c in the squeeze set. */
216 static int squeeze_repeats
= 0;
218 /* When nonzero, removes characters in the delete set from input. */
219 static int delete = 0;
221 /* Use the complement of set1 in place of set1. */
222 static int complement
= 0;
224 /* When nonzero, this flag causes GNU tr to provide strict
225 compliance with POSIX draft 1003.2.11.2. The POSIX spec
226 says that when -d is used without -s, string2 (if present)
227 must be ignored. Silently ignoring arguments is a bad idea.
228 The default GNU behavior is to give a usage message and exit.
229 Additionally, when this flag is nonzero, tr prints warnings
230 on stderr if it is being used in a manner that is not portable.
231 Applicable warnings are given by default, but are suppressed
232 if the environment variable `POSIXLY_CORRECT' is set, since
233 being POSIX conformant means we can't issue such messages.
234 Warnings on the following topics are suppressed when this
236 1. Ambiguous octal escapes. */
237 static int posix_pedantic
;
239 /* When tr is performing translation and string1 is longer than string2,
240 POSIX says that the result is undefined. That gives the implementor
241 of a POSIX conforming version of tr two reasonable choices for the
242 semantics of this case.
244 * The BSD tr pads string2 to the length of string1 by
245 repeating the last character in string2.
247 * System V tr ignores characters in string1 that have no
248 corresponding character in string2. That is, string1 is effectively
249 truncated to the length of string2.
251 When nonzero, this flag causes GNU tr to imitate the behavior
252 of System V tr when translating with string1 longer than string2.
253 The default is to emulate BSD tr. This flag is ignored in modes where
254 no translation is performed. Emulating the System V tr
255 in this exceptional case causes the relatively common BSD idiom:
257 tr -cs A-Za-z0-9 '\012'
259 to break (it would convert only zero bytes, rather than all
260 non-alphanumerics, to newlines).
262 WARNING: This switch does not provide general BSD or System V
263 compatibility. For example, it doesn't disable the interpretation
264 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
265 some unfortunate coincidence you use such constructs in scripts
266 expecting to use some other version of tr, the scripts will break. */
267 static int truncate_set1
= 0;
269 /* An alias for (!delete && non_option_args == 2).
270 It is set in main and used there and in validate(). */
271 static int translating
;
277 #define IO_BUF_SIZE BUFSIZ
278 static unsigned char io_buf
[IO_BUF_SIZE
];
280 static char const *const char_class_name
[] =
282 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
283 "lower", "print", "punct", "space", "upper", "xdigit"
285 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
287 typedef char SET_TYPE
;
289 /* Array of boolean values. A character `c' is a member of the
290 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
291 set is defined by the last (possibly, the only) string argument
292 on the command line when the squeeze option is given. */
293 static SET_TYPE in_squeeze_set
[N_CHARS
];
295 /* Array of boolean values. A character `c' is a member of the
296 delete set if and only if in_delete_set[c] is true. The delete
297 set is defined by the first (or only) string argument on the
298 command line when the delete option is given. */
299 static SET_TYPE in_delete_set
[N_CHARS
];
301 /* Array of character values defining the translation (if any) that
302 tr is to perform. Translation is performed only when there are
303 two specification strings and the delete switch is not given. */
304 static char xlate
[N_CHARS
];
306 static struct option
const long_options
[] =
308 {"complement", no_argument
, NULL
, 'c'},
309 {"delete", no_argument
, NULL
, 'd'},
310 {"squeeze-repeats", no_argument
, NULL
, 's'},
311 {"truncate-set1", no_argument
, NULL
, 't'},
312 {GETOPT_HELP_OPTION_DECL
},
313 {GETOPT_VERSION_OPTION_DECL
},
321 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
326 Usage: %s [OPTION]... SET1 [SET2]\n\
330 Translate, squeeze, and/or delete characters from standard input,\n\
331 writing to standard output.\n\
333 -c, --complement first complement SET1\n\
334 -d, --delete delete characters in SET1, do not translate\n\
335 -s, --squeeze-repeats replace sequence of characters with one\n\
336 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
337 --help display this help and exit\n\
338 --version output version information and exit\n\
342 SETs are specified as strings of characters. Most represent themselves.\n\
343 Interpreted sequences are:\n\
345 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
352 \\t horizontal tab\n\
354 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
355 [CHAR1-CHAR2] same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
356 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
357 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
358 [:alnum:] all letters and digits\n\
359 [:alpha:] all letters\n\
360 [:blank:] all horizontal whitespace\n\
361 [:cntrl:] all control characters\n\
362 [:digit:] all digits\n\
363 [:graph:] all printable characters, not including space\n\
364 [:lower:] all lower case letters\n\
365 [:print:] all printable characters, including space\n\
366 [:punct:] all punctuation characters\n\
367 [:space:] all horizontal or vertical whitespace\n\
368 [:upper:] all upper case letters\n\
369 [:xdigit:] all hexadecimal digits\n\
370 [=CHAR=] all characters which are equivalent to CHAR\n\
374 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
375 -t may be used only when translating. SET2 is extended to length of\n\
376 SET1 by repeating its last character as necessary. Excess characters\n\
377 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
378 expand in ascending order; used in SET2 while translating, they may\n\
379 only be used in pairs to specify case conversion. -s uses SET1 if not\n\
380 translating nor deleting; else squeezing uses SET2 and occurs after\n\
381 translation or deletion.\n\
383 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
385 exit (status
== 0 ? EXIT_SUCCESS
: EXIT_FAILURE
);
388 /* Return nonzero if the character C is a member of the
389 equivalence class containing the character EQUIV_CLASS. */
392 is_equiv_class_member (unsigned int equiv_class
, unsigned int c
)
394 return (equiv_class
== c
);
397 /* Return nonzero if the character C is a member of the
398 character class CHAR_CLASS. */
401 is_char_class_member (enum Char_class char_class
, unsigned int c
)
408 result
= ISALNUM (c
);
411 result
= ISALPHA (c
);
414 result
= ISBLANK (c
);
417 result
= ISCNTRL (c
);
420 result
= ISDIGIT_LOCALE (c
);
423 result
= ISGRAPH (c
);
426 result
= ISLOWER (c
);
429 result
= ISPRINT (c
);
432 result
= ISPUNCT (c
);
435 result
= ISSPACE (c
);
438 result
= ISUPPER (c
);
441 result
= ISXDIGIT (c
);
451 es_free (struct E_string
*es
)
457 /* Perform the first pass over each range-spec argument S, converting all
458 \c and \ddd escapes to their one-byte representations. The conversion
459 is done in-place, so S must point to writable storage. If an invalid
460 quote sequence is found print an error message and return nonzero.
461 Otherwise set *LEN to the length of the resulting string and return
462 zero. The resulting array of characters may contain zero-bytes;
463 however, on input, S is assumed to be null-terminated, and hence
464 cannot contain actual (non-escaped) zero bytes. */
467 unquote (const unsigned char *s
, struct E_string
*es
)
472 len
= strlen ((char *) s
);
474 es
->s
= (unsigned char *) xmalloc (len
);
475 es
->escaped
= (int *) xmalloc (len
* sizeof (es
->escaped
[0]));
476 for (i
= 0; i
< len
; i
++)
480 for (i
= 0; s
[i
]; i
++)
522 oct_digit
= s
[i
+ 2] - '0';
523 if (0 <= oct_digit
&& oct_digit
<= 7)
525 c
= 8 * c
+ oct_digit
;
527 oct_digit
= s
[i
+ 2] - '0';
528 if (0 <= oct_digit
&& oct_digit
<= 7)
530 if (8 * c
+ oct_digit
< N_CHARS
)
532 c
= 8 * c
+ oct_digit
;
535 else if (!posix_pedantic
)
537 /* Any octal number larger than 0377 won't
538 fit in 8 bits. So we stop when adding the
539 next digit would put us over the limit and
540 give a warning about the ambiguity. POSIX
541 isn't clear on this, but one person has said
542 that in his interpretation, POSIX says tr
543 can't even give a warning. */
544 error (0, 0, _("warning: the ambiguous octal escape \
545 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
546 s
[i
], s
[i
+ 1], s
[i
+ 2],
547 s
[i
], s
[i
+ 1], s
[i
+ 2]);
553 error (0, 0, _("invalid backslash escape at end of string"));
559 error (0, 0, _("invalid backslash escape `\\%c'"), s
[i
+ 1]);
580 /* If CLASS_STR is a valid character class string, return its index
581 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
583 static enum Char_class
584 look_up_char_class (const unsigned char *class_str
, size_t len
)
588 for (i
= 0; i
< N_CHAR_CLASSES
; i
++)
589 if (strncmp ((const char *) class_str
, char_class_name
[i
], len
) == 0
590 && strlen (char_class_name
[i
]) == len
)
591 return (enum Char_class
) i
;
595 /* Return a newly allocated string with a printable version of C.
596 This function is used solely for formatting error messages. */
599 make_printable_char (unsigned int c
)
601 char *buf
= xmalloc (5);
603 assert (c
< N_CHARS
);
611 sprintf (buf
, "\\%03o", c
);
616 /* Return a newly allocated copy of S which is suitable for printing.
617 LEN is the number of characters in S. Most non-printing
618 (isprint) characters are represented by a backslash followed by
619 3 octal digits. However, the characters represented by \c escapes
620 where c is one of [abfnrtv] are represented by their 2-character \c
621 sequences. This function is used solely for printing error messages. */
624 make_printable_str (const unsigned char *s
, size_t len
)
626 /* Worst case is that every character expands to a backslash
627 followed by a 3-character octal escape sequence. */
628 char *printable_buf
= xmalloc (4 * len
+ 1);
629 char *p
= printable_buf
;
632 for (i
= 0; i
< len
; i
++)
670 sprintf (buf
, "\\%03o", s
[i
]);
676 return printable_buf
;
679 /* Append a newly allocated structure representing a
680 character C to the specification list LIST. */
683 append_normal_char (struct Spec_list
*list
, unsigned int c
)
685 struct List_element
*new;
687 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
689 new->type
= RE_NORMAL_CHAR
;
690 new->u
.normal_char
= c
;
692 list
->tail
->next
= new;
696 /* Append a newly allocated structure representing the range
697 of characters from FIRST to LAST to the specification list LIST.
698 Return nonzero if LAST precedes FIRST in the collating sequence,
699 zero otherwise. This means that '[c-c]' is acceptable. */
702 append_range (struct Spec_list
*list
, unsigned int first
, unsigned int last
)
704 struct List_element
*new;
706 if (ORD (first
) > ORD (last
))
708 char *tmp1
= make_printable_char (first
);
709 char *tmp2
= make_printable_char (last
);
712 _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
718 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
720 new->type
= RE_RANGE
;
721 new->u
.range
.first_char
= first
;
722 new->u
.range
.last_char
= last
;
724 list
->tail
->next
= new;
729 /* If CHAR_CLASS_STR is a valid character class string, append a
730 newly allocated structure representing that character class to the end
731 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
732 a valid string return nonzero. */
735 append_char_class (struct Spec_list
*list
,
736 const unsigned char *char_class_str
, size_t len
)
738 enum Char_class char_class
;
739 struct List_element
*new;
741 char_class
= look_up_char_class (char_class_str
, len
);
742 if (char_class
== CC_NO_CLASS
)
744 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
746 new->type
= RE_CHAR_CLASS
;
747 new->u
.char_class
= char_class
;
749 list
->tail
->next
= new;
754 /* Append a newly allocated structure representing a [c*n]
755 repeated character construct to the specification list LIST.
756 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
757 is a non-negative repeat count. */
760 append_repeated_char (struct Spec_list
*list
, unsigned int the_char
,
763 struct List_element
*new;
765 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
767 new->type
= RE_REPEATED_CHAR
;
768 new->u
.repeated_char
.the_repeated_char
= the_char
;
769 new->u
.repeated_char
.repeat_count
= repeat_count
;
771 list
->tail
->next
= new;
775 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
776 the length of that string, LEN, if LEN is exactly one, append
777 a newly allocated structure representing the specified
778 equivalence class to the specification list, LIST and return zero.
779 If LEN is not 1, return nonzero. */
782 append_equiv_class (struct Spec_list
*list
,
783 const unsigned char *equiv_class_str
, size_t len
)
785 struct List_element
*new;
789 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
791 new->type
= RE_EQUIV_CLASS
;
792 new->u
.equiv_code
= *equiv_class_str
;
794 list
->tail
->next
= new;
799 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
800 The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
801 NUL bytes, and is *not* NUL-terminated. */
803 static unsigned char *
804 substr (const unsigned char *p
, size_t first_idx
, size_t last_idx
)
809 assert (first_idx
<= last_idx
);
810 len
= last_idx
- first_idx
+ 1;
811 tmp
= (unsigned char *) xmalloc (len
);
813 assert (first_idx
<= last_idx
);
814 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
815 memcpy (tmp
, p
+ first_idx
, len
);
819 /* Search forward starting at START_IDX for the 2-char sequence
820 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
821 a sequence is found, set *RESULT_IDX to the index of the first
822 character and return nonzero. Otherwise return zero. P may contain
826 find_closing_delim (const struct E_string
*es
, size_t start_idx
,
827 unsigned int pre_bracket_char
, size_t *result_idx
)
831 for (i
= start_idx
; i
< es
->len
- 1; i
++)
832 if (es
->s
[i
] == pre_bracket_char
&& es
->s
[i
+ 1] == ']'
833 && !es
->escaped
[i
] && !es
->escaped
[i
+ 1])
841 /* Convert a string S with explicit length LEN, possibly
842 containing embedded zero bytes, to a long integer value.
843 If the string represents a negative value, a value larger
844 than LONG_MAX, or if all LEN characters do not represent a
845 valid integer, return nonzero and do not modify *VAL.
846 Otherwise, return zero and set *VAL to the converted value. */
849 non_neg_strtol (const unsigned char *s
, size_t len
, size_t *val
)
852 unsigned long sum
= 0;
859 else if (ISDIGIT (s
[0]))
864 for (i
= 0; i
< len
; i
++)
875 if (sum
> (LONG_MAX
- c
) / base
)
877 sum
= sum
* base
+ c
;
883 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
884 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
885 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
886 and return zero. If the second character following
887 the opening bracket is not `*' or if no closing bracket can be
888 found, return -1. If a closing bracket is found and the
889 second char is `*', but the string between the `*' and `]' isn't
890 empty, an octal number, or a decimal number, print an error message
894 find_bracketed_repeat (const struct E_string
*es
, size_t start_idx
,
895 unsigned int *char_to_repeat
, size_t *repeat_count
,
896 size_t *closing_bracket_idx
)
900 assert (start_idx
+ 1 < es
->len
);
901 if (!ES_MATCH (es
, start_idx
+ 1, '*'))
904 for (i
= start_idx
+ 2; i
< es
->len
; i
++)
906 if (ES_MATCH (es
, i
, ']'))
908 const unsigned char *digit_str
;
909 size_t digit_str_len
= i
- start_idx
- 2;
911 *char_to_repeat
= es
->s
[start_idx
];
912 if (digit_str_len
== 0)
914 /* We've matched [c*] -- no explicit repeat count. */
916 *closing_bracket_idx
= i
;
920 /* Here, we have found [c*s] where s should be a string
921 of octal or decimal digits. */
922 digit_str
= &es
->s
[start_idx
+ 2];
923 if (non_neg_strtol (digit_str
, digit_str_len
, repeat_count
)
924 || *repeat_count
> BEGIN_STATE
)
926 char *tmp
= make_printable_str (digit_str
, digit_str_len
);
927 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
932 *closing_bracket_idx
= i
;
936 return -1; /* No bracket found. */
939 /* Return nonzero if the string at ES->s[IDX] matches the regular
940 expression `\*[0-9]*\]', zero otherwise. To match, the `*' and
941 the `]' must not be escaped. */
944 star_digits_closebracket (const struct E_string
*es
, size_t idx
)
948 if (!ES_MATCH (es
, idx
, '*'))
951 for (i
= idx
+ 1; i
< es
->len
; i
++)
953 if (!ISDIGIT (es
->s
[i
]))
955 if (ES_MATCH (es
, i
, ']'))
963 /* Convert string UNESACPED_STRING (which has been preprocessed to
964 convert backslash-escape sequences) of length LEN characters into
965 a linked list of the following 5 types of constructs:
966 - [:str:] Character class where `str' is one of the 12 valid strings.
967 - [=c=] Equivalence class where `c' is any single character.
968 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
969 However, if `n' is present, it must be a non-negative octal or
971 - r-s Range of characters from `r' to `s'. The second endpoint must
972 not precede the first in the current collating sequence.
973 - c Any other character is interpreted as itself. */
976 build_spec_list (const struct E_string
*es
, struct Spec_list
*result
)
978 const unsigned char *p
;
983 /* The main for-loop below recognizes the 4 multi-character constructs.
984 A character that matches (in its context) none of the multi-character
985 constructs is classified as `normal'. Since all multi-character
986 constructs have at least 3 characters, any strings of length 2 or
987 less are composed solely of normal characters. Hence, the index of
988 the outer for-loop runs only as far as LEN-2. */
990 for (i
= 0; i
+ 2 < es
->len
; /* empty */)
992 if (ES_MATCH (es
, i
, '['))
994 int matched_multi_char_construct
;
995 size_t closing_bracket_idx
;
996 unsigned int char_to_repeat
;
1000 matched_multi_char_construct
= 1;
1001 if (ES_MATCH (es
, i
+ 1, ':')
1002 || ES_MATCH (es
, i
+ 1, '='))
1004 size_t closing_delim_idx
;
1007 found
= find_closing_delim (es
, i
+ 2, p
[i
+ 1],
1008 &closing_delim_idx
);
1012 unsigned char *opnd_str
= substr (p
, i
+ 2,
1013 closing_delim_idx
- 1);
1014 size_t opnd_str_len
= closing_delim_idx
- 1 - (i
+ 2) + 1;
1016 if (p
[i
+ 1] == ':')
1018 parse_failed
= append_char_class (result
, opnd_str
,
1021 /* FIXME: big comment. */
1024 if (star_digits_closebracket (es
, i
+ 2))
1027 goto try_bracketed_repeat
;
1031 char *tmp
= make_printable_str (opnd_str
,
1033 error (0, 0, _("invalid character class `%s'"),
1042 parse_failed
= append_equiv_class (result
, opnd_str
,
1045 /* FIXME: big comment. */
1048 if (star_digits_closebracket (es
, i
+ 2))
1051 goto try_bracketed_repeat
;
1055 char *tmp
= make_printable_str (opnd_str
,
1058 _("%s: equivalence class operand must be a single character"),
1067 /* Return nonzero if append_*_class reports a problem. */
1071 i
= closing_delim_idx
+ 2;
1074 /* Else fall through. This could be [:*] or [=*]. */
1077 try_bracketed_repeat
:
1079 /* Determine whether this is a bracketed repeat range
1080 matching the RE \[.\*(dec_or_oct_number)?\]. */
1081 err
= find_bracketed_repeat (es
, i
+ 1, &char_to_repeat
,
1083 &closing_bracket_idx
);
1086 append_repeated_char (result
, char_to_repeat
, repeat_count
);
1087 i
= closing_bracket_idx
+ 1;
1091 matched_multi_char_construct
= 0;
1095 /* Found a string that looked like [c*n] but the
1096 numeric part was invalid. */
1100 if (matched_multi_char_construct
)
1103 /* We reach this point if P does not match [:str:], [=c=],
1104 [c*n], or [c*]. Now, see if P looks like a range `[-c'
1105 (from `[' to `c'). */
1108 /* Look ahead one char for ranges like a-z. */
1109 if (ES_MATCH (es
, i
+ 1, '-'))
1111 if (append_range (result
, p
[i
], p
[i
+ 2]))
1117 append_normal_char (result
, p
[i
]);
1122 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
1123 for (; i
< es
->len
; i
++)
1124 append_normal_char (result
, p
[i
]);
1129 /* Given a Spec_list S (with its saved state implicit in the values
1130 of its members `tail' and `state'), return the next single character
1131 in the expansion of S's constructs. If the last character of S was
1132 returned on the previous call or if S was empty, this function
1133 returns -1. For example, successive calls to get_next where S
1134 represents the spec-string 'a-d[y*3]' will return the sequence
1135 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1136 which the returned character comes is [:upper:] or [:lower:], the
1137 parameter CLASS is given a value to indicate which it was. Otherwise
1138 CLASS is set to UL_NONE. This value is used only when constructing
1139 the translation table to verify that any occurrences of upper and
1140 lower class constructs in the spec-strings appear in the same relative
1144 get_next (struct Spec_list
*s
, enum Upper_Lower_class
*class)
1146 struct List_element
*p
;
1153 if (s
->state
== BEGIN_STATE
)
1155 s
->tail
= s
->head
->next
;
1156 s
->state
= NEW_ELEMENT
;
1165 case RE_NORMAL_CHAR
:
1166 return_val
= p
->u
.normal_char
;
1167 s
->state
= NEW_ELEMENT
;
1172 if (s
->state
== NEW_ELEMENT
)
1173 s
->state
= ORD (p
->u
.range
.first_char
);
1176 return_val
= CHR (s
->state
);
1177 if (s
->state
== ORD (p
->u
.range
.last_char
))
1180 s
->state
= NEW_ELEMENT
;
1188 switch (p
->u
.char_class
)
1206 s
->state
= NEW_ELEMENT
;
1212 if (s
->state
== NEW_ELEMENT
)
1214 for (i
= 0; i
< N_CHARS
; i
++)
1215 if (is_char_class_member (p
->u
.char_class
, i
))
1217 assert (i
< N_CHARS
);
1220 assert (is_char_class_member (p
->u
.char_class
, s
->state
));
1221 return_val
= CHR (s
->state
);
1222 for (i
= s
->state
+ 1; i
< N_CHARS
; i
++)
1223 if (is_char_class_member (p
->u
.char_class
, i
))
1230 s
->state
= NEW_ELEMENT
;
1234 case RE_EQUIV_CLASS
:
1235 /* FIXME: this assumes that each character is alone in its own
1236 equivalence class (which appears to be correct for my
1237 LC_COLLATE. But I don't know of any function that allows
1238 one to determine a character's equivalence class. */
1240 return_val
= p
->u
.equiv_code
;
1241 s
->state
= NEW_ELEMENT
;
1245 case RE_REPEATED_CHAR
:
1246 /* Here, a repeat count of n == 0 means don't repeat at all. */
1247 if (p
->u
.repeated_char
.repeat_count
== 0)
1250 s
->state
= NEW_ELEMENT
;
1251 return_val
= get_next (s
, class);
1255 if (s
->state
== NEW_ELEMENT
)
1260 return_val
= p
->u
.repeated_char
.the_repeated_char
;
1261 if (p
->u
.repeated_char
.repeat_count
> 0
1262 && s
->state
== p
->u
.repeated_char
.repeat_count
)
1265 s
->state
= NEW_ELEMENT
;
1282 /* This is a minor kludge. This function is called from
1283 get_spec_stats to determine the cardinality of a set derived
1284 from a complemented string. It's a kludge in that some of the
1285 same operations are (duplicated) performed in set_initialize. */
1288 card_of_complement (struct Spec_list
*s
)
1291 int cardinality
= N_CHARS
;
1292 SET_TYPE in_set
[N_CHARS
];
1294 memset (in_set
, 0, N_CHARS
* sizeof (in_set
[0]));
1295 s
->state
= BEGIN_STATE
;
1296 while ((c
= get_next (s
, NULL
)) != -1)
1302 /* Gather statistics about the spec-list S in preparation for the tests
1303 in validate that determine the consistency of the specs. This function
1304 is called at most twice; once for string1, and again for any string2.
1305 LEN_S1 < 0 indicates that this is the first call and that S represents
1306 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1307 constructs in string1, and we can use its value to resolve any
1308 indefinite repeat construct in S (which represents string2). Hence,
1309 this function has the side-effect that it converts a valid [c*]
1310 construct in string2 to [c*n] where n is large enough (or 0) to give
1311 string2 the same length as string1. For example, with the command
1312 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1313 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1316 get_spec_stats (struct Spec_list
*s
)
1318 struct List_element
*p
;
1321 s
->n_indefinite_repeats
= 0;
1322 s
->has_equiv_class
= 0;
1323 s
->has_restricted_char_class
= 0;
1324 s
->has_char_class
= 0;
1325 for (p
= s
->head
->next
; p
; p
= p
->next
)
1330 case RE_NORMAL_CHAR
:
1335 assert (p
->u
.range
.last_char
>= p
->u
.range
.first_char
);
1336 len
+= p
->u
.range
.last_char
- p
->u
.range
.first_char
+ 1;
1340 s
->has_char_class
= 1;
1341 for (i
= 0; i
< N_CHARS
; i
++)
1342 if (is_char_class_member (p
->u
.char_class
, i
))
1344 switch (p
->u
.char_class
)
1350 s
->has_restricted_char_class
= 1;
1355 case RE_EQUIV_CLASS
:
1356 for (i
= 0; i
< N_CHARS
; i
++)
1357 if (is_equiv_class_member (p
->u
.equiv_code
, i
))
1359 s
->has_equiv_class
= 1;
1362 case RE_REPEATED_CHAR
:
1363 if (p
->u
.repeated_char
.repeat_count
> 0)
1364 len
+= p
->u
.repeated_char
.repeat_count
;
1365 else if (p
->u
.repeated_char
.repeat_count
== 0)
1367 s
->indefinite_repeat_element
= p
;
1368 ++(s
->n_indefinite_repeats
);
1382 get_s1_spec_stats (struct Spec_list
*s1
)
1384 get_spec_stats (s1
);
1386 s1
->length
= card_of_complement (s1
);
1390 get_s2_spec_stats (struct Spec_list
*s2
, size_t len_s1
)
1392 get_spec_stats (s2
);
1393 if (len_s1
>= s2
->length
&& s2
->n_indefinite_repeats
== 1)
1395 s2
->indefinite_repeat_element
->u
.repeated_char
.repeat_count
=
1396 len_s1
- s2
->length
;
1397 s2
->length
= len_s1
;
1402 spec_init (struct Spec_list
*spec_list
)
1404 spec_list
->head
= spec_list
->tail
=
1405 (struct List_element
*) xmalloc (sizeof (struct List_element
));
1406 spec_list
->head
->next
= NULL
;
1409 /* This function makes two passes over the argument string S. The first
1410 one converts all \c and \ddd escapes to their one-byte representations.
1411 The second constructs a linked specification list, SPEC_LIST, of the
1412 characters and constructs that comprise the argument string. If either
1413 of these passes detects an error, this function returns nonzero. */
1416 parse_str (const unsigned char *s
, struct Spec_list
*spec_list
)
1421 fail
= unquote (s
, &es
);
1423 fail
= build_spec_list (&es
, spec_list
);
1428 /* Given two specification lists, S1 and S2, and assuming that
1429 S1->length > S2->length, append a single [c*n] element to S2 where c
1430 is the last character in the expansion of S2 and n is the difference
1431 between the two lengths.
1432 Upon successful completion, S2->length is set to S1->length. The only
1433 way this function can fail to make S2 as long as S1 is when S2 has
1434 zero-length, since in that case, there is no last character to repeat.
1435 So S2->length is required to be at least 1.
1437 Providing this functionality allows the user to do some pretty
1438 non-BSD (and non-portable) things: For example, the command
1439 tr -cs '[:upper:]0-9' '[:lower:]'
1440 is almost guaranteed to give results that depend on your collating
1444 string2_extend (const struct Spec_list
*s1
, struct Spec_list
*s2
)
1446 struct List_element
*p
;
1450 assert (translating
);
1451 assert (s1
->length
> s2
->length
);
1452 assert (s2
->length
> 0);
1457 case RE_NORMAL_CHAR
:
1458 char_to_repeat
= p
->u
.normal_char
;
1461 char_to_repeat
= p
->u
.range
.last_char
;
1464 for (i
= N_CHARS
; i
>= 0; i
--)
1465 if (is_char_class_member (p
->u
.char_class
, i
))
1468 char_to_repeat
= CHR (i
);
1471 case RE_REPEATED_CHAR
:
1472 char_to_repeat
= p
->u
.repeated_char
.the_repeated_char
;
1475 case RE_EQUIV_CLASS
:
1476 /* This shouldn't happen, because validate exits with an error
1477 if it finds an equiv class in string2 when translating. */
1490 append_repeated_char (s2
, char_to_repeat
, s1
->length
- s2
->length
);
1491 s2
->length
= s1
->length
;
1494 /* Return non-zero if S is a non-empty list in which exactly one
1495 character (but potentially, many instances of it) appears.
1496 E.g. [X*] or xxxxxxxx. */
1499 homogeneous_spec_list (struct Spec_list
*s
)
1503 s
->state
= BEGIN_STATE
;
1505 if ((b
= get_next (s
, NULL
)) == -1)
1508 while ((c
= get_next (s
, NULL
)) != -1)
1515 /* Die with an error message if S1 and S2 describe strings that
1516 are not valid with the given command line switches.
1517 A side effect of this function is that if a valid [c*] or
1518 [c*0] construct appears in string2, it is converted to [c*n]
1519 with a value for n that makes s2->length == s1->length. By
1520 the same token, if the --truncate-set1 option is not
1521 given, S2 may be extended. */
1524 validate (struct Spec_list
*s1
, struct Spec_list
*s2
)
1526 get_s1_spec_stats (s1
);
1527 if (s1
->n_indefinite_repeats
> 0)
1529 error (EXIT_FAILURE
, 0,
1530 _("the [c*] repeat construct may not appear in string1"));
1535 get_s2_spec_stats (s2
, s1
->length
);
1537 if (s2
->n_indefinite_repeats
> 1)
1539 error (EXIT_FAILURE
, 0,
1540 _("only one [c*] repeat construct may appear in string2"));
1545 if (s2
->has_equiv_class
)
1547 error (EXIT_FAILURE
, 0,
1548 _("[=c=] expressions may not appear in string2 \
1549 when translating"));
1552 if (s1
->length
> s2
->length
)
1556 /* string2 must be non-empty unless --truncate-set1 is
1557 given or string1 is empty. */
1559 if (s2
->length
== 0)
1560 error (EXIT_FAILURE
, 0,
1561 _("when not truncating set1, string2 must be non-empty"));
1562 string2_extend (s1
, s2
);
1566 if (complement
&& s1
->has_char_class
1567 && ! (s2
->length
== s1
->length
&& homogeneous_spec_list (s2
)))
1569 error (EXIT_FAILURE
, 0,
1570 _("when translating with complemented character classes,\
1571 \nstring2 must map all characters in the domain to one"));
1574 if (s2
->has_restricted_char_class
)
1576 error (EXIT_FAILURE
, 0,
1577 _("when translating, the only character classes that may \
1578 appear in\nstring2 are `upper' and `lower'"));
1582 /* Not translating. */
1584 if (s2
->n_indefinite_repeats
> 0)
1585 error (EXIT_FAILURE
, 0,
1586 _("the [c*] construct may appear in string2 only \
1587 when translating"));
1592 /* Read buffers of SIZE bytes via the function READER (if READER is
1593 NULL, read from stdin) until EOF. When non-NULL, READER is either
1594 read_and_delete or read_and_xlate. After each buffer is read, it is
1595 processed and written to stdout. The buffers are processed so that
1596 multiple consecutive occurrences of the same character in the input
1597 stream are replaced by a single occurrence of that character if the
1598 character is in the squeeze set. */
1601 squeeze_filter (unsigned char *buf
, long int size
, PFI reader
)
1603 unsigned int char_to_squeeze
= NOT_A_CHAR
;
1614 nr
= safe_read (0, (char *) buf
, size
);
1616 nr
= (*reader
) (buf
, size
, NULL
);
1619 error (EXIT_FAILURE
, errno
, _("read error"));
1627 if (char_to_squeeze
== NOT_A_CHAR
)
1630 /* Here, by being a little tricky, we can get a significant
1631 performance increase in most cases when the input is
1632 reasonably large. Since tr will modify the input only
1633 if two consecutive (and identical) input characters are
1634 in the squeeze set, we can step by two through the data
1635 when searching for a character in the squeeze set. This
1636 means there may be a little more work in a few cases and
1637 perhaps twice as much work in the worst cases where most
1638 of the input is removed by squeezing repeats. But most
1639 uses of this functionality seem to remove less than 20-30%
1641 for (; i
< nr
&& !in_squeeze_set
[buf
[i
]]; i
+= 2)
1644 /* There is a special case when i == nr and we've just
1645 skipped a character (the last one in buf) that is in
1647 if (i
== nr
&& in_squeeze_set
[buf
[i
- 1]])
1651 out_len
= nr
- begin
;
1654 char_to_squeeze
= buf
[i
];
1655 /* We're about to output buf[begin..i]. */
1656 out_len
= i
- begin
+ 1;
1658 /* But since we stepped by 2 in the loop above,
1659 out_len may be one too large. */
1660 if (i
> 0 && buf
[i
- 1] == char_to_squeeze
)
1663 /* Advance i to the index of first character to be
1664 considered when looking for a char different from
1669 && fwrite ((char *) &buf
[begin
], 1, out_len
, stdout
) == 0)
1670 error (EXIT_FAILURE
, errno
, _("write error"));
1673 if (char_to_squeeze
!= NOT_A_CHAR
)
1675 /* Advance i to index of first char != char_to_squeeze
1676 (or to nr if all the rest of the characters in this
1677 buffer are the same as char_to_squeeze). */
1678 for (; i
< nr
&& buf
[i
] == char_to_squeeze
; i
++)
1681 char_to_squeeze
= NOT_A_CHAR
;
1682 /* If (i >= nr) we've squeezed the last character in this buffer.
1683 So now we have to read a new buffer and continue comparing
1684 characters against char_to_squeeze. */
1689 /* Read buffers of SIZE bytes from stdin until one is found that
1690 contains at least one character not in the delete set. Store
1691 in the array BUF, all characters from that buffer that are not
1692 in the delete set, and return the number of characters saved
1696 read_and_delete (unsigned char *buf
, long int size
, PFI not_used
)
1699 static int hit_eof
= 0;
1701 assert (not_used
== NULL
);
1707 /* This enclosing do-while loop is to make sure that
1708 we don't return zero (indicating EOF) when we've
1709 just deleted all the characters in a buffer. */
1713 int nr
= safe_read (0, (char *) buf
, size
);
1716 error (EXIT_FAILURE
, errno
, _("read error"));
1723 /* This first loop may be a waste of code, but gives much
1724 better performance when no characters are deleted in
1725 the beginning of a buffer. It just avoids the copying
1726 of buf[i] into buf[n_saved] when it would be a NOP. */
1728 for (i
= 0; i
< nr
&& !in_delete_set
[buf
[i
]]; i
++)
1732 for (++i
; i
< nr
; i
++)
1733 if (!in_delete_set
[buf
[i
]])
1734 buf
[n_saved
++] = buf
[i
];
1736 while (n_saved
== 0);
1741 /* Read at most SIZE bytes from stdin into the array BUF. Then
1742 perform the in-place and one-to-one mapping specified by the global
1743 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1746 read_and_xlate (unsigned char *buf
, long int size
, PFI not_used
)
1748 long chars_read
= 0;
1749 static int hit_eof
= 0;
1752 assert (not_used
== NULL
);
1758 chars_read
= safe_read (0, (char *) buf
, size
);
1760 error (EXIT_FAILURE
, errno
, _("read error"));
1761 if (chars_read
== 0)
1767 for (i
= 0; i
< chars_read
; i
++)
1768 buf
[i
] = xlate
[buf
[i
]];
1773 /* Initialize a boolean membership set IN_SET with the character
1774 values obtained by traversing the linked list of constructs S
1775 using the function `get_next'. If COMPLEMENT_THIS_SET is
1776 nonzero the resulting set is complemented. */
1779 set_initialize (struct Spec_list
*s
, int complement_this_set
, SET_TYPE
*in_set
)
1784 memset (in_set
, 0, N_CHARS
* sizeof (in_set
[0]));
1785 s
->state
= BEGIN_STATE
;
1786 while ((c
= get_next (s
, NULL
)) != -1)
1788 if (complement_this_set
)
1789 for (i
= 0; i
< N_CHARS
; i
++)
1790 in_set
[i
] = (!in_set
[i
]);
1794 main (int argc
, char **argv
)
1797 int non_option_args
;
1798 struct Spec_list buf1
, buf2
;
1799 struct Spec_list
*s1
= &buf1
;
1800 struct Spec_list
*s2
= &buf2
;
1802 program_name
= argv
[0];
1803 setlocale (LC_ALL
, "");
1804 bindtextdomain (PACKAGE
, LOCALEDIR
);
1805 textdomain (PACKAGE
);
1807 while ((c
= getopt_long (argc
, argv
, "cdst", long_options
, NULL
)) != -1)
1823 squeeze_repeats
= 1;
1830 case_GETOPT_HELP_CHAR
;
1832 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
1840 posix_pedantic
= (getenv ("POSIXLY_CORRECT") != NULL
);
1842 non_option_args
= argc
- optind
;
1843 translating
= (non_option_args
== 2 && !delete);
1845 /* Change this test if it is valid to give tr no options and
1846 no args at all. POSIX doesn't specifically say anything
1847 either way, but it looks like they implied it's invalid
1848 by omission. If you want to make tr do a slow imitation
1849 of `cat' use `tr a a'. */
1850 if (non_option_args
> 2)
1852 error (0, 0, _("too many arguments"));
1856 if (!delete && !squeeze_repeats
&& non_option_args
!= 2)
1857 error (EXIT_FAILURE
, 0, _("two strings must be given when translating"));
1859 if (delete && squeeze_repeats
&& non_option_args
!= 2)
1860 error (EXIT_FAILURE
, 0, _("two strings must be given when both \
1861 deleting and squeezing repeats"));
1863 /* If --delete is given without --squeeze-repeats, then
1864 only one string argument may be specified. But POSIX
1865 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1866 is set, pretend we never saw string2. But I think
1867 this deserves a fatal error, so that's the default. */
1868 if ((delete && !squeeze_repeats
) && non_option_args
!= 1)
1870 if (posix_pedantic
&& non_option_args
== 2)
1873 error (EXIT_FAILURE
, 0,
1874 _("only one string may be given when deleting \
1875 without squeezing repeats"));
1878 if (squeeze_repeats
&& non_option_args
== 0)
1879 error (EXIT_FAILURE
, 0,
1880 _("at least one string must be given when squeezing repeats"));
1883 if (parse_str ((unsigned char *) argv
[optind
], s1
))
1884 exit (EXIT_FAILURE
);
1886 if (non_option_args
== 2)
1889 if (parse_str ((unsigned char *) argv
[optind
+ 1], s2
))
1890 exit (EXIT_FAILURE
);
1897 /* Use binary I/O, since `tr' is sometimes used to transliterate
1898 non-printable characters, or characters which are stripped away
1899 by text-mode reads (like CR and ^Z). */
1900 SET_BINARY2 (STDIN_FILENO
, STDOUT_FILENO
);
1902 if (squeeze_repeats
&& non_option_args
== 1)
1904 set_initialize (s1
, complement
, in_squeeze_set
);
1905 squeeze_filter (io_buf
, IO_BUF_SIZE
, NULL
);
1907 else if (delete && non_option_args
== 1)
1911 set_initialize (s1
, complement
, in_delete_set
);
1914 nr
= read_and_delete (io_buf
, IO_BUF_SIZE
, NULL
);
1915 if (nr
> 0 && fwrite ((char *) io_buf
, 1, nr
, stdout
) == 0)
1916 error (EXIT_FAILURE
, errno
, _("write error"));
1920 else if (squeeze_repeats
&& delete && non_option_args
== 2)
1922 set_initialize (s1
, complement
, in_delete_set
);
1923 set_initialize (s2
, 0, in_squeeze_set
);
1924 squeeze_filter (io_buf
, IO_BUF_SIZE
, (PFI
) read_and_delete
);
1926 else if (translating
)
1931 SET_TYPE
*in_s1
= in_delete_set
;
1933 set_initialize (s1
, 0, in_s1
);
1934 s2
->state
= BEGIN_STATE
;
1935 for (i
= 0; i
< N_CHARS
; i
++)
1937 for (i
= 0; i
< N_CHARS
; i
++)
1941 int ch
= get_next (s2
, NULL
);
1942 assert (ch
!= -1 || truncate_set1
);
1945 /* This will happen when tr is invoked like e.g.
1946 tr -cs A-Za-z0-9 '\012'. */
1952 assert (get_next (s2
, NULL
) == -1 || truncate_set1
);
1958 enum Upper_Lower_class class_s1
;
1959 enum Upper_Lower_class class_s2
;
1961 for (i
= 0; i
< N_CHARS
; i
++)
1963 s1
->state
= BEGIN_STATE
;
1964 s2
->state
= BEGIN_STATE
;
1967 c1
= get_next (s1
, &class_s1
);
1968 c2
= get_next (s2
, &class_s2
);
1969 if (!class_ok
[(int) class_s1
][(int) class_s2
])
1970 error (EXIT_FAILURE
, 0,
1971 _("misaligned [:upper:] and/or [:lower:] construct"));
1973 if (class_s1
== UL_LOWER
&& class_s2
== UL_UPPER
)
1975 for (i
= 0; i
< N_CHARS
; i
++)
1977 xlate
[i
] = toupper (i
);
1979 else if (class_s1
== UL_UPPER
&& class_s2
== UL_LOWER
)
1981 for (i
= 0; i
< N_CHARS
; i
++)
1983 xlate
[i
] = tolower (i
);
1985 else if ((class_s1
== UL_LOWER
&& class_s2
== UL_LOWER
)
1986 || (class_s1
== UL_UPPER
&& class_s2
== UL_UPPER
))
1988 /* By default, GNU tr permits the identity mappings: from
1989 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But
1990 when POSIXLY_CORRECT is set, those evoke diagnostics. */
1993 error (EXIT_FAILURE
, 0,
1995 invalid identity mapping; when translating, any [:lower:] or [:upper:]\n\
1996 construct in string1 must be aligned with a corresponding construct\n\
1997 ([:upper:] or [:lower:], respectively) in string2"));
2002 /* The following should have been checked by validate... */
2003 if (c1
== -1 || c2
== -1)
2008 assert (c1
== -1 || truncate_set1
);
2010 if (squeeze_repeats
)
2012 set_initialize (s2
, 0, in_squeeze_set
);
2013 squeeze_filter (io_buf
, IO_BUF_SIZE
, (PFI
) read_and_xlate
);
2021 chars_read
= read_and_xlate (io_buf
, IO_BUF_SIZE
, NULL
);
2023 && fwrite ((char *) io_buf
, 1, chars_read
, stdout
) == 0)
2024 error (EXIT_FAILURE
, errno
, _("write error"));
2026 while (chars_read
> 0);
2030 if (fclose (stdout
) == EOF
)
2031 error (EXIT_FAILURE
, errno
, _("write error"));
2034 error (EXIT_FAILURE
, errno
, _("standard input"));
2036 exit (EXIT_SUCCESS
);