1 /* tr -- a filter to translate characters
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering, meyering@cs.utexas.edu. */
22 /* Get isblank from GNU libc. */
29 #include <sys/types.h>
37 #define ULONG_MAX ((unsigned long) ~(unsigned long) 0)
41 #define LONG_MAX ((long int) (ULONG_MAX >> 1))
45 #define UCHAR_MAX 0xFF
48 #define N_CHARS (UCHAR_MAX + 1)
50 /* A pointer to a function that returns an int. */
51 typedef int (*PFI
) ();
53 /* Convert from character C to its index in the collating
54 sequence array. Just cast to an unsigned int to avoid
55 problems with sign-extension. */
56 #define ORD(c) (unsigned int)(c)
58 /* The inverse of ORD. */
59 #define CHR(i) (unsigned char)(i)
61 /* The value for Spec_list->state that indicates to
62 get_next that it should initialize the tail pointer.
63 Its value doesn't matter as long as it can't be
64 confused with a valid character code. */
65 #define BEGIN_STATE (2 * N_CHARS)
67 /* The value for Spec_list->state that indicates to
68 get_next that the element pointed to by Spec_list->tail is
69 being considered for the first time on this pass through the
70 list -- it indicates that get_next should make any necessary
72 #define NEW_ELEMENT (BEGIN_STATE + 1)
74 /* A value distinct from any character that may have been stored in a
75 buffer as the result of a block-read in the function squeeze_filter. */
76 #define NOT_A_CHAR (unsigned int)(-1)
78 /* The following (but not CC_NO_CLASS) are indices into the array of
79 valid character class strings. */
82 CC_ALNUM
= 0, CC_ALPHA
= 1, CC_BLANK
= 2, CC_CNTRL
= 3,
83 CC_DIGIT
= 4, CC_GRAPH
= 5, CC_LOWER
= 6, CC_PRINT
= 7,
84 CC_PUNCT
= 8, CC_SPACE
= 9, CC_UPPER
= 10, CC_XDIGIT
= 11,
88 /* Character class to which a character (returned by get_next) belonged;
89 but it is set only if the construct from which the character was obtained
90 was one of the character classes [:upper:] or [:lower:]. The value
91 is used only when translating and then, only to make sure that upper
92 and lower class constructs have the same relative positions in string1
94 enum Upper_Lower_class
101 /* A shortcut to ensure that when constructing the translation array,
102 one of the values returned by paired calls to get_next (from s1 and s2)
103 is from [:upper:] and the other is from [:lower:], or neither is from
104 upper or lower. In fact, no other character classes are allowed when
105 translating, but that condition is tested elsewhere. This array is
106 indexed by values of type enum Upper_Lower_class. */
107 static int const class_ok
[3][3] =
114 /* The type of a List_element. See build_spec_list for more details. */
115 enum Range_element_type
125 /* One construct in one of tr's argument strings.
126 For example, consider the POSIX version of the classic tr command:
127 tr -cs 'a-zA-Z_' '[\n*]'
128 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
129 and a single normal character, `_'. String2 has one construct. */
132 enum Range_element_type type
;
133 struct List_element
*next
;
139 unsigned int first_char
;
140 unsigned int last_char
;
143 enum Char_class char_class
;
147 unsigned int the_repeated_char
;
155 /* Each of tr's argument strings is parsed into a form that is easier
156 to work with: a linked list of constructs (struct List_element).
157 Each Spec_list structure also encapsulates various attributes of
158 the corresponding argument string. The attributes are used mainly
159 to verify that the strings are valid in the context of any options
160 specified (like -s, -d, or -c). The main exception is the member
161 `tail', which is first used to construct the list. After construction,
162 it is used by get_next to save its state when traversing the list.
163 The member `state' serves a similar function. */
166 /* Points to the head of the list of range elements.
167 The first struct is a dummy; its members are never used. */
168 struct List_element
*head
;
170 /* When appending, points to the last element. When traversing via
171 get_next(), points to the element to process next. Setting
172 Spec_list.state to the value BEGIN_STATE before calling get_next
173 signals get_next to initialize tail to point to head->next. */
174 struct List_element
*tail
;
176 /* Used to save state between calls to get_next(). */
179 /* Length, in the sense that length ('a-z[:digit:]123abc')
180 is 42 ( = 26 + 10 + 6). */
183 /* The number of [c*] and [c*0] constructs that appear in this spec. */
184 int n_indefinite_repeats
;
186 /* If n_indefinite_repeats is non-zero, this points to the List_element
187 corresponding to the last [c*] or [c*0] construct encountered in
188 this spec. Otherwise it is undefined. */
189 struct List_element
*indefinite_repeat_element
;
191 /* Non-zero if this spec contains at least one equivalence
192 class construct e.g. [=c=]. */
195 /* Non-zero if this spec contains at least one of [:upper:] or
196 [:lower:] class constructs. */
197 int has_upper_or_lower
;
199 /* Non-zero if this spec contains at least one of the character class
200 constructs (all but upper and lower) that aren't allowed in s2. */
201 int has_restricted_char_class
;
208 /* The name by which this program was run. */
211 /* When non-zero, each sequence in the input of a repeated character
212 (call it c) is replaced (in the output) by a single occurrence of c
213 for every c in the squeeze set. */
214 static int squeeze_repeats
= 0;
216 /* When non-zero, removes characters in the delete set from input. */
217 static int delete = 0;
219 /* Use the complement of set1 in place of set1. */
220 static int complement
= 0;
222 /* When non-zero, this flag causes GNU tr to provide strict
223 compliance with POSIX draft 1003.2.11.2. The POSIX spec
224 says that when -d is used without -s, string2 (if present)
225 must be ignored. Silently ignoring arguments is a bad idea.
226 The default GNU behavior is to give a usage message and exit.
227 Additionally, when this flag is non-zero, tr prints warnings
228 on stderr if it is being used in a manner that is not portable.
229 Applicable warnings are given by default, but are suppressed
230 if the environment variable `POSIXLY_CORRECT' is set, since
231 being POSIX conformant means we can't issue such messages.
232 Warnings on the following topics are suppressed when this
233 variable is non-zero:
234 1. Ambiguous octal escapes. */
235 static int posix_pedantic
;
237 /* When tr is performing translation and string1 is longer than string2,
238 POSIX says that the result is undefined. That gives the implementor
239 of a POSIX conforming version of tr two reasonable choices for the
240 semantics of this case.
242 * The BSD tr pads string2 to the length of string1 by
243 repeating the last character in string2.
245 * System V tr ignores characters in string1 that have no
246 corresponding character in string2. That is, string1 is effectively
247 truncated to the length of string2.
249 When non-zero, this flag causes GNU tr to imitate the behavior
250 of System V tr when translating with string1 longer than string2.
251 The default is to emulate BSD tr. This flag is ignored in modes where
252 no translation is performed. Emulating the System V tr
253 in this exceptional case causes the relatively common BSD idiom:
255 tr -cs A-Za-z0-9 '\012'
257 to break (it would convert only zero bytes, rather than all
258 non-alphanumerics, to newlines).
260 WARNING: This switch does not provide general BSD or System V
261 compatibility. For example, it doesn't disable the interpretation
262 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
263 some unfortunate coincidence you use such constructs in scripts
264 expecting to use some other version of tr, the scripts will break. */
265 static int truncate_set1
= 0;
267 /* An alias for (!delete && non_option_args == 2).
268 It is set in main and used there and in validate(). */
269 static int translating
;
275 #define IO_BUF_SIZE BUFSIZ
276 static unsigned char io_buf
[IO_BUF_SIZE
];
278 static char const *const char_class_name
[] =
280 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
281 "lower", "print", "punct", "space", "upper", "xdigit"
283 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
285 typedef char SET_TYPE
;
287 /* Array of boolean values. A character `c' is a member of the
288 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
289 set is defined by the last (possibly, the only) string argument
290 on the command line when the squeeze option is given. */
291 static SET_TYPE in_squeeze_set
[N_CHARS
];
293 /* Array of boolean values. A character `c' is a member of the
294 delete set if and only if in_delete_set[c] is true. The delete
295 set is defined by the first (or only) string argument on the
296 command line when the delete option is given. */
297 static SET_TYPE in_delete_set
[N_CHARS
];
299 /* Array of character values defining the translation (if any) that
300 tr is to perform. Translation is performed only when there are
301 two specification strings and the delete switch is not given. */
302 static char xlate
[N_CHARS
];
304 /* If non-zero, display usage information and exit. */
305 static int show_help
;
307 /* If non-zero, print the version on standard output then exit. */
308 static int show_version
;
310 static struct option
const long_options
[] =
312 {"complement", no_argument
, NULL
, 'c'},
313 {"delete", no_argument
, NULL
, 'd'},
314 {"squeeze-repeats", no_argument
, NULL
, 's'},
315 {"truncate-set1", no_argument
, NULL
, 't'},
316 {"help", no_argument
, &show_help
, 1},
317 {"version", no_argument
, &show_version
, 1},
326 fprintf (stderr
, "Try `%s --help' for more information.\n",
331 Usage: %s [OPTION]... SET1 [SET2]\n\
335 Translate, squeeze, and/or delete characters from standard input,\n\
336 writing to standard output.\n\
338 -c, --complement first complement SET1\n\
339 -d, --delete delete characters in SET1, do not translate\n\
340 -s, --squeeze-repeats replace sequence of characters with one\n\
341 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
342 --help display this help and exit\n\
343 --version output version information and exit\n\
347 SETs are specified as strings of characters. Most represent themselves.\n\
348 Interpreted sequences are:\n\
350 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
357 \\t horizontal tab\n\
359 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
360 [CHAR1-CHAR2] same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
361 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
362 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
363 [:alnum:] all letters and digits\n\
364 [:alpha:] all letters\n\
365 [:blank:] all horizontal whitespace\n\
366 [:cntrl:] all control characters\n\
367 [:digit:] all digits\n\
368 [:graph:] all printable characters, not including space\n\
369 [:lower:] all lower case letters\n\
370 [:print:] all printable characters, including space\n\
371 [:punct:] all punctuation characters\n\
372 [:space:] all horizontal or vertical whitespace\n\
373 [:upper:] all upper case letters\n\
374 [:xdigit:] all hexadecimal digits\n\
375 [=CHAR=] all characters which are equivalent to CHAR\n\
379 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
380 -t may be used only when translating. SET2 is extended to length of\n\
381 SET1 by repeating its last character as necessary. Excess characters\n\
382 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
383 expand in ascending order; used in SET2 while translating, they may\n\
384 only be used in pairs to specify case conversion. -s uses SET1 if not\n\
385 translating nor deleting; else squeezing uses SET2 and occurs after\n\
386 translation or deletion.\n\
392 /* Return non-zero if the character C is a member of the
393 equivalence class containing the character EQUIV_CLASS. */
396 is_equiv_class_member (equiv_class
, c
)
397 unsigned int equiv_class
;
400 return (equiv_class
== c
);
403 /* Return non-zero if the character C is a member of the
404 character class CHAR_CLASS. */
407 is_char_class_member (char_class
, c
)
408 enum Char_class char_class
;
416 result
= ISALNUM (c
);
419 result
= ISALPHA (c
);
422 result
= ISBLANK (c
);
425 result
= ISCNTRL (c
);
428 result
= ISDIGIT (c
);
431 result
= ISGRAPH (c
);
434 result
= ISLOWER (c
);
437 result
= ISPRINT (c
);
440 result
= ISPUNCT (c
);
443 result
= ISSPACE (c
);
446 result
= ISUPPER (c
);
449 result
= ISXDIGIT (c
);
458 /* Perform the first pass over each range-spec argument S, converting all
459 \c and \ddd escapes to their one-byte representations. The conversion
460 is done in-place, so S must point to writable storage. If an invalid
461 quote sequence is found print an error message and return non-zero.
462 Otherwise set *LEN to the length of the resulting string and return
463 zero. The resulting array of characters may contain zero-bytes;
464 however, on input, S is assumed to be null-terminated, and hence
465 cannot contain actual (non-escaped) zero bytes. */
475 for (i
= 0; s
[i
]; i
++)
517 oct_digit
= s
[i
+ 2] - '0';
518 if (0 <= oct_digit
&& oct_digit
<= 7)
520 c
= 8 * c
+ oct_digit
;
522 oct_digit
= s
[i
+ 2] - '0';
523 if (0 <= oct_digit
&& oct_digit
<= 7)
525 if (8 * c
+ oct_digit
< N_CHARS
)
527 c
= 8 * c
+ oct_digit
;
530 else if (!posix_pedantic
)
532 /* Any octal number larger than 0377 won't
533 fit in 8 bits. So we stop when adding the
534 next digit would put us over the limit and
535 give a warning about the ambiguity. POSIX
536 isn't clear on this, but one person has said
537 that in his interpretation, POSIX says tr
538 can't even give a warning. */
539 error (0, 0, "warning: the ambiguous octal escape \
540 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'",
541 s
[i
], s
[i
+ 1], s
[i
+ 2],
542 s
[i
], s
[i
+ 1], s
[i
+ 2]);
548 error (0, 0, "invalid backslash escape at end of string");
552 error (0, 0, "invalid backslash escape `\\%c'", s
[i
+ 1]);
568 /* If CLASS_STR is a valid character class string, return its index
569 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
571 static enum Char_class
572 look_up_char_class (class_str
, len
)
573 const unsigned char *class_str
;
578 for (i
= 0; i
< N_CHAR_CLASSES
; i
++)
579 if (strncmp ((const char *) class_str
, char_class_name
[i
], len
) == 0
580 && strlen (char_class_name
[i
]) == len
)
581 return (enum Char_class
) i
;
585 /* Return a newly allocated string with a printable version of C.
586 This function is used solely for formatting error messages. */
589 make_printable_char (c
)
592 char *buf
= xmalloc (5);
594 assert (c
< N_CHARS
);
602 sprintf (buf
, "\\%03o", c
);
607 /* Return a newly allocated copy of S which is suitable for printing.
608 LEN is the number of characters in S. Most non-printing
609 (isprint) characters are represented by a backslash followed by
610 3 octal digits. However, the characters represented by \c escapes
611 where c is one of [abfnrtv] are represented by their 2-character \c
612 sequences. This function is used solely for printing error messages. */
615 make_printable_str (s
, len
)
616 const unsigned char *s
;
619 /* Worst case is that every character expands to a backslash
620 followed by a 3-character octal escape sequence. */
621 char *printable_buf
= xmalloc (4 * len
+ 1);
622 char *p
= printable_buf
;
625 for (i
= 0; i
< len
; i
++)
663 sprintf (buf
, "\\%03o", s
[i
]);
669 return printable_buf
;
672 /* Append a newly allocated structure representing a
673 character C to the specification list LIST. */
676 append_normal_char (list
, c
)
677 struct Spec_list
*list
;
680 struct List_element
*new;
682 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
684 new->type
= RE_NORMAL_CHAR
;
685 new->u
.normal_char
= c
;
687 list
->tail
->next
= new;
691 /* Append a newly allocated structure representing the range
692 of characters from FIRST to LAST to the specification list LIST.
693 Return non-zero if LAST precedes FIRST in the collating sequence,
694 zero otherwise. This means that '[c-c]' is acceptable. */
697 append_range (list
, first
, last
)
698 struct Spec_list
*list
;
702 struct List_element
*new;
704 if (ORD (first
) > ORD (last
))
706 char *tmp1
= make_printable_char (first
);
707 char *tmp2
= make_printable_char (last
);
710 "range-endpoints of `%s-%s' are in reverse collating sequence order",
716 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
718 new->type
= RE_RANGE
;
719 new->u
.range
.first_char
= first
;
720 new->u
.range
.last_char
= last
;
722 list
->tail
->next
= new;
727 /* If CHAR_CLASS_STR is a valid character class string, append a
728 newly allocated structure representing that character class to the end
729 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
730 a valid string, print an error message and return non-zero. */
733 append_char_class (list
, char_class_str
, len
)
734 struct Spec_list
*list
;
735 const unsigned char *char_class_str
;
738 enum Char_class char_class
;
739 struct List_element
*new;
741 char_class
= look_up_char_class (char_class_str
, len
);
742 if (char_class
== CC_NO_CLASS
)
744 char *tmp
= make_printable_str (char_class_str
, len
);
746 error (0, 0, "invalid character class `%s'", tmp
);
750 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
752 new->type
= RE_CHAR_CLASS
;
753 new->u
.char_class
= char_class
;
755 list
->tail
->next
= new;
760 /* Append a newly allocated structure representing a [c*n]
761 repeated character construct to the specification list LIST.
762 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
763 is a non-negative repeat count. */
766 append_repeated_char (list
, the_char
, repeat_count
)
767 struct Spec_list
*list
;
768 unsigned int the_char
;
771 struct List_element
*new;
773 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
775 new->type
= RE_REPEATED_CHAR
;
776 new->u
.repeated_char
.the_repeated_char
= the_char
;
777 new->u
.repeated_char
.repeat_count
= repeat_count
;
779 list
->tail
->next
= new;
783 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
784 the length of that string, LEN, if LEN is exactly one, append
785 a newly allocated structure representing the specified
786 equivalence class to the specification list, LIST and return zero.
787 If LEN is not 1, issue an error message and return non-zero. */
790 append_equiv_class (list
, equiv_class_str
, len
)
791 struct Spec_list
*list
;
792 const unsigned char *equiv_class_str
;
795 struct List_element
*new;
799 char *tmp
= make_printable_str (equiv_class_str
, len
);
801 error (0, 0, "%s: equivalence class operand must be a single character",
806 new = (struct List_element
*) xmalloc (sizeof (struct List_element
));
808 new->type
= RE_EQUIV_CLASS
;
809 new->u
.equiv_code
= *equiv_class_str
;
811 list
->tail
->next
= new;
816 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
817 The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
818 NUL bytes, and is *not* NUL-terminated. */
820 static unsigned char *
821 substr (p
, first_idx
, last_idx
)
822 const unsigned char *p
;
829 assert (first_idx
<= last_idx
);
830 len
= last_idx
- first_idx
+ 1;
831 tmp
= (unsigned char *) xmalloc (len
);
833 assert (first_idx
<= last_idx
);
834 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
835 memcpy (tmp
, p
+ first_idx
, len
);
839 /* Search forward starting at START_IDX for the 2-char sequence
840 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
841 a sequence is found, return the index of the first character,
842 otherwise return -1. P may contain zero bytes. */
845 find_closing_delim (p
, start_idx
, p_len
, pre_bracket_char
)
846 const unsigned char *p
;
849 unsigned int pre_bracket_char
;
853 for (i
= start_idx
; i
< p_len
- 1; i
++)
854 if (p
[i
] == pre_bracket_char
&& p
[i
+ 1] == ']')
859 /* Convert a string S with explicit length LEN, possibly
860 containing embedded zero bytes, to a long integer value.
861 If the string represents a negative value, a value larger
862 than LONG_MAX, or if all LEN characters do not represent a
863 valid integer, return non-zero and do not modify *VAL.
864 Otherwise, return zero and set *VAL to the converted value. */
867 non_neg_strtol (s
, len
, val
)
868 const unsigned char *s
;
873 unsigned long sum
= 0;
880 else if (ISDIGIT (s
[0]))
885 for (i
= 0; i
< len
; i
++)
896 if (sum
> (LONG_MAX
- c
) / base
)
898 sum
= sum
* base
+ c
;
904 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
905 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
906 return the character and the repeat count through the arg pointers,
907 CHAR_TO_REPEAT and N, and then return the index of the closing
908 bracket as the function value. If the second character following
909 the opening bracket is not `*' or if no closing bracket can be
910 found, return -1. If a closing bracket is found and the
911 second char is `*', but the string between the `*' and `]' isn't
912 empty, an octal number, or a decimal number, print an error message
916 find_bracketed_repeat (p
, start_idx
, p_len
, char_to_repeat
, n
)
917 const unsigned char *p
;
920 unsigned int *char_to_repeat
;
925 assert (start_idx
+ 1 < p_len
);
926 if (p
[start_idx
+ 1] != '*')
929 for (i
= start_idx
+ 2; i
< p_len
; i
++)
933 const unsigned char *digit_str
;
934 size_t digit_str_len
= i
- start_idx
- 2;
936 *char_to_repeat
= p
[start_idx
];
937 if (digit_str_len
== 0)
939 /* We've matched [c*] -- no explicit repeat count. */
944 /* Here, we have found [c*s] where s should be a string
945 of octal or decimal digits. */
946 digit_str
= &p
[start_idx
+ 2];
947 if (non_neg_strtol (digit_str
, digit_str_len
, n
))
949 char *tmp
= make_printable_str (digit_str
, digit_str_len
);
950 error (0, 0, "invalid repeat count `%s' in [c*n] construct", tmp
);
957 return -1; /* No bracket found. */
960 /* Convert string UNESACPED_STRING (which has been preprocessed to
961 convert backslash-escape sequences) of length LEN characters into
962 a linked list of the following 5 types of constructs:
963 - [:str:] Character class where `str' is one of the 12 valid strings.
964 - [=c=] Equivalence class where `c' is any single character.
965 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
966 However, if `n' is present, it must be a non-negative octal or
968 - r-s Range of characters from `r' to `s'. The second endpoint must
969 not precede the first in the current collating sequence.
970 - c Any other character is interpreted as itself. */
973 build_spec_list (unescaped_string
, len
, result
)
974 const unsigned char *unescaped_string
;
976 struct Spec_list
*result
;
978 const unsigned char *p
;
981 p
= unescaped_string
;
983 /* The main for-loop below recognizes the 4 multi-character constructs.
984 A character that matches (in its context) none of the multi-character
985 constructs is classified as `normal'. Since all multi-character
986 constructs have at least 3 characters, any strings of length 2 or
987 less are composed solely of normal characters. Hence, the index of
988 the outer for-loop runs only as far as LEN-2. */
990 for (i
= 0; i
+ 2 < len
;)
999 int closing_delim_idx
;
1000 int closing_bracket_idx
;
1001 unsigned int char_to_repeat
;
1002 size_t repeat_count
;
1005 closing_delim_idx
= find_closing_delim (p
, i
+ 2, len
, p
[i
+ 1]);
1006 if (closing_delim_idx
>= 0)
1009 unsigned char *opnd_str
= substr (p
, i
+ 2,
1010 closing_delim_idx
- 1);
1011 if (p
[i
+ 1] == ':')
1012 parse_failed
= append_char_class (result
, opnd_str
,
1013 (closing_delim_idx
- 1) - (i
+ 2) + 1);
1015 parse_failed
= append_equiv_class (result
, opnd_str
,
1016 (closing_delim_idx
- 1) - (i
+ 2) + 1);
1019 /* Return non-zero if append_*_class reports a problem. */
1023 i
= closing_delim_idx
+ 2;
1026 /* Else fall through. This could be [:*] or [=*]. */
1028 /* Determine whether this is a bracketed repeat range
1029 matching the RE \[.\*(dec_or_oct_number)?\]. */
1030 closing_bracket_idx
= find_bracketed_repeat (p
, i
+ 1,
1031 len
, &char_to_repeat
, &repeat_count
);
1032 if (closing_bracket_idx
>= 0)
1034 append_repeated_char (result
, char_to_repeat
, repeat_count
);
1035 i
= closing_bracket_idx
+ 1;
1038 else if (closing_bracket_idx
== -1)
1043 /* Found a string that looked like [c*n] but the
1044 numeric part was invalid. */
1051 /* Here if we've tried to match [c*n], [:str:], and [=c=]
1052 and none of them fit. So we still have to consider the
1053 range `[-c' (from `[' to `c'). */
1055 /* Look ahead one char for ranges like a-z. */
1056 if (p
[i
+ 1] == '-')
1058 if (append_range (result
, p
[i
], p
[i
+ 2]))
1064 append_normal_char (result
, p
[i
]);
1071 /* Now handle the (2 or fewer) remaining characters p[i]..p[len - 1]. */
1072 for (; i
< len
; i
++)
1073 append_normal_char (result
, p
[i
]);
1078 /* Given a Spec_list S (with its saved state implicit in the values
1079 of its members `tail' and `state'), return the next single character
1080 in the expansion of S's constructs. If the last character of S was
1081 returned on the previous call or if S was empty, this function
1082 returns -1. For example, successive calls to get_next where S
1083 represents the spec-string 'a-d[y*3]' will return the sequence
1084 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1085 which the returned character comes is [:upper:] or [:lower:], the
1086 parameter CLASS is given a value to indicate which it was. Otherwise
1087 CLASS is set to UL_NONE. This value is used only when constructing
1088 the translation table to verify that any occurrences of upper and
1089 lower class constructs in the spec-strings appear in the same relative
1094 struct Spec_list
*s
;
1095 enum Upper_Lower_class
*class;
1097 struct List_element
*p
;
1104 if (s
->state
== BEGIN_STATE
)
1106 s
->tail
= s
->head
->next
;
1107 s
->state
= NEW_ELEMENT
;
1116 case RE_NORMAL_CHAR
:
1117 return_val
= p
->u
.normal_char
;
1118 s
->state
= NEW_ELEMENT
;
1123 if (s
->state
== NEW_ELEMENT
)
1124 s
->state
= ORD (p
->u
.range
.first_char
);
1127 return_val
= CHR (s
->state
);
1128 if (s
->state
== ORD (p
->u
.range
.last_char
))
1131 s
->state
= NEW_ELEMENT
;
1136 if (s
->state
== NEW_ELEMENT
)
1138 for (i
= 0; i
< N_CHARS
; i
++)
1139 if (is_char_class_member (p
->u
.char_class
, i
))
1141 assert (i
< N_CHARS
);
1144 assert (is_char_class_member (p
->u
.char_class
, s
->state
));
1145 return_val
= CHR (s
->state
);
1146 for (i
= s
->state
+ 1; i
< N_CHARS
; i
++)
1147 if (is_char_class_member (p
->u
.char_class
, i
))
1154 s
->state
= NEW_ELEMENT
;
1158 switch (p
->u
.char_class
)
1173 case RE_EQUIV_CLASS
:
1174 /* FIXME: this assumes that each character is alone in its own
1175 equivalence class (which appears to be correct for my
1176 LC_COLLATE. But I don't know of any function that allows
1177 one to determine a character's equivalence class. */
1179 return_val
= p
->u
.equiv_code
;
1180 s
->state
= NEW_ELEMENT
;
1184 case RE_REPEATED_CHAR
:
1185 /* Here, a repeat count of n == 0 means don't repeat at all. */
1186 if (p
->u
.repeated_char
.repeat_count
== 0)
1189 s
->state
= NEW_ELEMENT
;
1190 return_val
= get_next (s
, class);
1194 if (s
->state
== NEW_ELEMENT
)
1199 return_val
= p
->u
.repeated_char
.the_repeated_char
;
1200 if (p
->u
.repeated_char
.repeat_count
> 0
1201 && s
->state
== p
->u
.repeated_char
.repeat_count
)
1204 s
->state
= NEW_ELEMENT
;
1221 /* This is a minor kludge. This function is called from
1222 get_spec_stats to determine the cardinality of a set derived
1223 from a complemented string. It's a kludge in that some of the
1224 same operations are (duplicated) performed in set_initialize. */
1227 card_of_complement (s
)
1228 struct Spec_list
*s
;
1231 int cardinality
= N_CHARS
;
1232 SET_TYPE in_set
[N_CHARS
];
1234 memset (in_set
, 0, N_CHARS
* sizeof (in_set
[0]));
1235 s
->state
= BEGIN_STATE
;
1236 while ((c
= get_next (s
, NULL
)) != -1)
1242 /* Gather statistics about the spec-list S in preparation for the tests
1243 in validate that determine the consistency of the specs. This function
1244 is called at most twice; once for string1, and again for any string2.
1245 LEN_S1 < 0 indicates that this is the first call and that S represents
1246 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1247 constructs in string1, and we can use its value to resolve any
1248 indefinite repeat construct in S (which represents string2). Hence,
1249 this function has the side-effect that it converts a valid [c*]
1250 construct in string2 to [c*n] where n is large enough (or 0) to give
1251 string2 the same length as string1. For example, with the command
1252 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1253 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1257 struct Spec_list
*s
;
1259 struct List_element
*p
;
1262 s
->n_indefinite_repeats
= 0;
1263 s
->has_equiv_class
= 0;
1264 s
->has_restricted_char_class
= 0;
1265 s
->has_upper_or_lower
= 0;
1266 for (p
= s
->head
->next
; p
; p
= p
->next
)
1271 case RE_NORMAL_CHAR
:
1276 assert (p
->u
.range
.last_char
>= p
->u
.range
.first_char
);
1277 len
+= p
->u
.range
.last_char
- p
->u
.range
.first_char
+ 1;
1281 for (i
= 0; i
< N_CHARS
; i
++)
1282 if (is_char_class_member (p
->u
.char_class
, i
))
1284 switch (p
->u
.char_class
)
1288 s
->has_upper_or_lower
= 1;
1291 s
->has_restricted_char_class
= 1;
1296 case RE_EQUIV_CLASS
:
1297 for (i
= 0; i
< N_CHARS
; i
++)
1298 if (is_equiv_class_member (p
->u
.equiv_code
, i
))
1300 s
->has_equiv_class
= 1;
1303 case RE_REPEATED_CHAR
:
1304 if (p
->u
.repeated_char
.repeat_count
> 0)
1305 len
+= p
->u
.repeated_char
.repeat_count
;
1306 else if (p
->u
.repeated_char
.repeat_count
== 0)
1308 s
->indefinite_repeat_element
= p
;
1309 ++(s
->n_indefinite_repeats
);
1323 get_s1_spec_stats (s1
)
1324 struct Spec_list
*s1
;
1326 get_spec_stats (s1
);
1328 s1
->length
= card_of_complement (s1
);
1332 get_s2_spec_stats (s2
, len_s1
)
1333 struct Spec_list
*s2
;
1336 get_spec_stats (s2
);
1337 if (len_s1
>= s2
->length
&& s2
->n_indefinite_repeats
== 1)
1339 s2
->indefinite_repeat_element
->u
.repeated_char
.repeat_count
=
1340 len_s1
- s2
->length
;
1341 s2
->length
= len_s1
;
1346 spec_init (spec_list
)
1347 struct Spec_list
*spec_list
;
1349 spec_list
->head
= spec_list
->tail
=
1350 (struct List_element
*) xmalloc (sizeof (struct List_element
));
1351 spec_list
->head
->next
= NULL
;
1354 /* This function makes two passes over the argument string S. The first
1355 one converts all \c and \ddd escapes to their one-byte representations.
1356 The second constructs a linked specification list, SPEC_LIST, of the
1357 characters and constructs that comprise the argument string. If either
1358 of these passes detects an error, this function returns non-zero. */
1361 parse_str (s
, spec_list
)
1363 struct Spec_list
*spec_list
;
1367 if (unquote (s
, &len
))
1369 if (build_spec_list (s
, len
, spec_list
))
1374 /* Given two specification lists, S1 and S2, and assuming that
1375 S1->length > S2->length, append a single [c*n] element to S2 where c
1376 is the last character in the expansion of S2 and n is the difference
1377 between the two lengths.
1378 Upon successful completion, S2->length is set to S1->length. The only
1379 way this function can fail to make S2 as long as S1 is when S2 has
1380 zero-length, since in that case, there is no last character to repeat.
1381 So S2->length is required to be at least 1.
1383 Providing this functionality allows the user to do some pretty
1384 non-BSD (and non-portable) things: For example, the command
1385 tr -cs '[:upper:]0-9' '[:lower:]'
1386 is almost guaranteed to give results that depend on your collating
1390 string2_extend (s1
, s2
)
1391 const struct Spec_list
*s1
;
1392 struct Spec_list
*s2
;
1394 struct List_element
*p
;
1398 assert (translating
);
1399 assert (s1
->length
> s2
->length
);
1400 assert (s2
->length
> 0);
1405 case RE_NORMAL_CHAR
:
1406 char_to_repeat
= p
->u
.normal_char
;
1409 char_to_repeat
= p
->u
.range
.last_char
;
1412 for (i
= N_CHARS
; i
>= 0; i
--)
1413 if (is_char_class_member (p
->u
.char_class
, i
))
1416 char_to_repeat
= CHR (i
);
1419 case RE_REPEATED_CHAR
:
1420 char_to_repeat
= p
->u
.repeated_char
.the_repeated_char
;
1423 case RE_EQUIV_CLASS
:
1424 /* This shouldn't happen, because validate exits with an error
1425 if it finds an equiv class in string2 when translating. */
1438 append_repeated_char (s2
, char_to_repeat
, s1
->length
- s2
->length
);
1439 s2
->length
= s1
->length
;
1442 /* Die with an error message if S1 and S2 describe strings that
1443 are not valid with the given command line switches.
1444 A side effect of this function is that if a valid [c*] or
1445 [c*0] construct appears in string2, it is converted to [c*n]
1446 with a value for n that makes s2->length == s1->length. By
1447 the same token, if the --truncate-set1 option is not
1448 given, S2 may be extended. */
1452 const struct Spec_list
*s1
;
1453 struct Spec_list
*s2
;
1455 get_s1_spec_stats (s1
);
1456 if (s1
->n_indefinite_repeats
> 0)
1458 error (1, 0, "the [c*] repeat construct may not appear in string1");
1461 /* FIXME: it isn't clear from the POSIX spec that this is invalid,
1462 but in the spirit of the other restrictions put on translation
1463 with character classes, this seems a logical interpretation. */
1464 if (complement
&& s1
->has_upper_or_lower
)
1467 "character classes may not be used when translating \
1468 and complementing");
1473 get_s2_spec_stats (s2
, s1
->length
);
1474 if (s2
->has_restricted_char_class
)
1477 "when translating, the only character classes that may \
1478 appear in\n\tstring2 are `upper' and `lower'");
1481 if (s2
->n_indefinite_repeats
> 1)
1483 error (1, 0, "only one [c*] repeat construct may appear in string2");
1488 if (s2
->has_equiv_class
)
1491 "[=c=] expressions may not appear in string2 \
1495 if (s1
->length
> s2
->length
)
1499 /* string2 must be non-empty unless --truncate-set1 is
1500 given or string1 is empty. */
1502 if (s2
->length
== 0)
1504 "when not truncating set1, string2 must be non-empty");
1505 string2_extend (s1
, s2
);
1509 if (complement
&& s2
->has_upper_or_lower
)
1511 "character classes may not be used when translating \
1512 and complementing");
1515 /* Not translating. */
1517 if (s2
->n_indefinite_repeats
> 0)
1519 "the [c*] construct may appear in string2 only \
1525 /* Read buffers of SIZE bytes via the function READER (if READER is
1526 NULL, read from stdin) until EOF. When non-NULL, READER is either
1527 read_and_delete or read_and_xlate. After each buffer is read, it is
1528 processed and written to stdout. The buffers are processed so that
1529 multiple consecutive occurrences of the same character in the input
1530 stream are replaced by a single occurrence of that character if the
1531 character is in the squeeze set. */
1534 squeeze_filter (buf
, size
, reader
)
1539 unsigned int char_to_squeeze
= NOT_A_CHAR
;
1550 nr
= safe_read (0, (char *) buf
, size
);
1552 nr
= (*reader
) (buf
, size
, NULL
);
1555 error (1, errno
, "read error");
1563 if (char_to_squeeze
== NOT_A_CHAR
)
1566 /* Here, by being a little tricky, we can get a significant
1567 performance increase in most cases when the input is
1568 reasonably large. Since tr will modify the input only
1569 if two consecutive (and identical) input characters are
1570 in the squeeze set, we can step by two through the data
1571 when searching for a character in the squeeze set. This
1572 means there may be a little more work in a few cases and
1573 perhaps twice as much work in the worst cases where most
1574 of the input is removed by squeezing repeats. But most
1575 uses of this functionality seem to remove less than 20-30%
1577 for (; i
< nr
&& !in_squeeze_set
[buf
[i
]]; i
+= 2)
1580 /* There is a special case when i == nr and we've just
1581 skipped a character (the last one in buf) that is in
1583 if (i
== nr
&& in_squeeze_set
[buf
[i
- 1]])
1587 out_len
= nr
- begin
;
1590 char_to_squeeze
= buf
[i
];
1591 /* We're about to output buf[begin..i]. */
1592 out_len
= i
- begin
+ 1;
1594 /* But since we stepped by 2 in the loop above,
1595 out_len may be one too large. */
1596 if (i
> 0 && buf
[i
- 1] == char_to_squeeze
)
1599 /* Advance i to the index of first character to be
1600 considered when looking for a char different from
1605 && fwrite ((char *) &buf
[begin
], 1, out_len
, stdout
) == 0)
1606 error (1, errno
, "write error");
1609 if (char_to_squeeze
!= NOT_A_CHAR
)
1611 /* Advance i to index of first char != char_to_squeeze
1612 (or to nr if all the rest of the characters in this
1613 buffer are the same as char_to_squeeze). */
1614 for (; i
< nr
&& buf
[i
] == char_to_squeeze
; i
++)
1617 char_to_squeeze
= NOT_A_CHAR
;
1618 /* If (i >= nr) we've squeezed the last character in this buffer.
1619 So now we have to read a new buffer and continue comparing
1620 characters against char_to_squeeze. */
1625 /* Read buffers of SIZE bytes from stdin until one is found that
1626 contains at least one character not in the delete set. Store
1627 in the array BUF, all characters from that buffer that are not
1628 in the delete set, and return the number of characters saved
1632 read_and_delete (buf
, size
, not_used
)
1638 static int hit_eof
= 0;
1640 assert (not_used
== NULL
);
1646 /* This enclosing do-while loop is to make sure that
1647 we don't return zero (indicating EOF) when we've
1648 just deleted all the characters in a buffer. */
1652 int nr
= safe_read (0, (char *) buf
, size
);
1655 error (1, errno
, "read error");
1662 /* This first loop may be a waste of code, but gives much
1663 better performance when no characters are deleted in
1664 the beginning of a buffer. It just avoids the copying
1665 of buf[i] into buf[n_saved] when it would be a NOP. */
1667 for (i
= 0; i
< nr
&& !in_delete_set
[buf
[i
]]; i
++)
1671 for (++i
; i
< nr
; i
++)
1672 if (!in_delete_set
[buf
[i
]])
1673 buf
[n_saved
++] = buf
[i
];
1675 while (n_saved
== 0);
1680 /* Read at most SIZE bytes from stdin into the array BUF. Then
1681 perform the in-place and one-to-one mapping specified by the global
1682 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1685 read_and_xlate (buf
, size
, not_used
)
1690 long chars_read
= 0;
1691 static int hit_eof
= 0;
1694 assert (not_used
== NULL
);
1700 chars_read
= safe_read (0, (char *) buf
, size
);
1702 error (1, errno
, "read error");
1703 if (chars_read
== 0)
1709 for (i
= 0; i
< chars_read
; i
++)
1710 buf
[i
] = xlate
[buf
[i
]];
1715 /* Initialize a boolean membership set IN_SET with the character
1716 values obtained by traversing the linked list of constructs S
1717 using the function `get_next'. If COMPLEMENT_THIS_SET is
1718 non-zero the resulting set is complemented. */
1721 set_initialize (s
, complement_this_set
, in_set
)
1722 struct Spec_list
*s
;
1723 int complement_this_set
;
1729 memset (in_set
, 0, N_CHARS
* sizeof (in_set
[0]));
1730 s
->state
= BEGIN_STATE
;
1731 while ((c
= get_next (s
, NULL
)) != -1)
1733 if (complement_this_set
)
1734 for (i
= 0; i
< N_CHARS
; i
++)
1735 in_set
[i
] = (!in_set
[i
]);
1744 int non_option_args
;
1745 struct Spec_list buf1
, buf2
;
1746 struct Spec_list
*s1
= &buf1
;
1747 struct Spec_list
*s2
= &buf2
;
1749 program_name
= argv
[0];
1751 while ((c
= getopt_long (argc
, argv
, "cdst", long_options
,
1768 squeeze_repeats
= 1;
1783 printf ("tr - %s\n", version_string
);
1790 posix_pedantic
= (getenv ("POSIXLY_CORRECT") != NULL
);
1792 non_option_args
= argc
- optind
;
1793 translating
= (non_option_args
== 2 && !delete);
1795 /* Change this test if it is valid to give tr no options and
1796 no args at all. POSIX doesn't specifically say anything
1797 either way, but it looks like they implied it's invalid
1798 by omission. If you want to make tr do a slow imitation
1799 of `cat' use `tr a a'. */
1800 if (non_option_args
> 2)
1802 error (0, 0, "too many arguments");
1806 if (!delete && !squeeze_repeats
&& non_option_args
!= 2)
1807 error (1, 0, "two strings must be given when translating");
1809 if (delete && squeeze_repeats
&& non_option_args
!= 2)
1810 error (1, 0, "two strings must be given when both \
1811 deleting and squeezing repeats");
1813 /* If --delete is given without --squeeze-repeats, then
1814 only one string argument may be specified. But POSIX
1815 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1816 is set, pretend we never saw string2. But I think
1817 this deserves a fatal error, so that's the default. */
1818 if ((delete && !squeeze_repeats
) && non_option_args
!= 1)
1820 if (posix_pedantic
&& non_option_args
== 2)
1824 "only one string may be given when deleting \
1825 without squeezing repeats");
1828 if (squeeze_repeats
&& non_option_args
== 0)
1829 error (1, 0, "at least one string must be given when squeezing repeats");
1832 if (parse_str ((unsigned char *) argv
[optind
], s1
))
1835 if (non_option_args
== 2)
1838 if (parse_str ((unsigned char *) argv
[optind
+ 1], s2
))
1846 if (squeeze_repeats
&& non_option_args
== 1)
1848 set_initialize (s1
, complement
, in_squeeze_set
);
1849 squeeze_filter (io_buf
, IO_BUF_SIZE
, NULL
);
1851 else if (delete && non_option_args
== 1)
1855 set_initialize (s1
, complement
, in_delete_set
);
1858 nr
= read_and_delete (io_buf
, IO_BUF_SIZE
, NULL
);
1859 if (nr
> 0 && fwrite ((char *) io_buf
, 1, nr
, stdout
) == 0)
1860 error (1, errno
, "write error");
1864 else if (squeeze_repeats
&& delete && non_option_args
== 2)
1866 set_initialize (s1
, complement
, in_delete_set
);
1867 set_initialize (s2
, 0, in_squeeze_set
);
1868 squeeze_filter (io_buf
, IO_BUF_SIZE
, (PFI
) read_and_delete
);
1870 else if (translating
)
1875 SET_TYPE
*in_s1
= in_delete_set
;
1877 set_initialize (s1
, 0, in_s1
);
1878 s2
->state
= BEGIN_STATE
;
1879 for (i
= 0; i
< N_CHARS
; i
++)
1881 for (i
= 0; i
< N_CHARS
; i
++)
1885 int ch
= get_next (s2
, NULL
);
1886 assert (ch
!= -1 || truncate_set1
);
1889 /* This will happen when tr is invoked like e.g.
1890 tr -cs A-Za-z0-9 '\012'. */
1896 assert (get_next (s2
, NULL
) == -1 || truncate_set1
);
1902 enum Upper_Lower_class class_s1
;
1903 enum Upper_Lower_class class_s2
;
1905 for (i
= 0; i
< N_CHARS
; i
++)
1907 s1
->state
= BEGIN_STATE
;
1908 s2
->state
= BEGIN_STATE
;
1911 c1
= get_next (s1
, &class_s1
);
1912 c2
= get_next (s2
, &class_s2
);
1913 if (!class_ok
[(int) class_s1
][(int) class_s2
])
1915 "misaligned or mismatched upper and/or lower classes");
1916 /* The following should have been checked by validate... */
1921 assert (c1
== -1 || truncate_set1
);
1923 if (squeeze_repeats
)
1925 set_initialize (s2
, 0, in_squeeze_set
);
1926 squeeze_filter (io_buf
, IO_BUF_SIZE
, (PFI
) read_and_xlate
);
1934 chars_read
= read_and_xlate (io_buf
, IO_BUF_SIZE
, NULL
);
1936 && fwrite ((char *) io_buf
, 1, chars_read
, stdout
) == 0)
1937 error (1, errno
, "write error");
1939 while (chars_read
> 0);
1943 if (fclose (stdout
) == EOF
)
1944 error (2, errno
, "write error");
1947 error (2, errno
, "standard input");