*** empty log message ***
[coreutils.git] / src / tr.c
blobdc3332b7fb74534f898f46fd4d7abda4f512d011
1 /* tr -- a filter to translate characters
2 Copyright (C) 91, 1995-1998, 1999 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 /* Written by Jim Meyering */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <errno.h>
25 #include <sys/types.h>
26 #include <getopt.h>
28 #include "system.h"
29 #include "error.h"
30 #include "safe-read.h"
32 /* The official name of this program (e.g., no `g' prefix). */
33 #define PROGRAM_NAME "tr"
35 #define AUTHORS "Jim Meyering"
37 #define N_CHARS (UCHAR_MAX + 1)
39 /* A pointer to a function that returns an int. */
40 typedef int (*PFI) ();
42 /* Convert from character C to its index in the collating
43 sequence array. Just cast to an unsigned int to avoid
44 problems with sign-extension. */
45 #define ORD(c) (unsigned int)(c)
47 /* The inverse of ORD. */
48 #define CHR(i) (unsigned char)(i)
50 /* The value for Spec_list->state that indicates to
51 get_next that it should initialize the tail pointer.
52 Its value should be as large as possible to avoid conflict
53 a valid value for the state field -- and that may be as
54 large as any valid repeat_count. */
55 #define BEGIN_STATE (INT_MAX - 1)
57 /* The value for Spec_list->state that indicates to
58 get_next that the element pointed to by Spec_list->tail is
59 being considered for the first time on this pass through the
60 list -- it indicates that get_next should make any necessary
61 initializations. */
62 #define NEW_ELEMENT (BEGIN_STATE + 1)
64 /* A value distinct from any character that may have been stored in a
65 buffer as the result of a block-read in the function squeeze_filter. */
66 #define NOT_A_CHAR (unsigned int)(-1)
68 /* The following (but not CC_NO_CLASS) are indices into the array of
69 valid character class strings. */
70 enum Char_class
72 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
73 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
74 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
75 CC_NO_CLASS = 9999
78 /* Character class to which a character (returned by get_next) belonged;
79 but it is set only if the construct from which the character was obtained
80 was one of the character classes [:upper:] or [:lower:]. The value
81 is used only when translating and then, only to make sure that upper
82 and lower class constructs have the same relative positions in string1
83 and string2. */
84 enum Upper_Lower_class
86 UL_LOWER = 0,
87 UL_UPPER = 1,
88 UL_NONE = 2
91 /* A shortcut to ensure that when constructing the translation array,
92 one of the values returned by paired calls to get_next (from s1 and s2)
93 is from [:upper:] and the other is from [:lower:], or neither is from
94 upper or lower. By default, GNU tr permits the identity mappings: from
95 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But when
96 POSIXLY_CORRECT is set, those evoke diagnostics. This array is indexed
97 by values of type enum Upper_Lower_class. */
98 static int const class_ok[3][3] =
100 {1, 1, 0},
101 {1, 1, 0},
102 {0, 0, 1}
105 /* The type of a List_element. See build_spec_list for more details. */
106 enum Range_element_type
108 RE_NO_TYPE = 0,
109 RE_NORMAL_CHAR,
110 RE_RANGE,
111 RE_CHAR_CLASS,
112 RE_EQUIV_CLASS,
113 RE_REPEATED_CHAR
116 /* One construct in one of tr's argument strings.
117 For example, consider the POSIX version of the classic tr command:
118 tr -cs 'a-zA-Z_' '[\n*]'
119 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
120 and a single normal character, `_'. String2 has one construct. */
121 struct List_element
123 enum Range_element_type type;
124 struct List_element *next;
125 union
127 int normal_char;
128 struct /* unnamed */
130 unsigned int first_char;
131 unsigned int last_char;
133 range;
134 enum Char_class char_class;
135 int equiv_code;
136 struct /* unnamed */
138 unsigned int the_repeated_char;
139 size_t repeat_count;
141 repeated_char;
146 /* Each of tr's argument strings is parsed into a form that is easier
147 to work with: a linked list of constructs (struct List_element).
148 Each Spec_list structure also encapsulates various attributes of
149 the corresponding argument string. The attributes are used mainly
150 to verify that the strings are valid in the context of any options
151 specified (like -s, -d, or -c). The main exception is the member
152 `tail', which is first used to construct the list. After construction,
153 it is used by get_next to save its state when traversing the list.
154 The member `state' serves a similar function. */
155 struct Spec_list
157 /* Points to the head of the list of range elements.
158 The first struct is a dummy; its members are never used. */
159 struct List_element *head;
161 /* When appending, points to the last element. When traversing via
162 get_next(), points to the element to process next. Setting
163 Spec_list.state to the value BEGIN_STATE before calling get_next
164 signals get_next to initialize tail to point to head->next. */
165 struct List_element *tail;
167 /* Used to save state between calls to get_next(). */
168 unsigned int state;
170 /* Length, in the sense that length ('a-z[:digit:]123abc')
171 is 42 ( = 26 + 10 + 6). */
172 size_t length;
174 /* The number of [c*] and [c*0] constructs that appear in this spec. */
175 int n_indefinite_repeats;
177 /* If n_indefinite_repeats is nonzero, this points to the List_element
178 corresponding to the last [c*] or [c*0] construct encountered in
179 this spec. Otherwise it is undefined. */
180 struct List_element *indefinite_repeat_element;
182 /* Non-zero if this spec contains at least one equivalence
183 class construct e.g. [=c=]. */
184 int has_equiv_class;
186 /* Non-zero if this spec contains at least one character class
187 construct. E.g. [:digit:]. */
188 int has_char_class;
190 /* Non-zero if this spec contains at least one of the character class
191 constructs (all but upper and lower) that aren't allowed in s2. */
192 int has_restricted_char_class;
195 /* A representation for escaped string1 or string2. As a string is parsed,
196 any backslash-escaped characters (other than octal or \a, \b, \f, \n,
197 etc.) are marked as such in this structure by setting the corresponding
198 entry in the ESCAPED vector. */
199 struct E_string
201 unsigned char *s;
202 int *escaped;
203 size_t len;
206 /* Return nonzero if the Ith character of escaped string ES matches C
207 and is not escaped itself. */
208 #define ES_MATCH(ES, I, C) ((ES)->s[(I)] == (C) && !(ES)->escaped[(I)])
210 /* The name by which this program was run. */
211 char *program_name;
213 /* When nonzero, each sequence in the input of a repeated character
214 (call it c) is replaced (in the output) by a single occurrence of c
215 for every c in the squeeze set. */
216 static int squeeze_repeats = 0;
218 /* When nonzero, removes characters in the delete set from input. */
219 static int delete = 0;
221 /* Use the complement of set1 in place of set1. */
222 static int complement = 0;
224 /* When nonzero, this flag causes GNU tr to provide strict
225 compliance with POSIX draft 1003.2.11.2. The POSIX spec
226 says that when -d is used without -s, string2 (if present)
227 must be ignored. Silently ignoring arguments is a bad idea.
228 The default GNU behavior is to give a usage message and exit.
229 Additionally, when this flag is nonzero, tr prints warnings
230 on stderr if it is being used in a manner that is not portable.
231 Applicable warnings are given by default, but are suppressed
232 if the environment variable `POSIXLY_CORRECT' is set, since
233 being POSIX conformant means we can't issue such messages.
234 Warnings on the following topics are suppressed when this
235 variable is nonzero:
236 1. Ambiguous octal escapes. */
237 static int posix_pedantic;
239 /* When tr is performing translation and string1 is longer than string2,
240 POSIX says that the result is undefined. That gives the implementor
241 of a POSIX conforming version of tr two reasonable choices for the
242 semantics of this case.
244 * The BSD tr pads string2 to the length of string1 by
245 repeating the last character in string2.
247 * System V tr ignores characters in string1 that have no
248 corresponding character in string2. That is, string1 is effectively
249 truncated to the length of string2.
251 When nonzero, this flag causes GNU tr to imitate the behavior
252 of System V tr when translating with string1 longer than string2.
253 The default is to emulate BSD tr. This flag is ignored in modes where
254 no translation is performed. Emulating the System V tr
255 in this exceptional case causes the relatively common BSD idiom:
257 tr -cs A-Za-z0-9 '\012'
259 to break (it would convert only zero bytes, rather than all
260 non-alphanumerics, to newlines).
262 WARNING: This switch does not provide general BSD or System V
263 compatibility. For example, it doesn't disable the interpretation
264 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
265 some unfortunate coincidence you use such constructs in scripts
266 expecting to use some other version of tr, the scripts will break. */
267 static int truncate_set1 = 0;
269 /* An alias for (!delete && non_option_args == 2).
270 It is set in main and used there and in validate(). */
271 static int translating;
273 #ifndef BUFSIZ
274 # define BUFSIZ 8192
275 #endif
277 #define IO_BUF_SIZE BUFSIZ
278 static unsigned char io_buf[IO_BUF_SIZE];
280 static char const *const char_class_name[] =
282 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
283 "lower", "print", "punct", "space", "upper", "xdigit"
285 #define N_CHAR_CLASSES (sizeof(char_class_name) / sizeof(char_class_name[0]))
287 typedef char SET_TYPE;
289 /* Array of boolean values. A character `c' is a member of the
290 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
291 set is defined by the last (possibly, the only) string argument
292 on the command line when the squeeze option is given. */
293 static SET_TYPE in_squeeze_set[N_CHARS];
295 /* Array of boolean values. A character `c' is a member of the
296 delete set if and only if in_delete_set[c] is true. The delete
297 set is defined by the first (or only) string argument on the
298 command line when the delete option is given. */
299 static SET_TYPE in_delete_set[N_CHARS];
301 /* Array of character values defining the translation (if any) that
302 tr is to perform. Translation is performed only when there are
303 two specification strings and the delete switch is not given. */
304 static char xlate[N_CHARS];
306 static struct option const long_options[] =
308 {"complement", no_argument, NULL, 'c'},
309 {"delete", no_argument, NULL, 'd'},
310 {"squeeze-repeats", no_argument, NULL, 's'},
311 {"truncate-set1", no_argument, NULL, 't'},
312 {GETOPT_HELP_OPTION_DECL},
313 {GETOPT_VERSION_OPTION_DECL},
314 {NULL, 0, NULL, 0}
317 void
318 usage (int status)
320 if (status != 0)
321 fprintf (stderr, _("Try `%s --help' for more information.\n"),
322 program_name);
323 else
325 printf (_("\
326 Usage: %s [OPTION]... SET1 [SET2]\n\
328 program_name);
329 printf (_("\
330 Translate, squeeze, and/or delete characters from standard input,\n\
331 writing to standard output.\n\
333 -c, --complement first complement SET1\n\
334 -d, --delete delete characters in SET1, do not translate\n\
335 -s, --squeeze-repeats replace sequence of characters with one\n\
336 -t, --truncate-set1 first truncate SET1 to length of SET2\n\
337 --help display this help and exit\n\
338 --version output version information and exit\n\
339 "));
340 printf (_("\
342 SETs are specified as strings of characters. Most represent themselves.\n\
343 Interpreted sequences are:\n\
345 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
346 \\\\ backslash\n\
347 \\a audible BEL\n\
348 \\b backspace\n\
349 \\f form feed\n\
350 \\n new line\n\
351 \\r return\n\
352 \\t horizontal tab\n\
353 \\v vertical tab\n\
354 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
355 [CHAR1-CHAR2] same as CHAR1-CHAR2, if both SET1 and SET2 use this\n\
356 [CHAR*] in SET2, copies of CHAR until length of SET1\n\
357 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
358 [:alnum:] all letters and digits\n\
359 [:alpha:] all letters\n\
360 [:blank:] all horizontal whitespace\n\
361 [:cntrl:] all control characters\n\
362 [:digit:] all digits\n\
363 [:graph:] all printable characters, not including space\n\
364 [:lower:] all lower case letters\n\
365 [:print:] all printable characters, including space\n\
366 [:punct:] all punctuation characters\n\
367 [:space:] all horizontal or vertical whitespace\n\
368 [:upper:] all upper case letters\n\
369 [:xdigit:] all hexadecimal digits\n\
370 [=CHAR=] all characters which are equivalent to CHAR\n\
371 "));
372 printf (_("\
374 Translation occurs if -d is not given and both SET1 and SET2 appear.\n\
375 -t may be used only when translating. SET2 is extended to length of\n\
376 SET1 by repeating its last character as necessary. Excess characters\n\
377 of SET2 are ignored. Only [:lower:] and [:upper:] are guaranteed to\n\
378 expand in ascending order; used in SET2 while translating, they may\n\
379 only be used in pairs to specify case conversion. -s uses SET1 if not\n\
380 translating nor deleting; else squeezing uses SET2 and occurs after\n\
381 translation or deletion.\n\
382 "));
383 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
385 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
388 /* Return nonzero if the character C is a member of the
389 equivalence class containing the character EQUIV_CLASS. */
391 static int
392 is_equiv_class_member (unsigned int equiv_class, unsigned int c)
394 return (equiv_class == c);
397 /* Return nonzero if the character C is a member of the
398 character class CHAR_CLASS. */
400 static int
401 is_char_class_member (enum Char_class char_class, unsigned int c)
403 int result;
405 switch (char_class)
407 case CC_ALNUM:
408 result = ISALNUM (c);
409 break;
410 case CC_ALPHA:
411 result = ISALPHA (c);
412 break;
413 case CC_BLANK:
414 result = ISBLANK (c);
415 break;
416 case CC_CNTRL:
417 result = ISCNTRL (c);
418 break;
419 case CC_DIGIT:
420 result = ISDIGIT_LOCALE (c);
421 break;
422 case CC_GRAPH:
423 result = ISGRAPH (c);
424 break;
425 case CC_LOWER:
426 result = ISLOWER (c);
427 break;
428 case CC_PRINT:
429 result = ISPRINT (c);
430 break;
431 case CC_PUNCT:
432 result = ISPUNCT (c);
433 break;
434 case CC_SPACE:
435 result = ISSPACE (c);
436 break;
437 case CC_UPPER:
438 result = ISUPPER (c);
439 break;
440 case CC_XDIGIT:
441 result = ISXDIGIT (c);
442 break;
443 default:
444 abort ();
445 break;
447 return result;
450 static void
451 es_free (struct E_string *es)
453 free (es->s);
454 free (es->escaped);
457 /* Perform the first pass over each range-spec argument S, converting all
458 \c and \ddd escapes to their one-byte representations. The conversion
459 is done in-place, so S must point to writable storage. If an invalid
460 quote sequence is found print an error message and return nonzero.
461 Otherwise set *LEN to the length of the resulting string and return
462 zero. The resulting array of characters may contain zero-bytes;
463 however, on input, S is assumed to be null-terminated, and hence
464 cannot contain actual (non-escaped) zero bytes. */
466 static int
467 unquote (const unsigned char *s, struct E_string *es)
469 size_t i, j;
470 size_t len;
472 len = strlen ((char *) s);
474 es->s = (unsigned char *) xmalloc (len);
475 es->escaped = (int *) xmalloc (len * sizeof (es->escaped[0]));
476 for (i = 0; i < len; i++)
477 es->escaped[i] = 0;
479 j = 0;
480 for (i = 0; s[i]; i++)
482 switch (s[i])
484 int c;
485 case '\\':
486 switch (s[i + 1])
488 int oct_digit;
489 case '\\':
490 c = '\\';
491 break;
492 case 'a':
493 c = '\007';
494 break;
495 case 'b':
496 c = '\b';
497 break;
498 case 'f':
499 c = '\f';
500 break;
501 case 'n':
502 c = '\n';
503 break;
504 case 'r':
505 c = '\r';
506 break;
507 case 't':
508 c = '\t';
509 break;
510 case 'v':
511 c = '\v';
512 break;
513 case '0':
514 case '1':
515 case '2':
516 case '3':
517 case '4':
518 case '5':
519 case '6':
520 case '7':
521 c = s[i + 1] - '0';
522 oct_digit = s[i + 2] - '0';
523 if (0 <= oct_digit && oct_digit <= 7)
525 c = 8 * c + oct_digit;
526 ++i;
527 oct_digit = s[i + 2] - '0';
528 if (0 <= oct_digit && oct_digit <= 7)
530 if (8 * c + oct_digit < N_CHARS)
532 c = 8 * c + oct_digit;
533 ++i;
535 else if (!posix_pedantic)
537 /* Any octal number larger than 0377 won't
538 fit in 8 bits. So we stop when adding the
539 next digit would put us over the limit and
540 give a warning about the ambiguity. POSIX
541 isn't clear on this, but one person has said
542 that in his interpretation, POSIX says tr
543 can't even give a warning. */
544 error (0, 0, _("warning: the ambiguous octal escape \
545 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, `%c'"),
546 s[i], s[i + 1], s[i + 2],
547 s[i], s[i + 1], s[i + 2]);
551 break;
552 case '\0':
553 error (0, 0, _("invalid backslash escape at end of string"));
554 return 1;
556 default:
557 if (posix_pedantic)
559 error (0, 0, _("invalid backslash escape `\\%c'"), s[i + 1]);
560 return 1;
562 else
564 c = s[i + 1];
565 es->escaped[j] = 1;
568 ++i;
569 es->s[j++] = c;
570 break;
571 default:
572 es->s[j++] = s[i];
573 break;
576 es->len = j;
577 return 0;
580 /* If CLASS_STR is a valid character class string, return its index
581 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
583 static enum Char_class
584 look_up_char_class (const unsigned char *class_str, size_t len)
586 unsigned int i;
588 for (i = 0; i < N_CHAR_CLASSES; i++)
589 if (strncmp ((const char *) class_str, char_class_name[i], len) == 0
590 && strlen (char_class_name[i]) == len)
591 return (enum Char_class) i;
592 return CC_NO_CLASS;
595 /* Return a newly allocated string with a printable version of C.
596 This function is used solely for formatting error messages. */
598 static char *
599 make_printable_char (unsigned int c)
601 char *buf = xmalloc (5);
603 assert (c < N_CHARS);
604 if (ISPRINT (c))
606 buf[0] = c;
607 buf[1] = '\0';
609 else
611 sprintf (buf, "\\%03o", c);
613 return buf;
616 /* Return a newly allocated copy of S which is suitable for printing.
617 LEN is the number of characters in S. Most non-printing
618 (isprint) characters are represented by a backslash followed by
619 3 octal digits. However, the characters represented by \c escapes
620 where c is one of [abfnrtv] are represented by their 2-character \c
621 sequences. This function is used solely for printing error messages. */
623 static char *
624 make_printable_str (const unsigned char *s, size_t len)
626 /* Worst case is that every character expands to a backslash
627 followed by a 3-character octal escape sequence. */
628 char *printable_buf = xmalloc (4 * len + 1);
629 char *p = printable_buf;
630 size_t i;
632 for (i = 0; i < len; i++)
634 char buf[5];
635 char *tmp = NULL;
637 switch (s[i])
639 case '\\':
640 tmp = "\\";
641 break;
642 case '\007':
643 tmp = "\\a";
644 break;
645 case '\b':
646 tmp = "\\b";
647 break;
648 case '\f':
649 tmp = "\\f";
650 break;
651 case '\n':
652 tmp = "\\n";
653 break;
654 case '\r':
655 tmp = "\\r";
656 break;
657 case '\t':
658 tmp = "\\t";
659 break;
660 case '\v':
661 tmp = "\\v";
662 break;
663 default:
664 if (ISPRINT (s[i]))
666 buf[0] = s[i];
667 buf[1] = '\0';
669 else
670 sprintf (buf, "\\%03o", s[i]);
671 tmp = buf;
672 break;
674 p = stpcpy (p, tmp);
676 return printable_buf;
679 /* Append a newly allocated structure representing a
680 character C to the specification list LIST. */
682 static void
683 append_normal_char (struct Spec_list *list, unsigned int c)
685 struct List_element *new;
687 new = (struct List_element *) xmalloc (sizeof (struct List_element));
688 new->next = NULL;
689 new->type = RE_NORMAL_CHAR;
690 new->u.normal_char = c;
691 assert (list->tail);
692 list->tail->next = new;
693 list->tail = new;
696 /* Append a newly allocated structure representing the range
697 of characters from FIRST to LAST to the specification list LIST.
698 Return nonzero if LAST precedes FIRST in the collating sequence,
699 zero otherwise. This means that '[c-c]' is acceptable. */
701 static int
702 append_range (struct Spec_list *list, unsigned int first, unsigned int last)
704 struct List_element *new;
706 if (ORD (first) > ORD (last))
708 char *tmp1 = make_printable_char (first);
709 char *tmp2 = make_printable_char (last);
711 error (0, 0,
712 _("range-endpoints of `%s-%s' are in reverse collating sequence order"),
713 tmp1, tmp2);
714 free (tmp1);
715 free (tmp2);
716 return 1;
718 new = (struct List_element *) xmalloc (sizeof (struct List_element));
719 new->next = NULL;
720 new->type = RE_RANGE;
721 new->u.range.first_char = first;
722 new->u.range.last_char = last;
723 assert (list->tail);
724 list->tail->next = new;
725 list->tail = new;
726 return 0;
729 /* If CHAR_CLASS_STR is a valid character class string, append a
730 newly allocated structure representing that character class to the end
731 of the specification list LIST and return 0. If CHAR_CLASS_STR is not
732 a valid string return nonzero. */
734 static int
735 append_char_class (struct Spec_list *list,
736 const unsigned char *char_class_str, size_t len)
738 enum Char_class char_class;
739 struct List_element *new;
741 char_class = look_up_char_class (char_class_str, len);
742 if (char_class == CC_NO_CLASS)
743 return 1;
744 new = (struct List_element *) xmalloc (sizeof (struct List_element));
745 new->next = NULL;
746 new->type = RE_CHAR_CLASS;
747 new->u.char_class = char_class;
748 assert (list->tail);
749 list->tail->next = new;
750 list->tail = new;
751 return 0;
754 /* Append a newly allocated structure representing a [c*n]
755 repeated character construct to the specification list LIST.
756 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
757 is a non-negative repeat count. */
759 static void
760 append_repeated_char (struct Spec_list *list, unsigned int the_char,
761 size_t repeat_count)
763 struct List_element *new;
765 new = (struct List_element *) xmalloc (sizeof (struct List_element));
766 new->next = NULL;
767 new->type = RE_REPEATED_CHAR;
768 new->u.repeated_char.the_repeated_char = the_char;
769 new->u.repeated_char.repeat_count = repeat_count;
770 assert (list->tail);
771 list->tail->next = new;
772 list->tail = new;
775 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
776 the length of that string, LEN, if LEN is exactly one, append
777 a newly allocated structure representing the specified
778 equivalence class to the specification list, LIST and return zero.
779 If LEN is not 1, return nonzero. */
781 static int
782 append_equiv_class (struct Spec_list *list,
783 const unsigned char *equiv_class_str, size_t len)
785 struct List_element *new;
787 if (len != 1)
788 return 1;
789 new = (struct List_element *) xmalloc (sizeof (struct List_element));
790 new->next = NULL;
791 new->type = RE_EQUIV_CLASS;
792 new->u.equiv_code = *equiv_class_str;
793 assert (list->tail);
794 list->tail->next = new;
795 list->tail = new;
796 return 0;
799 /* Return a newly allocated copy of the substring P[FIRST_IDX..LAST_IDX].
800 The returned string has length LAST_IDX - FIRST_IDX + 1, may contain
801 NUL bytes, and is *not* NUL-terminated. */
803 static unsigned char *
804 substr (const unsigned char *p, size_t first_idx, size_t last_idx)
806 size_t len;
807 unsigned char *tmp;
809 assert (first_idx <= last_idx);
810 len = last_idx - first_idx + 1;
811 tmp = (unsigned char *) xmalloc (len);
813 assert (first_idx <= last_idx);
814 /* Use memcpy rather than strncpy because `p' may contain zero-bytes. */
815 memcpy (tmp, p + first_idx, len);
816 return tmp;
819 /* Search forward starting at START_IDX for the 2-char sequence
820 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
821 a sequence is found, set *RESULT_IDX to the index of the first
822 character and return nonzero. Otherwise return zero. P may contain
823 zero bytes. */
825 static int
826 find_closing_delim (const struct E_string *es, size_t start_idx,
827 unsigned int pre_bracket_char, size_t *result_idx)
829 size_t i;
831 for (i = start_idx; i < es->len - 1; i++)
832 if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
833 && !es->escaped[i] && !es->escaped[i + 1])
835 *result_idx = i;
836 return 1;
838 return 0;
841 /* Convert a string S with explicit length LEN, possibly
842 containing embedded zero bytes, to a long integer value.
843 If the string represents a negative value, a value larger
844 than LONG_MAX, or if all LEN characters do not represent a
845 valid integer, return nonzero and do not modify *VAL.
846 Otherwise, return zero and set *VAL to the converted value. */
848 static int
849 non_neg_strtol (const unsigned char *s, size_t len, size_t *val)
851 size_t i;
852 unsigned long sum = 0;
853 unsigned int base;
855 if (len <= 0)
856 return 1;
857 if (s[0] == '0')
858 base = 8;
859 else if (ISDIGIT (s[0]))
860 base = 10;
861 else
862 return 1;
864 for (i = 0; i < len; i++)
866 unsigned int c;
868 if (s[i] < '0')
869 return 1;
871 c = s[i] - '0';
872 if (c >= base)
873 return 1;
875 if (sum > (LONG_MAX - c) / base)
876 return 1;
877 sum = sum * base + c;
879 *val = sum;
880 return 0;
883 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
884 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
885 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
886 and return zero. If the second character following
887 the opening bracket is not `*' or if no closing bracket can be
888 found, return -1. If a closing bracket is found and the
889 second char is `*', but the string between the `*' and `]' isn't
890 empty, an octal number, or a decimal number, print an error message
891 and return -2. */
893 static int
894 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
895 unsigned int *char_to_repeat, size_t *repeat_count,
896 size_t *closing_bracket_idx)
898 size_t i;
900 assert (start_idx + 1 < es->len);
901 if (!ES_MATCH (es, start_idx + 1, '*'))
902 return -1;
904 for (i = start_idx + 2; i < es->len; i++)
906 if (ES_MATCH (es, i, ']'))
908 const unsigned char *digit_str;
909 size_t digit_str_len = i - start_idx - 2;
911 *char_to_repeat = es->s[start_idx];
912 if (digit_str_len == 0)
914 /* We've matched [c*] -- no explicit repeat count. */
915 *repeat_count = 0;
916 *closing_bracket_idx = i;
917 return 0;
920 /* Here, we have found [c*s] where s should be a string
921 of octal or decimal digits. */
922 digit_str = &es->s[start_idx + 2];
923 if (non_neg_strtol (digit_str, digit_str_len, repeat_count)
924 || *repeat_count > BEGIN_STATE)
926 char *tmp = make_printable_str (digit_str, digit_str_len);
927 error (0, 0, _("invalid repeat count `%s' in [c*n] construct"),
928 tmp);
929 free (tmp);
930 return -2;
932 *closing_bracket_idx = i;
933 return 0;
936 return -1; /* No bracket found. */
939 /* Return nonzero if the string at ES->s[IDX] matches the regular
940 expression `\*[0-9]*\]', zero otherwise. To match, the `*' and
941 the `]' must not be escaped. */
943 static int
944 star_digits_closebracket (const struct E_string *es, size_t idx)
946 size_t i;
948 if (!ES_MATCH (es, idx, '*'))
949 return 0;
951 for (i = idx + 1; i < es->len; i++)
953 if (!ISDIGIT (es->s[i]))
955 if (ES_MATCH (es, i, ']'))
956 return 1;
957 return 0;
960 return 0;
963 /* Convert string UNESACPED_STRING (which has been preprocessed to
964 convert backslash-escape sequences) of length LEN characters into
965 a linked list of the following 5 types of constructs:
966 - [:str:] Character class where `str' is one of the 12 valid strings.
967 - [=c=] Equivalence class where `c' is any single character.
968 - [c*n] Repeat the single character `c' `n' times. n may be omitted.
969 However, if `n' is present, it must be a non-negative octal or
970 decimal integer.
971 - r-s Range of characters from `r' to `s'. The second endpoint must
972 not precede the first in the current collating sequence.
973 - c Any other character is interpreted as itself. */
975 static int
976 build_spec_list (const struct E_string *es, struct Spec_list *result)
978 const unsigned char *p;
979 size_t i;
981 p = es->s;
983 /* The main for-loop below recognizes the 4 multi-character constructs.
984 A character that matches (in its context) none of the multi-character
985 constructs is classified as `normal'. Since all multi-character
986 constructs have at least 3 characters, any strings of length 2 or
987 less are composed solely of normal characters. Hence, the index of
988 the outer for-loop runs only as far as LEN-2. */
990 for (i = 0; i + 2 < es->len; /* empty */)
992 if (ES_MATCH (es, i, '['))
994 int matched_multi_char_construct;
995 size_t closing_bracket_idx;
996 unsigned int char_to_repeat;
997 size_t repeat_count;
998 int err;
1000 matched_multi_char_construct = 1;
1001 if (ES_MATCH (es, i + 1, ':')
1002 || ES_MATCH (es, i + 1, '='))
1004 size_t closing_delim_idx;
1005 int found;
1007 found = find_closing_delim (es, i + 2, p[i + 1],
1008 &closing_delim_idx);
1009 if (found)
1011 int parse_failed;
1012 unsigned char *opnd_str = substr (p, i + 2,
1013 closing_delim_idx - 1);
1014 size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
1016 if (p[i + 1] == ':')
1018 parse_failed = append_char_class (result, opnd_str,
1019 opnd_str_len);
1021 /* FIXME: big comment. */
1022 if (parse_failed)
1024 if (star_digits_closebracket (es, i + 2))
1026 free (opnd_str);
1027 goto try_bracketed_repeat;
1029 else
1031 char *tmp = make_printable_str (opnd_str,
1032 opnd_str_len);
1033 error (0, 0, _("invalid character class `%s'"),
1034 tmp);
1035 free (tmp);
1036 return 1;
1040 else
1042 parse_failed = append_equiv_class (result, opnd_str,
1043 opnd_str_len);
1045 /* FIXME: big comment. */
1046 if (parse_failed)
1048 if (star_digits_closebracket (es, i + 2))
1050 free (opnd_str);
1051 goto try_bracketed_repeat;
1053 else
1055 char *tmp = make_printable_str (opnd_str,
1056 opnd_str_len);
1057 error (0, 0,
1058 _("%s: equivalence class operand must be a single character"),
1059 tmp);
1060 free (tmp);
1061 return 1;
1065 free (opnd_str);
1067 /* Return nonzero if append_*_class reports a problem. */
1068 if (parse_failed)
1069 return 1;
1070 else
1071 i = closing_delim_idx + 2;
1072 continue;
1074 /* Else fall through. This could be [:*] or [=*]. */
1077 try_bracketed_repeat:
1079 /* Determine whether this is a bracketed repeat range
1080 matching the RE \[.\*(dec_or_oct_number)?\]. */
1081 err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
1082 &repeat_count,
1083 &closing_bracket_idx);
1084 if (err == 0)
1086 append_repeated_char (result, char_to_repeat, repeat_count);
1087 i = closing_bracket_idx + 1;
1089 else if (err == -1)
1091 matched_multi_char_construct = 0;
1093 else
1095 /* Found a string that looked like [c*n] but the
1096 numeric part was invalid. */
1097 return 1;
1100 if (matched_multi_char_construct)
1101 continue;
1103 /* We reach this point if P does not match [:str:], [=c=],
1104 [c*n], or [c*]. Now, see if P looks like a range `[-c'
1105 (from `[' to `c'). */
1108 /* Look ahead one char for ranges like a-z. */
1109 if (ES_MATCH (es, i + 1, '-'))
1111 if (append_range (result, p[i], p[i + 2]))
1112 return 1;
1113 i += 3;
1115 else
1117 append_normal_char (result, p[i]);
1118 ++i;
1122 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
1123 for (; i < es->len; i++)
1124 append_normal_char (result, p[i]);
1126 return 0;
1129 /* Given a Spec_list S (with its saved state implicit in the values
1130 of its members `tail' and `state'), return the next single character
1131 in the expansion of S's constructs. If the last character of S was
1132 returned on the previous call or if S was empty, this function
1133 returns -1. For example, successive calls to get_next where S
1134 represents the spec-string 'a-d[y*3]' will return the sequence
1135 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1136 which the returned character comes is [:upper:] or [:lower:], the
1137 parameter CLASS is given a value to indicate which it was. Otherwise
1138 CLASS is set to UL_NONE. This value is used only when constructing
1139 the translation table to verify that any occurrences of upper and
1140 lower class constructs in the spec-strings appear in the same relative
1141 positions. */
1143 static int
1144 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1146 struct List_element *p;
1147 int return_val;
1148 int i;
1150 if (class)
1151 *class = UL_NONE;
1153 if (s->state == BEGIN_STATE)
1155 s->tail = s->head->next;
1156 s->state = NEW_ELEMENT;
1159 p = s->tail;
1160 if (p == NULL)
1161 return -1;
1163 switch (p->type)
1165 case RE_NORMAL_CHAR:
1166 return_val = p->u.normal_char;
1167 s->state = NEW_ELEMENT;
1168 s->tail = p->next;
1169 break;
1171 case RE_RANGE:
1172 if (s->state == NEW_ELEMENT)
1173 s->state = ORD (p->u.range.first_char);
1174 else
1175 ++(s->state);
1176 return_val = CHR (s->state);
1177 if (s->state == ORD (p->u.range.last_char))
1179 s->tail = p->next;
1180 s->state = NEW_ELEMENT;
1182 break;
1184 case RE_CHAR_CLASS:
1185 if (class)
1187 int upper_or_lower;
1188 switch (p->u.char_class)
1190 case CC_LOWER:
1191 *class = UL_LOWER;
1192 upper_or_lower = 1;
1193 break;
1194 case CC_UPPER:
1195 *class = UL_UPPER;
1196 upper_or_lower = 1;
1197 break;
1198 default:
1199 upper_or_lower = 0;
1200 break;
1203 if (upper_or_lower)
1205 s->tail = p->next;
1206 s->state = NEW_ELEMENT;
1207 return_val = 0;
1208 break;
1212 if (s->state == NEW_ELEMENT)
1214 for (i = 0; i < N_CHARS; i++)
1215 if (is_char_class_member (p->u.char_class, i))
1216 break;
1217 assert (i < N_CHARS);
1218 s->state = i;
1220 assert (is_char_class_member (p->u.char_class, s->state));
1221 return_val = CHR (s->state);
1222 for (i = s->state + 1; i < N_CHARS; i++)
1223 if (is_char_class_member (p->u.char_class, i))
1224 break;
1225 if (i < N_CHARS)
1226 s->state = i;
1227 else
1229 s->tail = p->next;
1230 s->state = NEW_ELEMENT;
1232 break;
1234 case RE_EQUIV_CLASS:
1235 /* FIXME: this assumes that each character is alone in its own
1236 equivalence class (which appears to be correct for my
1237 LC_COLLATE. But I don't know of any function that allows
1238 one to determine a character's equivalence class. */
1240 return_val = p->u.equiv_code;
1241 s->state = NEW_ELEMENT;
1242 s->tail = p->next;
1243 break;
1245 case RE_REPEATED_CHAR:
1246 /* Here, a repeat count of n == 0 means don't repeat at all. */
1247 if (p->u.repeated_char.repeat_count == 0)
1249 s->tail = p->next;
1250 s->state = NEW_ELEMENT;
1251 return_val = get_next (s, class);
1253 else
1255 if (s->state == NEW_ELEMENT)
1257 s->state = 0;
1259 ++(s->state);
1260 return_val = p->u.repeated_char.the_repeated_char;
1261 if (p->u.repeated_char.repeat_count > 0
1262 && s->state == p->u.repeated_char.repeat_count)
1264 s->tail = p->next;
1265 s->state = NEW_ELEMENT;
1268 break;
1270 case RE_NO_TYPE:
1271 abort ();
1272 break;
1274 default:
1275 abort ();
1276 break;
1279 return return_val;
1282 /* This is a minor kludge. This function is called from
1283 get_spec_stats to determine the cardinality of a set derived
1284 from a complemented string. It's a kludge in that some of the
1285 same operations are (duplicated) performed in set_initialize. */
1287 static int
1288 card_of_complement (struct Spec_list *s)
1290 int c;
1291 int cardinality = N_CHARS;
1292 SET_TYPE in_set[N_CHARS];
1294 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1295 s->state = BEGIN_STATE;
1296 while ((c = get_next (s, NULL)) != -1)
1297 if (!in_set[c]++)
1298 --cardinality;
1299 return cardinality;
1302 /* Gather statistics about the spec-list S in preparation for the tests
1303 in validate that determine the consistency of the specs. This function
1304 is called at most twice; once for string1, and again for any string2.
1305 LEN_S1 < 0 indicates that this is the first call and that S represents
1306 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1307 constructs in string1, and we can use its value to resolve any
1308 indefinite repeat construct in S (which represents string2). Hence,
1309 this function has the side-effect that it converts a valid [c*]
1310 construct in string2 to [c*n] where n is large enough (or 0) to give
1311 string2 the same length as string1. For example, with the command
1312 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1313 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1315 static void
1316 get_spec_stats (struct Spec_list *s)
1318 struct List_element *p;
1319 int len = 0;
1321 s->n_indefinite_repeats = 0;
1322 s->has_equiv_class = 0;
1323 s->has_restricted_char_class = 0;
1324 s->has_char_class = 0;
1325 for (p = s->head->next; p; p = p->next)
1327 switch (p->type)
1329 int i;
1330 case RE_NORMAL_CHAR:
1331 ++len;
1332 break;
1334 case RE_RANGE:
1335 assert (p->u.range.last_char >= p->u.range.first_char);
1336 len += p->u.range.last_char - p->u.range.first_char + 1;
1337 break;
1339 case RE_CHAR_CLASS:
1340 s->has_char_class = 1;
1341 for (i = 0; i < N_CHARS; i++)
1342 if (is_char_class_member (p->u.char_class, i))
1343 ++len;
1344 switch (p->u.char_class)
1346 case CC_UPPER:
1347 case CC_LOWER:
1348 break;
1349 default:
1350 s->has_restricted_char_class = 1;
1351 break;
1353 break;
1355 case RE_EQUIV_CLASS:
1356 for (i = 0; i < N_CHARS; i++)
1357 if (is_equiv_class_member (p->u.equiv_code, i))
1358 ++len;
1359 s->has_equiv_class = 1;
1360 break;
1362 case RE_REPEATED_CHAR:
1363 if (p->u.repeated_char.repeat_count > 0)
1364 len += p->u.repeated_char.repeat_count;
1365 else if (p->u.repeated_char.repeat_count == 0)
1367 s->indefinite_repeat_element = p;
1368 ++(s->n_indefinite_repeats);
1370 break;
1372 case RE_NO_TYPE:
1373 assert (0);
1374 break;
1378 s->length = len;
1381 static void
1382 get_s1_spec_stats (struct Spec_list *s1)
1384 get_spec_stats (s1);
1385 if (complement)
1386 s1->length = card_of_complement (s1);
1389 static void
1390 get_s2_spec_stats (struct Spec_list *s2, size_t len_s1)
1392 get_spec_stats (s2);
1393 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1395 s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1396 len_s1 - s2->length;
1397 s2->length = len_s1;
1401 static void
1402 spec_init (struct Spec_list *spec_list)
1404 spec_list->head = spec_list->tail =
1405 (struct List_element *) xmalloc (sizeof (struct List_element));
1406 spec_list->head->next = NULL;
1409 /* This function makes two passes over the argument string S. The first
1410 one converts all \c and \ddd escapes to their one-byte representations.
1411 The second constructs a linked specification list, SPEC_LIST, of the
1412 characters and constructs that comprise the argument string. If either
1413 of these passes detects an error, this function returns nonzero. */
1415 static int
1416 parse_str (const unsigned char *s, struct Spec_list *spec_list)
1418 struct E_string es;
1419 int fail;
1421 fail = unquote (s, &es);
1422 if (!fail)
1423 fail = build_spec_list (&es, spec_list);
1424 es_free (&es);
1425 return fail;
1428 /* Given two specification lists, S1 and S2, and assuming that
1429 S1->length > S2->length, append a single [c*n] element to S2 where c
1430 is the last character in the expansion of S2 and n is the difference
1431 between the two lengths.
1432 Upon successful completion, S2->length is set to S1->length. The only
1433 way this function can fail to make S2 as long as S1 is when S2 has
1434 zero-length, since in that case, there is no last character to repeat.
1435 So S2->length is required to be at least 1.
1437 Providing this functionality allows the user to do some pretty
1438 non-BSD (and non-portable) things: For example, the command
1439 tr -cs '[:upper:]0-9' '[:lower:]'
1440 is almost guaranteed to give results that depend on your collating
1441 sequence. */
1443 static void
1444 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1446 struct List_element *p;
1447 int char_to_repeat;
1448 int i;
1450 assert (translating);
1451 assert (s1->length > s2->length);
1452 assert (s2->length > 0);
1454 p = s2->tail;
1455 switch (p->type)
1457 case RE_NORMAL_CHAR:
1458 char_to_repeat = p->u.normal_char;
1459 break;
1460 case RE_RANGE:
1461 char_to_repeat = p->u.range.last_char;
1462 break;
1463 case RE_CHAR_CLASS:
1464 for (i = N_CHARS; i >= 0; i--)
1465 if (is_char_class_member (p->u.char_class, i))
1466 break;
1467 assert (i >= 0);
1468 char_to_repeat = CHR (i);
1469 break;
1471 case RE_REPEATED_CHAR:
1472 char_to_repeat = p->u.repeated_char.the_repeated_char;
1473 break;
1475 case RE_EQUIV_CLASS:
1476 /* This shouldn't happen, because validate exits with an error
1477 if it finds an equiv class in string2 when translating. */
1478 abort ();
1479 break;
1481 case RE_NO_TYPE:
1482 abort ();
1483 break;
1485 default:
1486 abort ();
1487 break;
1490 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1491 s2->length = s1->length;
1494 /* Return non-zero if S is a non-empty list in which exactly one
1495 character (but potentially, many instances of it) appears.
1496 E.g. [X*] or xxxxxxxx. */
1498 static int
1499 homogeneous_spec_list (struct Spec_list *s)
1501 int b, c;
1503 s->state = BEGIN_STATE;
1505 if ((b = get_next (s, NULL)) == -1)
1506 return 0;
1508 while ((c = get_next (s, NULL)) != -1)
1509 if (c != b)
1510 return 0;
1512 return 1;
1515 /* Die with an error message if S1 and S2 describe strings that
1516 are not valid with the given command line switches.
1517 A side effect of this function is that if a valid [c*] or
1518 [c*0] construct appears in string2, it is converted to [c*n]
1519 with a value for n that makes s2->length == s1->length. By
1520 the same token, if the --truncate-set1 option is not
1521 given, S2 may be extended. */
1523 static void
1524 validate (struct Spec_list *s1, struct Spec_list *s2)
1526 get_s1_spec_stats (s1);
1527 if (s1->n_indefinite_repeats > 0)
1529 error (EXIT_FAILURE, 0,
1530 _("the [c*] repeat construct may not appear in string1"));
1533 if (s2)
1535 get_s2_spec_stats (s2, s1->length);
1537 if (s2->n_indefinite_repeats > 1)
1539 error (EXIT_FAILURE, 0,
1540 _("only one [c*] repeat construct may appear in string2"));
1543 if (translating)
1545 if (s2->has_equiv_class)
1547 error (EXIT_FAILURE, 0,
1548 _("[=c=] expressions may not appear in string2 \
1549 when translating"));
1552 if (s1->length > s2->length)
1554 if (!truncate_set1)
1556 /* string2 must be non-empty unless --truncate-set1 is
1557 given or string1 is empty. */
1559 if (s2->length == 0)
1560 error (EXIT_FAILURE, 0,
1561 _("when not truncating set1, string2 must be non-empty"));
1562 string2_extend (s1, s2);
1566 if (complement && s1->has_char_class
1567 && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1569 error (EXIT_FAILURE, 0,
1570 _("when translating with complemented character classes,\
1571 \nstring2 must map all characters in the domain to one"));
1574 if (s2->has_restricted_char_class)
1576 error (EXIT_FAILURE, 0,
1577 _("when translating, the only character classes that may \
1578 appear in\nstring2 are `upper' and `lower'"));
1581 else
1582 /* Not translating. */
1584 if (s2->n_indefinite_repeats > 0)
1585 error (EXIT_FAILURE, 0,
1586 _("the [c*] construct may appear in string2 only \
1587 when translating"));
1592 /* Read buffers of SIZE bytes via the function READER (if READER is
1593 NULL, read from stdin) until EOF. When non-NULL, READER is either
1594 read_and_delete or read_and_xlate. After each buffer is read, it is
1595 processed and written to stdout. The buffers are processed so that
1596 multiple consecutive occurrences of the same character in the input
1597 stream are replaced by a single occurrence of that character if the
1598 character is in the squeeze set. */
1600 static void
1601 squeeze_filter (unsigned char *buf, long int size, PFI reader)
1603 unsigned int char_to_squeeze = NOT_A_CHAR;
1604 int i = 0;
1605 int nr = 0;
1607 for (;;)
1609 int begin;
1611 if (i >= nr)
1613 if (reader == NULL)
1614 nr = safe_read (0, (char *) buf, size);
1615 else
1616 nr = (*reader) (buf, size, NULL);
1618 if (nr < 0)
1619 error (EXIT_FAILURE, errno, _("read error"));
1620 if (nr == 0)
1621 break;
1622 i = 0;
1625 begin = i;
1627 if (char_to_squeeze == NOT_A_CHAR)
1629 int out_len;
1630 /* Here, by being a little tricky, we can get a significant
1631 performance increase in most cases when the input is
1632 reasonably large. Since tr will modify the input only
1633 if two consecutive (and identical) input characters are
1634 in the squeeze set, we can step by two through the data
1635 when searching for a character in the squeeze set. This
1636 means there may be a little more work in a few cases and
1637 perhaps twice as much work in the worst cases where most
1638 of the input is removed by squeezing repeats. But most
1639 uses of this functionality seem to remove less than 20-30%
1640 of the input. */
1641 for (; i < nr && !in_squeeze_set[buf[i]]; i += 2)
1642 ; /* empty */
1644 /* There is a special case when i == nr and we've just
1645 skipped a character (the last one in buf) that is in
1646 the squeeze set. */
1647 if (i == nr && in_squeeze_set[buf[i - 1]])
1648 --i;
1650 if (i >= nr)
1651 out_len = nr - begin;
1652 else
1654 char_to_squeeze = buf[i];
1655 /* We're about to output buf[begin..i]. */
1656 out_len = i - begin + 1;
1658 /* But since we stepped by 2 in the loop above,
1659 out_len may be one too large. */
1660 if (i > 0 && buf[i - 1] == char_to_squeeze)
1661 --out_len;
1663 /* Advance i to the index of first character to be
1664 considered when looking for a char different from
1665 char_to_squeeze. */
1666 ++i;
1668 if (out_len > 0
1669 && fwrite ((char *) &buf[begin], 1, out_len, stdout) == 0)
1670 error (EXIT_FAILURE, errno, _("write error"));
1673 if (char_to_squeeze != NOT_A_CHAR)
1675 /* Advance i to index of first char != char_to_squeeze
1676 (or to nr if all the rest of the characters in this
1677 buffer are the same as char_to_squeeze). */
1678 for (; i < nr && buf[i] == char_to_squeeze; i++)
1679 ; /* empty */
1680 if (i < nr)
1681 char_to_squeeze = NOT_A_CHAR;
1682 /* If (i >= nr) we've squeezed the last character in this buffer.
1683 So now we have to read a new buffer and continue comparing
1684 characters against char_to_squeeze. */
1689 /* Read buffers of SIZE bytes from stdin until one is found that
1690 contains at least one character not in the delete set. Store
1691 in the array BUF, all characters from that buffer that are not
1692 in the delete set, and return the number of characters saved
1693 or 0 upon EOF. */
1695 static long
1696 read_and_delete (unsigned char *buf, long int size, PFI not_used)
1698 long n_saved;
1699 static int hit_eof = 0;
1701 assert (not_used == NULL);
1702 assert (size > 0);
1704 if (hit_eof)
1705 return 0;
1707 /* This enclosing do-while loop is to make sure that
1708 we don't return zero (indicating EOF) when we've
1709 just deleted all the characters in a buffer. */
1712 int i;
1713 int nr = safe_read (0, (char *) buf, size);
1715 if (nr < 0)
1716 error (EXIT_FAILURE, errno, _("read error"));
1717 if (nr == 0)
1719 hit_eof = 1;
1720 return 0;
1723 /* This first loop may be a waste of code, but gives much
1724 better performance when no characters are deleted in
1725 the beginning of a buffer. It just avoids the copying
1726 of buf[i] into buf[n_saved] when it would be a NOP. */
1728 for (i = 0; i < nr && !in_delete_set[buf[i]]; i++)
1729 /* empty */ ;
1730 n_saved = i;
1732 for (++i; i < nr; i++)
1733 if (!in_delete_set[buf[i]])
1734 buf[n_saved++] = buf[i];
1736 while (n_saved == 0);
1738 return n_saved;
1741 /* Read at most SIZE bytes from stdin into the array BUF. Then
1742 perform the in-place and one-to-one mapping specified by the global
1743 array `xlate'. Return the number of characters read, or 0 upon EOF. */
1745 static long
1746 read_and_xlate (unsigned char *buf, long int size, PFI not_used)
1748 long chars_read = 0;
1749 static int hit_eof = 0;
1750 int i;
1752 assert (not_used == NULL);
1753 assert (size > 0);
1755 if (hit_eof)
1756 return 0;
1758 chars_read = safe_read (0, (char *) buf, size);
1759 if (chars_read < 0)
1760 error (EXIT_FAILURE, errno, _("read error"));
1761 if (chars_read == 0)
1763 hit_eof = 1;
1764 return 0;
1767 for (i = 0; i < chars_read; i++)
1768 buf[i] = xlate[buf[i]];
1770 return chars_read;
1773 /* Initialize a boolean membership set IN_SET with the character
1774 values obtained by traversing the linked list of constructs S
1775 using the function `get_next'. If COMPLEMENT_THIS_SET is
1776 nonzero the resulting set is complemented. */
1778 static void
1779 set_initialize (struct Spec_list *s, int complement_this_set, SET_TYPE *in_set)
1781 int c;
1782 int i;
1784 memset (in_set, 0, N_CHARS * sizeof (in_set[0]));
1785 s->state = BEGIN_STATE;
1786 while ((c = get_next (s, NULL)) != -1)
1787 in_set[c] = 1;
1788 if (complement_this_set)
1789 for (i = 0; i < N_CHARS; i++)
1790 in_set[i] = (!in_set[i]);
1794 main (int argc, char **argv)
1796 int c;
1797 int non_option_args;
1798 struct Spec_list buf1, buf2;
1799 struct Spec_list *s1 = &buf1;
1800 struct Spec_list *s2 = &buf2;
1802 program_name = argv[0];
1803 setlocale (LC_ALL, "");
1804 bindtextdomain (PACKAGE, LOCALEDIR);
1805 textdomain (PACKAGE);
1807 while ((c = getopt_long (argc, argv, "cdst", long_options, NULL)) != -1)
1809 switch (c)
1811 case 0:
1812 break;
1814 case 'c':
1815 complement = 1;
1816 break;
1818 case 'd':
1819 delete = 1;
1820 break;
1822 case 's':
1823 squeeze_repeats = 1;
1824 break;
1826 case 't':
1827 truncate_set1 = 1;
1828 break;
1830 case_GETOPT_HELP_CHAR;
1832 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1834 default:
1835 usage (2);
1836 break;
1840 posix_pedantic = (getenv ("POSIXLY_CORRECT") != NULL);
1842 non_option_args = argc - optind;
1843 translating = (non_option_args == 2 && !delete);
1845 /* Change this test if it is valid to give tr no options and
1846 no args at all. POSIX doesn't specifically say anything
1847 either way, but it looks like they implied it's invalid
1848 by omission. If you want to make tr do a slow imitation
1849 of `cat' use `tr a a'. */
1850 if (non_option_args > 2)
1852 error (0, 0, _("too many arguments"));
1853 usage (2);
1856 if (!delete && !squeeze_repeats && non_option_args != 2)
1857 error (EXIT_FAILURE, 0, _("two strings must be given when translating"));
1859 if (delete && squeeze_repeats && non_option_args != 2)
1860 error (EXIT_FAILURE, 0, _("two strings must be given when both \
1861 deleting and squeezing repeats"));
1863 /* If --delete is given without --squeeze-repeats, then
1864 only one string argument may be specified. But POSIX
1865 says to ignore any string2 in this case, so if POSIXLY_CORRECT
1866 is set, pretend we never saw string2. But I think
1867 this deserves a fatal error, so that's the default. */
1868 if ((delete && !squeeze_repeats) && non_option_args != 1)
1870 if (posix_pedantic && non_option_args == 2)
1871 --non_option_args;
1872 else
1873 error (EXIT_FAILURE, 0,
1874 _("only one string may be given when deleting \
1875 without squeezing repeats"));
1878 if (squeeze_repeats && non_option_args == 0)
1879 error (EXIT_FAILURE, 0,
1880 _("at least one string must be given when squeezing repeats"));
1882 spec_init (s1);
1883 if (parse_str ((unsigned char *) argv[optind], s1))
1884 exit (EXIT_FAILURE);
1886 if (non_option_args == 2)
1888 spec_init (s2);
1889 if (parse_str ((unsigned char *) argv[optind + 1], s2))
1890 exit (EXIT_FAILURE);
1892 else
1893 s2 = NULL;
1895 validate (s1, s2);
1897 /* Use binary I/O, since `tr' is sometimes used to transliterate
1898 non-printable characters, or characters which are stripped away
1899 by text-mode reads (like CR and ^Z). */
1900 SET_BINARY2 (STDIN_FILENO, STDOUT_FILENO);
1902 if (squeeze_repeats && non_option_args == 1)
1904 set_initialize (s1, complement, in_squeeze_set);
1905 squeeze_filter (io_buf, IO_BUF_SIZE, NULL);
1907 else if (delete && non_option_args == 1)
1909 long nr;
1911 set_initialize (s1, complement, in_delete_set);
1914 nr = read_and_delete (io_buf, IO_BUF_SIZE, NULL);
1915 if (nr > 0 && fwrite ((char *) io_buf, 1, nr, stdout) == 0)
1916 error (EXIT_FAILURE, errno, _("write error"));
1918 while (nr > 0);
1920 else if (squeeze_repeats && delete && non_option_args == 2)
1922 set_initialize (s1, complement, in_delete_set);
1923 set_initialize (s2, 0, in_squeeze_set);
1924 squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_delete);
1926 else if (translating)
1928 if (complement)
1930 int i;
1931 SET_TYPE *in_s1 = in_delete_set;
1933 set_initialize (s1, 0, in_s1);
1934 s2->state = BEGIN_STATE;
1935 for (i = 0; i < N_CHARS; i++)
1936 xlate[i] = i;
1937 for (i = 0; i < N_CHARS; i++)
1939 if (!in_s1[i])
1941 int ch = get_next (s2, NULL);
1942 assert (ch != -1 || truncate_set1);
1943 if (ch == -1)
1945 /* This will happen when tr is invoked like e.g.
1946 tr -cs A-Za-z0-9 '\012'. */
1947 break;
1949 xlate[i] = ch;
1952 assert (get_next (s2, NULL) == -1 || truncate_set1);
1954 else
1956 int c1, c2;
1957 int i;
1958 enum Upper_Lower_class class_s1;
1959 enum Upper_Lower_class class_s2;
1961 for (i = 0; i < N_CHARS; i++)
1962 xlate[i] = i;
1963 s1->state = BEGIN_STATE;
1964 s2->state = BEGIN_STATE;
1965 for (;;)
1967 c1 = get_next (s1, &class_s1);
1968 c2 = get_next (s2, &class_s2);
1969 if (!class_ok[(int) class_s1][(int) class_s2])
1970 error (EXIT_FAILURE, 0,
1971 _("misaligned [:upper:] and/or [:lower:] construct"));
1973 if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1975 for (i = 0; i < N_CHARS; i++)
1976 if (ISLOWER (i))
1977 xlate[i] = toupper (i);
1979 else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1981 for (i = 0; i < N_CHARS; i++)
1982 if (ISUPPER (i))
1983 xlate[i] = tolower (i);
1985 else if ((class_s1 == UL_LOWER && class_s2 == UL_LOWER)
1986 || (class_s1 == UL_UPPER && class_s2 == UL_UPPER))
1988 /* By default, GNU tr permits the identity mappings: from
1989 [:upper:] to [:upper:] and [:lower:] to [:lower:]. But
1990 when POSIXLY_CORRECT is set, those evoke diagnostics. */
1991 if (posix_pedantic)
1993 error (EXIT_FAILURE, 0,
1994 _("\
1995 invalid identity mapping; when translating, any [:lower:] or [:upper:]\n\
1996 construct in string1 must be aligned with a corresponding construct\n\
1997 ([:upper:] or [:lower:], respectively) in string2"));
2000 else
2002 /* The following should have been checked by validate... */
2003 if (c1 == -1 || c2 == -1)
2004 break;
2005 xlate[c1] = c2;
2008 assert (c1 == -1 || truncate_set1);
2010 if (squeeze_repeats)
2012 set_initialize (s2, 0, in_squeeze_set);
2013 squeeze_filter (io_buf, IO_BUF_SIZE, (PFI) read_and_xlate);
2015 else
2017 long chars_read;
2021 chars_read = read_and_xlate (io_buf, IO_BUF_SIZE, NULL);
2022 if (chars_read > 0
2023 && fwrite ((char *) io_buf, 1, chars_read, stdout) == 0)
2024 error (EXIT_FAILURE, errno, _("write error"));
2026 while (chars_read > 0);
2030 if (fclose (stdout) == EOF)
2031 error (EXIT_FAILURE, errno, _("write error"));
2033 if (close (0) != 0)
2034 error (EXIT_FAILURE, errno, _("standard input"));
2036 exit (EXIT_SUCCESS);