remove spurious SPACEs before TABs
[coreutils.git] / src / join.c
blobd5c6fbe1611ee328c96389b20898a16f6b81ff5c
1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2003 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <sys/types.h>
25 #include <getopt.h>
27 #include "system.h"
28 #include "error.h"
29 #include "hard-locale.h"
30 #include "linebuffer.h"
31 #include "memcasecmp.h"
32 #include "posixver.h"
33 #include "quote.h"
34 #include "xmemcoll.h"
35 #include "xstrtol.h"
37 /* The official name of this program (e.g., no `g' prefix). */
38 #define PROGRAM_NAME "join"
40 #define AUTHORS "Mike Haertel"
42 #define join system_join
44 /* An element of the list identifying which fields to print for each
45 output line. */
46 struct outlist
48 /* File number: 0, 1, or 2. 0 means use the join field.
49 1 means use the first file argument, 2 the second. */
50 int file;
52 /* Field index (zero-based), specified only when FILE is 1 or 2. */
53 size_t field;
55 struct outlist *next;
58 /* A field of a line. */
59 struct field
61 char *beg; /* First character in field. */
62 size_t len; /* The length of the field. */
65 /* A line read from an input file. */
66 struct line
68 struct linebuffer buf; /* The line itself. */
69 size_t nfields; /* Number of elements in `fields'. */
70 size_t nfields_allocated; /* Number of elements allocated for `fields'. */
71 struct field *fields;
74 /* One or more consecutive lines read from a file that all have the
75 same join field value. */
76 struct seq
78 size_t count; /* Elements used in `lines'. */
79 size_t alloc; /* Elements allocated in `lines'. */
80 struct line *lines;
83 /* The name this program was run with. */
84 char *program_name;
86 /* True if the LC_COLLATE locale is hard. */
87 static bool hard_LC_COLLATE;
89 /* True if obsolete option usage should be supported. */
90 static bool obsolete_usage;
92 /* If nonzero, print unpairable lines in file 1 or 2. */
93 static bool print_unpairables_1, print_unpairables_2;
95 /* If nonzero, print pairable lines. */
96 static bool print_pairables;
98 /* Empty output field filler. */
99 static char const *empty_filler;
101 /* Field to join on. */
102 static size_t join_field_1, join_field_2;
104 /* List of fields to print. */
105 static struct outlist outlist_head;
107 /* Last element in `outlist', where a new element can be added. */
108 static struct outlist *outlist_end = &outlist_head;
110 /* Tab character separating fields; if this is NUL fields are separated
111 by any nonempty string of white space, otherwise by exactly one
112 tab character. */
113 static char tab;
115 /* When using getopt_long_only, no long option can start with
116 a character that is a short option. */
117 static struct option const longopts[] =
119 /* These three options are obsolete; see OBSOLETE_LONG_OPTIONS below. */
120 {"j", required_argument, NULL, 'j'},
121 {"j1", required_argument, NULL, '1'},
122 {"j2", required_argument, NULL, '2'},
124 {"ignore-case", no_argument, NULL, 'i'},
125 {GETOPT_HELP_OPTION_DECL},
126 {GETOPT_VERSION_OPTION_DECL},
127 {NULL, 0, NULL, 0}
130 /* Number of options at the start of longopts that are obsolete. */
131 enum { OBSOLETE_LONG_OPTIONS = 3 };
133 /* Used to print non-joining lines */
134 static struct line uni_blank;
136 /* If nonzero, ignore case when comparing join fields. */
137 static bool ignore_case;
139 /* Get the next option from the argument vector. */
141 static int
142 get_option (int argc, char **argv)
144 return (obsolete_usage
145 ? getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts, NULL)
146 : getopt_long (argc, argv, "a:e:ij:1:2:o:t:v:",
147 longopts + OBSOLETE_LONG_OPTIONS, NULL));
150 void
151 usage (int status)
153 if (status != 0)
154 fprintf (stderr, _("Try `%s --help' for more information.\n"),
155 program_name);
156 else
158 printf (_("\
159 Usage: %s [OPTION]... FILE1 FILE2\n\
161 program_name);
162 fputs (_("\
163 For each pair of input lines with identical join fields, write a line to\n\
164 standard output. The default join field is the first, delimited\n\
165 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
167 -a FILENUM print unpairable lines coming from file FILENUM, where\n\
168 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
169 -e EMPTY replace missing input fields with EMPTY\n\
170 "), stdout);
171 fputs (_("\
172 -i, --ignore-case ignore differences in case when comparing fields\n\
173 -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
174 -o FORMAT obey FORMAT while constructing output line\n\
175 -t CHAR use CHAR as input and output field separator\n\
176 "), stdout);
177 fputs (_("\
178 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
179 -1 FIELD join on this FIELD of file 1\n\
180 -2 FIELD join on this FIELD of file 2\n\
181 "), stdout);
182 fputs (HELP_OPTION_DESCRIPTION, stdout);
183 fputs (VERSION_OPTION_DESCRIPTION, stdout);
184 fputs (_("\
186 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
187 else fields are separated by CHAR. Any FIELD is a field number counted\n\
188 from 1. FORMAT is one or more comma or blank separated specifications,\n\
189 each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
190 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
191 separated by CHAR.\n\
193 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
194 "), stdout);
195 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
197 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
200 /* Return true if C is a blank (a default input field separator). */
202 static inline bool
203 is_blank (unsigned char c)
205 return ISBLANK (c) != 0;
208 /* Record a field in LINE, with location FIELD and size LEN. */
210 static void
211 extract_field (struct line *line, char *field, size_t len)
213 if (line->nfields >= line->nfields_allocated)
215 line->fields = x2nrealloc (line->fields, &line->nfields_allocated,
216 sizeof (struct field));
218 line->fields[line->nfields].beg = field;
219 line->fields[line->nfields].len = len;
220 ++(line->nfields);
223 /* Fill in the `fields' structure in LINE. */
225 static void
226 xfields (struct line *line)
228 char *ptr = line->buf.buffer;
229 char const *lim = ptr + line->buf.length - 1;
231 if (ptr == lim)
232 return;
234 if (tab)
236 unsigned char t = tab;
237 char *sep;
238 for (; (sep = memchr (ptr, t, lim - ptr)) != NULL; ptr = sep + 1)
239 extract_field (line, ptr, sep - ptr);
241 else
243 /* Skip leading blanks before the first field. */
244 while (is_blank (*ptr))
245 if (++ptr == lim)
246 return;
250 char *sep;
251 for (sep = ptr + 1; sep != lim && ! is_blank (*sep); sep++)
252 continue;
253 extract_field (line, ptr, sep - ptr);
254 if (sep == lim)
255 return;
256 for (ptr = sep + 1; ptr != lim && is_blank (*ptr); ptr++)
257 continue;
259 while (ptr != lim);
262 extract_field (line, ptr, lim - ptr);
265 /* Read a line from FP into LINE and split it into fields.
266 Return true if successful. */
268 static bool
269 get_line (FILE *fp, struct line *line)
271 initbuffer (&line->buf);
273 if (! readlinebuffer (&line->buf, fp))
275 if (ferror (fp))
276 error (EXIT_FAILURE, errno, _("read error"));
277 free (line->buf.buffer);
278 line->buf.buffer = NULL;
279 return false;
282 line->nfields_allocated = 0;
283 line->nfields = 0;
284 line->fields = NULL;
285 xfields (line);
286 return true;
289 static void
290 freeline (struct line *line)
292 free (line->fields);
293 free (line->buf.buffer);
294 line->buf.buffer = NULL;
297 static void
298 initseq (struct seq *seq)
300 seq->count = 0;
301 seq->alloc = 0;
302 seq->lines = NULL;
305 /* Read a line from FP and add it to SEQ. Return true if successful. */
307 static bool
308 getseq (FILE *fp, struct seq *seq)
310 if (seq->count == seq->alloc)
311 seq->lines = x2nrealloc (seq->lines, &seq->alloc, sizeof *seq->lines);
313 if (get_line (fp, &seq->lines[seq->count]))
315 ++seq->count;
316 return true;
318 return false;
321 static void
322 delseq (struct seq *seq)
324 size_t i;
325 for (i = 0; i < seq->count; i++)
326 if (seq->lines[i].buf.buffer)
327 freeline (&seq->lines[i]);
328 free (seq->lines);
331 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
332 >0 if it compares greater; 0 if it compares equal.
333 Report an error and exit if the comparison fails. */
335 static int
336 keycmp (struct line const *line1, struct line const *line2)
338 /* Start of field to compare in each file. */
339 char *beg1;
340 char *beg2;
342 size_t len1;
343 size_t len2; /* Length of fields to compare. */
344 int diff;
346 if (join_field_1 < line1->nfields)
348 beg1 = line1->fields[join_field_1].beg;
349 len1 = line1->fields[join_field_1].len;
351 else
353 beg1 = NULL;
354 len1 = 0;
357 if (join_field_2 < line2->nfields)
359 beg2 = line2->fields[join_field_2].beg;
360 len2 = line2->fields[join_field_2].len;
362 else
364 beg2 = NULL;
365 len2 = 0;
368 if (len1 == 0)
369 return len2 == 0 ? 0 : -1;
370 if (len2 == 0)
371 return 1;
373 if (ignore_case)
375 /* FIXME: ignore_case does not work with NLS (in particular,
376 with multibyte chars). */
377 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
379 else
381 if (HAVE_SETLOCALE && hard_LC_COLLATE)
382 return xmemcoll (beg1, len1, beg2, len2);
383 diff = memcmp (beg1, beg2, MIN (len1, len2));
386 if (diff)
387 return diff;
388 return len1 < len2 ? -1 : len1 != len2;
391 /* Print field N of LINE if it exists and is nonempty, otherwise
392 `empty_filler' if it is nonempty. */
394 static void
395 prfield (size_t n, struct line const *line)
397 size_t len;
399 if (n < line->nfields)
401 len = line->fields[n].len;
402 if (len)
403 fwrite (line->fields[n].beg, 1, len, stdout);
404 else if (empty_filler)
405 fputs (empty_filler, stdout);
407 else if (empty_filler)
408 fputs (empty_filler, stdout);
411 /* Print the join of LINE1 and LINE2. */
413 static void
414 prjoin (struct line const *line1, struct line const *line2)
416 const struct outlist *outlist;
417 char output_separator = tab ? tab : ' ';
419 outlist = outlist_head.next;
420 if (outlist)
422 const struct outlist *o;
424 o = outlist;
425 while (1)
427 size_t field;
428 struct line const *line;
430 if (o->file == 0)
432 if (line1 == &uni_blank)
434 line = line2;
435 field = join_field_2;
437 else
439 line = line1;
440 field = join_field_1;
443 else
445 line = (o->file == 1 ? line1 : line2);
446 field = o->field;
448 prfield (field, line);
449 o = o->next;
450 if (o == NULL)
451 break;
452 putchar (output_separator);
454 putchar ('\n');
456 else
458 size_t i;
460 if (line1 == &uni_blank)
462 struct line const *t;
463 t = line1;
464 line1 = line2;
465 line2 = t;
467 prfield (join_field_1, line1);
468 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
470 putchar (output_separator);
471 prfield (i, line1);
473 for (i = join_field_1 + 1; i < line1->nfields; ++i)
475 putchar (output_separator);
476 prfield (i, line1);
479 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
481 putchar (output_separator);
482 prfield (i, line2);
484 for (i = join_field_2 + 1; i < line2->nfields; ++i)
486 putchar (output_separator);
487 prfield (i, line2);
489 putchar ('\n');
493 /* Print the join of the files in FP1 and FP2. */
495 static void
496 join (FILE *fp1, FILE *fp2)
498 struct seq seq1, seq2;
499 struct line line;
500 int diff;
501 bool eof1, eof2;
503 /* Read the first line of each file. */
504 initseq (&seq1);
505 getseq (fp1, &seq1);
506 initseq (&seq2);
507 getseq (fp2, &seq2);
509 while (seq1.count && seq2.count)
511 size_t i;
512 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
513 if (diff < 0)
515 if (print_unpairables_1)
516 prjoin (&seq1.lines[0], &uni_blank);
517 freeline (&seq1.lines[0]);
518 seq1.count = 0;
519 getseq (fp1, &seq1);
520 continue;
522 if (diff > 0)
524 if (print_unpairables_2)
525 prjoin (&uni_blank, &seq2.lines[0]);
526 freeline (&seq2.lines[0]);
527 seq2.count = 0;
528 getseq (fp2, &seq2);
529 continue;
532 /* Keep reading lines from file1 as long as they continue to
533 match the current line from file2. */
534 eof1 = false;
536 if (!getseq (fp1, &seq1))
538 eof1 = true;
539 ++seq1.count;
540 break;
542 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
544 /* Keep reading lines from file2 as long as they continue to
545 match the current line from file1. */
546 eof2 = false;
548 if (!getseq (fp2, &seq2))
550 eof2 = true;
551 ++seq2.count;
552 break;
554 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
556 if (print_pairables)
558 for (i = 0; i < seq1.count - 1; ++i)
560 size_t j;
561 for (j = 0; j < seq2.count - 1; ++j)
562 prjoin (&seq1.lines[i], &seq2.lines[j]);
566 for (i = 0; i < seq1.count - 1; ++i)
567 freeline (&seq1.lines[i]);
568 if (!eof1)
570 seq1.lines[0] = seq1.lines[seq1.count - 1];
571 seq1.count = 1;
573 else
574 seq1.count = 0;
576 for (i = 0; i < seq2.count - 1; ++i)
577 freeline (&seq2.lines[i]);
578 if (!eof2)
580 seq2.lines[0] = seq2.lines[seq2.count - 1];
581 seq2.count = 1;
583 else
584 seq2.count = 0;
587 if (print_unpairables_1 && seq1.count)
589 prjoin (&seq1.lines[0], &uni_blank);
590 freeline (&seq1.lines[0]);
591 while (get_line (fp1, &line))
593 prjoin (&line, &uni_blank);
594 freeline (&line);
598 if (print_unpairables_2 && seq2.count)
600 prjoin (&uni_blank, &seq2.lines[0]);
601 freeline (&seq2.lines[0]);
602 while (get_line (fp2, &line))
604 prjoin (&uni_blank, &line);
605 freeline (&line);
609 delseq (&seq1);
610 delseq (&seq2);
613 /* Add a field spec for field FIELD of file FILE to `outlist'. */
615 static void
616 add_field (int file, size_t field)
618 struct outlist *o;
620 assert (file == 0 || file == 1 || file == 2);
621 assert (file != 0 || field == 0);
623 o = xmalloc (sizeof *o);
624 o->file = file;
625 o->field = field;
626 o->next = NULL;
628 /* Add to the end of the list so the fields are in the right order. */
629 outlist_end->next = o;
630 outlist_end = o;
633 /* Convert a string of decimal digits, STR (the 1-based join field number),
634 to an integral value. Upon successful conversion, return one less
635 (the zero-based field number). If it cannot be converted, give a
636 diagnostic and exit. */
638 size_t
639 string_to_join_field (char const *str, char const *err_msg_fmt)
641 size_t result;
642 uintmax_t val;
644 strtol_error s_err = xstrtoumax (str, NULL, 10, &val, "");
645 if (s_err == LONGINT_OVERFLOW || SIZE_MAX < val)
647 error (EXIT_FAILURE, 0,
648 _("value %s is so large that it is not representable"),
649 quote (str));
652 if (s_err != LONGINT_OK || val == 0)
653 error (EXIT_FAILURE, 0, err_msg_fmt, quote (str));
655 result = val - 1;
657 return result;
660 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
661 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
662 If S is valid, return true. Otherwise, give a diagnostic, don't update
663 *FILE_INDEX or *FIELD_INDEX, and return false. */
665 static bool
666 decode_field_spec (const char *s, int *file_index, size_t *field_index)
668 bool valid = false;
670 /* The first character must be 0, 1, or 2. */
671 switch (s[0])
673 case '0':
674 if (s[1] == '\0')
676 *file_index = 0;
677 *field_index = 0;
678 valid = true;
680 else
682 /* `0' must be all alone -- no `.FIELD'. */
683 error (0, 0, _("invalid field specifier: `%s'"), s);
685 break;
687 case '1':
688 case '2':
689 if (s[1] == '.' && s[2] != '\0')
691 *field_index
692 = string_to_join_field (s + 2, _("invalid field number: %s"));
693 *file_index = s[0] - '0';
694 valid = true;
696 break;
698 default:
699 error (0, 0, _("invalid file number in field spec: `%s'"), s);
700 break;
702 return valid;
705 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
706 Return true if successful. */
708 static bool
709 add_field_list (char *str)
711 char *p = str;
715 int file_index;
716 size_t field_index;
717 char const *spec_item = p;
719 p = strpbrk (p, ", \t");
720 if (p)
721 *p++ = 0;
722 if (! decode_field_spec (spec_item, &file_index, &field_index))
723 return false;
724 add_field (file_index, field_index);
726 while (p);
728 return true;
731 /* Add NAME to the array of input file NAMES; currently there are
732 *NFILES names in the list. */
734 void
735 add_file_name (char const *name, char const *names[2], int *nfiles)
737 if (*nfiles == 2)
739 error (0, 0, _("too many non-option arguments"));
740 usage (EXIT_FAILURE);
742 names[(*nfiles)++] = name;
746 main (int argc, char **argv)
748 char const *names[2];
749 FILE *fp1, *fp2;
750 int optc, prev_optc = 0, nfiles;
752 initialize_main (&argc, &argv);
753 program_name = argv[0];
754 setlocale (LC_ALL, "");
755 bindtextdomain (PACKAGE, LOCALEDIR);
756 textdomain (PACKAGE);
757 hard_LC_COLLATE = hard_locale (LC_COLLATE);
758 obsolete_usage = (posix2_version () < 200112);
760 atexit (close_stdout);
762 nfiles = 0;
763 print_pairables = true;
765 while ((optc = get_option (argc, argv)) != -1)
767 long int val;
769 switch (optc)
771 case 0:
772 break;
774 case 'v':
775 print_pairables = false;
776 /* Fall through. */
778 case 'a':
779 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
780 || (val != 1 && val != 2))
781 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
782 if (val == 1)
783 print_unpairables_1 = true;
784 else
785 print_unpairables_2 = true;
786 break;
788 case 'e':
789 empty_filler = optarg;
790 break;
792 case 'i':
793 ignore_case = true;
794 break;
796 case '1':
797 join_field_1 =
798 string_to_join_field (optarg,
799 _("invalid field number for file 1: `%s'"));
800 break;
802 case '2':
803 join_field_2 =
804 string_to_join_field (optarg,
805 _("invalid field number for file 2: `%s'"));
806 break;
808 case 'j':
809 join_field_1 = join_field_2 =
810 string_to_join_field (optarg,
811 _("invalid field number: `%s'"));
812 break;
814 case 'o':
815 if (! add_field_list (optarg))
816 exit (EXIT_FAILURE);
817 break;
819 case 't':
820 tab = *optarg;
821 break;
823 case 1: /* Non-option argument. */
824 if (prev_optc == 'o' && optind <= argc - 2)
826 if (! add_field_list (optarg))
827 exit (EXIT_FAILURE);
829 /* Might be continuation of args to -o. */
830 continue; /* Don't change `prev_optc'. */
832 add_file_name (optarg, names, &nfiles);
833 break;
835 case_GETOPT_HELP_CHAR;
837 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
839 default:
840 usage (EXIT_FAILURE);
842 prev_optc = optc;
845 if (! obsolete_usage)
846 while (optind < argc)
847 add_file_name (argv[optind++], names, &nfiles);
849 if (nfiles != 2)
851 error (0, 0, _("too few non-option arguments"));
852 usage (EXIT_FAILURE);
855 fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
856 if (!fp1)
857 error (EXIT_FAILURE, errno, "%s", names[0]);
858 fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
859 if (!fp2)
860 error (EXIT_FAILURE, errno, "%s", names[1]);
861 if (fp1 == fp2)
862 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
863 join (fp1, fp2);
865 if (fp1 != stdin && fclose (fp1) == EOF)
866 error (EXIT_FAILURE, errno, "%s", names[0]);
867 if (fp2 != stdin && fclose (fp2) == EOF)
868 error (EXIT_FAILURE, errno, "%s", names[1]);
869 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
870 error (EXIT_FAILURE, errno, "-");
872 exit (EXIT_SUCCESS);