.
[coreutils.git] / src / join.c
blobe9c5bb14519d6b56c50ca534fe7336e7ab9ce947
1 /* join - join lines of two files on a common field
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
20 #include <config.h>
22 #ifdef __GNUC__
23 #define alloca __builtin_alloca
24 #else /* not __GNUC__ */
25 #if HAVE_ALLOCA_H
26 #include <alloca.h>
27 #else /* not HAVE_ALLOCA_H */
28 #ifdef _AIX
29 #pragma alloca
30 #else /* not _AIX */
31 char *alloca ();
32 #endif /* not _AIX */
33 #endif /* not HAVE_ALLOCA_H */
34 #endif /* not __GNUC__ */
36 /* Get isblank from GNU libc. */
37 #define _GNU_SOURCE
39 #include <stdio.h>
40 #define NDEBUG
41 #include <assert.h>
42 #include <sys/types.h>
43 #include <getopt.h>
45 #if HAVE_LIMITS_H
46 # include <limits.h>
47 #endif
49 #ifndef UINT_MAX
50 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
51 #endif
53 #ifndef INT_MAX
54 # define INT_MAX ((int) (UINT_MAX >> 1))
55 #endif
57 #if _LIBC || STDC_HEADERS
58 # define TOLOWER(c) tolower (c)
59 #else
60 # define TOLOWER(c) (ISUPPER (c) ? tolower (c) : (c))
61 #endif
63 #include "system.h"
64 #include "version.h"
65 #include "long-options.h"
66 #include "xstrtol.h"
67 #include "error.h"
69 #define join system_join
71 char *xmalloc ();
72 char *xrealloc ();
74 /* Undefine, to avoid warning about redefinition on some systems. */
75 #undef min
76 #undef max
77 #define min(A, B) ((A) < (B) ? (A) : (B))
78 #define max(A, B) ((A) > (B) ? (A) : (B))
80 /* An element of the list identifying which fields to print for each
81 output line. */
82 struct outlist
84 /* File number: 0, 1, or 2. 0 means use the join field.
85 1 means use the first file argument, 2 the second. */
86 int file;
88 /* Field index (zero-based), specified only when FILE is 1 or 2. */
89 int field;
91 struct outlist *next;
94 /* A field of a line. */
95 struct field
97 const char *beg; /* First character in field. */
98 size_t len; /* The length of the field. */
101 /* A line read from an input file. Newlines are not stored. */
102 struct line
104 char *beg; /* First character in line. */
105 char *lim; /* Character after last character in line. */
106 int nfields; /* Number of elements in `fields'. */
107 int nfields_allocated; /* Number of elements in `fields'. */
108 struct field *fields;
111 /* One or more consecutive lines read from a file that all have the
112 same join field value. */
113 struct seq
115 int count; /* Elements used in `lines'. */
116 int alloc; /* Elements allocated in `lines'. */
117 struct line *lines;
120 /* The name this program was run with. */
121 char *program_name;
123 /* If nonzero, print unpairable lines in file 1 or 2. */
124 static int print_unpairables_1, print_unpairables_2;
126 /* If nonzero, print pairable lines. */
127 static int print_pairables;
129 /* Empty output field filler. */
130 static char *empty_filler;
132 /* Field to join on. */
133 static int join_field_1, join_field_2;
135 /* List of fields to print. */
136 static struct outlist outlist_head;
138 /* Last element in `outlist', where a new element can be added. */
139 static struct outlist *outlist_end = &outlist_head;
141 /* Tab character separating fields; if this is NUL fields are separated
142 by any nonempty string of white space, otherwise by exactly one
143 tab character. */
144 static char tab;
146 /* When using getopt_long_only, no long option can start with
147 a character that is a short option. */
148 static struct option const longopts[] =
150 {"ignore-case", no_argument, NULL, 'i'},
151 {"j", required_argument, NULL, 'j'},
152 {"j1", required_argument, NULL, '1'},
153 {"j2", required_argument, NULL, '2'},
154 {NULL, 0, NULL, 0}
157 /* Used to print non-joining lines */
158 static struct line uni_blank;
160 /* If nonzero, ignore case when comparing join fields. */
161 static int ignore_case;
163 static void
164 usage (int status)
166 if (status != 0)
167 fprintf (stderr, _("Try `%s --help' for more information.\n"),
168 program_name);
169 else
171 printf (_("\
172 Usage: %s [OPTION]... FILE1 FILE2\n\
174 program_name);
175 printf (_("\
176 For each pair of input lines with identical join fields, write a line to\n\
177 standard output. The default join field is the first, delimited\n\
178 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
180 -a SIDE print unpairable lines coming from file SIDE\n\
181 -e EMPTY replace missing input fields with EMPTY\n\
182 -i, --ignore-case ignore differences in case when comparing fields\n\
183 -j FIELD (Obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
184 -j1 FIELD (Obsolescent) equivalent to `-1 FIELD'\n\
185 -j2 FIELD (Obsolescent) equivalent to `-2 FIELD'\n\
186 -1 FIELD join on this FIELD of file 1\n\
187 -2 FIELD join on this FIELD of file 2\n\
188 -o FORMAT obey FORMAT while constructing output line\n\
189 -t CHAR use CHAR as input and output field separator\n\
190 -v SIDE like -a SIDE, but suppress joined output lines\n\
191 --help display this help and exit\n\
192 --version output version information and exit\n\
194 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
195 else fields are separated by CHAR. Any FIELD is a field number counted\n\
196 from 1. FORMAT is one or more comma or blank separated specifications,\n\
197 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
198 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
199 separated by CHAR.\n\
200 "));
202 exit (status);
205 /* Like memcmp, but ignore differences in case. */
207 static int
208 memcasecmp (const void *vs1, const void *vs2, size_t n)
210 unsigned int i;
211 unsigned char *s1 = (unsigned char *) vs1;
212 unsigned char *s2 = (unsigned char *) vs2;
213 for (i = 0; i < n; i++)
215 unsigned char u1 = *s1++;
216 unsigned char u2 = *s2++;
217 if (TOLOWER (u1) != TOLOWER (u2))
218 return TOLOWER (u1) - TOLOWER (u2);
220 return 0;
223 static void
224 ADD_FIELD (struct line *line, const char *field, size_t len)
226 if (line->nfields >= line->nfields_allocated)
228 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
229 line->fields = (struct field *) xrealloc ((char *) line->fields,
230 (line->nfields_allocated
231 * sizeof (struct field)));
233 line->fields[line->nfields].beg = field;
234 line->fields[line->nfields].len = len;
235 ++(line->nfields);
238 /* Fill in the `fields' structure in LINE. */
240 static void
241 xfields (struct line *line)
243 int i;
244 register char *ptr, *lim;
246 ptr = line->beg;
247 lim = line->lim;
249 if (!tab)
251 /* Skip leading blanks before the first field. */
252 while (ptr < lim && ISSPACE (*ptr))
253 ++ptr;
256 for (i = 0; ptr < lim; ++i)
258 if (tab)
260 char *beg;
262 beg = ptr;
263 while (ptr < lim && *ptr != tab)
264 ++ptr;
265 ADD_FIELD (line, beg, ptr - beg);
266 if (ptr < lim)
267 ++ptr;
269 else
271 char *beg;
273 beg = ptr;
274 while (ptr < lim && !ISSPACE (*ptr))
275 ++ptr;
276 ADD_FIELD (line, beg, ptr - beg);
277 while (ptr < lim && ISSPACE (*ptr))
278 ++ptr;
282 if (ptr > line->beg && ((tab && ISSPACE (ptr[-1])) || ptr[-1] == tab))
284 /* Add one more (empty) field because the last character of the
285 line was a delimiter. */
286 ADD_FIELD (line, NULL, 0);
290 /* Read a line from FP into LINE and split it into fields.
291 Return 0 if EOF, 1 otherwise. */
293 static int
294 get_line (FILE *fp, struct line *line)
296 static int linesize = 80;
297 int c, i;
298 char *ptr;
300 if (feof (fp))
301 return 0;
303 ptr = xmalloc (linesize);
305 for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i)
307 if (i == linesize)
309 linesize *= 2;
310 ptr = xrealloc (ptr, linesize);
312 ptr[i] = c;
315 if (c == EOF && i == 0)
317 free (ptr);
318 return 0;
321 line->beg = ptr;
322 line->lim = line->beg + i;
323 line->nfields_allocated = 0;
324 line->nfields = 0;
325 line->fields = NULL;
326 xfields (line);
327 return 1;
330 static void
331 freeline (struct line *line)
333 free ((char *) line->fields);
334 free (line->beg);
335 line->beg = NULL;
338 static void
339 initseq (struct seq *seq)
341 seq->count = 0;
342 seq->alloc = 1;
343 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
346 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
348 static int
349 getseq (FILE *fp, struct seq *seq)
351 if (seq->count == seq->alloc)
353 seq->alloc *= 2;
354 seq->lines = (struct line *)
355 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
358 if (get_line (fp, &seq->lines[seq->count]))
360 ++seq->count;
361 return 1;
363 return 0;
366 static void
367 delseq (struct seq *seq)
369 int i;
370 for (i = 0; i < seq->count; i++)
371 if (seq->lines[i].beg)
372 freeline (&seq->lines[i]);
373 free ((char *) seq->lines);
376 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
377 >0 if it compares greater; 0 if it compares equal. */
379 static int
380 keycmp (struct line *line1, struct line *line2)
382 const char *beg1, *beg2; /* Start of field to compare in each file. */
383 int len1, len2; /* Length of fields to compare. */
384 int diff;
386 if (join_field_1 < line1->nfields)
388 beg1 = line1->fields[join_field_1].beg;
389 len1 = line1->fields[join_field_1].len;
391 else
393 beg1 = NULL;
394 len1 = 0;
397 if (join_field_2 < line2->nfields)
399 beg2 = line2->fields[join_field_2].beg;
400 len2 = line2->fields[join_field_2].len;
402 else
404 beg2 = NULL;
405 len2 = 0;
408 if (len1 == 0)
409 return len2 == 0 ? 0 : -1;
410 if (len2 == 0)
411 return 1;
413 /* Use an if-statement here rather than a function variable to
414 avoid portability hassles of getting a non-conflicting declaration
415 of memcmp. */
416 if (ignore_case)
417 diff = memcasecmp (beg1, beg2, min (len1, len2));
418 else
419 diff = memcmp (beg1, beg2, min (len1, len2));
421 if (diff)
422 return diff;
423 return len1 - len2;
426 /* Print field N of LINE if it exists and is nonempty, otherwise
427 `empty_filler' if it is nonempty. */
429 static void
430 prfield (int n, struct line *line)
432 int len;
434 if (n < line->nfields)
436 len = line->fields[n].len;
437 if (len)
438 fwrite (line->fields[n].beg, 1, len, stdout);
439 else if (empty_filler)
440 fputs (empty_filler, stdout);
442 else if (empty_filler)
443 fputs (empty_filler, stdout);
446 /* Print the join of LINE1 and LINE2. */
448 static void
449 prjoin (struct line *line1, struct line *line2)
451 const struct outlist *outlist;
453 outlist = outlist_head.next;
454 if (outlist)
456 const struct outlist *o;
458 o = outlist;
459 while (1)
461 int field;
462 struct line *line;
464 if (o->file == 0)
466 if (line1 == &uni_blank)
468 line = line2;
469 field = join_field_2;
471 else
473 line = line1;
474 field = join_field_1;
477 else
479 line = (o->file == 1 ? line1 : line2);
480 field = o->field;
482 prfield (field, line);
483 o = o->next;
484 if (o == NULL)
485 break;
486 putchar (tab ? tab : ' ');
488 putchar ('\n');
490 else
492 int i;
494 if (line1 == &uni_blank)
496 struct line *t;
497 t = line1;
498 line1 = line2;
499 line2 = t;
501 prfield (join_field_1, line1);
502 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
504 putchar (tab ? tab : ' ');
505 prfield (i, line1);
507 for (i = join_field_1 + 1; i < line1->nfields; ++i)
509 putchar (tab ? tab : ' ');
510 prfield (i, line1);
513 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
515 putchar (tab ? tab : ' ');
516 prfield (i, line2);
518 for (i = join_field_2 + 1; i < line2->nfields; ++i)
520 putchar (tab ? tab : ' ');
521 prfield (i, line2);
523 putchar ('\n');
527 /* Print the join of the files in FP1 and FP2. */
529 static void
530 join (FILE *fp1, FILE *fp2)
532 struct seq seq1, seq2;
533 struct line line;
534 int diff, i, j, eof1, eof2;
536 /* Read the first line of each file. */
537 initseq (&seq1);
538 getseq (fp1, &seq1);
539 initseq (&seq2);
540 getseq (fp2, &seq2);
542 while (seq1.count && seq2.count)
544 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
545 if (diff < 0)
547 if (print_unpairables_1)
548 prjoin (&seq1.lines[0], &uni_blank);
549 freeline (&seq1.lines[0]);
550 seq1.count = 0;
551 getseq (fp1, &seq1);
552 continue;
554 if (diff > 0)
556 if (print_unpairables_2)
557 prjoin (&uni_blank, &seq2.lines[0]);
558 freeline (&seq2.lines[0]);
559 seq2.count = 0;
560 getseq (fp2, &seq2);
561 continue;
564 /* Keep reading lines from file1 as long as they continue to
565 match the current line from file2. */
566 eof1 = 0;
568 if (!getseq (fp1, &seq1))
570 eof1 = 1;
571 ++seq1.count;
572 break;
574 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
576 /* Keep reading lines from file2 as long as they continue to
577 match the current line from file1. */
578 eof2 = 0;
580 if (!getseq (fp2, &seq2))
582 eof2 = 1;
583 ++seq2.count;
584 break;
586 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
588 if (print_pairables)
590 for (i = 0; i < seq1.count - 1; ++i)
591 for (j = 0; j < seq2.count - 1; ++j)
592 prjoin (&seq1.lines[i], &seq2.lines[j]);
595 for (i = 0; i < seq1.count - 1; ++i)
596 freeline (&seq1.lines[i]);
597 if (!eof1)
599 seq1.lines[0] = seq1.lines[seq1.count - 1];
600 seq1.count = 1;
602 else
603 seq1.count = 0;
605 for (i = 0; i < seq2.count - 1; ++i)
606 freeline (&seq2.lines[i]);
607 if (!eof2)
609 seq2.lines[0] = seq2.lines[seq2.count - 1];
610 seq2.count = 1;
612 else
613 seq2.count = 0;
616 if (print_unpairables_1 && seq1.count)
618 prjoin (&seq1.lines[0], &uni_blank);
619 freeline (&seq1.lines[0]);
620 while (get_line (fp1, &line))
622 prjoin (&line, &uni_blank);
623 freeline (&line);
627 if (print_unpairables_2 && seq2.count)
629 prjoin (&uni_blank, &seq2.lines[0]);
630 freeline (&seq2.lines[0]);
631 while (get_line (fp2, &line))
633 prjoin (&uni_blank, &line);
634 freeline (&line);
638 delseq (&seq1);
639 delseq (&seq2);
642 /* Add a field spec for field FIELD of file FILE to `outlist'. */
644 static void
645 add_field (int file, int field)
647 struct outlist *o;
649 assert (file == 0 || file == 1 || file == 2);
650 assert (field >= 0);
652 o = (struct outlist *) xmalloc (sizeof (struct outlist));
653 o->file = file;
654 o->field = field;
655 o->next = NULL;
657 /* Add to the end of the list so the fields are in the right order. */
658 outlist_end->next = o;
659 outlist_end = o;
662 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
663 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
664 If S is valid, return zero. Otherwise, give a diagnostic, don't update
665 *FILE_INDEX or *FIELD_INDEX, and return nonzero. */
667 static int
668 decode_field_spec (const char *s, int *file_index, int *field_index)
670 int valid = 0;
672 /* The first character must be 0, 1, or 2. */
673 switch (s[0])
675 case '0':
676 if (s[1] == '\0')
678 *file_index = 0;
679 /* Leave *field_index undefined. */
680 valid = 1;
682 else
684 /* `0' must be all alone -- no `.FIELD'. */
685 error (0, 0, _("invalid field specifier: `%s'"), s);
687 break;
689 case '1':
690 case '2':
691 if (s[1] == '.' && s[2] != '\0')
693 strtol_error s_err;
694 long int tmp_long;
696 s_err = xstrtol (s + 2, NULL, 10, &tmp_long, NULL);
697 if (s_err != LONGINT_OK || tmp_long <= 0 || tmp_long > INT_MAX)
699 error (0, 0, _("invalid field number: `%s'"), s + 2);
701 else
703 *file_index = s[0] - '0';
704 /* Convert to a zero-based index. */
705 *field_index = (int) tmp_long - 1;
706 valid = 1;
709 break;
711 default:
712 error (0, 0, _("invalid file number in field spec: `%s'"), s);
713 break;
715 return !valid;
718 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
719 Return nonzero to indicate failure. */
721 static int
722 add_field_list (const char *c_str)
724 char *p, *str;
726 /* Make a writable copy of c_str. */
727 str = (char *) alloca (strlen (c_str) + 1);
728 strcpy (str, c_str);
730 p = str;
733 int invalid;
734 int file_index, field_index;
735 char *spec_item = p;
737 p = strpbrk (p, ", \t");
738 if (p)
739 *p++ = 0;
740 invalid = decode_field_spec (spec_item, &file_index, &field_index);
741 if (invalid)
742 return 1;
743 add_field (file_index, field_index);
744 uni_blank.nfields = max (uni_blank.nfields, field_index);
746 while (p);
747 return 0;
750 /* Create a blank line with COUNT fields separated by tabs. */
752 void
753 make_blank (struct line *blank, int count)
755 int i;
756 blank->nfields = count;
757 blank->beg = xmalloc (blank->nfields + 1);
758 blank->fields = (struct field *) xmalloc (sizeof (struct field) * count);
759 for (i = 0; i < blank->nfields; i++)
761 blank->beg[i] = '\t';
762 blank->fields[i].beg = &blank->beg[i];
763 blank->fields[i].len = 0;
765 blank->beg[i] = '\0';
766 blank->lim = &blank->beg[i];
769 void
770 main (int argc, char **argv)
772 char *names[2];
773 FILE *fp1, *fp2;
774 int optc, prev_optc = 0, nfiles;
776 program_name = argv[0];
777 setlocale (LC_ALL, "");
778 bindtextdomain (PACKAGE, LOCALEDIR);
779 textdomain (PACKAGE);
781 /* Initialize this before parsing options. In parsing options,
782 it may be increased. */
783 uni_blank.nfields = 1;
785 parse_long_options (argc, argv, "join", version_string, usage);
787 nfiles = 0;
788 print_pairables = 1;
790 while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts,
791 (int *) 0)) != EOF)
793 long int val;
795 switch (optc)
797 case 0:
798 break;
800 case 'v':
801 print_pairables = 0;
802 /* Fall through. */
804 case 'a':
805 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
806 || (val != 1 && val != 2))
807 error (2, 0, _("invalid field number: `%s'"), optarg);
808 if (val == 1)
809 print_unpairables_1 = 1;
810 else
811 print_unpairables_2 = 1;
812 break;
814 case 'e':
815 empty_filler = optarg;
816 break;
818 case 'i':
819 ignore_case = 1;
820 break;
822 case '1':
823 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
824 || val <= 0 || val > INT_MAX)
826 error (2, 0, _("invalid field number for file 1: `%s'"), optarg);
828 join_field_1 = (int) val - 1;
829 break;
831 case '2':
832 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
833 || val <= 0 || val > INT_MAX)
834 error (2, 0, _("invalid field number for file 2: `%s'"), optarg);
835 join_field_2 = (int) val - 1;
836 break;
838 case 'j':
839 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
840 || val <= 0 || val > INT_MAX)
841 error (2, 0, _("invalid field number: `%s'"), optarg);
842 join_field_1 = join_field_2 = (int) val - 1;
843 break;
845 case 'o':
846 if (add_field_list (optarg))
847 exit (1);
848 break;
850 case 't':
851 tab = *optarg;
852 break;
854 case 1: /* Non-option argument. */
855 if (prev_optc == 'o' && optind <= argc - 2)
857 if (add_field_list (optarg))
858 exit (1);
860 /* Might be continuation of args to -o. */
861 continue; /* Don't change `prev_optc'. */
864 if (nfiles > 1)
866 error (0, 0, _("too many non-option arguments"));
867 usage (1);
869 names[nfiles++] = optarg;
870 break;
872 case '?':
873 usage (1);
875 prev_optc = optc;
878 /* Now that we've seen the options, we can construct the blank line
879 structure. */
880 make_blank (&uni_blank, uni_blank.nfields);
882 if (nfiles != 2)
884 error (0, 0, _("too few non-option arguments"));
885 usage (1);
888 fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin;
889 if (!fp1)
890 error (1, errno, "%s", names[0]);
891 fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin;
892 if (!fp2)
893 error (1, errno, "%s", names[1]);
894 if (fp1 == fp2)
895 error (1, errno, _("both files cannot be standard input"));
896 join (fp1, fp2);
898 if (fp1 != stdin && fclose (fp1) == EOF)
899 error (1, errno, "%s", names[0]);
900 if (fp2 != stdin && fclose (fp2) == EOF)
901 error (1, errno, "%s", names[1]);
902 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
903 error (1, errno, "-");
904 if (ferror (stdout) || fclose (stdout) == EOF)
905 error (1, errno, _("write error"));
907 exit (0);