*** empty log message ***
[coreutils.git] / src / join.c
blob7b24dc20e1e962e575f2203fd38b27f6853c572c
1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-1999 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <sys/types.h>
25 #include <getopt.h>
27 #include "system.h"
28 #include "error.h"
29 #include "hard-locale.h"
30 #include "linebuffer.h"
31 #include "memcasecmp.h"
32 #include "memcoll.h"
33 #include "xstrtol.h"
35 /* The official name of this program (e.g., no `g' prefix). */
36 #define PROGRAM_NAME "join"
38 #define AUTHORS "Mike Haertel"
40 #define join system_join
42 /* Undefine, to avoid warning about redefinition on some systems. */
43 #undef min
44 #undef max
45 #define min(A, B) ((A) < (B) ? (A) : (B))
46 #define max(A, B) ((A) > (B) ? (A) : (B))
48 /* An element of the list identifying which fields to print for each
49 output line. */
50 struct outlist
52 /* File number: 0, 1, or 2. 0 means use the join field.
53 1 means use the first file argument, 2 the second. */
54 int file;
56 /* Field index (zero-based), specified only when FILE is 1 or 2. */
57 int field;
59 struct outlist *next;
62 /* A field of a line. */
63 struct field
65 const unsigned char *beg; /* First character in field. */
66 size_t len; /* The length of the field. */
69 /* A line read from an input file. */
70 struct line
72 struct linebuffer buf; /* The line itself. */
73 int nfields; /* Number of elements in `fields'. */
74 int nfields_allocated; /* Number of elements in `fields'. */
75 struct field *fields;
78 /* One or more consecutive lines read from a file that all have the
79 same join field value. */
80 struct seq
82 int count; /* Elements used in `lines'. */
83 int alloc; /* Elements allocated in `lines'. */
84 struct line *lines;
87 /* The name this program was run with. */
88 char *program_name;
90 #ifdef ENABLE_NLS
91 /* Nonzero if the LC_COLLATE locale is hard. */
92 static int hard_LC_COLLATE;
93 #endif
95 /* If nonzero, print unpairable lines in file 1 or 2. */
96 static int print_unpairables_1, print_unpairables_2;
98 /* If nonzero, print pairable lines. */
99 static int print_pairables;
101 /* Empty output field filler. */
102 static char *empty_filler;
104 /* Field to join on. */
105 static int join_field_1, join_field_2;
107 /* List of fields to print. */
108 static struct outlist outlist_head;
110 /* Last element in `outlist', where a new element can be added. */
111 static struct outlist *outlist_end = &outlist_head;
113 /* Tab character separating fields; if this is NUL fields are separated
114 by any nonempty string of white space, otherwise by exactly one
115 tab character. */
116 static char tab;
118 /* When using getopt_long_only, no long option can start with
119 a character that is a short option. */
120 static struct option const longopts[] =
122 {"ignore-case", no_argument, NULL, 'i'},
123 {"j", required_argument, NULL, 'j'},
124 {"j1", required_argument, NULL, '1'},
125 {"j2", required_argument, NULL, '2'},
126 {GETOPT_HELP_OPTION_DECL},
127 {GETOPT_VERSION_OPTION_DECL},
128 {NULL, 0, NULL, 0}
131 /* Used to print non-joining lines */
132 static struct line uni_blank;
134 /* If nonzero, ignore case when comparing join fields. */
135 static int ignore_case;
137 void
138 usage (int status)
140 if (status != 0)
141 fprintf (stderr, _("Try `%s --help' for more information.\n"),
142 program_name);
143 else
145 printf (_("\
146 Usage: %s [OPTION]... FILE1 FILE2\n\
148 program_name);
149 printf (_("\
150 For each pair of input lines with identical join fields, write a line to\n\
151 standard output. The default join field is the first, delimited\n\
152 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
154 -a SIDE print unpairable lines coming from file SIDE\n\
155 -e EMPTY replace missing input fields with EMPTY\n\
156 -i, --ignore-case ignore differences in case when comparing fields\n\
157 -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
158 -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\
159 -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\
160 -o FORMAT obey FORMAT while constructing output line\n\
161 -t CHAR use CHAR as input and output field separator\n\
162 -v SIDE like -a SIDE, but suppress joined output lines\n\
163 -1 FIELD join on this FIELD of file 1\n\
164 -2 FIELD join on this FIELD of file 2\n\
165 --help display this help and exit\n\
166 --version output version information and exit\n\
168 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
169 else fields are separated by CHAR. Any FIELD is a field number counted\n\
170 from 1. FORMAT is one or more comma or blank separated specifications,\n\
171 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
172 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
173 separated by CHAR.\n\
174 "));
175 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
177 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
180 static void
181 ADD_FIELD (struct line *line, const unsigned char *field, size_t len)
183 if (line->nfields >= line->nfields_allocated)
185 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
186 line->fields = (struct field *) xrealloc ((char *) line->fields,
187 (line->nfields_allocated
188 * sizeof (struct field)));
190 line->fields[line->nfields].beg = field;
191 line->fields[line->nfields].len = len;
192 ++(line->nfields);
195 /* Fill in the `fields' structure in LINE. */
197 static void
198 xfields (struct line *line)
200 int i;
201 unsigned char *ptr0 = (unsigned char *) line->buf.buffer;
202 unsigned char *ptr;
203 unsigned char *lim;
205 ptr = ptr0;
206 lim = ptr0 + line->buf.length - 1;
208 if (!tab)
210 /* Skip leading blanks before the first field. */
211 while (ptr < lim && ISBLANK (*ptr))
212 ++ptr;
215 for (i = 0; ptr < lim; ++i)
217 if (tab)
219 unsigned char *beg;
221 beg = ptr;
222 while (ptr < lim && *ptr != tab)
223 ++ptr;
224 ADD_FIELD (line, beg, ptr - beg);
225 if (ptr < lim)
226 ++ptr;
228 else
230 unsigned char *beg;
232 beg = ptr;
233 while (ptr < lim && !ISBLANK (*ptr))
234 ++ptr;
235 ADD_FIELD (line, beg, ptr - beg);
236 while (ptr < lim && ISBLANK (*ptr))
237 ++ptr;
241 if (ptr != ptr0 && ((!tab && ISBLANK (ptr[-1])) || ptr[-1] == tab))
243 /* Add one more (empty) field because the last character of the
244 line was a delimiter. */
245 ADD_FIELD (line, NULL, 0);
249 /* Read a line from FP into LINE and split it into fields.
250 Return 0 if EOF, 1 otherwise. */
252 static int
253 get_line (FILE *fp, struct line *line)
255 initbuffer (&line->buf);
257 if (! readline (&line->buf, fp))
259 free (line->buf.buffer);
260 line->buf.buffer = NULL;
261 return 0;
264 line->nfields_allocated = 0;
265 line->nfields = 0;
266 line->fields = NULL;
267 xfields (line);
268 return 1;
271 static void
272 freeline (struct line *line)
274 free ((char *) line->fields);
275 free (line->buf.buffer);
276 line->buf.buffer = NULL;
279 static void
280 initseq (struct seq *seq)
282 seq->count = 0;
283 seq->alloc = 1;
284 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
287 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
289 static int
290 getseq (FILE *fp, struct seq *seq)
292 if (seq->count == seq->alloc)
294 seq->alloc *= 2;
295 seq->lines = (struct line *)
296 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
299 if (get_line (fp, &seq->lines[seq->count]))
301 ++seq->count;
302 return 1;
304 return 0;
307 static void
308 delseq (struct seq *seq)
310 int i;
311 for (i = 0; i < seq->count; i++)
312 if (seq->lines[i].buf.buffer)
313 freeline (&seq->lines[i]);
314 free ((char *) seq->lines);
317 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
318 >0 if it compares greater; 0 if it compares equal. */
320 static int
321 keycmp (struct line *line1, struct line *line2)
323 /* Start of field to compare in each file. */
324 const unsigned char *beg1, *beg2;
326 int len1, len2; /* Length of fields to compare. */
327 int diff;
329 if (join_field_1 < line1->nfields)
331 beg1 = line1->fields[join_field_1].beg;
332 len1 = line1->fields[join_field_1].len;
334 else
336 beg1 = NULL;
337 len1 = 0;
340 if (join_field_2 < line2->nfields)
342 beg2 = line2->fields[join_field_2].beg;
343 len2 = line2->fields[join_field_2].len;
345 else
347 beg2 = NULL;
348 len2 = 0;
351 if (len1 == 0)
352 return len2 == 0 ? 0 : -1;
353 if (len2 == 0)
354 return 1;
356 /* Use an if-statement here rather than a function variable to
357 avoid portability hassles of getting a non-conflicting declaration
358 of memcmp. */
359 if (ignore_case)
361 /* FIXME: ignore_case does not work with NLS (in particular,
362 with multibyte chars). */
363 diff = memcasecmp (beg1, beg2, min (len1, len2));
365 else
367 #ifdef ENABLE_NLS
368 if (hard_LC_COLLATE)
369 return memcoll ((char *) beg1, len1, (char *) beg2, len2);
370 #endif
371 diff = memcmp (beg1, beg2, min (len1, len2));
374 if (diff)
375 return diff;
376 return len1 - len2;
379 /* Print field N of LINE if it exists and is nonempty, otherwise
380 `empty_filler' if it is nonempty. */
382 static void
383 prfield (int n, struct line *line)
385 int len;
387 if (n < line->nfields)
389 len = line->fields[n].len;
390 if (len)
391 fwrite (line->fields[n].beg, 1, len, stdout);
392 else if (empty_filler)
393 fputs (empty_filler, stdout);
395 else if (empty_filler)
396 fputs (empty_filler, stdout);
399 /* Print the join of LINE1 and LINE2. */
401 static void
402 prjoin (struct line *line1, struct line *line2)
404 const struct outlist *outlist;
406 outlist = outlist_head.next;
407 if (outlist)
409 const struct outlist *o;
411 o = outlist;
412 while (1)
414 int field;
415 struct line *line;
417 if (o->file == 0)
419 if (line1 == &uni_blank)
421 line = line2;
422 field = join_field_2;
424 else
426 line = line1;
427 field = join_field_1;
430 else
432 line = (o->file == 1 ? line1 : line2);
433 assert (o->field >= 0);
434 field = o->field;
436 prfield (field, line);
437 o = o->next;
438 if (o == NULL)
439 break;
440 putchar (tab ? tab : ' ');
442 putchar ('\n');
444 else
446 int i;
448 if (line1 == &uni_blank)
450 struct line *t;
451 t = line1;
452 line1 = line2;
453 line2 = t;
455 prfield (join_field_1, line1);
456 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
458 putchar (tab ? tab : ' ');
459 prfield (i, line1);
461 for (i = join_field_1 + 1; i < line1->nfields; ++i)
463 putchar (tab ? tab : ' ');
464 prfield (i, line1);
467 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
469 putchar (tab ? tab : ' ');
470 prfield (i, line2);
472 for (i = join_field_2 + 1; i < line2->nfields; ++i)
474 putchar (tab ? tab : ' ');
475 prfield (i, line2);
477 putchar ('\n');
481 /* Print the join of the files in FP1 and FP2. */
483 static void
484 join (FILE *fp1, FILE *fp2)
486 struct seq seq1, seq2;
487 struct line line;
488 int diff, i, j, eof1, eof2;
490 /* Read the first line of each file. */
491 initseq (&seq1);
492 getseq (fp1, &seq1);
493 initseq (&seq2);
494 getseq (fp2, &seq2);
496 while (seq1.count && seq2.count)
498 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
499 if (diff < 0)
501 if (print_unpairables_1)
502 prjoin (&seq1.lines[0], &uni_blank);
503 freeline (&seq1.lines[0]);
504 seq1.count = 0;
505 getseq (fp1, &seq1);
506 continue;
508 if (diff > 0)
510 if (print_unpairables_2)
511 prjoin (&uni_blank, &seq2.lines[0]);
512 freeline (&seq2.lines[0]);
513 seq2.count = 0;
514 getseq (fp2, &seq2);
515 continue;
518 /* Keep reading lines from file1 as long as they continue to
519 match the current line from file2. */
520 eof1 = 0;
522 if (!getseq (fp1, &seq1))
524 eof1 = 1;
525 ++seq1.count;
526 break;
528 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
530 /* Keep reading lines from file2 as long as they continue to
531 match the current line from file1. */
532 eof2 = 0;
534 if (!getseq (fp2, &seq2))
536 eof2 = 1;
537 ++seq2.count;
538 break;
540 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
542 if (print_pairables)
544 for (i = 0; i < seq1.count - 1; ++i)
545 for (j = 0; j < seq2.count - 1; ++j)
546 prjoin (&seq1.lines[i], &seq2.lines[j]);
549 for (i = 0; i < seq1.count - 1; ++i)
550 freeline (&seq1.lines[i]);
551 if (!eof1)
553 seq1.lines[0] = seq1.lines[seq1.count - 1];
554 seq1.count = 1;
556 else
557 seq1.count = 0;
559 for (i = 0; i < seq2.count - 1; ++i)
560 freeline (&seq2.lines[i]);
561 if (!eof2)
563 seq2.lines[0] = seq2.lines[seq2.count - 1];
564 seq2.count = 1;
566 else
567 seq2.count = 0;
570 if (print_unpairables_1 && seq1.count)
572 prjoin (&seq1.lines[0], &uni_blank);
573 freeline (&seq1.lines[0]);
574 while (get_line (fp1, &line))
576 prjoin (&line, &uni_blank);
577 freeline (&line);
581 if (print_unpairables_2 && seq2.count)
583 prjoin (&uni_blank, &seq2.lines[0]);
584 freeline (&seq2.lines[0]);
585 while (get_line (fp2, &line))
587 prjoin (&uni_blank, &line);
588 freeline (&line);
592 delseq (&seq1);
593 delseq (&seq2);
596 /* Add a field spec for field FIELD of file FILE to `outlist'. */
598 static void
599 add_field (int file, int field)
601 struct outlist *o;
603 assert (file == 0 || file == 1 || file == 2);
604 assert (file == 0 ? field < 0 : field >= 0);
606 o = (struct outlist *) xmalloc (sizeof (struct outlist));
607 o->file = file;
608 o->field = field;
609 o->next = NULL;
611 /* Add to the end of the list so the fields are in the right order. */
612 outlist_end->next = o;
613 outlist_end = o;
616 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
617 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
618 If S is valid, return zero. Otherwise, give a diagnostic, don't update
619 *FILE_INDEX or *FIELD_INDEX, and return nonzero. */
621 static int
622 decode_field_spec (const char *s, int *file_index, int *field_index)
624 int invalid = 1;
626 /* The first character must be 0, 1, or 2. */
627 switch (s[0])
629 case '0':
630 if (s[1] == '\0')
632 *file_index = 0;
633 /* Give *field_index an invalid value. */
634 *field_index = -1;
635 invalid = 0;
637 else
639 /* `0' must be all alone -- no `.FIELD'. */
640 error (0, 0, _("invalid field specifier: `%s'"), s);
642 break;
644 case '1':
645 case '2':
646 if (s[1] == '.' && s[2] != '\0')
648 strtol_error s_err;
649 long int tmp_long;
651 s_err = xstrtol (s + 2, NULL, 10, &tmp_long, "");
652 if (s_err != LONGINT_OK || tmp_long <= 0 || tmp_long > INT_MAX)
654 error (0, 0, _("invalid field number: `%s'"), s + 2);
656 else
658 *file_index = s[0] - '0';
659 /* Convert to a zero-based index. */
660 *field_index = (int) tmp_long - 1;
661 invalid = 0;
664 break;
666 default:
667 error (0, 0, _("invalid file number in field spec: `%s'"), s);
668 break;
670 return invalid;
673 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
674 Return nonzero to indicate failure. */
676 static int
677 add_field_list (const char *c_str)
679 char *p, *str;
681 /* Make a writable copy of c_str. */
682 str = (char *) alloca (strlen (c_str) + 1);
683 strcpy (str, c_str);
685 p = str;
688 int invalid;
689 int file_index, field_index;
690 char *spec_item = p;
692 p = strpbrk (p, ", \t");
693 if (p)
694 *p++ = 0;
695 invalid = decode_field_spec (spec_item, &file_index, &field_index);
696 if (invalid)
697 return 1;
698 add_field (file_index, field_index);
699 uni_blank.nfields = max (uni_blank.nfields, field_index);
701 while (p);
702 return 0;
705 /* Create a blank line with COUNT fields separated by tabs. */
707 void
708 make_blank (struct line *blank, int count)
710 int i;
711 char *buffer;
712 struct field *fields;
713 blank->nfields = count;
714 blank->buf.size = blank->buf.length = count + 1;
715 blank->buf.buffer = buffer = xmalloc (blank->buf.size);
716 blank->fields = fields =
717 (struct field *) xmalloc (sizeof (struct field) * count);
718 for (i = 0; i < count; i++)
720 buffer[i] = '\t';
721 fields[i].beg = &buffer[i];
722 fields[i].len = 0;
724 buffer[i] = '\n';
728 main (int argc, char **argv)
730 char *names[2];
731 FILE *fp1, *fp2;
732 int optc, prev_optc = 0, nfiles;
734 program_name = argv[0];
735 setlocale (LC_ALL, "");
736 bindtextdomain (PACKAGE, LOCALEDIR);
737 textdomain (PACKAGE);
739 #ifdef ENABLE_NLS
740 hard_LC_COLLATE = hard_locale (LC_COLLATE);
741 #endif
743 /* Initialize this before parsing options. In parsing options,
744 it may be increased. */
745 uni_blank.nfields = 1;
747 nfiles = 0;
748 print_pairables = 1;
750 while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts,
751 NULL)) != -1)
753 long int val;
755 switch (optc)
757 case 0:
758 break;
760 case 'v':
761 print_pairables = 0;
762 /* Fall through. */
764 case 'a':
765 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
766 || (val != 1 && val != 2))
767 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
768 if (val == 1)
769 print_unpairables_1 = 1;
770 else
771 print_unpairables_2 = 1;
772 break;
774 case 'e':
775 empty_filler = optarg;
776 break;
778 case 'i':
779 ignore_case = 1;
780 break;
782 case '1':
783 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
784 || val <= 0 || val > INT_MAX)
786 error (EXIT_FAILURE, 0,
787 _("invalid field number for file 1: `%s'"), optarg);
789 join_field_1 = (int) val - 1;
790 break;
792 case '2':
793 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
794 || val <= 0 || val > INT_MAX)
795 error (EXIT_FAILURE, 0,
796 _("invalid field number for file 2: `%s'"), optarg);
797 join_field_2 = (int) val - 1;
798 break;
800 case 'j':
801 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
802 || val <= 0 || val > INT_MAX)
803 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
804 join_field_1 = join_field_2 = (int) val - 1;
805 break;
807 case 'o':
808 if (add_field_list (optarg))
809 exit (EXIT_FAILURE);
810 break;
812 case 't':
813 tab = *optarg;
814 break;
816 case 1: /* Non-option argument. */
817 if (prev_optc == 'o' && optind <= argc - 2)
819 if (add_field_list (optarg))
820 exit (EXIT_FAILURE);
822 /* Might be continuation of args to -o. */
823 continue; /* Don't change `prev_optc'. */
826 if (nfiles > 1)
828 error (0, 0, _("too many non-option arguments"));
829 usage (1);
831 names[nfiles++] = optarg;
832 break;
834 case_GETOPT_HELP_CHAR;
836 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
838 default:
839 usage (1);
841 prev_optc = optc;
844 /* Now that we've seen the options, we can construct the blank line
845 structure. */
846 make_blank (&uni_blank, uni_blank.nfields);
848 if (nfiles != 2)
850 error (0, 0, _("too few non-option arguments"));
851 usage (1);
854 fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
855 if (!fp1)
856 error (EXIT_FAILURE, errno, "%s", names[0]);
857 fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
858 if (!fp2)
859 error (EXIT_FAILURE, errno, "%s", names[1]);
860 if (fp1 == fp2)
861 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
862 join (fp1, fp2);
864 if (fp1 != stdin && fclose (fp1) == EOF)
865 error (EXIT_FAILURE, errno, "%s", names[0]);
866 if (fp2 != stdin && fclose (fp2) == EOF)
867 error (EXIT_FAILURE, errno, "%s", names[1]);
868 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
869 error (EXIT_FAILURE, errno, "-");
870 if (ferror (stdout) || fclose (stdout) == EOF)
871 error (EXIT_FAILURE, errno, _("write error"));
873 exit (EXIT_SUCCESS);