*** empty log message ***
[coreutils.git] / src / join.c
blobb6f2c692267869fc590f84638fd606a4765b740e
1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2002 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
20 #include <config.h>
22 #include <stdio.h>
23 #include <assert.h>
24 #include <sys/types.h>
25 #include <getopt.h>
27 #include "system.h"
28 #include "closeout.h"
29 #include "error.h"
30 #include "hard-locale.h"
31 #include "linebuffer.h"
32 #include "memcasecmp.h"
33 #include "xmemcoll.h"
34 #include "xstrtol.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "join"
39 #define AUTHORS "Mike Haertel"
41 #define join system_join
43 /* Undefine, to avoid warning about redefinition on some systems. */
44 #undef min
45 #undef max
46 #define min(A, B) ((A) < (B) ? (A) : (B))
47 #define max(A, B) ((A) > (B) ? (A) : (B))
49 /* An element of the list identifying which fields to print for each
50 output line. */
51 struct outlist
53 /* File number: 0, 1, or 2. 0 means use the join field.
54 1 means use the first file argument, 2 the second. */
55 int file;
57 /* Field index (zero-based), specified only when FILE is 1 or 2. */
58 int field;
60 struct outlist *next;
63 /* A field of a line. */
64 struct field
66 const unsigned char *beg; /* First character in field. */
67 size_t len; /* The length of the field. */
70 /* A line read from an input file. */
71 struct line
73 struct linebuffer buf; /* The line itself. */
74 int nfields; /* Number of elements in `fields'. */
75 int nfields_allocated; /* Number of elements in `fields'. */
76 struct field *fields;
79 /* One or more consecutive lines read from a file that all have the
80 same join field value. */
81 struct seq
83 int count; /* Elements used in `lines'. */
84 int alloc; /* Elements allocated in `lines'. */
85 struct line *lines;
88 /* The name this program was run with. */
89 char *program_name;
91 /* Nonzero if the LC_COLLATE locale is hard. */
92 static int hard_LC_COLLATE;
94 /* If nonzero, print unpairable lines in file 1 or 2. */
95 static int print_unpairables_1, print_unpairables_2;
97 /* If nonzero, print pairable lines. */
98 static int print_pairables;
100 /* Empty output field filler. */
101 static char *empty_filler;
103 /* Field to join on. */
104 static int join_field_1, join_field_2;
106 /* List of fields to print. */
107 static struct outlist outlist_head;
109 /* Last element in `outlist', where a new element can be added. */
110 static struct outlist *outlist_end = &outlist_head;
112 /* Tab character separating fields; if this is NUL fields are separated
113 by any nonempty string of white space, otherwise by exactly one
114 tab character. */
115 static unsigned char tab;
117 /* When using getopt_long_only, no long option can start with
118 a character that is a short option. */
119 static struct option const longopts[] =
121 {"ignore-case", no_argument, NULL, 'i'},
122 {"j", required_argument, NULL, 'j'},
123 {"j1", required_argument, NULL, '1'},
124 {"j2", required_argument, NULL, '2'},
125 {GETOPT_HELP_OPTION_DECL},
126 {GETOPT_VERSION_OPTION_DECL},
127 {NULL, 0, NULL, 0}
130 /* Used to print non-joining lines */
131 static struct line uni_blank;
133 /* If nonzero, ignore case when comparing join fields. */
134 static int ignore_case;
136 void
137 usage (int status)
139 if (status != 0)
140 fprintf (stderr, _("Try `%s --help' for more information.\n"),
141 program_name);
142 else
144 printf (_("\
145 Usage: %s [OPTION]... FILE1 FILE2\n\
147 program_name);
148 fputs (_("\
149 For each pair of input lines with identical join fields, write a line to\n\
150 standard output. The default join field is the first, delimited\n\
151 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
153 -a SIDE print unpairable lines coming from file SIDE\n\
154 -e EMPTY replace missing input fields with EMPTY\n\
155 "), stdout);
156 fputs (_("\
157 -i, --ignore-case ignore differences in case when comparing fields\n\
158 -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
159 -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\
160 -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\
161 -o FORMAT obey FORMAT while constructing output line\n\
162 -t CHAR use CHAR as input and output field separator\n\
163 "), stdout);
164 fputs (_("\
165 -v SIDE like -a SIDE, but suppress joined output lines\n\
166 -1 FIELD join on this FIELD of file 1\n\
167 -2 FIELD join on this FIELD of file 2\n\
168 "), stdout);
169 fputs (HELP_OPTION_DESCRIPTION, stdout);
170 fputs (VERSION_OPTION_DESCRIPTION, stdout);
171 fputs (_("\
173 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
174 else fields are separated by CHAR. Any FIELD is a field number counted\n\
175 from 1. FORMAT is one or more comma or blank separated specifications,\n\
176 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
177 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
178 separated by CHAR.\n\
179 "), stdout);
180 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
182 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
185 static void
186 ADD_FIELD (struct line *line, const unsigned char *field, size_t len)
188 if (line->nfields >= line->nfields_allocated)
190 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
191 line->fields = (struct field *) xrealloc ((char *) line->fields,
192 (line->nfields_allocated
193 * sizeof (struct field)));
195 line->fields[line->nfields].beg = field;
196 line->fields[line->nfields].len = len;
197 ++(line->nfields);
200 /* Fill in the `fields' structure in LINE. */
202 static void
203 xfields (struct line *line)
205 int i;
206 unsigned char *ptr0 = (unsigned char *) line->buf.buffer;
207 unsigned char *ptr;
208 unsigned char *lim;
210 ptr = ptr0;
211 lim = ptr0 + line->buf.length - 1;
213 if (!tab)
215 /* Skip leading blanks before the first field. */
216 while (ptr < lim && ISBLANK (*ptr))
217 ++ptr;
220 for (i = 0; ptr < lim; ++i)
222 if (tab)
224 unsigned char *beg;
226 beg = ptr;
227 while (ptr < lim && *ptr != tab)
228 ++ptr;
229 ADD_FIELD (line, beg, ptr - beg);
230 if (ptr < lim)
231 ++ptr;
233 else
235 unsigned char *beg;
237 beg = ptr;
238 while (ptr < lim && !ISBLANK (*ptr))
239 ++ptr;
240 ADD_FIELD (line, beg, ptr - beg);
241 while (ptr < lim && ISBLANK (*ptr))
242 ++ptr;
246 if (ptr != ptr0 && ((!tab && ISBLANK (ptr[-1])) || ptr[-1] == tab))
248 /* Add one more (empty) field because the last character of the
249 line was a delimiter. */
250 ADD_FIELD (line, NULL, 0);
254 /* Read a line from FP into LINE and split it into fields.
255 Return 0 if EOF, 1 otherwise. */
257 static int
258 get_line (FILE *fp, struct line *line)
260 initbuffer (&line->buf);
262 if (! readline (&line->buf, fp))
264 free (line->buf.buffer);
265 line->buf.buffer = NULL;
266 return 0;
269 line->nfields_allocated = 0;
270 line->nfields = 0;
271 line->fields = NULL;
272 xfields (line);
273 return 1;
276 static void
277 freeline (struct line *line)
279 free ((char *) line->fields);
280 free (line->buf.buffer);
281 line->buf.buffer = NULL;
284 static void
285 initseq (struct seq *seq)
287 seq->count = 0;
288 seq->alloc = 1;
289 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
292 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
294 static int
295 getseq (FILE *fp, struct seq *seq)
297 if (seq->count == seq->alloc)
299 seq->alloc *= 2;
300 seq->lines = (struct line *)
301 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
304 if (get_line (fp, &seq->lines[seq->count]))
306 ++seq->count;
307 return 1;
309 return 0;
312 static void
313 delseq (struct seq *seq)
315 int i;
316 for (i = 0; i < seq->count; i++)
317 if (seq->lines[i].buf.buffer)
318 freeline (&seq->lines[i]);
319 free ((char *) seq->lines);
322 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
323 >0 if it compares greater; 0 if it compares equal.
324 Report an error and exit if the comparison fails. */
326 static int
327 keycmp (struct line *line1, struct line *line2)
329 /* Start of field to compare in each file. */
330 const unsigned char *beg1, *beg2;
332 size_t len1, len2; /* Length of fields to compare. */
333 int diff;
335 if (join_field_1 < line1->nfields)
337 beg1 = line1->fields[join_field_1].beg;
338 len1 = line1->fields[join_field_1].len;
340 else
342 beg1 = NULL;
343 len1 = 0;
346 if (join_field_2 < line2->nfields)
348 beg2 = line2->fields[join_field_2].beg;
349 len2 = line2->fields[join_field_2].len;
351 else
353 beg2 = NULL;
354 len2 = 0;
357 if (len1 == 0)
358 return len2 == 0 ? 0 : -1;
359 if (len2 == 0)
360 return 1;
362 /* Use an if-statement here rather than a function variable to
363 avoid portability hassles of getting a non-conflicting declaration
364 of memcmp. */
365 if (ignore_case)
367 /* FIXME: ignore_case does not work with NLS (in particular,
368 with multibyte chars). */
369 diff = memcasecmp (beg1, beg2, min (len1, len2));
371 else
373 if (HAVE_SETLOCALE && hard_LC_COLLATE)
374 return xmemcoll ((char *) beg1, len1, (char *) beg2, len2);
375 diff = memcmp (beg1, beg2, min (len1, len2));
378 if (diff)
379 return diff;
380 return len1 < len2 ? -1 : len1 != len2;
383 /* Print field N of LINE if it exists and is nonempty, otherwise
384 `empty_filler' if it is nonempty. */
386 static void
387 prfield (int n, struct line *line)
389 size_t len;
391 if (n < line->nfields)
393 len = line->fields[n].len;
394 if (len)
395 fwrite (line->fields[n].beg, 1, len, stdout);
396 else if (empty_filler)
397 fputs (empty_filler, stdout);
399 else if (empty_filler)
400 fputs (empty_filler, stdout);
403 /* Print the join of LINE1 and LINE2. */
405 static void
406 prjoin (struct line *line1, struct line *line2)
408 const struct outlist *outlist;
410 outlist = outlist_head.next;
411 if (outlist)
413 const struct outlist *o;
415 o = outlist;
416 while (1)
418 int field;
419 struct line *line;
421 if (o->file == 0)
423 if (line1 == &uni_blank)
425 line = line2;
426 field = join_field_2;
428 else
430 line = line1;
431 field = join_field_1;
434 else
436 line = (o->file == 1 ? line1 : line2);
437 assert (o->field >= 0);
438 field = o->field;
440 prfield (field, line);
441 o = o->next;
442 if (o == NULL)
443 break;
444 putchar (tab ? tab : ' ');
446 putchar ('\n');
448 else
450 int i;
452 if (line1 == &uni_blank)
454 struct line *t;
455 t = line1;
456 line1 = line2;
457 line2 = t;
459 prfield (join_field_1, line1);
460 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
462 putchar (tab ? tab : ' ');
463 prfield (i, line1);
465 for (i = join_field_1 + 1; i < line1->nfields; ++i)
467 putchar (tab ? tab : ' ');
468 prfield (i, line1);
471 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
473 putchar (tab ? tab : ' ');
474 prfield (i, line2);
476 for (i = join_field_2 + 1; i < line2->nfields; ++i)
478 putchar (tab ? tab : ' ');
479 prfield (i, line2);
481 putchar ('\n');
485 /* Print the join of the files in FP1 and FP2. */
487 static void
488 join (FILE *fp1, FILE *fp2)
490 struct seq seq1, seq2;
491 struct line line;
492 int diff, i, j, eof1, eof2;
494 /* Read the first line of each file. */
495 initseq (&seq1);
496 getseq (fp1, &seq1);
497 initseq (&seq2);
498 getseq (fp2, &seq2);
500 while (seq1.count && seq2.count)
502 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
503 if (diff < 0)
505 if (print_unpairables_1)
506 prjoin (&seq1.lines[0], &uni_blank);
507 freeline (&seq1.lines[0]);
508 seq1.count = 0;
509 getseq (fp1, &seq1);
510 continue;
512 if (diff > 0)
514 if (print_unpairables_2)
515 prjoin (&uni_blank, &seq2.lines[0]);
516 freeline (&seq2.lines[0]);
517 seq2.count = 0;
518 getseq (fp2, &seq2);
519 continue;
522 /* Keep reading lines from file1 as long as they continue to
523 match the current line from file2. */
524 eof1 = 0;
526 if (!getseq (fp1, &seq1))
528 eof1 = 1;
529 ++seq1.count;
530 break;
532 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
534 /* Keep reading lines from file2 as long as they continue to
535 match the current line from file1. */
536 eof2 = 0;
538 if (!getseq (fp2, &seq2))
540 eof2 = 1;
541 ++seq2.count;
542 break;
544 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
546 if (print_pairables)
548 for (i = 0; i < seq1.count - 1; ++i)
549 for (j = 0; j < seq2.count - 1; ++j)
550 prjoin (&seq1.lines[i], &seq2.lines[j]);
553 for (i = 0; i < seq1.count - 1; ++i)
554 freeline (&seq1.lines[i]);
555 if (!eof1)
557 seq1.lines[0] = seq1.lines[seq1.count - 1];
558 seq1.count = 1;
560 else
561 seq1.count = 0;
563 for (i = 0; i < seq2.count - 1; ++i)
564 freeline (&seq2.lines[i]);
565 if (!eof2)
567 seq2.lines[0] = seq2.lines[seq2.count - 1];
568 seq2.count = 1;
570 else
571 seq2.count = 0;
574 if (print_unpairables_1 && seq1.count)
576 prjoin (&seq1.lines[0], &uni_blank);
577 freeline (&seq1.lines[0]);
578 while (get_line (fp1, &line))
580 prjoin (&line, &uni_blank);
581 freeline (&line);
585 if (print_unpairables_2 && seq2.count)
587 prjoin (&uni_blank, &seq2.lines[0]);
588 freeline (&seq2.lines[0]);
589 while (get_line (fp2, &line))
591 prjoin (&uni_blank, &line);
592 freeline (&line);
596 delseq (&seq1);
597 delseq (&seq2);
600 /* Add a field spec for field FIELD of file FILE to `outlist'. */
602 static void
603 add_field (int file, int field)
605 struct outlist *o;
607 assert (file == 0 || file == 1 || file == 2);
608 assert (file == 0 ? field < 0 : field >= 0);
610 o = (struct outlist *) xmalloc (sizeof (struct outlist));
611 o->file = file;
612 o->field = field;
613 o->next = NULL;
615 /* Add to the end of the list so the fields are in the right order. */
616 outlist_end->next = o;
617 outlist_end = o;
620 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
621 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
622 If S is valid, return zero. Otherwise, give a diagnostic, don't update
623 *FILE_INDEX or *FIELD_INDEX, and return nonzero. */
625 static int
626 decode_field_spec (const char *s, int *file_index, int *field_index)
628 int invalid = 1;
630 /* The first character must be 0, 1, or 2. */
631 switch (s[0])
633 case '0':
634 if (s[1] == '\0')
636 *file_index = 0;
637 /* Give *field_index an invalid value. */
638 *field_index = -1;
639 invalid = 0;
641 else
643 /* `0' must be all alone -- no `.FIELD'. */
644 error (0, 0, _("invalid field specifier: `%s'"), s);
646 break;
648 case '1':
649 case '2':
650 if (s[1] == '.' && s[2] != '\0')
652 strtol_error s_err;
653 long int tmp_long;
655 s_err = xstrtol (s + 2, NULL, 10, &tmp_long, "");
656 if (s_err != LONGINT_OK || tmp_long <= 0 || tmp_long > INT_MAX)
658 error (0, 0, _("invalid field number: `%s'"), s + 2);
660 else
662 *file_index = s[0] - '0';
663 /* Convert to a zero-based index. */
664 *field_index = (int) tmp_long - 1;
665 invalid = 0;
668 break;
670 default:
671 error (0, 0, _("invalid file number in field spec: `%s'"), s);
672 break;
674 return invalid;
677 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
678 Return nonzero to indicate failure. */
680 static int
681 add_field_list (const char *c_str)
683 char *p, *str;
685 /* Make a writable copy of c_str. */
686 str = (char *) alloca (strlen (c_str) + 1);
687 strcpy (str, c_str);
689 p = str;
692 int invalid;
693 int file_index, field_index;
694 char *spec_item = p;
696 p = strpbrk (p, ", \t");
697 if (p)
698 *p++ = 0;
699 invalid = decode_field_spec (spec_item, &file_index, &field_index);
700 if (invalid)
701 return 1;
702 add_field (file_index, field_index);
703 uni_blank.nfields = max (uni_blank.nfields, field_index);
705 while (p);
706 return 0;
709 /* Create a blank line with COUNT fields separated by tabs. */
711 static void
712 make_blank (struct line *blank, int count)
714 int i;
715 unsigned char *buffer;
716 struct field *fields;
717 blank->nfields = count;
718 blank->buf.size = blank->buf.length = count + 1;
719 blank->buf.buffer = xmalloc (blank->buf.size);
720 buffer = (unsigned char *) blank->buf.buffer;
721 blank->fields = fields =
722 (struct field *) xmalloc (sizeof (struct field) * count);
723 for (i = 0; i < count; i++)
725 buffer[i] = '\t';
726 fields[i].beg = &buffer[i];
727 fields[i].len = 0;
729 buffer[i] = '\n';
733 main (int argc, char **argv)
735 char *names[2];
736 FILE *fp1, *fp2;
737 int optc, prev_optc = 0, nfiles;
739 program_name = argv[0];
740 setlocale (LC_ALL, "");
741 bindtextdomain (PACKAGE, LOCALEDIR);
742 textdomain (PACKAGE);
743 hard_LC_COLLATE = hard_locale (LC_COLLATE);
745 atexit (close_stdout);
747 /* Initialize this before parsing options. In parsing options,
748 it may be increased. */
749 uni_blank.nfields = 1;
751 nfiles = 0;
752 print_pairables = 1;
754 while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts,
755 NULL)) != -1)
757 long int val;
759 switch (optc)
761 case 0:
762 break;
764 case 'v':
765 print_pairables = 0;
766 /* Fall through. */
768 case 'a':
769 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
770 || (val != 1 && val != 2))
771 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
772 if (val == 1)
773 print_unpairables_1 = 1;
774 else
775 print_unpairables_2 = 1;
776 break;
778 case 'e':
779 empty_filler = optarg;
780 break;
782 case 'i':
783 ignore_case = 1;
784 break;
786 case '1':
787 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
788 || val <= 0 || val > INT_MAX)
790 error (EXIT_FAILURE, 0,
791 _("invalid field number for file 1: `%s'"), optarg);
793 join_field_1 = (int) val - 1;
794 break;
796 case '2':
797 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
798 || val <= 0 || val > INT_MAX)
799 error (EXIT_FAILURE, 0,
800 _("invalid field number for file 2: `%s'"), optarg);
801 join_field_2 = (int) val - 1;
802 break;
804 case 'j':
805 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
806 || val <= 0 || val > INT_MAX)
807 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
808 join_field_1 = join_field_2 = (int) val - 1;
809 break;
811 case 'o':
812 if (add_field_list (optarg))
813 exit (EXIT_FAILURE);
814 break;
816 case 't':
817 tab = *optarg;
818 break;
820 case 1: /* Non-option argument. */
821 if (prev_optc == 'o' && optind <= argc - 2)
823 if (add_field_list (optarg))
824 exit (EXIT_FAILURE);
826 /* Might be continuation of args to -o. */
827 continue; /* Don't change `prev_optc'. */
830 if (nfiles > 1)
832 error (0, 0, _("too many non-option arguments"));
833 usage (1);
835 names[nfiles++] = optarg;
836 break;
838 case_GETOPT_HELP_CHAR;
840 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
842 default:
843 usage (1);
845 prev_optc = optc;
848 /* Now that we've seen the options, we can construct the blank line
849 structure. */
850 make_blank (&uni_blank, uni_blank.nfields);
852 if (nfiles != 2)
854 error (0, 0, _("too few non-option arguments"));
855 usage (1);
858 fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
859 if (!fp1)
860 error (EXIT_FAILURE, errno, "%s", names[0]);
861 fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
862 if (!fp2)
863 error (EXIT_FAILURE, errno, "%s", names[1]);
864 if (fp1 == fp2)
865 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
866 join (fp1, fp2);
868 if (fp1 != stdin && fclose (fp1) == EOF)
869 error (EXIT_FAILURE, errno, "%s", names[0]);
870 if (fp2 != stdin && fclose (fp2) == EOF)
871 error (EXIT_FAILURE, errno, "%s", names[1]);
872 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
873 error (EXIT_FAILURE, errno, "-");
875 exit (EXIT_SUCCESS);