.
[coreutils.git] / src / join.c
blobc0d098b5f9f1f8ae1860bcbf61e5351c95f3cae0
1 /* join - join lines of two files on a common field
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
20 #include <config.h>
22 /* Get isblank from GNU libc. */
23 #define _GNU_SOURCE
25 #include <stdio.h>
26 #include <sys/types.h>
27 #include <getopt.h>
28 #include "system.h"
29 #include "version.h"
30 #include "long-options.h"
31 #include "error.h"
33 #define join system_join
35 char *xmalloc ();
36 char *xrealloc ();
37 static void usage ();
39 /* Undefine, to avoid warning about redefinition on some systems. */
40 #undef min
41 #undef max
42 #define min(A, B) ((A) < (B) ? (A) : (B))
43 #define max(A, B) ((A) > (B) ? (A) : (B))
45 /* An element of the list describing the format of each
46 output line. */
47 struct outlist
49 int file; /* File to take field from (1 or 2). */
50 int field; /* Field number to print. */
51 struct outlist *next;
54 /* A field of a line. */
55 struct field
57 const char *beg; /* First character in field. */
58 size_t len; /* The length of the field. */
61 /* A line read from an input file. Newlines are not stored. */
62 struct line
64 char *beg; /* First character in line. */
65 char *lim; /* Character after last character in line. */
66 int nfields; /* Number of elements in `fields'. */
67 int nfields_allocated; /* Number of elements in `fields'. */
68 struct field *fields;
71 /* One or more consecutive lines read from a file that all have the
72 same join field value. */
73 struct seq
75 int count; /* Elements used in `lines'. */
76 int alloc; /* Elements allocated in `lines'. */
77 struct line *lines;
80 /* The name this program was run with. */
81 char *program_name;
83 /* If nonzero, print unpairable lines in file 1 or 2. */
84 static int print_unpairables_1, print_unpairables_2;
86 /* If nonzero, print pairable lines. */
87 static int print_pairables;
89 /* Empty output field filler. */
90 static char *empty_filler;
92 /* Field to join on. */
93 static int join_field_1, join_field_2;
95 /* List of fields to print. */
96 static struct outlist *outlist;
98 /* Last element in `outlist', where a new element can be added. */
99 static struct outlist *outlist_end;
101 /* Tab character separating fields; if this is NUL fields are separated
102 by any nonempty string of white space, otherwise by exactly one
103 tab character. */
104 static char tab;
106 /* When using getopt_long_only, no long option can start with
107 a character that is a short option. */
108 static struct option const longopts[] =
110 {"j", required_argument, NULL, 'j'},
111 {"j1", required_argument, NULL, '1'},
112 {"j2", required_argument, NULL, '2'},
113 {NULL, 0, NULL, 0}
116 /* Used to print non-joining lines */
117 static struct line uni_blank;
119 static void
120 ADD_FIELD (line, field, len)
121 struct line *line;
122 const char *field;
123 size_t len;
125 if (line->nfields >= line->nfields_allocated)
127 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
128 line->fields = (struct field *) xrealloc ((char *) line->fields,
129 (line->nfields_allocated
130 * sizeof (struct field)));
132 line->fields[line->nfields].beg = field;
133 line->fields[line->nfields].len = len;
134 ++(line->nfields);
137 /* Fill in the `fields' structure in LINE. */
139 static void
140 xfields (line)
141 struct line *line;
143 int i;
144 register char *ptr, *lim;
146 ptr = line->beg;
147 lim = line->lim;
149 for (i = 0; ptr < lim; ++i)
151 if (tab)
153 char *beg;
155 beg = ptr;
156 while (ptr < lim && *ptr != tab)
157 ++ptr;
158 ADD_FIELD (line, beg, ptr - beg);
159 if (ptr < lim)
160 ++ptr;
162 else
164 char *beg;
166 beg = ptr;
167 while (ptr < lim && !ISSPACE (*ptr))
168 ++ptr;
169 ADD_FIELD (line, beg, ptr - beg);
170 while (ptr < lim && ISSPACE (*ptr))
171 ++ptr;
175 if (ptr > line->beg && ((tab && ISSPACE (ptr[-1])) || ptr[-1] == tab))
177 /* Add one more (empty) field because the last character of the
178 line was a delimiter. */
179 ADD_FIELD (line, NULL, 0);
183 /* Read a line from FP into LINE and split it into fields.
184 Return 0 if EOF, 1 otherwise. */
186 static int
187 get_line (fp, line)
188 FILE *fp;
189 struct line *line;
191 static int linesize = 80;
192 int c, i;
193 char *ptr;
195 if (feof (fp))
196 return 0;
198 ptr = xmalloc (linesize);
200 for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i)
202 if (i == linesize)
204 linesize *= 2;
205 ptr = xrealloc (ptr, linesize);
207 ptr[i] = c;
210 if (c == EOF && i == 0)
212 free (ptr);
213 return 0;
216 line->beg = ptr;
217 line->lim = line->beg + i;
218 line->nfields_allocated = 0;
219 line->nfields = 0;
220 line->fields = NULL;
221 xfields (line);
222 return 1;
225 static void
226 freeline (line)
227 struct line *line;
229 free ((char *) line->fields);
230 free (line->beg);
231 line->beg = NULL;
234 static void
235 initseq (seq)
236 struct seq *seq;
238 seq->count = 0;
239 seq->alloc = 1;
240 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
243 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
245 static int
246 getseq (fp, seq)
247 FILE *fp;
248 struct seq *seq;
250 if (seq->count == seq->alloc)
252 seq->alloc *= 2;
253 seq->lines = (struct line *)
254 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
257 if (get_line (fp, &seq->lines[seq->count]))
259 ++seq->count;
260 return 1;
262 return 0;
265 static void
266 delseq (seq)
267 struct seq *seq;
269 int i;
270 for (i = 0; i < seq->count; i++)
271 if (seq->lines[i].beg)
272 freeline (&seq->lines[i]);
273 free ((char *) seq->lines);
276 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
277 >0 if it compares greater; 0 if it compares equal. */
279 static int
280 keycmp (line1, line2)
281 struct line *line1;
282 struct line *line2;
284 const char *beg1, *beg2; /* Start of field to compare in each file. */
285 int len1, len2; /* Length of fields to compare. */
286 int diff;
288 if (join_field_1 < line1->nfields)
290 beg1 = line1->fields[join_field_1].beg;
291 len1 = line1->fields[join_field_1].len;
293 else
295 beg1 = NULL;
296 len1 = 0;
299 if (join_field_2 < line2->nfields)
301 beg2 = line2->fields[join_field_2].beg;
302 len2 = line2->fields[join_field_2].len;
304 else
306 beg2 = NULL;
307 len2 = 0;
310 if (len1 == 0)
311 return len2 == 0 ? 0 : -1;
312 if (len2 == 0)
313 return 1;
314 diff = memcmp (beg1, beg2, min (len1, len2));
315 if (diff)
316 return diff;
317 return len1 - len2;
320 /* Print field N of LINE if it exists and is nonempty, otherwise
321 `empty_filler' if it is nonempty. */
323 static void
324 prfield (n, line)
325 int n;
326 struct line *line;
328 int len;
330 if (n < line->nfields)
332 len = line->fields[n].len;
333 if (len)
334 fwrite (line->fields[n].beg, 1, len, stdout);
335 else if (empty_filler)
336 fputs (empty_filler, stdout);
338 else if (empty_filler)
339 fputs (empty_filler, stdout);
342 /* Print the join of LINE1 and LINE2. */
344 static void
345 prjoin (line1, line2)
346 struct line *line1;
347 struct line *line2;
349 if (outlist)
351 struct outlist *o;
353 prfield (outlist->field - 1, outlist->file == 1 ? line1 : line2);
354 for (o = outlist->next; o; o = o->next)
356 putchar (tab ? tab : ' ');
357 prfield (o->field - 1, o->file == 1 ? line1 : line2);
359 putchar ('\n');
361 else
363 int i;
365 if (line1 == &uni_blank)
367 struct line *t;
368 t = line1;
369 line1 = line2;
370 line2 = t;
372 prfield (join_field_1, line1);
373 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
375 putchar (tab ? tab : ' ');
376 prfield (i, line1);
378 for (i = join_field_1 + 1; i < line1->nfields; ++i)
380 putchar (tab ? tab : ' ');
381 prfield (i, line1);
384 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
386 putchar (tab ? tab : ' ');
387 prfield (i, line2);
389 for (i = join_field_2 + 1; i < line2->nfields; ++i)
391 putchar (tab ? tab : ' ');
392 prfield (i, line2);
394 putchar ('\n');
398 /* Print the join of the files in FP1 and FP2. */
400 static void
401 join (fp1, fp2)
402 FILE *fp1;
403 FILE *fp2;
405 struct seq seq1, seq2;
406 struct line line;
407 int diff, i, j, eof1, eof2;
409 /* Read the first line of each file. */
410 initseq (&seq1);
411 getseq (fp1, &seq1);
412 initseq (&seq2);
413 getseq (fp2, &seq2);
415 while (seq1.count && seq2.count)
417 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
418 if (diff < 0)
420 if (print_unpairables_1)
421 prjoin (&seq1.lines[0], &uni_blank);
422 freeline (&seq1.lines[0]);
423 seq1.count = 0;
424 getseq (fp1, &seq1);
425 continue;
427 if (diff > 0)
429 if (print_unpairables_2)
430 prjoin (&uni_blank, &seq2.lines[0]);
431 freeline (&seq2.lines[0]);
432 seq2.count = 0;
433 getseq (fp2, &seq2);
434 continue;
437 /* Keep reading lines from file1 as long as they continue to
438 match the current line from file2. */
439 eof1 = 0;
441 if (!getseq (fp1, &seq1))
443 eof1 = 1;
444 ++seq1.count;
445 break;
447 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
449 /* Keep reading lines from file2 as long as they continue to
450 match the current line from file1. */
451 eof2 = 0;
453 if (!getseq (fp2, &seq2))
455 eof2 = 1;
456 ++seq2.count;
457 break;
459 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
461 if (print_pairables)
463 for (i = 0; i < seq1.count - 1; ++i)
464 for (j = 0; j < seq2.count - 1; ++j)
465 prjoin (&seq1.lines[i], &seq2.lines[j]);
468 for (i = 0; i < seq1.count - 1; ++i)
469 freeline (&seq1.lines[i]);
470 if (!eof1)
472 seq1.lines[0] = seq1.lines[seq1.count - 1];
473 seq1.count = 1;
475 else
476 seq1.count = 0;
478 for (i = 0; i < seq2.count - 1; ++i)
479 freeline (&seq2.lines[i]);
480 if (!eof2)
482 seq2.lines[0] = seq2.lines[seq2.count - 1];
483 seq2.count = 1;
485 else
486 seq2.count = 0;
489 if (print_unpairables_1 && seq1.count)
491 prjoin (&seq1.lines[0], &uni_blank);
492 freeline (&seq1.lines[0]);
493 while (get_line (fp1, &line))
495 prjoin (&line, &uni_blank);
496 freeline (&line);
500 if (print_unpairables_2 && seq2.count)
502 prjoin (&uni_blank, &seq2.lines[0]);
503 freeline (&seq2.lines[0]);
504 while (get_line (fp2, &line))
506 prjoin (&uni_blank, &line);
507 freeline (&line);
511 delseq (&seq1);
512 delseq (&seq2);
515 /* Add a field spec for field FIELD of file FILE to `outlist' and return 1,
516 unless either argument is invalid; then just return 0. */
518 static int
519 add_field (file, field)
520 int file;
521 int field;
523 struct outlist *o;
525 if (file < 1 || file > 2 || field < 1)
526 return 0;
527 o = (struct outlist *) xmalloc (sizeof (struct outlist));
528 o->file = file;
529 o->field = field;
530 o->next = NULL;
532 /* Add to the end of the list so the fields are in the right order. */
533 if (outlist == NULL)
534 outlist = o;
535 else
536 outlist_end->next = o;
537 outlist_end = o;
539 return 1;
542 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
543 Return the number of fields added. */
545 static int
546 add_field_list (str)
547 char *str;
549 int added = 0;
550 int file = -1, field = -1;
551 int dot_found = 0;
553 for (; *str; str++)
555 if (*str == ',' || ISBLANK (*str))
557 added += add_field (file, field);
558 uni_blank.nfields = max (uni_blank.nfields, field);
559 file = field = -1;
560 dot_found = 0;
562 else if (*str == '.')
563 dot_found = 1;
564 else if (ISDIGIT (*str))
566 if (!dot_found)
568 if (file == -1)
569 file = 0;
570 file = file * 10 + *str - '0';
572 else
574 if (field == -1)
575 field = 0;
576 field = field * 10 + *str - '0';
579 else
580 return 0;
583 uni_blank.nfields = max (uni_blank.nfields, field);
584 added += add_field (file, field);
585 return added;
588 /* Create a blank line with COUNT fields separated by tabs. */
590 void
591 make_blank (blank, count)
592 struct line *blank;
593 int count;
595 int i;
596 blank->nfields = count;
597 blank->beg = xmalloc (blank->nfields + 1);
598 blank->fields = (struct field *) xmalloc (sizeof (struct field) * count);
599 for (i = 0; i < blank->nfields; i++)
601 blank->beg[i] = '\t';
602 blank->fields[i].beg = &blank->beg[i];
603 blank->fields[i].len = 0;
605 blank->beg[i] = '\0';
606 blank->lim = &blank->beg[i];
609 void
610 main (argc, argv)
611 int argc;
612 char *argv[];
614 char *names[2];
615 FILE *fp1, *fp2;
616 int optc, prev_optc = 0, nfiles, val;
618 program_name = argv[0];
620 /* Initialize this before parsing options. In parsing options,
621 it may be increased. */
622 uni_blank.nfields = 1;
624 parse_long_options (argc, argv, "join", version_string, usage);
626 nfiles = 0;
627 print_pairables = 1;
629 while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts,
630 (int *) 0)) != EOF)
632 switch (optc)
634 case 0:
635 break;
637 case 'a':
638 val = atoi (optarg);
639 if (val == 1)
640 print_unpairables_1 = 1;
641 else if (val == 2)
642 print_unpairables_2 = 1;
643 else
644 error (2, 0, "invalid file number for `-a'");
645 break;
647 case 'e':
648 empty_filler = optarg;
649 break;
651 case '1':
652 val = atoi (optarg);
653 if (val <= 0)
654 error (2, 0, "invalid field number for `-1'");
655 join_field_1 = val - 1;
656 break;
658 case '2':
659 val = atoi (optarg);
660 if (val <= 0)
661 error (2, 0, "invalid field number for `-2'");
662 join_field_2 = val - 1;
663 break;
665 case 'j':
666 val = atoi (optarg);
667 if (val <= 0)
668 error (2, 0, "invalid field number for `-j'");
669 join_field_1 = join_field_2 = val - 1;
670 break;
672 case 'o':
673 if (add_field_list (optarg) == 0)
674 error (2, 0, "invalid field list for `-o'");
675 break;
677 case 't':
678 tab = *optarg;
679 break;
681 case 'v':
682 val = atoi (optarg);
683 if (val == 1)
684 print_unpairables_1 = 1;
685 else if (val == 2)
686 print_unpairables_2 = 1;
687 else
688 error (2, 0, "invalid file number for `-v'");
689 print_pairables = 0;
690 break;
692 case 1: /* Non-option argument. */
693 if (prev_optc == 'o')
695 /* Might be continuation of args to -o. */
696 if (add_field_list (optarg) > 0)
697 continue; /* Don't change `prev_optc'. */
700 if (nfiles > 1)
701 usage (1);
702 names[nfiles++] = optarg;
703 break;
705 case '?':
706 usage (1);
708 prev_optc = optc;
711 /* Now that we've seen the options, we can construct the blank line
712 structure. */
713 make_blank (&uni_blank, uni_blank.nfields);
715 if (nfiles != 2)
716 usage (1);
718 fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin;
719 if (!fp1)
720 error (1, errno, "%s", names[0]);
721 fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin;
722 if (!fp2)
723 error (1, errno, "%s", names[1]);
724 if (fp1 == fp2)
725 error (1, errno, "both files cannot be standard input");
726 join (fp1, fp2);
728 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
729 error (1, errno, "-");
730 if (ferror (stdout) || fclose (stdout) == EOF)
731 error (1, errno, "write error");
733 exit (0);
736 static void
737 usage (status)
738 int status;
740 if (status != 0)
741 fprintf (stderr, "Try `%s --help' for more information.\n",
742 program_name);
743 else
745 printf ("\
746 Usage: %s [OPTION]... FILE1 FILE2\n\
748 program_name);
749 printf ("\
750 For each pair of input lines with identical join fields, write a line to\n\
751 standard output. The default join field is the first, delimited\n\
752 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
754 -a SIDE print unpairable lines coming from file SIDE\n\
755 -e EMPTY replace missing input fields with EMPTY\n\
756 -j FIELD join on this FIELD for both files\n\
757 -[j]SIDE FIELD join on this FIELD for file SIDE\n\
758 -o FORMAT obey FORMAT while constructing output line\n\
759 -t CHAR use CHAR as input and output field separator\n\
760 -v SIDE like -a SIDE, but suppress joined output lines\n\
761 --help display this help and exit\n\
762 --version output version information and exit\n\
764 SIDE is 1 for FILE1 or 2 for FILE2. Unless -t CHAR is given, leading blanks\n\
765 separate fields and are ignored, else fields are separated by CHAR.\n\
766 Any FIELD is a field number counted from 1. FORMAT is one or more\n\
767 comma or blank separated specifications, each being `SIDE.FIELD'.\n\
768 Default FORMAT outputs the join field, the remaining fields from\n\
769 FILE1, the remaining fields from FILE2, all separated by CHAR.\n\
772 exit (status);