1 /* join - join lines of two files on a common field
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
22 /* Get isblank from GNU libc. */
26 #include <sys/types.h>
30 #include "long-options.h"
33 #define join system_join
39 /* Undefine, to avoid warning about redefinition on some systems. */
42 #define min(A, B) ((A) < (B) ? (A) : (B))
43 #define max(A, B) ((A) > (B) ? (A) : (B))
45 /* An element of the list describing the format of each
49 int file
; /* File to take field from (1 or 2). */
50 int field
; /* Field number to print. */
54 /* A field of a line. */
57 const char *beg
; /* First character in field. */
58 size_t len
; /* The length of the field. */
61 /* A line read from an input file. Newlines are not stored. */
64 char *beg
; /* First character in line. */
65 char *lim
; /* Character after last character in line. */
66 int nfields
; /* Number of elements in `fields'. */
67 int nfields_allocated
; /* Number of elements in `fields'. */
71 /* One or more consecutive lines read from a file that all have the
72 same join field value. */
75 int count
; /* Elements used in `lines'. */
76 int alloc
; /* Elements allocated in `lines'. */
80 /* The name this program was run with. */
83 /* If nonzero, print unpairable lines in file 1 or 2. */
84 static int print_unpairables_1
, print_unpairables_2
;
86 /* If nonzero, print pairable lines. */
87 static int print_pairables
;
89 /* Empty output field filler. */
90 static char *empty_filler
;
92 /* Field to join on. */
93 static int join_field_1
, join_field_2
;
95 /* List of fields to print. */
96 static struct outlist
*outlist
;
98 /* Last element in `outlist', where a new element can be added. */
99 static struct outlist
*outlist_end
;
101 /* Tab character separating fields; if this is NUL fields are separated
102 by any nonempty string of white space, otherwise by exactly one
106 /* When using getopt_long_only, no long option can start with
107 a character that is a short option. */
108 static struct option
const longopts
[] =
110 {"j", required_argument
, NULL
, 'j'},
111 {"j1", required_argument
, NULL
, '1'},
112 {"j2", required_argument
, NULL
, '2'},
116 /* Used to print non-joining lines */
117 static struct line uni_blank
;
120 ADD_FIELD (line
, field
, len
)
125 if (line
->nfields
>= line
->nfields_allocated
)
127 line
->nfields_allocated
= (3 * line
->nfields_allocated
) / 2 + 1;
128 line
->fields
= (struct field
*) xrealloc ((char *) line
->fields
,
129 (line
->nfields_allocated
130 * sizeof (struct field
)));
132 line
->fields
[line
->nfields
].beg
= field
;
133 line
->fields
[line
->nfields
].len
= len
;
137 /* Fill in the `fields' structure in LINE. */
144 register char *ptr
, *lim
;
149 for (i
= 0; ptr
< lim
; ++i
)
156 while (ptr
< lim
&& *ptr
!= tab
)
158 ADD_FIELD (line
, beg
, ptr
- beg
);
167 while (ptr
< lim
&& !ISSPACE (*ptr
))
169 ADD_FIELD (line
, beg
, ptr
- beg
);
170 while (ptr
< lim
&& ISSPACE (*ptr
))
175 if (ptr
> line
->beg
&& ((tab
&& ISSPACE (ptr
[-1])) || ptr
[-1] == tab
))
177 /* Add one more (empty) field because the last character of the
178 line was a delimiter. */
179 ADD_FIELD (line
, NULL
, 0);
183 /* Read a line from FP into LINE and split it into fields.
184 Return 0 if EOF, 1 otherwise. */
191 static int linesize
= 80;
198 ptr
= xmalloc (linesize
);
200 for (i
= 0; (c
= getc (fp
)) != EOF
&& c
!= '\n'; ++i
)
205 ptr
= xrealloc (ptr
, linesize
);
210 if (c
== EOF
&& i
== 0)
217 line
->lim
= line
->beg
+ i
;
218 line
->nfields_allocated
= 0;
229 free ((char *) line
->fields
);
240 seq
->lines
= (struct line
*) xmalloc (seq
->alloc
* sizeof (struct line
));
243 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
250 if (seq
->count
== seq
->alloc
)
253 seq
->lines
= (struct line
*)
254 xrealloc ((char *) seq
->lines
, seq
->alloc
* sizeof (struct line
));
257 if (get_line (fp
, &seq
->lines
[seq
->count
]))
270 for (i
= 0; i
< seq
->count
; i
++)
271 if (seq
->lines
[i
].beg
)
272 freeline (&seq
->lines
[i
]);
273 free ((char *) seq
->lines
);
276 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
277 >0 if it compares greater; 0 if it compares equal. */
280 keycmp (line1
, line2
)
284 const char *beg1
, *beg2
; /* Start of field to compare in each file. */
285 int len1
, len2
; /* Length of fields to compare. */
288 if (join_field_1
< line1
->nfields
)
290 beg1
= line1
->fields
[join_field_1
].beg
;
291 len1
= line1
->fields
[join_field_1
].len
;
299 if (join_field_2
< line2
->nfields
)
301 beg2
= line2
->fields
[join_field_2
].beg
;
302 len2
= line2
->fields
[join_field_2
].len
;
311 return len2
== 0 ? 0 : -1;
314 diff
= memcmp (beg1
, beg2
, min (len1
, len2
));
320 /* Print field N of LINE if it exists and is nonempty, otherwise
321 `empty_filler' if it is nonempty. */
330 if (n
< line
->nfields
)
332 len
= line
->fields
[n
].len
;
334 fwrite (line
->fields
[n
].beg
, 1, len
, stdout
);
335 else if (empty_filler
)
336 fputs (empty_filler
, stdout
);
338 else if (empty_filler
)
339 fputs (empty_filler
, stdout
);
342 /* Print the join of LINE1 and LINE2. */
345 prjoin (line1
, line2
)
353 prfield (outlist
->field
- 1, outlist
->file
== 1 ? line1
: line2
);
354 for (o
= outlist
->next
; o
; o
= o
->next
)
356 putchar (tab
? tab
: ' ');
357 prfield (o
->field
- 1, o
->file
== 1 ? line1
: line2
);
365 if (line1
== &uni_blank
)
372 prfield (join_field_1
, line1
);
373 for (i
= 0; i
< join_field_1
&& i
< line1
->nfields
; ++i
)
375 putchar (tab
? tab
: ' ');
378 for (i
= join_field_1
+ 1; i
< line1
->nfields
; ++i
)
380 putchar (tab
? tab
: ' ');
384 for (i
= 0; i
< join_field_2
&& i
< line2
->nfields
; ++i
)
386 putchar (tab
? tab
: ' ');
389 for (i
= join_field_2
+ 1; i
< line2
->nfields
; ++i
)
391 putchar (tab
? tab
: ' ');
398 /* Print the join of the files in FP1 and FP2. */
405 struct seq seq1
, seq2
;
407 int diff
, i
, j
, eof1
, eof2
;
409 /* Read the first line of each file. */
415 while (seq1
.count
&& seq2
.count
)
417 diff
= keycmp (&seq1
.lines
[0], &seq2
.lines
[0]);
420 if (print_unpairables_1
)
421 prjoin (&seq1
.lines
[0], &uni_blank
);
422 freeline (&seq1
.lines
[0]);
429 if (print_unpairables_2
)
430 prjoin (&uni_blank
, &seq2
.lines
[0]);
431 freeline (&seq2
.lines
[0]);
437 /* Keep reading lines from file1 as long as they continue to
438 match the current line from file2. */
441 if (!getseq (fp1
, &seq1
))
447 while (!keycmp (&seq1
.lines
[seq1
.count
- 1], &seq2
.lines
[0]));
449 /* Keep reading lines from file2 as long as they continue to
450 match the current line from file1. */
453 if (!getseq (fp2
, &seq2
))
459 while (!keycmp (&seq1
.lines
[0], &seq2
.lines
[seq2
.count
- 1]));
463 for (i
= 0; i
< seq1
.count
- 1; ++i
)
464 for (j
= 0; j
< seq2
.count
- 1; ++j
)
465 prjoin (&seq1
.lines
[i
], &seq2
.lines
[j
]);
468 for (i
= 0; i
< seq1
.count
- 1; ++i
)
469 freeline (&seq1
.lines
[i
]);
472 seq1
.lines
[0] = seq1
.lines
[seq1
.count
- 1];
478 for (i
= 0; i
< seq2
.count
- 1; ++i
)
479 freeline (&seq2
.lines
[i
]);
482 seq2
.lines
[0] = seq2
.lines
[seq2
.count
- 1];
489 if (print_unpairables_1
&& seq1
.count
)
491 prjoin (&seq1
.lines
[0], &uni_blank
);
492 freeline (&seq1
.lines
[0]);
493 while (get_line (fp1
, &line
))
495 prjoin (&line
, &uni_blank
);
500 if (print_unpairables_2
&& seq2
.count
)
502 prjoin (&uni_blank
, &seq2
.lines
[0]);
503 freeline (&seq2
.lines
[0]);
504 while (get_line (fp2
, &line
))
506 prjoin (&uni_blank
, &line
);
515 /* Add a field spec for field FIELD of file FILE to `outlist' and return 1,
516 unless either argument is invalid; then just return 0. */
519 add_field (file
, field
)
525 if (file
< 1 || file
> 2 || field
< 1)
527 o
= (struct outlist
*) xmalloc (sizeof (struct outlist
));
532 /* Add to the end of the list so the fields are in the right order. */
536 outlist_end
->next
= o
;
542 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
543 Return the number of fields added. */
550 int file
= -1, field
= -1;
555 if (*str
== ',' || ISBLANK (*str
))
557 added
+= add_field (file
, field
);
558 uni_blank
.nfields
= max (uni_blank
.nfields
, field
);
562 else if (*str
== '.')
564 else if (ISDIGIT (*str
))
570 file
= file
* 10 + *str
- '0';
576 field
= field
* 10 + *str
- '0';
583 uni_blank
.nfields
= max (uni_blank
.nfields
, field
);
584 added
+= add_field (file
, field
);
588 /* Create a blank line with COUNT fields separated by tabs. */
591 make_blank (blank
, count
)
596 blank
->nfields
= count
;
597 blank
->beg
= xmalloc (blank
->nfields
+ 1);
598 blank
->fields
= (struct field
*) xmalloc (sizeof (struct field
) * count
);
599 for (i
= 0; i
< blank
->nfields
; i
++)
601 blank
->beg
[i
] = '\t';
602 blank
->fields
[i
].beg
= &blank
->beg
[i
];
603 blank
->fields
[i
].len
= 0;
605 blank
->beg
[i
] = '\0';
606 blank
->lim
= &blank
->beg
[i
];
616 int optc
, prev_optc
= 0, nfiles
, val
;
618 program_name
= argv
[0];
620 /* Initialize this before parsing options. In parsing options,
621 it may be increased. */
622 uni_blank
.nfields
= 1;
624 parse_long_options (argc
, argv
, "join", version_string
, usage
);
629 while ((optc
= getopt_long_only (argc
, argv
, "-a:e:1:2:o:t:v:", longopts
,
640 print_unpairables_1
= 1;
642 print_unpairables_2
= 1;
644 error (2, 0, "invalid file number for `-a'");
648 empty_filler
= optarg
;
654 error (2, 0, "invalid field number for `-1'");
655 join_field_1
= val
- 1;
661 error (2, 0, "invalid field number for `-2'");
662 join_field_2
= val
- 1;
668 error (2, 0, "invalid field number for `-j'");
669 join_field_1
= join_field_2
= val
- 1;
673 if (add_field_list (optarg
) == 0)
674 error (2, 0, "invalid field list for `-o'");
684 print_unpairables_1
= 1;
686 print_unpairables_2
= 1;
688 error (2, 0, "invalid file number for `-v'");
692 case 1: /* Non-option argument. */
693 if (prev_optc
== 'o')
695 /* Might be continuation of args to -o. */
696 if (add_field_list (optarg
) > 0)
697 continue; /* Don't change `prev_optc'. */
702 names
[nfiles
++] = optarg
;
711 /* Now that we've seen the options, we can construct the blank line
713 make_blank (&uni_blank
, uni_blank
.nfields
);
718 fp1
= strcmp (names
[0], "-") ? fopen (names
[0], "r") : stdin
;
720 error (1, errno
, "%s", names
[0]);
721 fp2
= strcmp (names
[1], "-") ? fopen (names
[1], "r") : stdin
;
723 error (1, errno
, "%s", names
[1]);
725 error (1, errno
, "both files cannot be standard input");
728 if ((fp1
== stdin
|| fp2
== stdin
) && fclose (stdin
) == EOF
)
729 error (1, errno
, "-");
730 if (ferror (stdout
) || fclose (stdout
) == EOF
)
731 error (1, errno
, "write error");
741 fprintf (stderr
, "Try `%s --help' for more information.\n",
746 Usage: %s [OPTION]... FILE1 FILE2\n\
750 For each pair of input lines with identical join fields, write a line to\n\
751 standard output. The default join field is the first, delimited\n\
752 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
754 -a SIDE print unpairable lines coming from file SIDE\n\
755 -e EMPTY replace missing input fields with EMPTY\n\
756 -j FIELD join on this FIELD for both files\n\
757 -[j]SIDE FIELD join on this FIELD for file SIDE\n\
758 -o FORMAT obey FORMAT while constructing output line\n\
759 -t CHAR use CHAR as input and output field separator\n\
760 -v SIDE like -a SIDE, but suppress joined output lines\n\
761 --help display this help and exit\n\
762 --version output version information and exit\n\
764 SIDE is 1 for FILE1 or 2 for FILE2. Unless -t CHAR is given, leading blanks\n\
765 separate fields and are ignored, else fields are separated by CHAR.\n\
766 Any FIELD is a field number counted from 1. FORMAT is one or more\n\
767 comma or blank separated specifications, each being `SIDE.FIELD'.\n\
768 Default FORMAT outputs the join field, the remaining fields from\n\
769 FILE1, the remaining fields from FILE2, all separated by CHAR.\n\