1 /* join - join lines of two files on a common field
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
23 #define alloca __builtin_alloca
24 #else /* not __GNUC__ */
27 #else /* not HAVE_ALLOCA_H */
33 #endif /* not HAVE_ALLOCA_H */
34 #endif /* not __GNUC__ */
36 /* Get isblank from GNU libc. */
42 #include <sys/types.h>
50 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
54 # define INT_MAX ((int) (UINT_MAX >> 1))
57 #if _LIBC || STDC_HEADERS
58 # define TOLOWER(c) tolower (c)
60 # define TOLOWER(c) (ISUPPER (c) ? tolower (c) : (c))
65 #include "long-options.h"
69 #define join system_join
74 /* Undefine, to avoid warning about redefinition on some systems. */
77 #define min(A, B) ((A) < (B) ? (A) : (B))
78 #define max(A, B) ((A) > (B) ? (A) : (B))
80 /* An element of the list identifying which fields to print for each
84 /* File number: 0, 1, or 2. 0 means use the join field.
85 1 means use the first file argument, 2 the second. */
88 /* Field index (zero-based), specified only when FILE is 1 or 2. */
94 /* A field of a line. */
97 const char *beg
; /* First character in field. */
98 size_t len
; /* The length of the field. */
101 /* A line read from an input file. Newlines are not stored. */
104 char *beg
; /* First character in line. */
105 char *lim
; /* Character after last character in line. */
106 int nfields
; /* Number of elements in `fields'. */
107 int nfields_allocated
; /* Number of elements in `fields'. */
108 struct field
*fields
;
111 /* One or more consecutive lines read from a file that all have the
112 same join field value. */
115 int count
; /* Elements used in `lines'. */
116 int alloc
; /* Elements allocated in `lines'. */
120 /* The name this program was run with. */
123 /* If nonzero, print unpairable lines in file 1 or 2. */
124 static int print_unpairables_1
, print_unpairables_2
;
126 /* If nonzero, print pairable lines. */
127 static int print_pairables
;
129 /* Empty output field filler. */
130 static char *empty_filler
;
132 /* Field to join on. */
133 static int join_field_1
, join_field_2
;
135 /* List of fields to print. */
136 static struct outlist outlist_head
;
138 /* Last element in `outlist', where a new element can be added. */
139 static struct outlist
*outlist_end
= &outlist_head
;
141 /* Tab character separating fields; if this is NUL fields are separated
142 by any nonempty string of white space, otherwise by exactly one
146 /* When using getopt_long_only, no long option can start with
147 a character that is a short option. */
148 static struct option
const longopts
[] =
150 {"ignore-case", no_argument
, NULL
, 'i'},
151 {"j", required_argument
, NULL
, 'j'},
152 {"j1", required_argument
, NULL
, '1'},
153 {"j2", required_argument
, NULL
, '2'},
157 /* Used to print non-joining lines */
158 static struct line uni_blank
;
160 /* If nonzero, ignore case when comparing join fields. */
161 static int ignore_case
;
167 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
172 Usage: %s [OPTION]... FILE1 FILE2\n\
176 For each pair of input lines with identical join fields, write a line to\n\
177 standard output. The default join field is the first, delimited\n\
178 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
180 -a SIDE print unpairable lines coming from file SIDE\n\
181 -e EMPTY replace missing input fields with EMPTY\n\
182 -i, --ignore-case ignore differences in case when comparing fields\n\
183 -j FIELD (Obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
184 -j1 FIELD (Obsolescent) equivalent to `-1 FIELD'\n\
185 -j2 FIELD (Obsolescent) equivalent to `-2 FIELD'\n\
186 -1 FIELD join on this FIELD of file 1\n\
187 -2 FIELD join on this FIELD of file 2\n\
188 -o FORMAT obey FORMAT while constructing output line\n\
189 -t CHAR use CHAR as input and output field separator\n\
190 -v SIDE like -a SIDE, but suppress joined output lines\n\
191 --help display this help and exit\n\
192 --version output version information and exit\n\
194 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
195 else fields are separated by CHAR. Any FIELD is a field number counted\n\
196 from 1. FORMAT is one or more comma or blank separated specifications,\n\
197 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
198 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
199 separated by CHAR.\n\
205 /* Like memcmp, but ignore differences in case. */
208 memcasecmp (const void *vs1
, const void *vs2
, size_t n
)
211 unsigned char *s1
= (unsigned char *) vs1
;
212 unsigned char *s2
= (unsigned char *) vs2
;
213 for (i
= 0; i
< n
; i
++)
215 unsigned char u1
= *s1
++;
216 unsigned char u2
= *s2
++;
217 if (TOLOWER (u1
) != TOLOWER (u2
))
218 return TOLOWER (u1
) - TOLOWER (u2
);
224 ADD_FIELD (struct line
*line
, const char *field
, size_t len
)
226 if (line
->nfields
>= line
->nfields_allocated
)
228 line
->nfields_allocated
= (3 * line
->nfields_allocated
) / 2 + 1;
229 line
->fields
= (struct field
*) xrealloc ((char *) line
->fields
,
230 (line
->nfields_allocated
231 * sizeof (struct field
)));
233 line
->fields
[line
->nfields
].beg
= field
;
234 line
->fields
[line
->nfields
].len
= len
;
238 /* Fill in the `fields' structure in LINE. */
241 xfields (struct line
*line
)
244 register char *ptr
, *lim
;
251 /* Skip leading blanks before the first field. */
252 while (ptr
< lim
&& ISSPACE (*ptr
))
256 for (i
= 0; ptr
< lim
; ++i
)
263 while (ptr
< lim
&& *ptr
!= tab
)
265 ADD_FIELD (line
, beg
, ptr
- beg
);
274 while (ptr
< lim
&& !ISSPACE (*ptr
))
276 ADD_FIELD (line
, beg
, ptr
- beg
);
277 while (ptr
< lim
&& ISSPACE (*ptr
))
282 if (ptr
> line
->beg
&& ((tab
&& ISSPACE (ptr
[-1])) || ptr
[-1] == tab
))
284 /* Add one more (empty) field because the last character of the
285 line was a delimiter. */
286 ADD_FIELD (line
, NULL
, 0);
290 /* Read a line from FP into LINE and split it into fields.
291 Return 0 if EOF, 1 otherwise. */
294 get_line (FILE *fp
, struct line
*line
)
296 static int linesize
= 80;
303 ptr
= xmalloc (linesize
);
305 for (i
= 0; (c
= getc (fp
)) != EOF
&& c
!= '\n'; ++i
)
310 ptr
= xrealloc (ptr
, linesize
);
315 if (c
== EOF
&& i
== 0)
322 line
->lim
= line
->beg
+ i
;
323 line
->nfields_allocated
= 0;
331 freeline (struct line
*line
)
333 free ((char *) line
->fields
);
339 initseq (struct seq
*seq
)
343 seq
->lines
= (struct line
*) xmalloc (seq
->alloc
* sizeof (struct line
));
346 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
349 getseq (FILE *fp
, struct seq
*seq
)
351 if (seq
->count
== seq
->alloc
)
354 seq
->lines
= (struct line
*)
355 xrealloc ((char *) seq
->lines
, seq
->alloc
* sizeof (struct line
));
358 if (get_line (fp
, &seq
->lines
[seq
->count
]))
367 delseq (struct seq
*seq
)
370 for (i
= 0; i
< seq
->count
; i
++)
371 if (seq
->lines
[i
].beg
)
372 freeline (&seq
->lines
[i
]);
373 free ((char *) seq
->lines
);
376 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
377 >0 if it compares greater; 0 if it compares equal. */
380 keycmp (struct line
*line1
, struct line
*line2
)
382 const char *beg1
, *beg2
; /* Start of field to compare in each file. */
383 int len1
, len2
; /* Length of fields to compare. */
386 if (join_field_1
< line1
->nfields
)
388 beg1
= line1
->fields
[join_field_1
].beg
;
389 len1
= line1
->fields
[join_field_1
].len
;
397 if (join_field_2
< line2
->nfields
)
399 beg2
= line2
->fields
[join_field_2
].beg
;
400 len2
= line2
->fields
[join_field_2
].len
;
409 return len2
== 0 ? 0 : -1;
413 /* Use an if-statement here rather than a function variable to
414 avoid portability hassles of getting a non-conflicting declaration
417 diff
= memcasecmp (beg1
, beg2
, min (len1
, len2
));
419 diff
= memcmp (beg1
, beg2
, min (len1
, len2
));
426 /* Print field N of LINE if it exists and is nonempty, otherwise
427 `empty_filler' if it is nonempty. */
430 prfield (int n
, struct line
*line
)
434 if (n
< line
->nfields
)
436 len
= line
->fields
[n
].len
;
438 fwrite (line
->fields
[n
].beg
, 1, len
, stdout
);
439 else if (empty_filler
)
440 fputs (empty_filler
, stdout
);
442 else if (empty_filler
)
443 fputs (empty_filler
, stdout
);
446 /* Print the join of LINE1 and LINE2. */
449 prjoin (struct line
*line1
, struct line
*line2
)
451 const struct outlist
*outlist
;
453 outlist
= outlist_head
.next
;
456 const struct outlist
*o
;
466 if (line1
== &uni_blank
)
469 field
= join_field_2
;
474 field
= join_field_1
;
479 line
= (o
->file
== 1 ? line1
: line2
);
482 prfield (field
, line
);
486 putchar (tab
? tab
: ' ');
494 if (line1
== &uni_blank
)
501 prfield (join_field_1
, line1
);
502 for (i
= 0; i
< join_field_1
&& i
< line1
->nfields
; ++i
)
504 putchar (tab
? tab
: ' ');
507 for (i
= join_field_1
+ 1; i
< line1
->nfields
; ++i
)
509 putchar (tab
? tab
: ' ');
513 for (i
= 0; i
< join_field_2
&& i
< line2
->nfields
; ++i
)
515 putchar (tab
? tab
: ' ');
518 for (i
= join_field_2
+ 1; i
< line2
->nfields
; ++i
)
520 putchar (tab
? tab
: ' ');
527 /* Print the join of the files in FP1 and FP2. */
530 join (FILE *fp1
, FILE *fp2
)
532 struct seq seq1
, seq2
;
534 int diff
, i
, j
, eof1
, eof2
;
536 /* Read the first line of each file. */
542 while (seq1
.count
&& seq2
.count
)
544 diff
= keycmp (&seq1
.lines
[0], &seq2
.lines
[0]);
547 if (print_unpairables_1
)
548 prjoin (&seq1
.lines
[0], &uni_blank
);
549 freeline (&seq1
.lines
[0]);
556 if (print_unpairables_2
)
557 prjoin (&uni_blank
, &seq2
.lines
[0]);
558 freeline (&seq2
.lines
[0]);
564 /* Keep reading lines from file1 as long as they continue to
565 match the current line from file2. */
568 if (!getseq (fp1
, &seq1
))
574 while (!keycmp (&seq1
.lines
[seq1
.count
- 1], &seq2
.lines
[0]));
576 /* Keep reading lines from file2 as long as they continue to
577 match the current line from file1. */
580 if (!getseq (fp2
, &seq2
))
586 while (!keycmp (&seq1
.lines
[0], &seq2
.lines
[seq2
.count
- 1]));
590 for (i
= 0; i
< seq1
.count
- 1; ++i
)
591 for (j
= 0; j
< seq2
.count
- 1; ++j
)
592 prjoin (&seq1
.lines
[i
], &seq2
.lines
[j
]);
595 for (i
= 0; i
< seq1
.count
- 1; ++i
)
596 freeline (&seq1
.lines
[i
]);
599 seq1
.lines
[0] = seq1
.lines
[seq1
.count
- 1];
605 for (i
= 0; i
< seq2
.count
- 1; ++i
)
606 freeline (&seq2
.lines
[i
]);
609 seq2
.lines
[0] = seq2
.lines
[seq2
.count
- 1];
616 if (print_unpairables_1
&& seq1
.count
)
618 prjoin (&seq1
.lines
[0], &uni_blank
);
619 freeline (&seq1
.lines
[0]);
620 while (get_line (fp1
, &line
))
622 prjoin (&line
, &uni_blank
);
627 if (print_unpairables_2
&& seq2
.count
)
629 prjoin (&uni_blank
, &seq2
.lines
[0]);
630 freeline (&seq2
.lines
[0]);
631 while (get_line (fp2
, &line
))
633 prjoin (&uni_blank
, &line
);
642 /* Add a field spec for field FIELD of file FILE to `outlist'. */
645 add_field (int file
, int field
)
649 assert (file
== 0 || file
== 1 || file
== 2);
652 o
= (struct outlist
*) xmalloc (sizeof (struct outlist
));
657 /* Add to the end of the list so the fields are in the right order. */
658 outlist_end
->next
= o
;
662 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
663 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
664 If S is valid, return zero. Otherwise, give a diagnostic, don't update
665 *FILE_INDEX or *FIELD_INDEX, and return nonzero. */
668 decode_field_spec (const char *s
, int *file_index
, int *field_index
)
672 /* The first character must be 0, 1, or 2. */
679 /* Leave *field_index undefined. */
684 /* `0' must be all alone -- no `.FIELD'. */
685 error (0, 0, _("invalid field specifier: `%s'"), s
);
691 if (s
[1] == '.' && s
[2] != '\0')
696 s_err
= xstrtol (s
+ 2, NULL
, 10, &tmp_long
, NULL
);
697 if (s_err
!= LONGINT_OK
|| tmp_long
<= 0 || tmp_long
> INT_MAX
)
699 error (0, 0, _("invalid field number: `%s'"), s
+ 2);
703 *file_index
= s
[0] - '0';
704 /* Convert to a zero-based index. */
705 *field_index
= (int) tmp_long
- 1;
712 error (0, 0, _("invalid file number in field spec: `%s'"), s
);
718 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
719 Return nonzero to indicate failure. */
722 add_field_list (const char *c_str
)
726 /* Make a writable copy of c_str. */
727 str
= (char *) alloca (strlen (c_str
) + 1);
734 int file_index
, field_index
;
737 p
= strpbrk (p
, ", \t");
740 invalid
= decode_field_spec (spec_item
, &file_index
, &field_index
);
743 add_field (file_index
, field_index
);
744 uni_blank
.nfields
= max (uni_blank
.nfields
, field_index
);
750 /* Create a blank line with COUNT fields separated by tabs. */
753 make_blank (struct line
*blank
, int count
)
756 blank
->nfields
= count
;
757 blank
->beg
= xmalloc (blank
->nfields
+ 1);
758 blank
->fields
= (struct field
*) xmalloc (sizeof (struct field
) * count
);
759 for (i
= 0; i
< blank
->nfields
; i
++)
761 blank
->beg
[i
] = '\t';
762 blank
->fields
[i
].beg
= &blank
->beg
[i
];
763 blank
->fields
[i
].len
= 0;
765 blank
->beg
[i
] = '\0';
766 blank
->lim
= &blank
->beg
[i
];
770 main (int argc
, char **argv
)
774 int optc
, prev_optc
= 0, nfiles
;
776 program_name
= argv
[0];
777 setlocale (LC_ALL
, "");
778 bindtextdomain (PACKAGE
, LOCALEDIR
);
779 textdomain (PACKAGE
);
781 /* Initialize this before parsing options. In parsing options,
782 it may be increased. */
783 uni_blank
.nfields
= 1;
785 parse_long_options (argc
, argv
, "join", version_string
, usage
);
790 while ((optc
= getopt_long_only (argc
, argv
, "-a:e:i1:2:o:t:v:", longopts
,
805 if (xstrtol (optarg
, NULL
, 10, &val
, NULL
) != LONGINT_OK
806 || (val
!= 1 && val
!= 2))
807 error (2, 0, _("invalid field number: `%s'"), optarg
);
809 print_unpairables_1
= 1;
811 print_unpairables_2
= 1;
815 empty_filler
= optarg
;
823 if (xstrtol (optarg
, NULL
, 10, &val
, NULL
) != LONGINT_OK
824 || val
<= 0 || val
> INT_MAX
)
826 error (2, 0, _("invalid field number for file 1: `%s'"), optarg
);
828 join_field_1
= (int) val
- 1;
832 if (xstrtol (optarg
, NULL
, 10, &val
, NULL
) != LONGINT_OK
833 || val
<= 0 || val
> INT_MAX
)
834 error (2, 0, _("invalid field number for file 2: `%s'"), optarg
);
835 join_field_2
= (int) val
- 1;
839 if (xstrtol (optarg
, NULL
, 10, &val
, NULL
) != LONGINT_OK
840 || val
<= 0 || val
> INT_MAX
)
841 error (2, 0, _("invalid field number: `%s'"), optarg
);
842 join_field_1
= join_field_2
= (int) val
- 1;
846 if (add_field_list (optarg
))
854 case 1: /* Non-option argument. */
855 if (prev_optc
== 'o' && optind
<= argc
- 2)
857 if (add_field_list (optarg
))
860 /* Might be continuation of args to -o. */
861 continue; /* Don't change `prev_optc'. */
866 error (0, 0, _("too many non-option arguments"));
869 names
[nfiles
++] = optarg
;
878 /* Now that we've seen the options, we can construct the blank line
880 make_blank (&uni_blank
, uni_blank
.nfields
);
884 error (0, 0, _("too few non-option arguments"));
888 fp1
= strcmp (names
[0], "-") ? fopen (names
[0], "r") : stdin
;
890 error (1, errno
, "%s", names
[0]);
891 fp2
= strcmp (names
[1], "-") ? fopen (names
[1], "r") : stdin
;
893 error (1, errno
, "%s", names
[1]);
895 error (1, errno
, _("both files cannot be standard input"));
898 if (fp1
!= stdin
&& fclose (fp1
) == EOF
)
899 error (1, errno
, "%s", names
[0]);
900 if (fp2
!= stdin
&& fclose (fp2
) == EOF
)
901 error (1, errno
, "%s", names
[1]);
902 if ((fp1
== stdin
|| fp2
== stdin
) && fclose (stdin
) == EOF
)
903 error (1, errno
, "-");
904 if (ferror (stdout
) || fclose (stdout
) == EOF
)
905 error (1, errno
, _("write error"));