1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2002 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
24 #include <sys/types.h>
30 #include "hard-locale.h"
31 #include "linebuffer.h"
32 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "join"
39 #define AUTHORS "Mike Haertel"
41 #define join system_join
43 /* Undefine, to avoid warning about redefinition on some systems. */
46 #define min(A, B) ((A) < (B) ? (A) : (B))
47 #define max(A, B) ((A) > (B) ? (A) : (B))
49 /* An element of the list identifying which fields to print for each
53 /* File number: 0, 1, or 2. 0 means use the join field.
54 1 means use the first file argument, 2 the second. */
57 /* Field index (zero-based), specified only when FILE is 1 or 2. */
63 /* A field of a line. */
66 const unsigned char *beg
; /* First character in field. */
67 size_t len
; /* The length of the field. */
70 /* A line read from an input file. */
73 struct linebuffer buf
; /* The line itself. */
74 int nfields
; /* Number of elements in `fields'. */
75 int nfields_allocated
; /* Number of elements in `fields'. */
79 /* One or more consecutive lines read from a file that all have the
80 same join field value. */
83 int count
; /* Elements used in `lines'. */
84 int alloc
; /* Elements allocated in `lines'. */
88 /* The name this program was run with. */
91 /* Nonzero if the LC_COLLATE locale is hard. */
92 static int hard_LC_COLLATE
;
94 /* If nonzero, print unpairable lines in file 1 or 2. */
95 static int print_unpairables_1
, print_unpairables_2
;
97 /* If nonzero, print pairable lines. */
98 static int print_pairables
;
100 /* Empty output field filler. */
101 static char *empty_filler
;
103 /* Field to join on. */
104 static int join_field_1
, join_field_2
;
106 /* List of fields to print. */
107 static struct outlist outlist_head
;
109 /* Last element in `outlist', where a new element can be added. */
110 static struct outlist
*outlist_end
= &outlist_head
;
112 /* Tab character separating fields; if this is NUL fields are separated
113 by any nonempty string of white space, otherwise by exactly one
115 static unsigned char tab
;
117 /* When using getopt_long_only, no long option can start with
118 a character that is a short option. */
119 static struct option
const longopts
[] =
121 {"ignore-case", no_argument
, NULL
, 'i'},
122 {"j", required_argument
, NULL
, 'j'},
123 {"j1", required_argument
, NULL
, '1'},
124 {"j2", required_argument
, NULL
, '2'},
125 {GETOPT_HELP_OPTION_DECL
},
126 {GETOPT_VERSION_OPTION_DECL
},
130 /* Used to print non-joining lines */
131 static struct line uni_blank
;
133 /* If nonzero, ignore case when comparing join fields. */
134 static int ignore_case
;
140 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
145 Usage: %s [OPTION]... FILE1 FILE2\n\
149 For each pair of input lines with identical join fields, write a line to\n\
150 standard output. The default join field is the first, delimited\n\
151 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
153 -a SIDE print unpairable lines coming from file SIDE\n\
154 -e EMPTY replace missing input fields with EMPTY\n\
157 -i, --ignore-case ignore differences in case when comparing fields\n\
158 -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
159 -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\
160 -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\
161 -o FORMAT obey FORMAT while constructing output line\n\
162 -t CHAR use CHAR as input and output field separator\n\
165 -v SIDE like -a SIDE, but suppress joined output lines\n\
166 -1 FIELD join on this FIELD of file 1\n\
167 -2 FIELD join on this FIELD of file 2\n\
169 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
170 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
173 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
174 else fields are separated by CHAR. Any FIELD is a field number counted\n\
175 from 1. FORMAT is one or more comma or blank separated specifications,\n\
176 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
177 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
178 separated by CHAR.\n\
180 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT
);
182 exit (status
== 0 ? EXIT_SUCCESS
: EXIT_FAILURE
);
186 ADD_FIELD (struct line
*line
, const unsigned char *field
, size_t len
)
188 if (line
->nfields
>= line
->nfields_allocated
)
190 line
->nfields_allocated
= (3 * line
->nfields_allocated
) / 2 + 1;
191 line
->fields
= (struct field
*) xrealloc ((char *) line
->fields
,
192 (line
->nfields_allocated
193 * sizeof (struct field
)));
195 line
->fields
[line
->nfields
].beg
= field
;
196 line
->fields
[line
->nfields
].len
= len
;
200 /* Fill in the `fields' structure in LINE. */
203 xfields (struct line
*line
)
206 unsigned char *ptr0
= (unsigned char *) line
->buf
.buffer
;
211 lim
= ptr0
+ line
->buf
.length
- 1;
215 /* Skip leading blanks before the first field. */
216 while (ptr
< lim
&& ISBLANK (*ptr
))
220 for (i
= 0; ptr
< lim
; ++i
)
227 while (ptr
< lim
&& *ptr
!= tab
)
229 ADD_FIELD (line
, beg
, ptr
- beg
);
238 while (ptr
< lim
&& !ISBLANK (*ptr
))
240 ADD_FIELD (line
, beg
, ptr
- beg
);
241 while (ptr
< lim
&& ISBLANK (*ptr
))
246 if (ptr
!= ptr0
&& ((!tab
&& ISBLANK (ptr
[-1])) || ptr
[-1] == tab
))
248 /* Add one more (empty) field because the last character of the
249 line was a delimiter. */
250 ADD_FIELD (line
, NULL
, 0);
254 /* Read a line from FP into LINE and split it into fields.
255 Return 0 if EOF, 1 otherwise. */
258 get_line (FILE *fp
, struct line
*line
)
260 initbuffer (&line
->buf
);
262 if (! readline (&line
->buf
, fp
))
264 free (line
->buf
.buffer
);
265 line
->buf
.buffer
= NULL
;
269 line
->nfields_allocated
= 0;
277 freeline (struct line
*line
)
279 free ((char *) line
->fields
);
280 free (line
->buf
.buffer
);
281 line
->buf
.buffer
= NULL
;
285 initseq (struct seq
*seq
)
289 seq
->lines
= (struct line
*) xmalloc (seq
->alloc
* sizeof (struct line
));
292 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
295 getseq (FILE *fp
, struct seq
*seq
)
297 if (seq
->count
== seq
->alloc
)
300 seq
->lines
= (struct line
*)
301 xrealloc ((char *) seq
->lines
, seq
->alloc
* sizeof (struct line
));
304 if (get_line (fp
, &seq
->lines
[seq
->count
]))
313 delseq (struct seq
*seq
)
316 for (i
= 0; i
< seq
->count
; i
++)
317 if (seq
->lines
[i
].buf
.buffer
)
318 freeline (&seq
->lines
[i
]);
319 free ((char *) seq
->lines
);
322 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
323 >0 if it compares greater; 0 if it compares equal.
324 Report an error and exit if the comparison fails. */
327 keycmp (struct line
*line1
, struct line
*line2
)
329 /* Start of field to compare in each file. */
330 const unsigned char *beg1
, *beg2
;
332 size_t len1
, len2
; /* Length of fields to compare. */
335 if (join_field_1
< line1
->nfields
)
337 beg1
= line1
->fields
[join_field_1
].beg
;
338 len1
= line1
->fields
[join_field_1
].len
;
346 if (join_field_2
< line2
->nfields
)
348 beg2
= line2
->fields
[join_field_2
].beg
;
349 len2
= line2
->fields
[join_field_2
].len
;
358 return len2
== 0 ? 0 : -1;
362 /* Use an if-statement here rather than a function variable to
363 avoid portability hassles of getting a non-conflicting declaration
367 /* FIXME: ignore_case does not work with NLS (in particular,
368 with multibyte chars). */
369 diff
= memcasecmp (beg1
, beg2
, min (len1
, len2
));
373 if (HAVE_SETLOCALE
&& hard_LC_COLLATE
)
374 return xmemcoll ((char *) beg1
, len1
, (char *) beg2
, len2
);
375 diff
= memcmp (beg1
, beg2
, min (len1
, len2
));
380 return len1
< len2
? -1 : len1
!= len2
;
383 /* Print field N of LINE if it exists and is nonempty, otherwise
384 `empty_filler' if it is nonempty. */
387 prfield (int n
, struct line
*line
)
391 if (n
< line
->nfields
)
393 len
= line
->fields
[n
].len
;
395 fwrite (line
->fields
[n
].beg
, 1, len
, stdout
);
396 else if (empty_filler
)
397 fputs (empty_filler
, stdout
);
399 else if (empty_filler
)
400 fputs (empty_filler
, stdout
);
403 /* Print the join of LINE1 and LINE2. */
406 prjoin (struct line
*line1
, struct line
*line2
)
408 const struct outlist
*outlist
;
410 outlist
= outlist_head
.next
;
413 const struct outlist
*o
;
423 if (line1
== &uni_blank
)
426 field
= join_field_2
;
431 field
= join_field_1
;
436 line
= (o
->file
== 1 ? line1
: line2
);
437 assert (o
->field
>= 0);
440 prfield (field
, line
);
444 putchar (tab
? tab
: ' ');
452 if (line1
== &uni_blank
)
459 prfield (join_field_1
, line1
);
460 for (i
= 0; i
< join_field_1
&& i
< line1
->nfields
; ++i
)
462 putchar (tab
? tab
: ' ');
465 for (i
= join_field_1
+ 1; i
< line1
->nfields
; ++i
)
467 putchar (tab
? tab
: ' ');
471 for (i
= 0; i
< join_field_2
&& i
< line2
->nfields
; ++i
)
473 putchar (tab
? tab
: ' ');
476 for (i
= join_field_2
+ 1; i
< line2
->nfields
; ++i
)
478 putchar (tab
? tab
: ' ');
485 /* Print the join of the files in FP1 and FP2. */
488 join (FILE *fp1
, FILE *fp2
)
490 struct seq seq1
, seq2
;
492 int diff
, i
, j
, eof1
, eof2
;
494 /* Read the first line of each file. */
500 while (seq1
.count
&& seq2
.count
)
502 diff
= keycmp (&seq1
.lines
[0], &seq2
.lines
[0]);
505 if (print_unpairables_1
)
506 prjoin (&seq1
.lines
[0], &uni_blank
);
507 freeline (&seq1
.lines
[0]);
514 if (print_unpairables_2
)
515 prjoin (&uni_blank
, &seq2
.lines
[0]);
516 freeline (&seq2
.lines
[0]);
522 /* Keep reading lines from file1 as long as they continue to
523 match the current line from file2. */
526 if (!getseq (fp1
, &seq1
))
532 while (!keycmp (&seq1
.lines
[seq1
.count
- 1], &seq2
.lines
[0]));
534 /* Keep reading lines from file2 as long as they continue to
535 match the current line from file1. */
538 if (!getseq (fp2
, &seq2
))
544 while (!keycmp (&seq1
.lines
[0], &seq2
.lines
[seq2
.count
- 1]));
548 for (i
= 0; i
< seq1
.count
- 1; ++i
)
549 for (j
= 0; j
< seq2
.count
- 1; ++j
)
550 prjoin (&seq1
.lines
[i
], &seq2
.lines
[j
]);
553 for (i
= 0; i
< seq1
.count
- 1; ++i
)
554 freeline (&seq1
.lines
[i
]);
557 seq1
.lines
[0] = seq1
.lines
[seq1
.count
- 1];
563 for (i
= 0; i
< seq2
.count
- 1; ++i
)
564 freeline (&seq2
.lines
[i
]);
567 seq2
.lines
[0] = seq2
.lines
[seq2
.count
- 1];
574 if (print_unpairables_1
&& seq1
.count
)
576 prjoin (&seq1
.lines
[0], &uni_blank
);
577 freeline (&seq1
.lines
[0]);
578 while (get_line (fp1
, &line
))
580 prjoin (&line
, &uni_blank
);
585 if (print_unpairables_2
&& seq2
.count
)
587 prjoin (&uni_blank
, &seq2
.lines
[0]);
588 freeline (&seq2
.lines
[0]);
589 while (get_line (fp2
, &line
))
591 prjoin (&uni_blank
, &line
);
600 /* Add a field spec for field FIELD of file FILE to `outlist'. */
603 add_field (int file
, int field
)
607 assert (file
== 0 || file
== 1 || file
== 2);
608 assert (file
== 0 ? field
< 0 : field
>= 0);
610 o
= (struct outlist
*) xmalloc (sizeof (struct outlist
));
615 /* Add to the end of the list so the fields are in the right order. */
616 outlist_end
->next
= o
;
620 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
621 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
622 If S is valid, return zero. Otherwise, give a diagnostic, don't update
623 *FILE_INDEX or *FIELD_INDEX, and return nonzero. */
626 decode_field_spec (const char *s
, int *file_index
, int *field_index
)
630 /* The first character must be 0, 1, or 2. */
637 /* Give *field_index an invalid value. */
643 /* `0' must be all alone -- no `.FIELD'. */
644 error (0, 0, _("invalid field specifier: `%s'"), s
);
650 if (s
[1] == '.' && s
[2] != '\0')
655 s_err
= xstrtol (s
+ 2, NULL
, 10, &tmp_long
, "");
656 if (s_err
!= LONGINT_OK
|| tmp_long
<= 0 || tmp_long
> INT_MAX
)
658 error (0, 0, _("invalid field number: `%s'"), s
+ 2);
662 *file_index
= s
[0] - '0';
663 /* Convert to a zero-based index. */
664 *field_index
= (int) tmp_long
- 1;
671 error (0, 0, _("invalid file number in field spec: `%s'"), s
);
677 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
678 Return nonzero to indicate failure. */
681 add_field_list (const char *c_str
)
685 /* Make a writable copy of c_str. */
686 str
= (char *) alloca (strlen (c_str
) + 1);
693 int file_index
, field_index
;
696 p
= strpbrk (p
, ", \t");
699 invalid
= decode_field_spec (spec_item
, &file_index
, &field_index
);
702 add_field (file_index
, field_index
);
703 uni_blank
.nfields
= max (uni_blank
.nfields
, field_index
);
709 /* Create a blank line with COUNT fields separated by tabs. */
712 make_blank (struct line
*blank
, int count
)
715 unsigned char *buffer
;
716 struct field
*fields
;
717 blank
->nfields
= count
;
718 blank
->buf
.size
= blank
->buf
.length
= count
+ 1;
719 blank
->buf
.buffer
= xmalloc (blank
->buf
.size
);
720 buffer
= (unsigned char *) blank
->buf
.buffer
;
721 blank
->fields
= fields
=
722 (struct field
*) xmalloc (sizeof (struct field
) * count
);
723 for (i
= 0; i
< count
; i
++)
726 fields
[i
].beg
= &buffer
[i
];
733 main (int argc
, char **argv
)
737 int optc
, prev_optc
= 0, nfiles
;
739 program_name
= argv
[0];
740 setlocale (LC_ALL
, "");
741 bindtextdomain (PACKAGE
, LOCALEDIR
);
742 textdomain (PACKAGE
);
743 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
745 atexit (close_stdout
);
747 /* Initialize this before parsing options. In parsing options,
748 it may be increased. */
749 uni_blank
.nfields
= 1;
754 while ((optc
= getopt_long_only (argc
, argv
, "-a:e:i1:2:o:t:v:", longopts
,
769 if (xstrtol (optarg
, NULL
, 10, &val
, "") != LONGINT_OK
770 || (val
!= 1 && val
!= 2))
771 error (EXIT_FAILURE
, 0, _("invalid field number: `%s'"), optarg
);
773 print_unpairables_1
= 1;
775 print_unpairables_2
= 1;
779 empty_filler
= optarg
;
787 if (xstrtol (optarg
, NULL
, 10, &val
, "") != LONGINT_OK
788 || val
<= 0 || val
> INT_MAX
)
790 error (EXIT_FAILURE
, 0,
791 _("invalid field number for file 1: `%s'"), optarg
);
793 join_field_1
= (int) val
- 1;
797 if (xstrtol (optarg
, NULL
, 10, &val
, "") != LONGINT_OK
798 || val
<= 0 || val
> INT_MAX
)
799 error (EXIT_FAILURE
, 0,
800 _("invalid field number for file 2: `%s'"), optarg
);
801 join_field_2
= (int) val
- 1;
805 if (xstrtol (optarg
, NULL
, 10, &val
, "") != LONGINT_OK
806 || val
<= 0 || val
> INT_MAX
)
807 error (EXIT_FAILURE
, 0, _("invalid field number: `%s'"), optarg
);
808 join_field_1
= join_field_2
= (int) val
- 1;
812 if (add_field_list (optarg
))
820 case 1: /* Non-option argument. */
821 if (prev_optc
== 'o' && optind
<= argc
- 2)
823 if (add_field_list (optarg
))
826 /* Might be continuation of args to -o. */
827 continue; /* Don't change `prev_optc'. */
832 error (0, 0, _("too many non-option arguments"));
835 names
[nfiles
++] = optarg
;
838 case_GETOPT_HELP_CHAR
;
840 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
848 /* Now that we've seen the options, we can construct the blank line
850 make_blank (&uni_blank
, uni_blank
.nfields
);
854 error (0, 0, _("too few non-option arguments"));
858 fp1
= STREQ (names
[0], "-") ? stdin
: fopen (names
[0], "r");
860 error (EXIT_FAILURE
, errno
, "%s", names
[0]);
861 fp2
= STREQ (names
[1], "-") ? stdin
: fopen (names
[1], "r");
863 error (EXIT_FAILURE
, errno
, "%s", names
[1]);
865 error (EXIT_FAILURE
, errno
, _("both files cannot be standard input"));
868 if (fp1
!= stdin
&& fclose (fp1
) == EOF
)
869 error (EXIT_FAILURE
, errno
, "%s", names
[0]);
870 if (fp2
!= stdin
&& fclose (fp2
) == EOF
)
871 error (EXIT_FAILURE
, errno
, "%s", names
[1]);
872 if ((fp1
== stdin
|| fp2
== stdin
) && fclose (stdin
) == EOF
)
873 error (EXIT_FAILURE
, errno
, "-");