1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2003 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
24 #include <sys/types.h>
29 #include "hard-locale.h"
30 #include "linebuffer.h"
31 #include "memcasecmp.h"
37 /* The official name of this program (e.g., no `g' prefix). */
38 #define PROGRAM_NAME "join"
40 #define AUTHORS "Mike Haertel"
42 #define join system_join
44 /* An element of the list identifying which fields to print for each
48 /* File number: 0, 1, or 2. 0 means use the join field.
49 1 means use the first file argument, 2 the second. */
52 /* Field index (zero-based), specified only when FILE is 1 or 2. */
58 /* A field of a line. */
61 char *beg
; /* First character in field. */
62 size_t len
; /* The length of the field. */
65 /* A line read from an input file. */
68 struct linebuffer buf
; /* The line itself. */
69 size_t nfields
; /* Number of elements in `fields'. */
70 size_t nfields_allocated
; /* Number of elements allocated for `fields'. */
74 /* One or more consecutive lines read from a file that all have the
75 same join field value. */
78 size_t count
; /* Elements used in `lines'. */
79 size_t alloc
; /* Elements allocated in `lines'. */
83 /* The name this program was run with. */
86 /* True if the LC_COLLATE locale is hard. */
87 static bool hard_LC_COLLATE
;
89 /* True if obsolete option usage should be supported. */
90 static bool obsolete_usage
;
92 /* If nonzero, print unpairable lines in file 1 or 2. */
93 static bool print_unpairables_1
, print_unpairables_2
;
95 /* If nonzero, print pairable lines. */
96 static bool print_pairables
;
98 /* Empty output field filler. */
99 static char const *empty_filler
;
101 /* Field to join on. */
102 static size_t join_field_1
, join_field_2
;
104 /* List of fields to print. */
105 static struct outlist outlist_head
;
107 /* Last element in `outlist', where a new element can be added. */
108 static struct outlist
*outlist_end
= &outlist_head
;
110 /* Tab character separating fields; if this is NUL fields are separated
111 by any nonempty string of white space, otherwise by exactly one
115 /* When using getopt_long_only, no long option can start with
116 a character that is a short option. */
117 static struct option
const longopts
[] =
119 /* These three options are obsolete; see OBSOLETE_LONG_OPTIONS below. */
120 {"j", required_argument
, NULL
, 'j'},
121 {"j1", required_argument
, NULL
, '1'},
122 {"j2", required_argument
, NULL
, '2'},
124 {"ignore-case", no_argument
, NULL
, 'i'},
125 {GETOPT_HELP_OPTION_DECL
},
126 {GETOPT_VERSION_OPTION_DECL
},
130 /* Number of options at the start of longopts that are obsolete. */
131 enum { OBSOLETE_LONG_OPTIONS
= 3 };
133 /* Used to print non-joining lines */
134 static struct line uni_blank
;
136 /* If nonzero, ignore case when comparing join fields. */
137 static bool ignore_case
;
139 /* Get the next option from the argument vector. */
142 get_option (int argc
, char **argv
)
144 return (obsolete_usage
145 ? getopt_long_only (argc
, argv
, "-a:e:i1:2:o:t:v:", longopts
, NULL
)
146 : getopt_long (argc
, argv
, "a:e:ij:1:2:o:t:v:",
147 longopts
+ OBSOLETE_LONG_OPTIONS
, NULL
));
154 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
159 Usage: %s [OPTION]... FILE1 FILE2\n\
163 For each pair of input lines with identical join fields, write a line to\n\
164 standard output. The default join field is the first, delimited\n\
165 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
167 -a FILENUM print unpairable lines coming from file FILENUM, where\n\
168 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
169 -e EMPTY replace missing input fields with EMPTY\n\
172 -i, --ignore-case ignore differences in case when comparing fields\n\
173 -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
174 -o FORMAT obey FORMAT while constructing output line\n\
175 -t CHAR use CHAR as input and output field separator\n\
178 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
179 -1 FIELD join on this FIELD of file 1\n\
180 -2 FIELD join on this FIELD of file 2\n\
182 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
183 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
186 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
187 else fields are separated by CHAR. Any FIELD is a field number counted\n\
188 from 1. FORMAT is one or more comma or blank separated specifications,\n\
189 each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
190 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
191 separated by CHAR.\n\
193 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
195 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT
);
197 exit (status
== 0 ? EXIT_SUCCESS
: EXIT_FAILURE
);
200 /* Return true if C is a blank (a default input field separator). */
203 is_blank (unsigned char c
)
205 return ISBLANK (c
) != 0;
208 /* Record a field in LINE, with location FIELD and size LEN. */
211 extract_field (struct line
*line
, char *field
, size_t len
)
213 if (line
->nfields
>= line
->nfields_allocated
)
215 line
->fields
= x2nrealloc (line
->fields
, &line
->nfields_allocated
,
216 sizeof (struct field
));
218 line
->fields
[line
->nfields
].beg
= field
;
219 line
->fields
[line
->nfields
].len
= len
;
223 /* Fill in the `fields' structure in LINE. */
226 xfields (struct line
*line
)
228 char *ptr
= line
->buf
.buffer
;
229 char const *lim
= ptr
+ line
->buf
.length
- 1;
236 unsigned char t
= tab
;
238 for (; (sep
= memchr (ptr
, t
, lim
- ptr
)) != NULL
; ptr
= sep
+ 1)
239 extract_field (line
, ptr
, sep
- ptr
);
243 /* Skip leading blanks before the first field. */
244 while (is_blank (*ptr
))
251 for (sep
= ptr
+ 1; sep
!= lim
&& ! is_blank (*sep
); sep
++)
253 extract_field (line
, ptr
, sep
- ptr
);
256 for (ptr
= sep
+ 1; ptr
!= lim
&& is_blank (*ptr
); ptr
++)
262 extract_field (line
, ptr
, lim
- ptr
);
265 /* Read a line from FP into LINE and split it into fields.
266 Return true if successful. */
269 get_line (FILE *fp
, struct line
*line
)
271 initbuffer (&line
->buf
);
273 if (! readlinebuffer (&line
->buf
, fp
))
276 error (EXIT_FAILURE
, errno
, _("read error"));
277 free (line
->buf
.buffer
);
278 line
->buf
.buffer
= NULL
;
282 line
->nfields_allocated
= 0;
290 freeline (struct line
*line
)
293 free (line
->buf
.buffer
);
294 line
->buf
.buffer
= NULL
;
298 initseq (struct seq
*seq
)
305 /* Read a line from FP and add it to SEQ. Return true if successful. */
308 getseq (FILE *fp
, struct seq
*seq
)
310 if (seq
->count
== seq
->alloc
)
311 seq
->lines
= x2nrealloc (seq
->lines
, &seq
->alloc
, sizeof *seq
->lines
);
313 if (get_line (fp
, &seq
->lines
[seq
->count
]))
322 delseq (struct seq
*seq
)
325 for (i
= 0; i
< seq
->count
; i
++)
326 if (seq
->lines
[i
].buf
.buffer
)
327 freeline (&seq
->lines
[i
]);
331 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
332 >0 if it compares greater; 0 if it compares equal.
333 Report an error and exit if the comparison fails. */
336 keycmp (struct line
const *line1
, struct line
const *line2
)
338 /* Start of field to compare in each file. */
343 size_t len2
; /* Length of fields to compare. */
346 if (join_field_1
< line1
->nfields
)
348 beg1
= line1
->fields
[join_field_1
].beg
;
349 len1
= line1
->fields
[join_field_1
].len
;
357 if (join_field_2
< line2
->nfields
)
359 beg2
= line2
->fields
[join_field_2
].beg
;
360 len2
= line2
->fields
[join_field_2
].len
;
369 return len2
== 0 ? 0 : -1;
375 /* FIXME: ignore_case does not work with NLS (in particular,
376 with multibyte chars). */
377 diff
= memcasecmp (beg1
, beg2
, MIN (len1
, len2
));
381 if (HAVE_SETLOCALE
&& hard_LC_COLLATE
)
382 return xmemcoll (beg1
, len1
, beg2
, len2
);
383 diff
= memcmp (beg1
, beg2
, MIN (len1
, len2
));
388 return len1
< len2
? -1 : len1
!= len2
;
391 /* Print field N of LINE if it exists and is nonempty, otherwise
392 `empty_filler' if it is nonempty. */
395 prfield (size_t n
, struct line
const *line
)
399 if (n
< line
->nfields
)
401 len
= line
->fields
[n
].len
;
403 fwrite (line
->fields
[n
].beg
, 1, len
, stdout
);
404 else if (empty_filler
)
405 fputs (empty_filler
, stdout
);
407 else if (empty_filler
)
408 fputs (empty_filler
, stdout
);
411 /* Print the join of LINE1 and LINE2. */
414 prjoin (struct line
const *line1
, struct line
const *line2
)
416 const struct outlist
*outlist
;
417 char output_separator
= tab
? tab
: ' ';
419 outlist
= outlist_head
.next
;
422 const struct outlist
*o
;
428 struct line
const *line
;
432 if (line1
== &uni_blank
)
435 field
= join_field_2
;
440 field
= join_field_1
;
445 line
= (o
->file
== 1 ? line1
: line2
);
448 prfield (field
, line
);
452 putchar (output_separator
);
460 if (line1
== &uni_blank
)
462 struct line
const *t
;
467 prfield (join_field_1
, line1
);
468 for (i
= 0; i
< join_field_1
&& i
< line1
->nfields
; ++i
)
470 putchar (output_separator
);
473 for (i
= join_field_1
+ 1; i
< line1
->nfields
; ++i
)
475 putchar (output_separator
);
479 for (i
= 0; i
< join_field_2
&& i
< line2
->nfields
; ++i
)
481 putchar (output_separator
);
484 for (i
= join_field_2
+ 1; i
< line2
->nfields
; ++i
)
486 putchar (output_separator
);
493 /* Print the join of the files in FP1 and FP2. */
496 join (FILE *fp1
, FILE *fp2
)
498 struct seq seq1
, seq2
;
503 /* Read the first line of each file. */
509 while (seq1
.count
&& seq2
.count
)
512 diff
= keycmp (&seq1
.lines
[0], &seq2
.lines
[0]);
515 if (print_unpairables_1
)
516 prjoin (&seq1
.lines
[0], &uni_blank
);
517 freeline (&seq1
.lines
[0]);
524 if (print_unpairables_2
)
525 prjoin (&uni_blank
, &seq2
.lines
[0]);
526 freeline (&seq2
.lines
[0]);
532 /* Keep reading lines from file1 as long as they continue to
533 match the current line from file2. */
536 if (!getseq (fp1
, &seq1
))
542 while (!keycmp (&seq1
.lines
[seq1
.count
- 1], &seq2
.lines
[0]));
544 /* Keep reading lines from file2 as long as they continue to
545 match the current line from file1. */
548 if (!getseq (fp2
, &seq2
))
554 while (!keycmp (&seq1
.lines
[0], &seq2
.lines
[seq2
.count
- 1]));
558 for (i
= 0; i
< seq1
.count
- 1; ++i
)
561 for (j
= 0; j
< seq2
.count
- 1; ++j
)
562 prjoin (&seq1
.lines
[i
], &seq2
.lines
[j
]);
566 for (i
= 0; i
< seq1
.count
- 1; ++i
)
567 freeline (&seq1
.lines
[i
]);
570 seq1
.lines
[0] = seq1
.lines
[seq1
.count
- 1];
576 for (i
= 0; i
< seq2
.count
- 1; ++i
)
577 freeline (&seq2
.lines
[i
]);
580 seq2
.lines
[0] = seq2
.lines
[seq2
.count
- 1];
587 if (print_unpairables_1
&& seq1
.count
)
589 prjoin (&seq1
.lines
[0], &uni_blank
);
590 freeline (&seq1
.lines
[0]);
591 while (get_line (fp1
, &line
))
593 prjoin (&line
, &uni_blank
);
598 if (print_unpairables_2
&& seq2
.count
)
600 prjoin (&uni_blank
, &seq2
.lines
[0]);
601 freeline (&seq2
.lines
[0]);
602 while (get_line (fp2
, &line
))
604 prjoin (&uni_blank
, &line
);
613 /* Add a field spec for field FIELD of file FILE to `outlist'. */
616 add_field (int file
, size_t field
)
620 assert (file
== 0 || file
== 1 || file
== 2);
621 assert (file
!= 0 || field
== 0);
623 o
= xmalloc (sizeof *o
);
628 /* Add to the end of the list so the fields are in the right order. */
629 outlist_end
->next
= o
;
633 /* Convert a string of decimal digits, STR (the 1-based join field number),
634 to an integral value. Upon successful conversion, return one less
635 (the zero-based field number). If it cannot be converted, give a
636 diagnostic and exit. */
639 string_to_join_field (char const *str
, char const *err_msg_fmt
)
644 strtol_error s_err
= xstrtoumax (str
, NULL
, 10, &val
, "");
645 if (s_err
== LONGINT_OVERFLOW
|| SIZE_MAX
< val
)
647 error (EXIT_FAILURE
, 0,
648 _("value %s is so large that it is not representable"),
652 if (s_err
!= LONGINT_OK
|| val
== 0)
653 error (EXIT_FAILURE
, 0, err_msg_fmt
, quote (str
));
660 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
661 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
662 If S is valid, return true. Otherwise, give a diagnostic, don't update
663 *FILE_INDEX or *FIELD_INDEX, and return false. */
666 decode_field_spec (const char *s
, int *file_index
, size_t *field_index
)
670 /* The first character must be 0, 1, or 2. */
682 /* `0' must be all alone -- no `.FIELD'. */
683 error (0, 0, _("invalid field specifier: `%s'"), s
);
689 if (s
[1] == '.' && s
[2] != '\0')
692 = string_to_join_field (s
+ 2, _("invalid field number: %s"));
693 *file_index
= s
[0] - '0';
699 error (0, 0, _("invalid file number in field spec: `%s'"), s
);
705 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
706 Return true if successful. */
709 add_field_list (char *str
)
717 char const *spec_item
= p
;
719 p
= strpbrk (p
, ", \t");
722 if (! decode_field_spec (spec_item
, &file_index
, &field_index
))
724 add_field (file_index
, field_index
);
731 /* Add NAME to the array of input file NAMES; currently there are
732 *NFILES names in the list. */
735 add_file_name (char const *name
, char const *names
[2], int *nfiles
)
739 error (0, 0, _("too many non-option arguments"));
740 usage (EXIT_FAILURE
);
742 names
[(*nfiles
)++] = name
;
746 main (int argc
, char **argv
)
748 char const *names
[2];
750 int optc
, prev_optc
= 0, nfiles
;
752 initialize_main (&argc
, &argv
);
753 program_name
= argv
[0];
754 setlocale (LC_ALL
, "");
755 bindtextdomain (PACKAGE
, LOCALEDIR
);
756 textdomain (PACKAGE
);
757 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
758 obsolete_usage
= (posix2_version () < 200112);
760 atexit (close_stdout
);
763 print_pairables
= true;
765 while ((optc
= get_option (argc
, argv
)) != -1)
775 print_pairables
= false;
779 if (xstrtol (optarg
, NULL
, 10, &val
, "") != LONGINT_OK
780 || (val
!= 1 && val
!= 2))
781 error (EXIT_FAILURE
, 0, _("invalid field number: `%s'"), optarg
);
783 print_unpairables_1
= true;
785 print_unpairables_2
= true;
789 empty_filler
= optarg
;
798 string_to_join_field (optarg
,
799 _("invalid field number for file 1: `%s'"));
804 string_to_join_field (optarg
,
805 _("invalid field number for file 2: `%s'"));
809 join_field_1
= join_field_2
=
810 string_to_join_field (optarg
,
811 _("invalid field number: `%s'"));
815 if (! add_field_list (optarg
))
823 case 1: /* Non-option argument. */
824 if (prev_optc
== 'o' && optind
<= argc
- 2)
826 if (! add_field_list (optarg
))
829 /* Might be continuation of args to -o. */
830 continue; /* Don't change `prev_optc'. */
832 add_file_name (optarg
, names
, &nfiles
);
835 case_GETOPT_HELP_CHAR
;
837 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
840 usage (EXIT_FAILURE
);
845 if (! obsolete_usage
)
846 while (optind
< argc
)
847 add_file_name (argv
[optind
++], names
, &nfiles
);
851 error (0, 0, _("too few non-option arguments"));
852 usage (EXIT_FAILURE
);
855 fp1
= STREQ (names
[0], "-") ? stdin
: fopen (names
[0], "r");
857 error (EXIT_FAILURE
, errno
, "%s", names
[0]);
858 fp2
= STREQ (names
[1], "-") ? stdin
: fopen (names
[1], "r");
860 error (EXIT_FAILURE
, errno
, "%s", names
[1]);
862 error (EXIT_FAILURE
, errno
, _("both files cannot be standard input"));
865 if (fp1
!= stdin
&& fclose (fp1
) == EOF
)
866 error (EXIT_FAILURE
, errno
, "%s", names
[0]);
867 if (fp2
!= stdin
&& fclose (fp2
) == EOF
)
868 error (EXIT_FAILURE
, errno
, "%s", names
[1]);
869 if ((fp1
== stdin
|| fp2
== stdin
) && fclose (stdin
) == EOF
)
870 error (EXIT_FAILURE
, errno
, "-");