1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
23 #include <sys/types.h>
28 #include "hard-locale.h"
29 #include "linebuffer.h"
30 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "join"
39 #define AUTHORS "Mike Haertel"
41 #define join system_join
43 /* An element of the list identifying which fields to print for each
47 /* File number: 0, 1, or 2. 0 means use the join field.
48 1 means use the first file argument, 2 the second. */
51 /* Field index (zero-based), specified only when FILE is 1 or 2. */
57 /* A field of a line. */
60 char *beg
; /* First character in field. */
61 size_t len
; /* The length of the field. */
64 /* A line read from an input file. */
67 struct linebuffer buf
; /* The line itself. */
68 size_t nfields
; /* Number of elements in `fields'. */
69 size_t nfields_allocated
; /* Number of elements allocated for `fields'. */
73 /* One or more consecutive lines read from a file that all have the
74 same join field value. */
77 size_t count
; /* Elements used in `lines'. */
78 size_t alloc
; /* Elements allocated in `lines'. */
82 /* The name this program was run with. */
85 /* True if the LC_COLLATE locale is hard. */
86 static bool hard_LC_COLLATE
;
88 /* If nonzero, print unpairable lines in file 1 or 2. */
89 static bool print_unpairables_1
, print_unpairables_2
;
91 /* If nonzero, print pairable lines. */
92 static bool print_pairables
;
94 /* Empty output field filler. */
95 static char const *empty_filler
;
97 /* Field to join on; SIZE_MAX means they haven't been determined yet. */
98 static size_t join_field_1
= SIZE_MAX
;
99 static size_t join_field_2
= SIZE_MAX
;
101 /* List of fields to print. */
102 static struct outlist outlist_head
;
104 /* Last element in `outlist', where a new element can be added. */
105 static struct outlist
*outlist_end
= &outlist_head
;
107 /* Tab character separating fields. If negative, fields are separated
108 by any nonempty string of blanks, otherwise by exactly one
109 tab character whose value (when cast to unsigned char) equals TAB. */
112 static struct option
const longopts
[] =
114 {"ignore-case", no_argument
, NULL
, 'i'},
115 {GETOPT_HELP_OPTION_DECL
},
116 {GETOPT_VERSION_OPTION_DECL
},
120 /* Used to print non-joining lines */
121 static struct line uni_blank
;
123 /* If nonzero, ignore case when comparing join fields. */
124 static bool ignore_case
;
129 if (status
!= EXIT_SUCCESS
)
130 fprintf (stderr
, _("Try `%s --help' for more information.\n"),
135 Usage: %s [OPTION]... FILE1 FILE2\n\
139 For each pair of input lines with identical join fields, write a line to\n\
140 standard output. The default join field is the first, delimited\n\
141 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
143 -a FILENUM print unpairable lines coming from file FILENUM, where\n\
144 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
145 -e EMPTY replace missing input fields with EMPTY\n\
148 -i, --ignore-case ignore differences in case when comparing fields\n\
149 -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
150 -o FORMAT obey FORMAT while constructing output line\n\
151 -t CHAR use CHAR as input and output field separator\n\
154 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
155 -1 FIELD join on this FIELD of file 1\n\
156 -2 FIELD join on this FIELD of file 2\n\
158 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
159 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
162 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
163 else fields are separated by CHAR. Any FIELD is a field number counted\n\
164 from 1. FORMAT is one or more comma or blank separated specifications,\n\
165 each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
166 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
167 separated by CHAR.\n\
169 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
170 E.g., use `sort -k 1b,1' if `join' has no options.\n\
172 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT
);
177 /* Record a field in LINE, with location FIELD and size LEN. */
180 extract_field (struct line
*line
, char *field
, size_t len
)
182 if (line
->nfields
>= line
->nfields_allocated
)
184 line
->fields
= X2NREALLOC (line
->fields
, &line
->nfields_allocated
);
186 line
->fields
[line
->nfields
].beg
= field
;
187 line
->fields
[line
->nfields
].len
= len
;
191 /* Fill in the `fields' structure in LINE. */
194 xfields (struct line
*line
)
196 char *ptr
= line
->buf
.buffer
;
197 char const *lim
= ptr
+ line
->buf
.length
- 1;
205 for (; (sep
= memchr (ptr
, tab
, lim
- ptr
)) != NULL
; ptr
= sep
+ 1)
206 extract_field (line
, ptr
, sep
- ptr
);
210 /* Skip leading blanks before the first field. */
211 while (isblank (to_uchar (*ptr
)))
218 for (sep
= ptr
+ 1; sep
!= lim
&& ! isblank (to_uchar (*sep
)); sep
++)
220 extract_field (line
, ptr
, sep
- ptr
);
223 for (ptr
= sep
+ 1; ptr
!= lim
&& isblank (to_uchar (*ptr
)); ptr
++)
229 extract_field (line
, ptr
, lim
- ptr
);
232 /* Read a line from FP into LINE and split it into fields.
233 Return true if successful. */
236 get_line (FILE *fp
, struct line
*line
)
238 initbuffer (&line
->buf
);
240 if (! readlinebuffer (&line
->buf
, fp
))
243 error (EXIT_FAILURE
, errno
, _("read error"));
244 free (line
->buf
.buffer
);
245 line
->buf
.buffer
= NULL
;
249 line
->nfields_allocated
= 0;
257 freeline (struct line
*line
)
260 free (line
->buf
.buffer
);
261 line
->buf
.buffer
= NULL
;
265 initseq (struct seq
*seq
)
272 /* Read a line from FP and add it to SEQ. Return true if successful. */
275 getseq (FILE *fp
, struct seq
*seq
)
277 if (seq
->count
== seq
->alloc
)
278 seq
->lines
= X2NREALLOC (seq
->lines
, &seq
->alloc
);
280 if (get_line (fp
, &seq
->lines
[seq
->count
]))
289 delseq (struct seq
*seq
)
292 for (i
= 0; i
< seq
->count
; i
++)
293 if (seq
->lines
[i
].buf
.buffer
)
294 freeline (&seq
->lines
[i
]);
298 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
299 >0 if it compares greater; 0 if it compares equal.
300 Report an error and exit if the comparison fails. */
303 keycmp (struct line
const *line1
, struct line
const *line2
)
305 /* Start of field to compare in each file. */
310 size_t len2
; /* Length of fields to compare. */
313 if (join_field_1
< line1
->nfields
)
315 beg1
= line1
->fields
[join_field_1
].beg
;
316 len1
= line1
->fields
[join_field_1
].len
;
324 if (join_field_2
< line2
->nfields
)
326 beg2
= line2
->fields
[join_field_2
].beg
;
327 len2
= line2
->fields
[join_field_2
].len
;
336 return len2
== 0 ? 0 : -1;
342 /* FIXME: ignore_case does not work with NLS (in particular,
343 with multibyte chars). */
344 diff
= memcasecmp (beg1
, beg2
, MIN (len1
, len2
));
349 return xmemcoll (beg1
, len1
, beg2
, len2
);
350 diff
= memcmp (beg1
, beg2
, MIN (len1
, len2
));
355 return len1
< len2
? -1 : len1
!= len2
;
358 /* Print field N of LINE if it exists and is nonempty, otherwise
359 `empty_filler' if it is nonempty. */
362 prfield (size_t n
, struct line
const *line
)
366 if (n
< line
->nfields
)
368 len
= line
->fields
[n
].len
;
370 fwrite (line
->fields
[n
].beg
, 1, len
, stdout
);
371 else if (empty_filler
)
372 fputs (empty_filler
, stdout
);
374 else if (empty_filler
)
375 fputs (empty_filler
, stdout
);
378 /* Print the join of LINE1 and LINE2. */
381 prjoin (struct line
const *line1
, struct line
const *line2
)
383 const struct outlist
*outlist
;
384 char output_separator
= tab
< 0 ? ' ' : tab
;
386 outlist
= outlist_head
.next
;
389 const struct outlist
*o
;
395 struct line
const *line
;
399 if (line1
== &uni_blank
)
402 field
= join_field_2
;
407 field
= join_field_1
;
412 line
= (o
->file
== 1 ? line1
: line2
);
415 prfield (field
, line
);
419 putchar (output_separator
);
427 if (line1
== &uni_blank
)
429 struct line
const *t
;
434 prfield (join_field_1
, line1
);
435 for (i
= 0; i
< join_field_1
&& i
< line1
->nfields
; ++i
)
437 putchar (output_separator
);
440 for (i
= join_field_1
+ 1; i
< line1
->nfields
; ++i
)
442 putchar (output_separator
);
446 for (i
= 0; i
< join_field_2
&& i
< line2
->nfields
; ++i
)
448 putchar (output_separator
);
451 for (i
= join_field_2
+ 1; i
< line2
->nfields
; ++i
)
453 putchar (output_separator
);
460 /* Print the join of the files in FP1 and FP2. */
463 join (FILE *fp1
, FILE *fp2
)
465 struct seq seq1
, seq2
;
470 /* Read the first line of each file. */
476 while (seq1
.count
&& seq2
.count
)
479 diff
= keycmp (&seq1
.lines
[0], &seq2
.lines
[0]);
482 if (print_unpairables_1
)
483 prjoin (&seq1
.lines
[0], &uni_blank
);
484 freeline (&seq1
.lines
[0]);
491 if (print_unpairables_2
)
492 prjoin (&uni_blank
, &seq2
.lines
[0]);
493 freeline (&seq2
.lines
[0]);
499 /* Keep reading lines from file1 as long as they continue to
500 match the current line from file2. */
503 if (!getseq (fp1
, &seq1
))
509 while (!keycmp (&seq1
.lines
[seq1
.count
- 1], &seq2
.lines
[0]));
511 /* Keep reading lines from file2 as long as they continue to
512 match the current line from file1. */
515 if (!getseq (fp2
, &seq2
))
521 while (!keycmp (&seq1
.lines
[0], &seq2
.lines
[seq2
.count
- 1]));
525 for (i
= 0; i
< seq1
.count
- 1; ++i
)
528 for (j
= 0; j
< seq2
.count
- 1; ++j
)
529 prjoin (&seq1
.lines
[i
], &seq2
.lines
[j
]);
533 for (i
= 0; i
< seq1
.count
- 1; ++i
)
534 freeline (&seq1
.lines
[i
]);
537 seq1
.lines
[0] = seq1
.lines
[seq1
.count
- 1];
543 for (i
= 0; i
< seq2
.count
- 1; ++i
)
544 freeline (&seq2
.lines
[i
]);
547 seq2
.lines
[0] = seq2
.lines
[seq2
.count
- 1];
554 if (print_unpairables_1
&& seq1
.count
)
556 prjoin (&seq1
.lines
[0], &uni_blank
);
557 freeline (&seq1
.lines
[0]);
558 while (get_line (fp1
, &line
))
560 prjoin (&line
, &uni_blank
);
565 if (print_unpairables_2
&& seq2
.count
)
567 prjoin (&uni_blank
, &seq2
.lines
[0]);
568 freeline (&seq2
.lines
[0]);
569 while (get_line (fp2
, &line
))
571 prjoin (&uni_blank
, &line
);
580 /* Add a field spec for field FIELD of file FILE to `outlist'. */
583 add_field (int file
, size_t field
)
587 assert (file
== 0 || file
== 1 || file
== 2);
588 assert (file
!= 0 || field
== 0);
590 o
= xmalloc (sizeof *o
);
595 /* Add to the end of the list so the fields are in the right order. */
596 outlist_end
->next
= o
;
600 /* Convert a string of decimal digits, STR (the 1-based join field number),
601 to an integral value. Upon successful conversion, return one less
602 (the zero-based field number). Silently convert too-large values
603 to SIZE_MAX - 1. Otherwise, if a value cannot be converted, give a
604 diagnostic and exit. */
607 string_to_join_field (char const *str
)
610 unsigned long int val
;
611 verify (SIZE_MAX
<= ULONG_MAX
);
613 strtol_error s_err
= xstrtoul (str
, NULL
, 10, &val
, "");
614 if (s_err
== LONGINT_OVERFLOW
|| (s_err
== LONGINT_OK
&& SIZE_MAX
< val
))
616 else if (s_err
!= LONGINT_OK
|| val
== 0)
617 error (EXIT_FAILURE
, 0, _("invalid field number: %s"), quote (str
));
624 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
625 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
626 If S is valid, return true. Otherwise, give a diagnostic and exit. */
629 decode_field_spec (const char *s
, int *file_index
, size_t *field_index
)
631 /* The first character must be 0, 1, or 2. */
637 /* `0' must be all alone -- no `.FIELD'. */
638 error (EXIT_FAILURE
, 0, _("invalid field specifier: %s"), quote (s
));
647 error (EXIT_FAILURE
, 0, _("invalid field specifier: %s"), quote (s
));
648 *file_index
= s
[0] - '0';
649 *field_index
= string_to_join_field (s
+ 2);
653 error (EXIT_FAILURE
, 0,
654 _("invalid file number in field spec: %s"), quote (s
));
656 /* Tell gcc -W -Wall that we can't get beyond this point.
657 This avoids a warning (otherwise legit) that the caller's copies
658 of *file_index and *field_index might be used uninitialized. */
665 /* Add the comma or blank separated field spec(s) in STR to `outlist'. */
668 add_field_list (char *str
)
676 char const *spec_item
= p
;
678 p
= strpbrk (p
, ", \t");
681 decode_field_spec (spec_item
, &file_index
, &field_index
);
682 add_field (file_index
, field_index
);
687 /* Set the join field *VAR to VAL, but report an error if *VAR is set
688 more than once to incompatible values. */
691 set_join_field (size_t *var
, size_t val
)
693 if (*var
!= SIZE_MAX
&& *var
!= val
)
695 unsigned long int var1
= *var
+ 1;
696 unsigned long int val1
= val
+ 1;
697 error (EXIT_FAILURE
, 0, _("incompatible join fields %lu, %lu"),
703 /* Status of command-line arguments. */
707 /* This argument must be an operand, i.e., one of the files to be
711 /* This might be the argument of the preceding -j1 or -j2 option,
712 or it might be an operand. */
716 /* This might be the argument of the preceding -o option, or it might be
721 /* Add NAME to the array of input file NAMES with operand statuses
722 OPERAND_STATUS; currently there are NFILES names in the list. */
725 add_file_name (char *name
, char *names
[2],
726 int operand_status
[2], int joption_count
[2], int *nfiles
,
727 int *prev_optc_status
, int *optc_status
)
733 bool op0
= (operand_status
[0] == MUST_BE_OPERAND
);
734 char *arg
= names
[op0
];
735 switch (operand_status
[op0
])
737 case MUST_BE_OPERAND
:
738 error (0, 0, _("extra operand %s"), quote (name
));
739 usage (EXIT_FAILURE
);
741 case MIGHT_BE_J1_ARG
:
743 set_join_field (&join_field_1
, string_to_join_field (arg
));
746 case MIGHT_BE_J2_ARG
:
748 set_join_field (&join_field_2
, string_to_join_field (arg
));
752 add_field_list (arg
);
757 operand_status
[0] = operand_status
[1];
763 operand_status
[n
] = *prev_optc_status
;
766 if (*prev_optc_status
== MIGHT_BE_O_ARG
)
767 *optc_status
= MIGHT_BE_O_ARG
;
771 main (int argc
, char **argv
)
774 int prev_optc_status
= MUST_BE_OPERAND
;
775 int operand_status
[2];
776 int joption_count
[2] = { 0, 0 };
783 initialize_main (&argc
, &argv
);
784 program_name
= argv
[0];
785 setlocale (LC_ALL
, "");
786 bindtextdomain (PACKAGE
, LOCALEDIR
);
787 textdomain (PACKAGE
);
788 hard_LC_COLLATE
= hard_locale (LC_COLLATE
);
790 atexit (close_stdout
);
792 print_pairables
= true;
794 while ((optc
= getopt_long (argc
, argv
, "-a:e:i1:2:j:o:t:v:",
798 optc_status
= MUST_BE_OPERAND
;
803 print_pairables
= false;
808 unsigned long int val
;
809 if (xstrtoul (optarg
, NULL
, 10, &val
, "") != LONGINT_OK
810 || (val
!= 1 && val
!= 2))
811 error (EXIT_FAILURE
, 0,
812 _("invalid field number: %s"), quote (optarg
));
814 print_unpairables_1
= true;
816 print_unpairables_2
= true;
821 if (empty_filler
&& ! STREQ (empty_filler
, optarg
))
822 error (EXIT_FAILURE
, 0,
823 _("conflicting empty-field replacement strings"));
824 empty_filler
= optarg
;
832 set_join_field (&join_field_1
, string_to_join_field (optarg
));
836 set_join_field (&join_field_2
, string_to_join_field (optarg
));
840 if ((optarg
[0] == '1' || optarg
[0] == '2') && !optarg
[1]
841 && optarg
== argv
[optind
- 1] + 2)
843 /* The argument was either "-j1" or "-j2". */
844 bool is_j2
= (optarg
[0] == '2');
845 joption_count
[is_j2
]++;
846 optc_status
= MIGHT_BE_J1_ARG
+ is_j2
;
850 set_join_field (&join_field_1
, string_to_join_field (optarg
));
851 set_join_field (&join_field_2
, join_field_1
);
856 add_field_list (optarg
);
857 optc_status
= MIGHT_BE_O_ARG
;
862 unsigned char newtab
= optarg
[0];
864 error (EXIT_FAILURE
, 0, _("empty tab"));
867 if (STREQ (optarg
, "\\0"))
870 error (EXIT_FAILURE
, 0, _("multi-character tab %s"),
873 if (0 <= tab
&& tab
!= newtab
)
874 error (EXIT_FAILURE
, 0, _("incompatible tabs"));
879 case 1: /* Non-option argument. */
880 add_file_name (optarg
, names
, operand_status
, joption_count
,
881 &nfiles
, &prev_optc_status
, &optc_status
);
884 case_GETOPT_HELP_CHAR
;
886 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
889 usage (EXIT_FAILURE
);
892 prev_optc_status
= optc_status
;
895 /* Process any operands after "--". */
896 prev_optc_status
= MUST_BE_OPERAND
;
897 while (optind
< argc
)
898 add_file_name (argv
[optind
++], names
, operand_status
, joption_count
,
899 &nfiles
, &prev_optc_status
, &optc_status
);
904 error (0, 0, _("missing operand"));
906 error (0, 0, _("missing operand after %s"), quote (argv
[argc
- 1]));
907 usage (EXIT_FAILURE
);
910 /* If "-j1" was specified and it turns out not to have had an argument,
911 treat it as "-j 1". Likewise for -j2. */
912 for (i
= 0; i
< 2; i
++)
913 if (joption_count
[i
] != 0)
915 set_join_field (&join_field_1
, i
);
916 set_join_field (&join_field_2
, i
);
919 if (join_field_1
== SIZE_MAX
)
921 if (join_field_2
== SIZE_MAX
)
924 fp1
= STREQ (names
[0], "-") ? stdin
: fopen (names
[0], "r");
926 error (EXIT_FAILURE
, errno
, "%s", names
[0]);
927 fp2
= STREQ (names
[1], "-") ? stdin
: fopen (names
[1], "r");
929 error (EXIT_FAILURE
, errno
, "%s", names
[1]);
931 error (EXIT_FAILURE
, errno
, _("both files cannot be standard input"));
934 if (fclose (fp1
) != 0)
935 error (EXIT_FAILURE
, errno
, "%s", names
[0]);
936 if (fclose (fp2
) != 0)
937 error (EXIT_FAILURE
, errno
, "%s", names
[1]);