tests: add fold(1) test for --bytes option
[coreutils.git] / src / join.c
blobfd28febfb9058763eadd1d6ce1866dc16a1488cc
1 /* join - join lines of two files on a common field
2 Copyright (C) 1991-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>.
17 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
19 #include <config.h>
21 #include <sys/types.h>
22 #include <getopt.h>
24 #include "system.h"
25 #include "assure.h"
26 #include "fadvise.h"
27 #include "hard-locale.h"
28 #include "linebuffer.h"
29 #include "mcel.h"
30 #include "memcasecmp.h"
31 #include "quote.h"
32 #include "skipchars.h"
33 #include "stdio--.h"
34 #include "xmemcoll.h"
35 #include "xstrtol.h"
36 #include "argmatch.h"
38 /* The official name of this program (e.g., no 'g' prefix). */
39 #define PROGRAM_NAME "join"
41 #define AUTHORS proper_name ("Mike Haertel")
43 #define join system_join
45 #define SWAPLINES(a, b) do { \
46 struct line *tmp = a; \
47 a = b; \
48 b = tmp; \
49 } while (0);
51 /* An element of the list identifying which fields to print for each
52 output line. */
53 struct outlist
55 /* File number: 0, 1, or 2. 0 means use the join field.
56 1 means use the first file argument, 2 the second. */
57 int file;
59 /* Field index (zero-based), specified only when FILE is 1 or 2. */
60 idx_t field;
62 struct outlist *next;
65 /* A field of a line. */
66 struct field
68 char *beg; /* First character in field. */
69 idx_t len; /* The length of the field. */
72 /* A line read from an input file. */
73 struct line
75 struct linebuffer buf; /* The line itself. */
76 idx_t nfields; /* Number of elements in 'fields'. */
77 idx_t nfields_allocated; /* Number of elements allocated for 'fields'. */
78 struct field *fields;
81 /* One or more consecutive lines read from a file that all have the
82 same join field value. */
83 struct seq
85 idx_t count; /* Elements used in 'lines'. */
86 idx_t alloc; /* Elements allocated in 'lines'. */
87 struct line **lines;
90 /* The previous line read from each file. */
91 static struct line *prevline[2] = {nullptr, nullptr};
93 /* The number of lines read from each file. */
94 static uintmax_t line_no[2] = {0, 0};
96 /* The input file names. */
97 static char *g_names[2];
99 /* This provides an extra line buffer for each file. We need these if we
100 try to read two consecutive lines into the same buffer, since we don't
101 want to overwrite the previous buffer before we check order. */
102 static struct line *spareline[2] = {nullptr, nullptr};
104 /* True if the LC_COLLATE locale is hard. */
105 static bool hard_LC_COLLATE;
107 /* If nonzero, print unpairable lines in file 1 or 2. */
108 static bool print_unpairables_1, print_unpairables_2;
110 /* If nonzero, print pairable lines. */
111 static bool print_pairables;
113 /* If nonzero, we have seen at least one unpairable line. */
114 static bool seen_unpairable;
116 /* If nonzero, we have warned about disorder in that file. */
117 static bool issued_disorder_warning[2];
119 /* Empty output field filler. */
120 static char const *empty_filler;
122 /* Whether to ensure the same number of fields are output from each line. */
123 static bool autoformat;
124 /* The number of fields to output for each line.
125 Only significant when autoformat is true. */
126 static idx_t autocount_1;
127 static idx_t autocount_2;
129 /* Field to join on; -1 means they haven't been determined yet. */
130 static ptrdiff_t join_field_1 = -1;
131 static ptrdiff_t join_field_2 = -1;
133 /* List of fields to print. */
134 static struct outlist outlist_head;
136 /* Last element in 'outlist', where a new element can be added. */
137 static struct outlist *outlist_end = &outlist_head;
139 /* Tab character (or encoding error) separating fields. If TAB.len == 0,
140 fields are separated by any nonempty string of blanks, otherwise by
141 exactly one tab character (or encoding error) equal to TAB. */
142 static mcel_t tab;
144 /* The output separator to use, and its length in bytes. */
145 static char const *output_separator = " ";
146 static idx_t output_seplen = 1;
148 /* If nonzero, check that the input is correctly ordered. */
149 static enum
151 CHECK_ORDER_DEFAULT,
152 CHECK_ORDER_ENABLED,
153 CHECK_ORDER_DISABLED
154 } check_input_order;
156 enum
158 CHECK_ORDER_OPTION = CHAR_MAX + 1,
159 NOCHECK_ORDER_OPTION,
160 HEADER_LINE_OPTION
164 static struct option const longopts[] =
166 {"ignore-case", no_argument, nullptr, 'i'},
167 {"check-order", no_argument, nullptr, CHECK_ORDER_OPTION},
168 {"nocheck-order", no_argument, nullptr, NOCHECK_ORDER_OPTION},
169 {"zero-terminated", no_argument, nullptr, 'z'},
170 {"header", no_argument, nullptr, HEADER_LINE_OPTION},
171 {GETOPT_HELP_OPTION_DECL},
172 {GETOPT_VERSION_OPTION_DECL},
173 {nullptr, 0, nullptr, 0}
176 /* Used to print non-joining lines */
177 static struct line uni_blank;
179 /* If nonzero, ignore case when comparing join fields. */
180 static bool ignore_case;
182 /* If nonzero, treat the first line of each file as column headers --
183 join them without checking for ordering */
184 static bool join_header_lines;
186 /* The character marking end of line. Default to \n. */
187 static char eolchar = '\n';
189 void
190 usage (int status)
192 if (status != EXIT_SUCCESS)
193 emit_try_help ();
194 else
196 printf (_("\
197 Usage: %s [OPTION]... FILE1 FILE2\n\
199 program_name);
200 fputs (_("\
201 For each pair of input lines with identical join fields, write a line to\n\
202 standard output. The default join field is the first, delimited by blanks.\
204 "), stdout);
205 fputs (_("\
207 When FILE1 or FILE2 (not both) is -, read standard input.\n\
208 "), stdout);
209 fputs (_("\
211 -a FILENUM also print unpairable lines from file FILENUM, where\n\
212 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
213 "), stdout);
214 fputs (_("\
215 -e STRING replace missing (empty) input fields with STRING;\n\
216 I.e., missing fields specified with '-12jo' options\
218 "), stdout);
219 fputs (_("\
220 -i, --ignore-case ignore differences in case when comparing fields\n\
221 -j FIELD equivalent to '-1 FIELD -2 FIELD'\n\
222 -o FORMAT obey FORMAT while constructing output line\n\
223 -t CHAR use CHAR as input and output field separator\n\
224 "), stdout);
225 fputs (_("\
226 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
227 -1 FIELD join on this FIELD of file 1\n\
228 -2 FIELD join on this FIELD of file 2\n\
229 --check-order check that the input is correctly sorted, even\n\
230 if all input lines are pairable\n\
231 --nocheck-order do not check that the input is correctly sorted\n\
232 --header treat the first line in each file as field headers,\n\
233 print them without trying to pair them\n\
234 "), stdout);
235 fputs (_("\
236 -z, --zero-terminated line delimiter is NUL, not newline\n\
237 "), stdout);
238 fputs (HELP_OPTION_DESCRIPTION, stdout);
239 fputs (VERSION_OPTION_DESCRIPTION, stdout);
240 fputs (_("\
242 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
243 else fields are separated by CHAR. Any FIELD is a field number counted\n\
244 from 1. FORMAT is one or more comma or blank separated specifications,\n\
245 each being 'FILENUM.FIELD' or '0'. Default FORMAT outputs the join field,\n\
246 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
247 separated by CHAR. If FORMAT is the keyword 'auto', then the first\n\
248 line of each file determines the number of fields output for each line.\n\
250 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
251 E.g., use \"sort -k 1b,1\" if 'join' has no options,\n\
252 or use \"join -t ''\" if 'sort' has no options.\n\
253 Comparisons honor the rules specified by 'LC_COLLATE'.\n\
254 If the input is not sorted and some lines cannot be joined, a\n\
255 warning message will be given.\n\
256 "), stdout);
257 emit_ancillary_info (PROGRAM_NAME);
259 exit (status);
262 /* Record a field in LINE, with location FIELD and size LEN. */
264 static void
265 extract_field (struct line *line, char *field, idx_t len)
267 if (line->nfields >= line->nfields_allocated)
268 line->fields = xpalloc (line->fields, &line->nfields_allocated, 1,
269 -1, sizeof *line->fields);
270 line->fields[line->nfields].beg = field;
271 line->fields[line->nfields].len = len;
272 ++(line->nfields);
275 static bool
276 eq_tab (mcel_t g)
278 return mcel_cmp (g, tab) == 0;
281 static bool
282 newline_or_blank (mcel_t g)
284 return g.ch == '\n' || c32isblank (g.ch);
287 /* Fill in the 'fields' structure in LINE. */
289 static void
290 xfields (struct line *line)
292 char *ptr = line->buf.buffer;
293 char const *lim = ptr + line->buf.length - 1;
295 if (ptr == lim)
296 return;
298 if (!tab.len)
299 while ((ptr = skip_buf_matching (ptr, lim, newline_or_blank, true)) < lim)
301 char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
302 extract_field (line, ptr, sep - ptr);
303 ptr = sep;
305 else
307 if (tab.ch != '\n')
308 for (char *sep;
309 (sep = skip_buf_matching (ptr, lim, eq_tab, false)) < lim;
310 ptr = sep + mcel_scan (sep, lim).len)
311 extract_field (line, ptr, sep - ptr);
313 extract_field (line, ptr, lim - ptr);
317 static void
318 freeline (struct line *line)
320 if (line == nullptr)
321 return;
322 free (line->fields);
323 line->fields = nullptr;
324 free (line->buf.buffer);
325 line->buf.buffer = nullptr;
328 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
329 >0 if it compares greater; 0 if it compares equal.
330 Report an error and exit if the comparison fails.
331 Use join fields JF_1 and JF_2 respectively. */
333 static int
334 keycmp (struct line const *line1, struct line const *line2,
335 idx_t jf_1, idx_t jf_2)
337 /* Start of field to compare in each file. */
338 char *beg1;
339 char *beg2;
341 idx_t len1;
342 idx_t len2; /* Length of fields to compare. */
343 int diff;
345 if (jf_1 < line1->nfields)
347 beg1 = line1->fields[jf_1].beg;
348 len1 = line1->fields[jf_1].len;
350 else
352 beg1 = nullptr;
353 len1 = 0;
356 if (jf_2 < line2->nfields)
358 beg2 = line2->fields[jf_2].beg;
359 len2 = line2->fields[jf_2].len;
361 else
363 beg2 = nullptr;
364 len2 = 0;
367 if (len1 == 0)
368 return len2 == 0 ? 0 : -1;
369 if (len2 == 0)
370 return 1;
372 if (ignore_case)
374 /* FIXME: ignore_case does not work with NLS (in particular,
375 with multibyte chars). */
376 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
378 else
380 if (hard_LC_COLLATE)
381 return xmemcoll (beg1, len1, beg2, len2);
382 diff = memcmp (beg1, beg2, MIN (len1, len2));
385 if (diff)
386 return diff;
387 return (len1 > len2) - (len1 < len2);
390 /* Check that successive input lines PREV and CURRENT from input file
391 WHATFILE are presented in order, unless the user may be relying on
392 the GNU extension that input lines may be out of order if no input
393 lines are unpairable.
395 If the user specified --nocheck-order, the check is not made.
396 If the user specified --check-order, the problem is fatal.
397 Otherwise (the default), the message is simply a warning.
399 A message is printed at most once per input file. */
401 static void
402 check_order (const struct line *prev,
403 const struct line *current,
404 int whatfile)
406 if (check_input_order != CHECK_ORDER_DISABLED
407 && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable))
409 if (!issued_disorder_warning[whatfile - 1])
411 idx_t join_field = whatfile == 1 ? join_field_1 : join_field_2;
412 if (keycmp (prev, current, join_field, join_field) > 0)
414 /* Exclude any trailing newline. */
415 idx_t len = current->buf.length;
416 if (0 < len && current->buf.buffer[len - 1] == '\n')
417 --len;
419 /* If the offending line is longer than INT_MAX, output
420 only the first INT_MAX bytes in this diagnostic. */
421 len = MIN (INT_MAX, len);
423 error ((check_input_order == CHECK_ORDER_ENABLED
424 ? EXIT_FAILURE : 0),
425 0, _("%s:%ju: is not sorted: %.*s"),
426 g_names[whatfile - 1], line_no[whatfile - 1],
427 (int) len, current->buf.buffer);
429 /* If we get to here, the message was merely a warning.
430 Arrange to issue it only once per file. */
431 issued_disorder_warning[whatfile - 1] = true;
437 static inline void
438 reset_line (struct line *line)
440 line->nfields = 0;
443 static struct line *
444 init_linep (struct line **linep)
446 struct line *line = xzalloc (sizeof *line);
447 *linep = line;
448 return line;
451 /* Read a line from FP into LINE and split it into fields.
452 Return true if successful. */
454 static bool
455 get_line (FILE *fp, struct line **linep, int which)
457 struct line *line = *linep;
459 if (line == prevline[which - 1])
461 SWAPLINES (line, spareline[which - 1]);
462 *linep = line;
465 if (line)
466 reset_line (line);
467 else
468 line = init_linep (linep);
470 if (! readlinebuffer_delim (&line->buf, fp, eolchar))
472 if (ferror (fp))
473 error (EXIT_FAILURE, errno, _("read error"));
474 freeline (line);
475 return false;
477 ++line_no[which - 1];
479 xfields (line);
481 if (prevline[which - 1])
482 check_order (prevline[which - 1], line, which);
484 prevline[which - 1] = line;
485 return true;
488 static void
489 free_spareline (void)
491 for (idx_t i = 0; i < ARRAY_CARDINALITY (spareline); i++)
493 if (spareline[i])
495 freeline (spareline[i]);
496 free (spareline[i]);
501 static void
502 initseq (struct seq *seq)
504 seq->count = 0;
505 seq->alloc = 0;
506 seq->lines = nullptr;
509 /* Read a line from FP and add it to SEQ. Return true if successful. */
511 static bool
512 getseq (FILE *fp, struct seq *seq, int whichfile)
514 if (seq->count == seq->alloc)
516 seq->lines = xpalloc (seq->lines, &seq->alloc, 1, -1, sizeof *seq->lines);
517 for (idx_t i = seq->count; i < seq->alloc; i++)
518 seq->lines[i] = nullptr;
521 if (get_line (fp, &seq->lines[seq->count], whichfile))
523 ++seq->count;
524 return true;
526 return false;
529 /* Read a line from FP and add it to SEQ, as the first item if FIRST is
530 true, else as the next. */
531 static bool
532 advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile)
534 if (first)
535 seq->count = 0;
537 return getseq (fp, seq, whichfile);
540 static void
541 delseq (struct seq *seq)
543 for (idx_t i = 0; i < seq->alloc; i++)
545 freeline (seq->lines[i]);
546 free (seq->lines[i]);
548 free (seq->lines);
552 /* Print field N of LINE if it exists and is nonempty, otherwise
553 'empty_filler' if it is nonempty. */
555 static void
556 prfield (idx_t n, struct line const *line)
558 if (n < line->nfields)
560 idx_t len = line->fields[n].len;
561 if (len)
562 fwrite (line->fields[n].beg, 1, len, stdout);
563 else if (empty_filler)
564 fputs (empty_filler, stdout);
566 else if (empty_filler)
567 fputs (empty_filler, stdout);
570 /* Output all the fields in line, other than the join field. */
572 static void
573 prfields (struct line const *line, idx_t join_field, idx_t autocount)
575 idx_t i;
576 idx_t nfields = autoformat ? autocount : line->nfields;
578 for (i = 0; i < join_field && i < nfields; ++i)
580 fwrite (output_separator, 1, output_seplen, stdout);
581 prfield (i, line);
583 for (i = join_field + 1; i < nfields; ++i)
585 fwrite (output_separator, 1, output_seplen, stdout);
586 prfield (i, line);
590 /* Print the join of LINE1 and LINE2. */
592 static void
593 prjoin (struct line const *line1, struct line const *line2)
595 const struct outlist *outlist;
596 idx_t field;
597 struct line const *line;
599 outlist = outlist_head.next;
600 if (outlist)
602 const struct outlist *o;
604 o = outlist;
605 while (true)
607 if (o->file == 0)
609 if (line1 == &uni_blank)
611 line = line2;
612 field = join_field_2;
614 else
616 line = line1;
617 field = join_field_1;
620 else
622 line = (o->file == 1 ? line1 : line2);
623 field = o->field;
625 prfield (field, line);
626 o = o->next;
627 if (o == nullptr)
628 break;
629 fwrite (output_separator, 1, output_seplen, stdout);
631 putchar (eolchar);
633 else
635 if (line1 == &uni_blank)
637 line = line2;
638 field = join_field_2;
640 else
642 line = line1;
643 field = join_field_1;
646 /* Output the join field. */
647 prfield (field, line);
649 /* Output other fields. */
650 prfields (line1, join_field_1, autocount_1);
651 prfields (line2, join_field_2, autocount_2);
653 putchar (eolchar);
656 if (ferror (stdout))
657 write_error ();
660 /* Print the join of the files in FP1 and FP2. */
662 static void
663 join (FILE *fp1, FILE *fp2)
665 struct seq seq1, seq2;
666 int diff;
667 bool eof1, eof2;
669 fadvise (fp1, FADVISE_SEQUENTIAL);
670 fadvise (fp2, FADVISE_SEQUENTIAL);
672 /* Read the first line of each file. */
673 initseq (&seq1);
674 getseq (fp1, &seq1, 1);
675 initseq (&seq2);
676 getseq (fp2, &seq2, 2);
678 if (autoformat)
680 autocount_1 = seq1.count ? seq1.lines[0]->nfields : 0;
681 autocount_2 = seq2.count ? seq2.lines[0]->nfields : 0;
684 if (join_header_lines && (seq1.count || seq2.count))
686 struct line const *hline1 = seq1.count ? seq1.lines[0] : &uni_blank;
687 struct line const *hline2 = seq2.count ? seq2.lines[0] : &uni_blank;
688 prjoin (hline1, hline2);
689 prevline[0] = nullptr;
690 prevline[1] = nullptr;
691 if (seq1.count)
692 advance_seq (fp1, &seq1, true, 1);
693 if (seq2.count)
694 advance_seq (fp2, &seq2, true, 2);
697 while (seq1.count && seq2.count)
699 diff = keycmp (seq1.lines[0], seq2.lines[0],
700 join_field_1, join_field_2);
701 if (diff < 0)
703 if (print_unpairables_1)
704 prjoin (seq1.lines[0], &uni_blank);
705 advance_seq (fp1, &seq1, true, 1);
706 seen_unpairable = true;
707 continue;
709 if (diff > 0)
711 if (print_unpairables_2)
712 prjoin (&uni_blank, seq2.lines[0]);
713 advance_seq (fp2, &seq2, true, 2);
714 seen_unpairable = true;
715 continue;
718 /* Keep reading lines from file1 as long as they continue to
719 match the current line from file2. */
720 eof1 = false;
722 if (!advance_seq (fp1, &seq1, false, 1))
724 eof1 = true;
725 ++seq1.count;
726 break;
728 while (!keycmp (seq1.lines[seq1.count - 1], seq2.lines[0],
729 join_field_1, join_field_2));
731 /* Keep reading lines from file2 as long as they continue to
732 match the current line from file1. */
733 eof2 = false;
735 if (!advance_seq (fp2, &seq2, false, 2))
737 eof2 = true;
738 ++seq2.count;
739 break;
741 while (!keycmp (seq1.lines[0], seq2.lines[seq2.count - 1],
742 join_field_1, join_field_2));
744 if (print_pairables)
746 for (idx_t i = 0; i < seq1.count - 1; ++i)
748 idx_t j;
749 for (j = 0; j < seq2.count - 1; ++j)
750 prjoin (seq1.lines[i], seq2.lines[j]);
754 if (!eof1)
756 SWAPLINES (seq1.lines[0], seq1.lines[seq1.count - 1]);
757 seq1.count = 1;
759 else
760 seq1.count = 0;
762 if (!eof2)
764 SWAPLINES (seq2.lines[0], seq2.lines[seq2.count - 1]);
765 seq2.count = 1;
767 else
768 seq2.count = 0;
771 /* If the user did not specify --nocheck-order, then we read the
772 tail ends of both inputs to verify that they are in order. We
773 skip the rest of the tail once we have issued a warning for that
774 file, unless we actually need to print the unpairable lines. */
775 struct line *line = nullptr;
776 bool checktail = false;
778 if (check_input_order != CHECK_ORDER_DISABLED
779 && !(issued_disorder_warning[0] && issued_disorder_warning[1]))
780 checktail = true;
782 if ((print_unpairables_1 || checktail) && seq1.count)
784 if (print_unpairables_1)
785 prjoin (seq1.lines[0], &uni_blank);
786 if (seq2.count)
787 seen_unpairable = true;
788 while (get_line (fp1, &line, 1))
790 if (print_unpairables_1)
791 prjoin (line, &uni_blank);
792 if (issued_disorder_warning[0] && !print_unpairables_1)
793 break;
797 if ((print_unpairables_2 || checktail) && seq2.count)
799 if (print_unpairables_2)
800 prjoin (&uni_blank, seq2.lines[0]);
801 if (seq1.count)
802 seen_unpairable = true;
803 while (get_line (fp2, &line, 2))
805 if (print_unpairables_2)
806 prjoin (&uni_blank, line);
807 if (issued_disorder_warning[1] && !print_unpairables_2)
808 break;
812 freeline (line);
813 free (line);
815 delseq (&seq1);
816 delseq (&seq2);
819 /* Add a field spec for field FIELD of file FILE to 'outlist'. */
821 static void
822 add_field (int file, idx_t field)
824 struct outlist *o;
826 affirm (file == 0 || file == 1 || file == 2);
827 affirm (file != 0 || field == 0);
829 o = xmalloc (sizeof *o);
830 o->file = file;
831 o->field = field;
832 o->next = nullptr;
834 /* Add to the end of the list so the fields are in the right order. */
835 outlist_end->next = o;
836 outlist_end = o;
839 /* Convert a string of decimal digits, STR (the 1-based join field number),
840 to an integral value. Upon successful conversion, return one less
841 (the zero-based field number). Silently convert too-large values
842 to PTRDIFF_MAX. Otherwise, if a value cannot be converted, give a
843 diagnostic and exit. */
845 static idx_t
846 string_to_join_field (char const *str)
848 intmax_t val;
850 strtol_error s_err = xstrtoimax (str, nullptr, 10, &val, "");
851 if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && PTRDIFF_MAX < val))
852 val = PTRDIFF_MAX;
853 else if (s_err != LONGINT_OK || val <= 0)
854 error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
856 return val - 1;
859 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
860 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
861 If S is valid, return true. Otherwise, give a diagnostic and exit. */
863 static void
864 decode_field_spec (char const *s, int *file_index, idx_t *field_index)
866 /* The first character must be 0, 1, or 2. */
867 switch (s[0])
869 case '0':
870 if (s[1])
872 /* '0' must be all alone -- no '.FIELD'. */
873 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
875 *file_index = 0;
876 *field_index = 0;
877 break;
879 case '1':
880 case '2':
881 if (s[1] != '.')
882 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
883 *file_index = s[0] - '0';
884 *field_index = string_to_join_field (s + 2);
885 break;
887 default:
888 error (EXIT_FAILURE, 0,
889 _("invalid file number in field spec: %s"), quote (s));
893 static bool
894 comma_or_blank (mcel_t g)
896 return g.ch == ',' || c32isblank (g.ch);
899 /* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
901 static void
902 add_field_list (char *str)
904 char *p = str;
908 int file_index;
909 idx_t field_index;
910 char const *spec_item = p;
911 p = skip_str_matching (spec_item, comma_or_blank, false);
912 if (*p)
914 mcel_t g = mcel_scanz (p);
915 *p = '\0';
916 p += g.len;
918 decode_field_spec (spec_item, &file_index, &field_index);
919 add_field (file_index, field_index);
921 while (*p);
924 /* Set the join field *VAR to VAL, but report an error if *VAR is set
925 more than once to incompatible values. */
927 static void
928 set_join_field (ptrdiff_t *var, idx_t val)
930 if (0 <= *var && *var != val)
931 error (EXIT_FAILURE, 0,
932 _("incompatible join fields %td, %td"), *var, val);
933 *var = val;
936 /* Status of command-line arguments. */
938 enum operand_status
940 /* This argument must be an operand, i.e., one of the files to be
941 joined. */
942 MUST_BE_OPERAND,
944 /* This might be the argument of the preceding -j1 or -j2 option,
945 or it might be an operand. */
946 MIGHT_BE_J1_ARG,
947 MIGHT_BE_J2_ARG,
949 /* This might be the argument of the preceding -o option, or it might be
950 an operand. */
951 MIGHT_BE_O_ARG
954 /* Add NAME to the array of input file NAMES with operand statuses
955 OPERAND_STATUS; currently there are NFILES names in the list. */
957 static void
958 add_file_name (char *name, char *names[2],
959 int operand_status[2], int joption_count[2], int *nfiles,
960 int *prev_optc_status, int *optc_status)
962 int n = *nfiles;
964 if (n == 2)
966 bool op0 = (operand_status[0] == MUST_BE_OPERAND);
967 char *arg = names[op0];
968 switch (operand_status[op0])
970 case MUST_BE_OPERAND:
971 error (0, 0, _("extra operand %s"), quoteaf (name));
972 usage (EXIT_FAILURE);
974 case MIGHT_BE_J1_ARG:
975 joption_count[0]--;
976 set_join_field (&join_field_1, string_to_join_field (arg));
977 break;
979 case MIGHT_BE_J2_ARG:
980 joption_count[1]--;
981 set_join_field (&join_field_2, string_to_join_field (arg));
982 break;
984 case MIGHT_BE_O_ARG:
985 add_field_list (arg);
986 break;
988 if (!op0)
990 operand_status[0] = operand_status[1];
991 names[0] = names[1];
993 n = 1;
996 operand_status[n] = *prev_optc_status;
997 names[n] = name;
998 *nfiles = n + 1;
999 if (*prev_optc_status == MIGHT_BE_O_ARG)
1000 *optc_status = MIGHT_BE_O_ARG;
1004 main (int argc, char **argv)
1006 int optc_status;
1007 int prev_optc_status = MUST_BE_OPERAND;
1008 int operand_status[2];
1009 int joption_count[2] = { 0, 0 };
1010 FILE *fp1, *fp2;
1011 int optc;
1012 int nfiles = 0;
1013 int i;
1015 initialize_main (&argc, &argv);
1016 set_program_name (argv[0]);
1017 setlocale (LC_ALL, "");
1018 bindtextdomain (PACKAGE, LOCALEDIR);
1019 textdomain (PACKAGE);
1020 hard_LC_COLLATE = hard_locale (LC_COLLATE);
1022 atexit (close_stdout);
1023 atexit (free_spareline);
1025 print_pairables = true;
1026 seen_unpairable = false;
1027 issued_disorder_warning[0] = issued_disorder_warning[1] = false;
1028 check_input_order = CHECK_ORDER_DEFAULT;
1030 while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:z",
1031 longopts, nullptr))
1032 != -1)
1034 optc_status = MUST_BE_OPERAND;
1036 switch (optc)
1038 case 'v':
1039 print_pairables = false;
1040 FALLTHROUGH;
1042 case 'a':
1044 long int val;
1045 if (xstrtol (optarg, nullptr, 10, &val, "") != LONGINT_OK
1046 || (val != 1 && val != 2))
1047 error (EXIT_FAILURE, 0,
1048 _("invalid file number: %s"), quote (optarg));
1049 if (val == 1)
1050 print_unpairables_1 = true;
1051 else
1052 print_unpairables_2 = true;
1054 break;
1056 case 'e':
1057 if (empty_filler && ! STREQ (empty_filler, optarg))
1058 error (EXIT_FAILURE, 0,
1059 _("conflicting empty-field replacement strings"));
1060 empty_filler = optarg;
1061 break;
1063 case 'i':
1064 ignore_case = true;
1065 break;
1067 case '1':
1068 set_join_field (&join_field_1, string_to_join_field (optarg));
1069 break;
1071 case '2':
1072 set_join_field (&join_field_2, string_to_join_field (optarg));
1073 break;
1075 case 'j':
1076 if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
1077 && optarg == argv[optind - 1] + 2)
1079 /* The argument was either "-j1" or "-j2". */
1080 bool is_j2 = (optarg[0] == '2');
1081 joption_count[is_j2]++;
1082 optc_status = MIGHT_BE_J1_ARG + is_j2;
1084 else
1086 set_join_field (&join_field_1, string_to_join_field (optarg));
1087 set_join_field (&join_field_2, join_field_1);
1089 break;
1091 case 'o':
1092 if (STREQ (optarg, "auto"))
1093 autoformat = true;
1094 else
1096 add_field_list (optarg);
1097 optc_status = MIGHT_BE_O_ARG;
1099 break;
1101 case 't':
1103 mcel_t newtab;
1104 if (!*optarg)
1106 /* '' => process the whole line. */
1107 newtab = mcel_ch ('\n', 1);
1108 /* output_separator does not matter. */
1110 else if (STREQ (optarg, "\\0"))
1112 newtab = mcel_ch ('\0', 1);
1113 output_separator = "";
1115 else
1117 newtab = mcel_scanz (optarg);
1118 if (optarg[newtab.len])
1119 error (EXIT_FAILURE, 0, _("multi-character tab %s"),
1120 quote (optarg));
1121 output_separator = optarg;
1123 if (tab.len && mcel_cmp (tab, newtab) != 0)
1124 error (EXIT_FAILURE, 0, _("incompatible tabs"));
1125 tab = newtab;
1126 output_seplen = newtab.len;
1128 break;
1130 case 'z':
1131 eolchar = 0;
1132 break;
1134 case NOCHECK_ORDER_OPTION:
1135 check_input_order = CHECK_ORDER_DISABLED;
1136 break;
1138 case CHECK_ORDER_OPTION:
1139 check_input_order = CHECK_ORDER_ENABLED;
1140 break;
1142 case 1: /* Non-option argument. */
1143 add_file_name (optarg, g_names, operand_status, joption_count,
1144 &nfiles, &prev_optc_status, &optc_status);
1145 break;
1147 case HEADER_LINE_OPTION:
1148 join_header_lines = true;
1149 break;
1151 case_GETOPT_HELP_CHAR;
1153 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1155 default:
1156 usage (EXIT_FAILURE);
1159 prev_optc_status = optc_status;
1162 /* Process any operands after "--". */
1163 prev_optc_status = MUST_BE_OPERAND;
1164 while (optind < argc)
1165 add_file_name (argv[optind++], g_names, operand_status, joption_count,
1166 &nfiles, &prev_optc_status, &optc_status);
1168 if (nfiles != 2)
1170 if (nfiles == 0)
1171 error (0, 0, _("missing operand"));
1172 else
1173 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1174 usage (EXIT_FAILURE);
1177 /* If "-j1" was specified and it turns out not to have had an argument,
1178 treat it as "-j 1". Likewise for -j2. */
1179 for (i = 0; i < 2; i++)
1180 if (joption_count[i] != 0)
1182 set_join_field (&join_field_1, i);
1183 set_join_field (&join_field_2, i);
1186 if (join_field_1 < 0)
1187 join_field_1 = 0;
1188 if (join_field_2 < 0)
1189 join_field_2 = 0;
1191 fp1 = STREQ (g_names[0], "-") ? stdin : fopen (g_names[0], "r");
1192 if (!fp1)
1193 error (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1194 fp2 = STREQ (g_names[1], "-") ? stdin : fopen (g_names[1], "r");
1195 if (!fp2)
1196 error (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1197 if (fp1 == fp2)
1198 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
1199 join (fp1, fp2);
1201 if (fclose (fp1) != 0)
1202 error (EXIT_FAILURE, errno, "%s", quotef (g_names[0]));
1203 if (fclose (fp2) != 0)
1204 error (EXIT_FAILURE, errno, "%s", quotef (g_names[1]));
1206 if (issued_disorder_warning[0] || issued_disorder_warning[1])
1207 error (EXIT_FAILURE, 0, _("input is not in sorted order"));
1208 else
1209 return EXIT_SUCCESS;