doc: rewrite the "Unusual File Names" section
[diffutils.git] / src / cmp.c
blob26067d975d7f6615df8bb47679e708feb1dfecad
1 /* GNU cmp - compare two files byte by byte
3 Copyright (C) 1990-1996, 1998, 2001-2002, 2004, 2006-2007, 2009-2013,
4 2015-2025 Free Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "system.h"
20 #include "paths.h"
22 #include <binary-io.h>
23 #include <c-ctype.h>
24 #include <c-stack.h>
25 #include <cmpbuf.h>
26 #include <diagnose.h>
27 #include <error.h>
28 #include <exitfail.h>
29 #include <file-type.h>
30 #include <getopt.h>
31 #include <hard-locale.h>
32 #include <progname.h>
33 #include <quote.h>
34 #include <unlocked-io.h>
35 #include <version-etc.h>
36 #include <xalloc.h>
37 #include <xstdopen.h>
38 #include <xstrtol.h>
40 #include <stdio.h>
42 /* The official name of this program (e.g., no 'g' prefix). */
43 static char const PROGRAM_NAME[] = "cmp";
45 #define AUTHORS \
46 proper_name_lite ("Torbjorn Granlund", "Torbj\303\266rn Granlund"), \
47 _("David MacKenzie")
49 static bool
50 hard_locale_LC_MESSAGES (void)
52 #if defined LC_MESSAGES && ENABLE_NLS
53 return hard_locale (LC_MESSAGES);
54 #else
55 return false;
56 #endif
59 static int cmp (void);
60 static off_t file_position (int);
61 static idx_t block_compare (word const *, word const *) ATTRIBUTE_PURE;
62 static idx_t count_newlines (char *, idx_t);
63 static void sprintc (char *, unsigned char);
65 /* Filenames of the compared files. */
66 static char const *file[2];
68 /* File descriptors of the files. */
69 static int file_desc[2];
71 /* Status of the files. If st_size is -1, stat_buf[i] is valid except
72 that the file size is unspecified. If st_size is -2, the rest of
73 stat_buf[i] is unspecified except that st_blksize (if it exists) is
74 a reasonable guess. */
75 static struct stat stat_buf[2];
77 /* Read buffers for the files. */
78 static word *buffer[2];
80 /* Optimal block size for the files. */
81 static idx_t buf_size;
83 /* Initial prefix to ignore for each file, or negative if the user
84 requested to ignore more than TYPE_MAXIMUM (intmax_t) bytes. */
85 static intmax_t ignore_initial[2];
87 /* Number of bytes to compare. INTMAX_MAX is effectively infinity,
88 since there's no practical way on current computers to compare so
89 many bytes. Even if cmp added SEEK_HOLE and SEEK_DATA optimization,
90 regular files can't have more than TYPE_MAXIMUM (off_t) bytes
91 and special files are unlikely to support this optimization. */
92 static intmax_t bytes = INTMAX_MAX;
94 /* Output format. */
95 static enum comparison_type
97 type_first_diff, /* Print the first difference. */
98 type_all_diffs, /* Print all differences. */
99 type_no_stdout, /* Do not output to stdout; only stderr. */
100 type_status /* Exit status only. */
101 } comparison_type;
103 /* If nonzero, print values of bytes quoted like cat -t does. */
104 static bool opt_print_bytes;
106 /* Values for long options that do not have single-letter equivalents. */
107 enum
109 HELP_OPTION = CHAR_MAX + 1
112 static char const shortopts[] = "bci:ln:sv";
113 static struct option const longopts[] =
115 {"print-bytes", 0, 0, 'b'},
116 {"print-chars", 0, 0, 'c'}, /* obsolescent as of diffutils 2.7.3 */
117 {"ignore-initial", 1, 0, 'i'},
118 {"verbose", 0, 0, 'l'},
119 {"bytes", 1, 0, 'n'},
120 {"silent", 0, 0, 's'},
121 {"quiet", 0, 0, 's'},
122 {"version", 0, 0, 'v'},
123 {"help", 0, 0, HELP_OPTION},
124 {0, 0, 0, 0}
127 static char const valid_suffixes[] = "kKMGTPEZY0";
129 /* Update ignore_initial[F] according to the result of parsing an
130 *operand ARGPTR of --ignore-initial, updating *ARGPTR to point
131 *after the operand. If DELIMITER is nonzero, the operand may be
132 *followed by DELIMITER; otherwise it must be null-terminated. */
133 static void
134 specify_ignore_initial (int f, char **argptr, char delimiter)
136 intmax_t val;
137 char const *arg = *argptr;
138 strtol_error d = xstrtoimax (arg, argptr, 0, &val, valid_suffixes);
139 strtol_error e = d & ~LONGINT_OVERFLOW;
140 if (! ((e == LONGINT_OK
141 || (e == LONGINT_INVALID_SUFFIX_CHAR && **argptr == delimiter))
142 && 0 <= val))
143 try_help ("invalid --ignore-initial value %s", quote (arg));
144 if (0 <= ignore_initial[f] && ignore_initial[f] < val)
145 ignore_initial[f] = d == e ? val : -1;
148 /* Specify the output format. */
149 static void
150 specify_comparison_type (enum comparison_type t)
152 if (comparison_type && comparison_type != t)
153 try_help ("options -l and -s are incompatible", nullptr);
154 comparison_type = t;
157 static void
158 check_stdout (void)
160 if (ferror (stdout))
161 error (EXIT_TROUBLE, 0, "%s", _("write failed"));
162 else if (fclose (stdout) != 0)
163 error (EXIT_TROUBLE, errno, "%s", _("standard output"));
166 static char const *const option_help_msgid[] = {
167 N_("-b, --print-bytes print differing bytes"),
168 N_("-i, --ignore-initial=SKIP skip first SKIP bytes of both inputs"),
169 N_("-i, --ignore-initial=SKIP1:SKIP2 skip first SKIP1 bytes of FILE1 and\n"
170 " first SKIP2 bytes of FILE2"),
171 N_("-l, --verbose output byte numbers and differing byte values"),
172 N_("-n, --bytes=LIMIT compare at most LIMIT bytes"),
173 N_("-s, --quiet, --silent suppress all normal output"),
174 N_(" --help display this help and exit"),
175 N_("-v, --version output version information and exit"),
176 nullptr
179 static void
180 usage (void)
182 printf (_("Usage: %s [OPTION]... FILE1 [FILE2 [SKIP1 [SKIP2]]]\n"),
183 squote (0, program_name));
184 puts (_("Compare two files byte by byte."));
185 printf ("\n%s\n\n",
186 _("The optional SKIP1 and SKIP2 specify the number of bytes to skip\n"
187 "at the beginning of each file (zero by default)."));
189 fputs (_("\
190 Mandatory arguments to long options are mandatory for short options too.\n\
191 "), stdout);
192 for (char const *const *p = option_help_msgid; *p; p++)
193 printf (" %s\n", _(*p));
194 printf ("\n%s\n\n%s\n%s\n",
195 _("SKIP values may be followed by the following multiplicative suffixes:\n\
196 kB 1000, K 1024, MB 1,000,000, M 1,048,576,\n\
197 GB 1,000,000,000, G 1,073,741,824, and so on for T, P, E, Z, Y."),
198 _("If a FILE is '-' or missing, read standard input."),
199 _("Exit status is 0 if inputs are the same, 1 if different, 2 if trouble."));
200 emit_bug_reporting_address ();
204 main (int argc, char **argv)
206 exit_failure = EXIT_TROUBLE;
207 initialize_main (&argc, &argv);
208 set_program_name (argv[0]);
209 setlocale (LC_ALL, "");
210 bindtextdomain (PACKAGE, LOCALEDIR);
211 textdomain (PACKAGE);
212 c_stack_action (nullptr);
213 xstdopen ();
215 /* Parse command line options. */
217 for (int c;
218 0 <= (c = getopt_long (argc, argv, shortopts, longopts, nullptr)); )
219 switch (c)
221 case 'b':
222 case 'c': /* 'c' is obsolescent as of diffutils 2.7.3 */
223 opt_print_bytes = true;
224 break;
226 case 'i':
227 specify_ignore_initial (0, &optarg, ':');
228 if (*optarg++ == ':')
229 specify_ignore_initial (1, &optarg, 0);
230 else if (ignore_initial[1] < ignore_initial[0] || ignore_initial[0] < 0)
231 ignore_initial[1] = ignore_initial[0];
232 break;
234 case 'l':
235 specify_comparison_type (type_all_diffs);
236 break;
238 case 'n':
240 intmax_t n;
241 strtol_error e = xstrtoimax (optarg, nullptr, 0, &n, valid_suffixes);
242 if ((e & ~LONGINT_OVERFLOW) != LONGINT_OK || n < 0)
243 try_help ("invalid --bytes value %s", quote (optarg));
244 bytes = MIN (bytes, n);
246 break;
248 case 's':
249 specify_comparison_type (type_status);
250 break;
252 case 'v':
253 version_etc (stdout, PROGRAM_NAME, PACKAGE_NAME, Version,
254 AUTHORS, nullptr);
255 check_stdout ();
256 return EXIT_SUCCESS;
258 case HELP_OPTION:
259 usage ();
260 check_stdout ();
261 return EXIT_SUCCESS;
263 default:
264 try_help (nullptr, nullptr);
267 if (optind == argc)
268 try_help ("missing operand after %s", quote (argv[argc - 1]));
270 file[0] = argv[optind++];
271 file[1] = optind < argc ? argv[optind++] : "-";
273 for (int f = 0; f < 2 && optind < argc; f++)
275 char *arg = argv[optind++];
276 specify_ignore_initial (f, &arg, 0);
279 if (optind < argc)
280 try_help ("extra operand %s", quote (argv[optind]));
282 for (int f = 0; f < 2; f++)
284 /* Two files with the same name and offset are identical.
285 But wait until we open the file once, for proper diagnostics. */
286 if (f && 0 <= ignore_initial[0] && ignore_initial[0] == ignore_initial[1]
287 && file_name_cmp (file[0], file[1]) == 0)
288 return EXIT_SUCCESS;
290 if (STREQ (file[f], "-"))
292 file_desc[f] = STDIN_FILENO;
293 if (O_BINARY && ! isatty (STDIN_FILENO))
294 set_binary_mode (STDIN_FILENO, O_BINARY);
296 else
298 file_desc[f] = open (file[f], O_RDONLY | O_BINARY | O_CLOEXEC);
300 if (file_desc[f] < 0)
302 if (comparison_type != type_status)
303 error (0, errno, "%s", squote (0, file[f]));
304 exit (EXIT_TROUBLE);
308 if (fstat (file_desc[f], stat_buf + f) < 0)
310 stat_buf[f].st_size = -2;
311 #if HAVE_STRUCT_STAT_ST_BLKSIZE
312 stat_buf[f].st_blksize = 8 * 1024;
313 #endif
315 else
316 stat_buf[f].st_size = stat_size (&stat_buf[f]);
319 /* If the files are the same and have the same file position,
320 the contents are identical. */
322 if (-1 <= stat_buf[0].st_size && -1 <= stat_buf[1].st_size
323 && same_file (&stat_buf[0], &stat_buf[1])
324 && file_position (0) == file_position (1))
325 return EXIT_SUCCESS;
327 /* If output is redirected to the null device, we can avoid some of
328 the work. */
330 if (comparison_type != type_status)
332 struct stat outstat, nullstat;
334 if (fstat (STDOUT_FILENO, &outstat) == 0
335 && S_ISCHR (outstat.st_mode)
336 && stat (NULL_DEVICE, &nullstat) == 0
337 && same_file (&outstat, &nullstat))
338 comparison_type = type_no_stdout;
341 /* If no output is needed,
342 and both input descriptors are associated with plain files,
343 and the file sizes are nonzero so they are not Linux /proc files,
344 conclude that the files differ if they have different sizes
345 and if more bytes will be compared than are in the smaller file. */
347 if (type_no_stdout <= comparison_type
348 && 0 <= stat_buf[0].st_size && S_ISREG (stat_buf[0].st_mode)
349 && 0 <= stat_buf[1].st_size && S_ISREG (stat_buf[1].st_mode))
351 off_t pos0 = file_position (0);
352 if (0 <= pos0)
354 off_t pos1 = file_position (1);
355 if (0 <= pos1)
357 off_t s0 = stat_buf[0].st_size - pos0;
358 off_t s1 = stat_buf[1].st_size - pos1;
359 if (s0 < 0)
360 s0 = 0;
361 if (s1 < 0)
362 s1 = 0;
363 if (s0 != s1 && MIN (s0, s1) < bytes)
364 exit (EXIT_FAILURE);
369 /* Guess a good block size for the files. */
371 idx_t blksize[2];
372 for (int f = 0; f < 2; f++)
373 if (ST_BLKSIZE (stat_buf[0]) < 0
374 || ckd_add (&blksize[f], ST_BLKSIZE (stat_buf[0]), 0))
375 blksize[f] = 0;
376 buf_size = buffer_lcm (blksize[0], blksize[1], IDX_MAX - sizeof (word));
378 /* Allocate word-aligned buffers, with space for sentinels at the end. */
380 idx_t words_per_buffer = (buf_size + 2 * sizeof (word) - 1) / sizeof (word);
381 buffer[0] = xinmalloc (words_per_buffer, 2 * sizeof (word));
382 buffer[1] = buffer[0] + words_per_buffer;
384 int exit_status = cmp ();
386 for (int f = 0; f < 2; f++)
387 if (close (file_desc[f]) != 0)
388 error (EXIT_TROUBLE, errno, "%s", squote (0, file[f]));
389 if (exit_status != EXIT_SUCCESS && comparison_type < type_no_stdout)
390 check_stdout ();
391 exit (exit_status);
394 /* Compare the two files already open on 'file_desc[0]' and 'file_desc[1]',
395 using 'buffer[0]' and 'buffer[1]'.
396 Return EXIT_SUCCESS if identical, EXIT_FAILURE if different,
397 >1 if error. */
399 static int
400 cmp (void)
402 word *buffer0 = buffer[0];
403 word *buffer1 = buffer[1];
404 char *buf0 = (char *) buffer0;
405 char *buf1 = (char *) buffer1;
407 /* For -l, the print width of the offset, a positive number.
408 Otherwise, the negative of the comparison type.
409 This portmanteauization pacifies gcc -Wmaybe-uninitialized. */
410 int offset_width;
412 if (comparison_type == type_all_diffs)
414 intmax_t byte_number_max = bytes;
416 for (int f = 0; f < 2; f++)
417 if (0 <= stat_buf[f].st_size && S_ISREG (stat_buf[f].st_mode))
419 off_t pos = file_position (f);
420 if (0 <= pos)
421 byte_number_max = MIN (byte_number_max,
422 MAX (0, stat_buf[f].st_size - pos));
425 for (offset_width = 1; (byte_number_max /= 10) != 0; offset_width++)
426 continue;
428 else
429 offset_width = -comparison_type;
431 bool eof[2] = { false, false };
433 for (int f = 0; f < 2; f++)
435 intmax_t ig = ignore_initial[f];
436 if (ig == 0)
437 continue;
439 if (0 <= file_position (f))
440 continue; /* lseek sufficed. */
442 if (! (0 <= ig && ig < TYPE_MAXIMUM (off_t))
443 && -1 <= stat_buf[f].st_size && S_ISREG (stat_buf[f].st_mode))
445 /* When ignoring at least TYPE_MAXIMUM (off_t) bytes
446 of a regular file, pretend to be at end of file,
447 as lseeking to TYPE_MAXIMUM (off_t) might tickle a kernel bug,
448 and lseeking to file end would race with a growing file. */
449 eof[f] = true;
451 else if (ig < 0)
453 /* Report an error if asked to ignore more than
454 INTMAX_MAX bytes of a non-regular file,
455 as the actual number of bytes to ignore is not known. */
456 error (EXIT_TROUBLE, EOVERFLOW, "%s", squote (0, file[f]));
458 else
460 /* Read and discard the ignored initial prefix. */
463 idx_t bytes_to_read = MIN (ig, buf_size);
464 ptrdiff_t r = block_read (file_desc[f], buf0, bytes_to_read);
465 if (r != bytes_to_read)
467 if (r < 0)
468 error (EXIT_TROUBLE, errno, "%s", squote (0, file[f]));
469 break;
471 ig -= r;
473 while (0 < ig);
477 bool at_line_start = true;
478 intmax_t line_number = 1; /* Line number (1...) of difference. */
479 intmax_t byte_number = 1; /* Byte number (1...) of difference. */
480 intmax_t remaining = bytes; /* Remaining bytes to compare, or -1. */
482 while (true)
484 idx_t bytes_to_read = MIN (buf_size, remaining);
485 remaining -= bytes_to_read;
487 ptrdiff_t read0 = (eof[0] ? 0
488 : block_read (file_desc[0], buf0, bytes_to_read));
489 if (read0 < 0)
490 error (EXIT_TROUBLE, errno, "%s", squote (0, file[0]));
491 ptrdiff_t read1 = (eof[1] ? 0
492 : block_read (file_desc[1], buf1, bytes_to_read));
493 if (read1 < 0)
494 error (EXIT_TROUBLE, errno, "%s", squote (0, file[1]));
496 idx_t smaller = MIN (read0, read1);
498 idx_t first_diff; /* Offset (0...) in buffers of 1st diff. */
500 /* Optimize the common case where the buffers are the same. */
501 if (memcmp (buf0, buf1, smaller) == 0)
502 first_diff = smaller;
503 else
505 /* Insert sentinels for the block compare. */
506 if (read0 >= read1)
507 buf1[read0] = 0x55; /* arbitrary */
508 if (read1 >= read0)
509 buf0[read1] = 0x79; /* arbitrary and distinct from the above */
510 buf0[read0] = ~buf1[read0];
511 buf1[read1] = ~buf0[read1];
512 /* Ensure all bytes of a final word-read are initialized. */
513 memset (buf0 + read0 + 1, 0,
514 sizeof (word) - read0 % sizeof (word) - 1);
515 memset (buf1 + read1 + 1, 0,
516 sizeof (word) - read1 % sizeof (word) - 1);
518 first_diff = block_compare (buffer0, buffer1);
521 byte_number += first_diff;
522 if (offset_width == -type_first_diff && first_diff != 0)
524 line_number += count_newlines (buf0, first_diff);
525 at_line_start = buf0[first_diff - 1] == '\n';
528 int differing = 0;
530 if (first_diff < smaller)
532 switch (offset_width)
534 case -type_first_diff:
536 if (!opt_print_bytes)
538 /* See POSIX for this format. This message is
539 used only in the POSIX locale, so it need not
540 be translated. */
541 static char const char_message[] =
542 "%s %s differ: char %"PRIdMAX", line %"PRIdMAX"\n";
544 /* The POSIX rationale recommends using the word
545 "byte" outside the POSIX locale. Some gettext
546 implementations translate even in the POSIX
547 locale if certain other environment variables
548 are set, so use "byte" if a translation is
549 available, or if outside the POSIX locale. */
550 static char const byte_msgid[] =
551 N_("%s %s differ: byte %"PRIdMAX", line %"PRIdMAX"\n");
552 char const *byte_message = _(byte_msgid);
553 bool use_byte_message = (byte_message != byte_msgid
554 || hard_locale_LC_MESSAGES ());
556 printf (use_byte_message ? byte_message : char_message,
557 file[0], file[1], byte_number, line_number);
559 else
561 unsigned char c0 = buf0[first_diff];
562 unsigned char c1 = buf1[first_diff];
563 char s0[5];
564 char s1[5];
565 sprintc (s0, c0);
566 sprintc (s1, c1);
567 printf (_("%s %s differ: byte %"PRIdMAX", line %"PRIdMAX
568 " is %3o %s %3o %s\n"),
569 file[0], file[1], byte_number, line_number,
570 c0, s0, c1, s1);
573 FALLTHROUGH;
574 case -type_status:
575 return EXIT_FAILURE;
577 default:
578 dassert (comparison_type == type_all_diffs);
582 unsigned char c0 = buf0[first_diff];
583 unsigned char c1 = buf1[first_diff];
584 if (c0 != c1)
586 if (!opt_print_bytes)
588 /* See POSIX for this format. */
589 printf ("%*"PRIdMAX" %3o %3o\n",
590 offset_width, byte_number, c0, c1);
592 else
594 char s0[5];
595 char s1[5];
596 sprintc (s0, c0);
597 sprintc (s1, c1);
598 printf ("%*"PRIdMAX" %3o %-4s %3o %s\n",
599 offset_width, byte_number, c0, s0, c1, s1);
602 byte_number++;
603 first_diff++;
605 while (first_diff < smaller);
607 differing = -1;
608 break;
610 case -type_no_stdout:
611 differing = 1;
612 break;
616 if (read0 != read1)
618 /* POSIX says that each of these format strings must be
619 "cmp: EOF on %s", optionally followed by a blank and
620 extra text sans newline, then terminated by "\n". */
621 if (differing <= 0 && offset_width != -type_status)
622 fprintf (stderr,
623 _(byte_number == 1
624 ? N_("cmp: EOF on %s which is empty\n")
625 : offset_width != -type_first_diff
626 ? N_("cmp: EOF on %s after byte %"PRIdMAX"\n")
627 : at_line_start
628 ? N_("cmp: EOF on %s after byte %"PRIdMAX","
629 " line %"PRIdMAX"\n")
630 : N_("cmp: EOF on %s after byte %"PRIdMAX","
631 " in line %"PRIdMAX"\n")),
632 quote (file[read1 < read0]),
633 byte_number - 1, line_number - at_line_start);
634 return EXIT_FAILURE;
637 if (0 < differing || read0 != buf_size)
638 return differing == 0 ? EXIT_SUCCESS : EXIT_FAILURE;
642 /* Compare two blocks of memory P0 and P1 until they differ.
643 If the blocks are not guaranteed to be different, put sentinels at the ends
644 of the blocks before calling this function.
646 Return the offset of the first byte that differs. */
648 static idx_t
649 block_compare (word const *p0, word const *p1)
651 word const *l0, *l1;
652 char const *c0, *c1;
654 /* Find the rough position of the first difference by reading words,
655 not bytes. */
657 for (l0 = p0, l1 = p1; *l0 == *l1; l0++, l1++)
658 continue;
660 /* Find the exact differing position (endianness independent). */
662 for (c0 = (char const *) l0, c1 = (char const *) l1;
663 *c0 == *c1;
664 c0++, c1++)
665 continue;
667 return c0 - (char const *) p0;
670 /* Return the number of newlines in BUF, of size BUFSIZE,
671 where BUF[NBYTES] is available for use as a sentinel. */
673 static idx_t
674 count_newlines (char *buf, idx_t bufsize)
676 idx_t count = 0;
677 char *lim = buf + bufsize;
678 char ch = *lim;
679 *lim = '\n';
680 for (char *p = buf; (p = rawmemchr (p, '\n')) != lim; p++)
681 count++;
682 *lim = ch;
683 return count;
686 /* Put into BUF the unsigned char C, making unprintable bytes
687 visible by quoting like cat -t does. */
689 static void
690 sprintc (char *buf, unsigned char c)
692 if (! c_isprint (c))
694 if (c >= 128)
696 *buf++ = 'M';
697 *buf++ = '-';
698 c -= 128;
700 if (c < 32)
702 *buf++ = '^';
703 c += 64;
705 else if (c == 127)
707 *buf++ = '^';
708 c = '?';
712 *buf++ = c;
713 *buf = 0;
716 /* Position file F to ignore_initial[F] bytes from its initial position,
717 and yield its new position. Return a negative number on failure.
718 Do not report an error on failure, as lseek is generally a no-op
719 on devices that cannot seek. Don't try more than once. */
721 static off_t
722 file_position (int f)
724 /* The initial position of input file F, and whether that position has
725 been determined. The position is -1 if it could not be determined. */
726 static bool positioned[2];
727 static off_t position[2];
729 if (! positioned[f])
731 positioned[f] = true;
732 off_t pos = ignore_initial[f];
733 position[f] = (0 <= pos && pos <= TYPE_MAXIMUM (off_t)
734 ? lseek (file_desc[f], pos, SEEK_CUR)
735 : -1);
737 return position[f];