doc: rewrite the "Unusual File Names" section
[diffutils.git] / src / io.c
bloba62c52970dca3c8899e9b99e45d50963d0aed6c1
1 /* File I/O for GNU DIFF.
3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
4 2015-2025 Free Software Foundation, Inc.
6 This file is part of GNU DIFF.
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include "diff.h"
22 #include <binary-io.h>
23 #include <cmpbuf.h>
24 #include <file-type.h>
25 #include <ialloc.h>
26 #include <mcel.h>
27 #include <xalloc.h>
29 #include <ctype.h>
30 #include <uchar.h>
32 /* The type of a hash value. */
33 typedef size_t hash_value;
34 enum { HASH_VALUE_WIDTH = SIZE_WIDTH };
35 static_assert (! TYPE_SIGNED (hash_value));
37 /* Rotate a hash value to the left. */
38 static hash_value
39 rol (hash_value v, int n)
41 return v << n | v >> (HASH_VALUE_WIDTH - n);
44 /* Given a hash value and a new character, return a new hash value. */
45 static hash_value
46 hash (hash_value h, hash_value c)
48 return rol (h, 7) + c;
51 /* Lines are put into equivalence classes of lines that match in lines_differ.
52 Each equivalence class is represented by one of these structures,
53 but only while the classes are being computed.
54 Afterward, each class is represented by a number. */
55 struct equivclass
57 lin next; /* Next item in this bucket. */
58 hash_value hash; /* Hash of lines in this class. */
59 char const *line; /* A line that fits this class. */
60 idx_t length; /* That line's length, counting its newline. */
63 /* Hash-table: array of buckets, each being a chain of equivalence classes.
64 buckets[-1] is reserved for incomplete lines. */
65 static lin *buckets;
67 /* Number of buckets in the hash table array, not counting buckets[-1]. */
68 static idx_t nbuckets;
70 /* Array in which the equivalence classes are allocated.
71 The bucket-chains go through the elements in this array.
72 The number of an equivalence class is its index in this array. */
73 static struct equivclass *equivs;
75 /* Index of first free element in the array 'equivs'. */
76 static lin equivs_index;
78 /* Number of elements allocated in the array 'equivs'. */
79 static idx_t equivs_alloc;
81 /* The file buffer, considered as an array of bytes rather than
82 as an array of words. */
84 static char *
85 file_buffer (struct file_data const *f)
87 return (char *) f->buffer;
90 /* Read a block of data into a file buffer, checking for EOF and error. */
92 void
93 file_block_read (struct file_data *current, idx_t size)
95 if (size && ! current->eof)
97 ptrdiff_t s = block_read (current->desc,
98 file_buffer (current) + current->buffered,
99 size);
100 if (s < 0)
101 pfatal_with_name (current->name);
102 current->buffered += s;
103 current->eof = s < size;
107 /* Check for binary files and compare them for exact identity. */
109 /* Return true if BUF contains a non text character.
110 SIZE is the number of characters in BUF. */
112 #define binary_file_p(buf, size) (!! memchr (buf, 0, size))
114 /* Get ready to read the current file.
115 Return nonzero if SKIP_TEST is zero,
116 and if it appears to be a binary file. */
118 static bool
119 sip (struct file_data *current, bool skip_test)
121 /* If we have a nonexistent file at this stage, treat it as empty. */
122 if (current->desc < 0)
124 /* Leave room for a sentinel. */
125 current->bufsize = sizeof (word);
126 current->buffer = ximalloc (current->bufsize);
128 else
130 idx_t blksize;
131 if (ST_BLKSIZE (current->stat) < 0
132 || ckd_add (&blksize, ST_BLKSIZE (current->stat), 0))
133 blksize = 0;
134 current->bufsize = buffer_lcm (sizeof (word), blksize, IDX_MAX);
135 current->buffer = ximalloc (current->bufsize);
137 #ifdef __KLIBC__
138 /* Skip test if seek is not possible */
139 skip_test = skip_test
140 || (lseek (current->desc, 0, SEEK_CUR) < 0
141 && errno == ESPIPE);
142 #endif
144 if (! skip_test)
146 /* Check first part of file to see if it's a binary file. */
148 int prev_mode = set_binary_mode (current->desc, O_BINARY);
149 file_block_read (current, current->bufsize);
150 off_t buffered = current->buffered;
152 if (prev_mode != O_BINARY)
154 /* Revert to text mode and seek back to the start to reread
155 the file. Use relative seek, since file descriptors
156 like stdin might not start at offset zero. */
157 if (lseek (current->desc, - buffered, SEEK_CUR) < 0)
158 pfatal_with_name (current->name);
159 set_binary_mode (current->desc, prev_mode);
160 current->buffered = 0;
161 current->eof = false;
164 return binary_file_p (current->buffer, buffered);
168 current->buffered = 0;
169 current->eof = false;
170 return false;
173 /* Slurp the rest of the current file completely into memory. */
175 static void
176 slurp (struct file_data *current)
178 if (current->desc < 0)
180 /* The file is nonexistent. */
181 return;
184 /* Upper bound on the room needed for an appended newline, word
185 sentinel, and worst-case word alignment. */
186 enum { extra_room = 2 * sizeof (word) };
188 if (S_ISREG (current->stat.st_mode))
190 /* It's a regular file; try to allocate a big enough buffer so
191 that it can be slurped in all at once, along with appended
192 newline plus word sentinel plus word-alignment. */
193 off_t file_size = current->stat.st_size;
194 idx_t cc;
195 if (0 <= file_size
196 && !ckd_add (&cc, file_size - file_size % sizeof (word), extra_room)
197 && current->bufsize < cc)
199 #if __GNUC__ == 13 && __GNUC_MINOR__ < 3
200 /* Work around GCC bug 110014. */
201 #pragma GCC diagnostic push
202 #pragma GCC diagnostic ignored "-Wanalyzer-allocation-size"
203 #endif
205 word *buffer = irealloc (current->buffer, cc);
207 if (buffer)
209 current->buffer = buffer;
210 current->bufsize = cc;
213 #if __GNUC__ == 13
214 #pragma GCC diagnostic pop
215 #endif
219 /* Read the file, growing the buffer as needed. */
221 while (file_block_read (current, current->bufsize - current->buffered),
222 !current->eof)
223 current->buffer = xpalloc (current->buffer, &current->bufsize,
224 extra_room, -1, 1);
226 if (current->bufsize - current->buffered < extra_room)
228 if (ckd_add (&current->bufsize, current->buffered, extra_room))
229 xalloc_die ();
230 current->buffer = xirealloc (current->buffer, current->bufsize);
234 /* Return true if CH1 and ERR1 stand for the same character or
235 encoding error as CH2 and ERR2. */
236 static bool
237 same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
239 return ! ((ch1 ^ ch2) | (err1 ^ err2));
242 /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
243 one line from each input file) according to the command line options.
244 Line lengths include the trailing newline.
245 For efficiency, this is invoked only when the lines do not match exactly
246 but an option like -i might cause us to ignore the difference.
247 Return nonzero if the lines differ.
248 Line lengths do not include the trailing newline. */
250 static bool
251 lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
253 char const *t1 = s1;
254 char const *t2 = s2;
255 intmax_t tab = 0, column = 0;
257 if (MB_CUR_MAX == 1)
258 while (true)
260 unsigned char c1 = *t1++;
261 unsigned char c2 = *t2++;
263 /* Test for exact char equality first, since it's a common case. */
264 if (c1 != c2)
266 switch (ignore_white_space)
268 case IGNORE_ALL_SPACE:
269 /* For -w, just skip past any white space. */
270 while (isspace (c1) && c1 != '\n') c1 = *t1++;
271 while (isspace (c2) && c2 != '\n') c2 = *t2++;
272 break;
274 case IGNORE_SPACE_CHANGE:
275 /* For -b, advance past any sequence of white space in
276 line 1 and consider it just one space, or nothing at
277 all if it is at the end of the line. */
278 if (isspace (c1))
279 while (c1 != '\n')
281 c1 = *t1++;
282 if (! isspace (c1))
284 --t1;
285 c1 = ' ';
286 break;
290 /* Likewise for line 2. */
291 if (isspace (c2))
292 while (c2 != '\n')
294 c2 = *t2++;
295 if (! isspace (c2))
297 --t2;
298 c2 = ' ';
299 break;
303 if (c1 != c2)
305 /* If we went too far when doing the simple test
306 for equality, go back to the first non-white-space
307 character in both sides and try again. */
308 if (c2 == ' ' && c1 != '\n'
309 && s1 + 1 < t1
310 && isspace ((unsigned char) t1[-2]))
312 --t1;
313 continue;
315 if (c1 == ' ' && c2 != '\n'
316 && s2 + 1 < t2
317 && isspace ((unsigned char) t2[-2]))
319 --t2;
320 continue;
324 break;
326 case IGNORE_TRAILING_SPACE:
327 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
328 if (isspace (c1) && isspace (c2))
330 if (c1 != '\n')
332 char const *p = t1;
333 unsigned char c;
334 while ((c = *p) != '\n' && isspace (c))
335 ++p;
336 if (c != '\n')
337 break;
339 if (c2 != '\n')
341 char const *p = t2;
342 unsigned char c;
343 while ((c = *p) != '\n' && isspace (c))
344 ++p;
345 if (c != '\n')
346 break;
348 /* Both lines have nothing but whitespace left. */
349 return false;
351 if (ignore_white_space == IGNORE_TRAILING_SPACE)
352 break;
353 FALLTHROUGH;
354 case IGNORE_TAB_EXPANSION:
355 if ((c1 == ' ' && c2 == '\t')
356 || (c1 == '\t' && c2 == ' '))
358 intmax_t tab2 = tab, column2 = column;
359 for (;; c1 = *t1++)
361 if (c1 == '\t' || (c1 == ' ' && column == tabsize - 1))
363 tab++;
364 column = 0;
366 else if (c1 == ' ')
367 column++;
368 else
369 break;
371 for (;; c2 = *t2++)
373 if (c2 == '\t' || (c2 == ' ' && column2 == tabsize - 1))
375 tab2++;
376 column2 = 0;
378 else if (c2 == ' ')
379 column2++;
380 else
381 break;
383 if (tab != tab2 || column != column2)
384 return true;
386 break;
388 case IGNORE_NO_WHITE_SPACE:
389 break;
392 if (ignore_case)
394 c1 = tolower (c1);
395 c2 = tolower (c2);
398 if (c1 != c2)
399 break;
402 switch (c1)
404 case '\n':
405 return false;
407 case '\r':
408 tab = column = 0;
409 break;
411 case '\b':
412 if (0 < column)
413 column--;
414 else if (0 < tab)
416 tab--;
417 column = tabsize - 1;
419 break;
421 case '\0': case '\a': case '\f': case '\v':
422 break;
424 default:
425 column += !! isprint (c1);
426 if (column < tabsize)
427 break;
428 FALLTHROUGH;
429 case '\t':
430 tab++;
431 column = 0;
432 break;
435 else
437 char const *lim1 = s1 + s1len;
438 char const *lim2 = s2 + s2len;
439 char32_t ch1prev = 0;
441 while (true)
443 mcel_t g1 = mcel_scan (t1, lim1);
444 mcel_t g2 = mcel_scan (t2, lim2);
445 t1 += g1.len;
446 t2 += g2.len;
447 char32_t ch1 = g1.ch;
448 char32_t ch2 = g2.ch;
450 /* Test for exact equality first, since it's a common case. */
451 if (! same_ch_err (ch1, g1.err, ch2, g2.err))
453 switch (ignore_white_space)
455 case IGNORE_ALL_SPACE:
456 /* For -w, just skip past any white space. */
457 while (ch1 != '\n' && c32isspace (ch1))
459 g1 = mcel_scan (t1, lim1);
460 t1 += g1.len;
461 ch1 = g1.ch;
463 while (ch2 != '\n' && c32isspace (ch2))
465 g2 = mcel_scan (t2, lim2);
466 t2 += g2.len;
467 ch2 = g2.ch;
469 break;
471 case IGNORE_SPACE_CHANGE:
472 /* For -b, advance past any sequence of white space in
473 line 1 and consider it just one space, or nothing at
474 all if it is at the end of the line. */
475 if (c32isspace (ch1))
476 while (ch1 != '\n')
478 g1 = mcel_scan (t1, lim1);
479 t1 += g1.len;
480 ch1 = g1.ch;
481 if (! c32isspace (ch1))
483 t1 -= g1.len;
484 ch1 = ' ';
485 break;
489 /* Likewise for line 2. */
490 if (c32isspace (ch2))
491 while (ch2 != '\n')
493 g2 = mcel_scan (t2, lim2);
494 t2 += g2.len;
495 ch2 = g2.ch;
496 if (! c32isspace (ch2))
498 t2 -= g2.len;
499 ch2 = ' ';
500 break;
504 if (ch1 != ch2)
506 /* If we went too far when doing the simple test
507 for equality, go back to the first non-white-space
508 character in both sides and try again. */
509 if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
511 t1 -= g1.len;
512 continue;
514 if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
516 t2 -= g2.len;
517 continue;
521 break;
523 case IGNORE_TRAILING_SPACE:
524 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
525 if (c32isspace (ch1) && c32isspace (ch2))
527 if (ch1 != '\n')
529 char const *p = t1;
530 while (*p != '\n')
532 mcel_t g = mcel_scan (p, lim1);
533 if (c32isspace (g.ch))
534 break;
535 p += g.len;
537 if (*p != '\n')
538 break;
540 if (ch2 != '\n')
542 char const *p = t2;
543 while (*p != '\n')
545 mcel_t g = mcel_scan (p, lim2);
546 if (! c32isspace (g.ch))
547 break;
548 p += g.len;
550 if (*p != '\n')
551 break;
553 /* Both lines have nothing but whitespace left. */
554 return false;
556 if (ignore_white_space == IGNORE_TRAILING_SPACE)
557 break;
558 FALLTHROUGH;
559 case IGNORE_TAB_EXPANSION:
560 if ((ch1 == ' ' && ch2 == '\t')
561 || (ch1 == '\t' && ch2 == ' '))
563 intmax_t tab2 = tab, column2 = column;
565 while (true)
567 if (ch1 == '\t'
568 || (ch1 == ' ' && column == tabsize - 1))
570 tab++;
571 column = 0;
573 else if (ch1 == ' ')
574 column++;
575 else
576 break;
578 g1 = mcel_scan (t1, lim1);
579 t1 += g1.len;
580 ch1 = g1.ch;
583 while (true)
585 if (ch2 == '\t'
586 || (ch2 == ' ' && column2 == tabsize - 1))
588 tab2++;
589 column2 = 0;
591 else if (ch2 == ' ')
592 column2++;
593 else
594 break;
596 g2 = mcel_scan (t2, lim2);
597 t2 += g2.len;
598 ch2 = g2.ch;
601 if (tab != tab2 || column != column2)
602 return true;
604 break;
606 case IGNORE_NO_WHITE_SPACE:
607 break;
610 if (ignore_case)
612 ch1 = c32tolower (ch1);
613 ch2 = c32tolower (ch2);
616 if (! same_ch_err (ch1, g1.err, ch2, g2.err))
617 break;
620 switch (ch1)
622 case '\n':
623 return false;
625 case '\r':
626 tab = column = 0;
627 break;
629 case '\b':
630 if (0 < column)
631 column--;
632 else if (0 < tab)
634 tab--;
635 column = tabsize - 1;
637 break;
639 case '\a': case '\f': case '\v':
640 break;
642 default:
643 /* Assume that downcasing does not change print width. */
644 column += g1.err ? 1 : c32width (ch1);
645 if (column < tabsize)
646 break;
647 FALLTHROUGH;
648 case '\t':
649 tab++;
650 column = 0;
651 break;
654 ch1prev = ch1;
658 return true;
661 /* Split the file into lines, simultaneously computing the equivalence
662 class for each line. If two lines hash differently, lines_differ
663 must return false. */
665 static void
666 find_and_hash_each_line (struct file_data *current)
668 char const *p = current->prefix_end;
670 /* Cache often-used quantities in local variables to help the compiler. */
671 char const **linbuf = current->linbuf;
672 lin alloc_lines = current->alloc_lines;
673 lin line = 0;
674 lin linbuf_base = current->linbuf_base;
675 lin *cureqs = xinmalloc (alloc_lines, sizeof *cureqs);
676 struct equivclass *eqs = equivs;
677 lin eqs_index = equivs_index;
678 idx_t eqs_alloc = equivs_alloc;
679 char const *suffix_begin = current->suffix_begin;
680 char const *bufend = file_buffer (current) + current->buffered;
681 bool ig_case = ignore_case;
682 enum DIFF_white_space ig_white_space = ignore_white_space;
683 bool unibyte = MB_CUR_MAX == 1;
684 bool diff_length_compare_anyway =
685 (ig_white_space != IGNORE_NO_WHITE_SPACE) | (!unibyte & ig_case);
686 bool same_length_diff_contents_compare_anyway =
687 diff_length_compare_anyway | ig_case;
689 while (p < suffix_begin)
691 char const *ip = p;
692 hash_value h = 0;
694 /* Hash this line until we find a newline. */
695 switch (ig_white_space)
697 case IGNORE_ALL_SPACE:
698 if (unibyte)
699 for (unsigned char c; (c = *p) != '\n'; p++)
701 if (! isspace (c))
702 h = hash (h, ig_case ? tolower (c) : c);
704 else
705 for (mcel_t g; *p != '\n'; p += g.len)
707 g = mcel_scan (p, suffix_begin);
708 if (! c32isspace (g.ch))
709 h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
711 break;
713 case IGNORE_SPACE_CHANGE:
714 if (unibyte)
715 for (unsigned char c; (c = *p) != '\n'; p++)
717 if (isspace (c))
721 c = *++p;
722 if (c == '\n')
723 goto hashing_done;
725 while (isspace (c));
727 h = hash (h, ' ');
730 /* C is now the first non-space. */
731 h = hash (h, ig_case ? tolower (c) : c);
733 else
734 for (mcel_t g; *p != '\n'; p += g.len)
736 g = mcel_scan (p, suffix_begin);
737 if (c32isspace (g.ch))
741 p += g.len;
742 if (*p == '\n')
743 goto hashing_done;
744 g = mcel_scan (p, suffix_begin);
746 while (c32isspace (g.ch));
748 h = hash (h, ' ');
751 /* G is now the first non-space. */
752 h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
754 break;
756 case IGNORE_TAB_EXPANSION:
757 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
758 case IGNORE_TRAILING_SPACE:
760 intmax_t tab = 0, column = 0;
761 if (unibyte)
762 for (unsigned char c; (c = *p) != '\n'; p++)
764 intmax_t repetitions = 1;
766 if (ig_white_space & IGNORE_TRAILING_SPACE
767 && isspace (c))
769 char const *p1 = p;
770 unsigned char c1;
773 c1 = *++p1;
774 if (c1 == '\n')
776 p = p1;
777 goto hashing_done;
780 while (isspace (c1));
783 if (ig_white_space & IGNORE_TAB_EXPANSION)
784 switch (c)
786 case '\b':
787 if (0 < column)
788 column--;
789 else if (0 < tab)
791 tab--;
792 column = tabsize - 1;
794 break;
796 case '\t':
797 c = ' ';
798 repetitions = tabsize - column % tabsize;
799 tab += column / tabsize + 1;
800 column = 0;
801 break;
803 case '\r':
804 tab = column = 0;
805 break;
807 case '\0': case '\a': case '\f': case '\v':
808 break;
810 default:
811 column++;
812 break;
815 if (ig_case)
816 c = tolower (c);
819 h = hash (h, c);
820 while (--repetitions != 0);
822 else
823 for (mcel_t g; *p != '\n'; p += g.len)
825 intmax_t repetitions = 1;
827 g = mcel_scan (p, suffix_begin);
828 char32_t ch;
829 if (g.err)
831 ch = -g.err;
832 column++;
834 else
836 ch = g.ch;
837 if (ig_white_space & IGNORE_TRAILING_SPACE
838 && c32isspace (ch))
840 char const *p1 = p + g.len;
841 for (mcel_t g1; ; p1 += g1.len)
843 if (*p1 == '\n')
845 p = p1;
846 goto hashing_done;
848 g1 = mcel_scan (p1, suffix_begin);
849 if (! c32isspace (g1.ch))
850 break;
854 if (ig_white_space & IGNORE_TAB_EXPANSION)
855 switch (ch)
857 case '\b':
858 if (0 < column)
859 column--;
860 else if (0 < tab)
862 tab--;
863 column = tabsize - 1;
865 break;
867 case '\t':
868 ch = ' ';
869 repetitions = tabsize - column % tabsize;
870 tab += column / tabsize + 1;
871 column = 0;
872 break;
874 case '\r':
875 tab = column = 0;
876 break;
878 case '\0': case '\a': case '\f': case '\v':
879 break;
881 default:
882 column += c32width (ch);
883 break;
886 if (ig_case)
887 ch = c32tolower (ch);
891 h = hash (h, ch);
892 while (--repetitions != 0);
895 break;
897 default:
898 if (unibyte)
900 if (ig_case)
901 for (unsigned char c; (c = *p) != '\n'; p++)
902 h = hash (h, tolower (c));
903 else
904 for (unsigned char c; (c = *p) != '\n'; p++)
905 h = hash (h, c);
907 else
909 if (ig_case)
910 for (mcel_t g; *p != '\n'; p += g.len)
912 g = mcel_scan (p, suffix_begin);
913 h = hash (h, c32tolower (g.ch) - g.err);
915 else
916 for (mcel_t g; *p != '\n'; p += g.len)
918 g = mcel_scan (p, suffix_begin);
919 h = hash (h, g.ch - g.err);
922 break;
925 hashing_done:;
927 lin *bucket = &buckets[h % nbuckets];
929 /* Advance past the line's trailing newline. */
930 p++;
931 idx_t length = p - ip;
933 if (p == bufend
934 && current->missing_newline
935 && robust_output_style (output_style))
937 /* The last line is incomplete and we do not silently
938 complete lines. If the line cannot compare equal to any
939 complete line, put it into buckets[-1] so that it can
940 compare equal only to the other file's incomplete line
941 (if one exists). */
942 if (ig_white_space < IGNORE_TRAILING_SPACE)
943 bucket = &buckets[-1];
946 lin i;
947 for (i = *bucket; ; i = eqs[i].next)
948 if (!i)
950 /* Create a new equivalence class in this bucket. */
951 i = eqs_index++;
952 if (i == eqs_alloc)
953 eqs = xpalloc (eqs, &eqs_alloc, 1, -1, sizeof *eqs);
954 eqs[i].next = *bucket;
955 eqs[i].hash = h;
956 eqs[i].line = ip;
957 eqs[i].length = length;
958 *bucket = i;
959 break;
961 else if (eqs[i].hash == h)
963 char const *eqline = eqs[i].line;
964 idx_t eqlinelen = eqs[i].length;
966 /* Reuse existing class if lines_differ reports the lines
967 equal. */
968 if (eqlinelen == length)
970 /* Reuse existing equivalence class if the lines are identical.
971 This detects the common case of exact identity
972 faster than lines_differ would. */
973 if (memcmp (eqline, ip, length - 1) == 0)
974 break;
975 if (!same_length_diff_contents_compare_anyway)
976 continue;
978 else if (!diff_length_compare_anyway)
979 continue;
981 if (! lines_differ (eqline, eqlinelen, ip, length))
982 break;
985 /* Maybe increase the size of the line table. */
986 if (line == alloc_lines)
988 /* Grow (alloc_lines - linbuf_base) by adding to alloc_lines. */
989 idx_t n = alloc_lines - linbuf_base;
990 linbuf += linbuf_base;
991 linbuf = xpalloc (linbuf, &n, 1, -1, sizeof *linbuf);
992 linbuf -= linbuf_base;
993 alloc_lines = linbuf_base + n;
994 cureqs = xirealloc (cureqs, alloc_lines * sizeof *cureqs);
996 linbuf[line] = ip;
997 cureqs[line] = i;
998 ++line;
1001 current->buffered_lines = line;
1003 for (lin i = 0; ; i++)
1005 /* Record the line start for lines in the suffix that we care about.
1006 Record one more line start than lines,
1007 so that we can compute the length of any buffered line. */
1008 if (line == alloc_lines)
1010 /* Grow (alloc_lines - linbuf_base) by adding to alloc_lines. */
1011 idx_t n = alloc_lines - linbuf_base;
1012 linbuf += linbuf_base;
1013 linbuf = xpalloc (linbuf, &n, 1, -1, sizeof *linbuf);
1014 linbuf -= linbuf_base;
1015 alloc_lines = n - linbuf_base;
1017 linbuf[line] = p;
1019 if (p == bufend)
1021 /* If the last line is incomplete and we do not silently
1022 complete lines, don't count its appended newline. */
1023 if (current->missing_newline && robust_output_style (output_style))
1024 linbuf[line]--;
1025 break;
1028 if (context <= i && no_diff_means_no_output)
1029 break;
1031 line++;
1033 while (*p++ != '\n')
1034 continue;
1037 /* Done with cache in local variables. */
1038 current->linbuf = linbuf;
1039 current->valid_lines = line;
1040 current->alloc_lines = alloc_lines;
1041 current->equivs = cureqs;
1042 equivs = eqs;
1043 equivs_alloc = eqs_alloc;
1044 equivs_index = eqs_index;
1047 /* Prepare the text. Make sure the text end is initialized.
1048 Make sure text ends in a newline,
1049 but remember that we had to add one.
1050 Strip trailing CRs, if that was requested. */
1052 static void
1053 prepare_text (struct file_data *current)
1055 idx_t buffered = current->buffered;
1056 char *p = file_buffer (current);
1057 if (!p)
1058 return;
1060 if (strip_trailing_cr)
1062 char *srclim = p + buffered;
1063 *srclim = '\r';
1064 char *dst = rawmemchr (p, '\r');
1066 for (char const *src = dst; src != srclim; src++)
1068 src += *src == '\r' && src[1] == '\n';
1069 *dst++ = *src;
1072 buffered -= srclim - dst;
1075 if (buffered != 0 && p[buffered - 1] != '\n')
1077 p[buffered++] = '\n';
1078 current->missing_newline = true;
1081 /* Don't use uninitialized storage when planting or using sentinels. */
1082 memset (p + buffered, 0, sizeof (word));
1084 current->buffered = buffered;
1087 /* We have found N lines in a buffer of size S; guess the
1088 proportionate number of lines that will be found in a buffer of
1089 size T. However, do not guess a number of lines so large that the
1090 resulting line table might cause overflow in size calculations. */
1091 static lin
1092 guess_lines (lin n, idx_t s, idx_t t)
1094 idx_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
1095 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
1096 return MIN (guessed_lines, LIN_MAX / (2 * sizeof (char *) + 1) - 5) + 5;
1099 /* Given a vector of two file_data objects, find the identical
1100 prefixes and suffixes of each object. */
1102 static void
1103 find_identical_ends (struct file_data filevec[])
1105 slurp (&filevec[0]);
1106 prepare_text (&filevec[0]);
1107 if (filevec[0].desc != filevec[1].desc)
1109 slurp (&filevec[1]);
1110 prepare_text (&filevec[1]);
1112 else
1114 filevec[1].buffer = filevec[0].buffer;
1115 filevec[1].bufsize = filevec[0].bufsize;
1116 filevec[1].buffered = filevec[0].buffered;
1117 filevec[1].missing_newline = filevec[0].missing_newline;
1120 /* Find identical prefix. */
1122 word *w0 = filevec[0].buffer;
1123 word *w1 = filevec[1].buffer;
1124 char *buffer0 = (char *) w0;
1125 char *buffer1 = (char *) w1;
1126 char *p0 = buffer0;
1127 char *p1 = buffer1;
1128 idx_t n0 = filevec[0].buffered;
1129 idx_t n1 = filevec[1].buffered;
1131 if (p0 == p1)
1132 /* The buffers are the same; sentinels won't work. */
1133 p0 = p1 += n1;
1134 else
1136 /* Insert end sentinels, in this case characters that are guaranteed
1137 to make the equality test false, and thus terminate the loop. */
1139 if (n0 < n1)
1140 p0[n0] = ~p1[n0];
1141 else
1142 p1[n1] = ~p0[n1];
1144 /* Loop until first mismatch, or to the sentinel characters. */
1146 /* Compare a word at a time for speed. */
1147 while (*w0 == *w1)
1148 w0++, w1++;
1150 /* Do the last few bytes of comparison a byte at a time. */
1151 p0 = (char *) w0;
1152 p1 = (char *) w1;
1153 while (*p0 == *p1)
1154 p0++, p1++;
1156 /* Don't mistakenly count missing newline as part of prefix. */
1157 if (robust_output_style (output_style)
1158 && ((buffer0 + n0 - filevec[0].missing_newline < p0)
1160 (buffer1 + n1 - filevec[1].missing_newline < p1)))
1161 p0--, p1--;
1164 /* Now P0 and P1 point at the first nonmatching characters. */
1166 /* Skip back to last line-beginning in the prefix,
1167 and then discard up to HORIZON_LINES lines from the prefix. */
1168 lin hor = horizon_lines;
1169 while (p0 != buffer0 && (p0[-1] != '\n' || hor--))
1170 p0--, p1--;
1172 /* Record the prefix. */
1173 filevec[0].prefix_end = p0;
1174 filevec[1].prefix_end = p1;
1176 /* Find identical suffix. */
1178 /* P0 and P1 point beyond the last chars not yet compared. */
1179 p0 = buffer0 + n0;
1180 p1 = buffer1 + n1;
1182 if (! robust_output_style (output_style)
1183 || filevec[0].missing_newline == filevec[1].missing_newline)
1185 char const *end0 = p0; /* Addr of last char in file 0. */
1187 /* Get value of P0 at which we should stop scanning backward:
1188 this is when either P0 or P1 points just past the last char
1189 of the identical prefix. */
1190 char const *beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
1192 /* Scan back until chars don't match or we reach that point. */
1193 while (p0 != beg0)
1194 if (*--p0 != *--p1)
1196 /* Point at the first char of the matching suffix. */
1197 ++p0, ++p1;
1198 beg0 = p0;
1199 break;
1202 /* Are we at a line-beginning in both files? If not, add the rest of
1203 this line to the main body. Discard up to HORIZON_LINES lines from
1204 the identical suffix. Also, discard one extra line,
1205 because shift_boundaries may need it. */
1206 lin i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
1208 (buffer1 == p1 || p1[-1] == '\n'));
1209 while (i-- && p0 != end0)
1210 while (*p0++ != '\n')
1211 continue;
1213 p1 += p0 - beg0;
1216 /* Record the suffix. */
1217 filevec[0].suffix_begin = p0;
1218 filevec[1].suffix_begin = p1;
1220 /* Calculate number of lines of prefix to save.
1222 prefix_count == 0 means save the whole prefix;
1223 we need this for options like -D that output the whole file,
1224 or for enormous contexts (to avoid worrying about arithmetic overflow).
1225 We also need it for options like -F that output some preceding line;
1226 at least we will need to find the last few lines,
1227 but since we don't know how many, it's easiest to find them all.
1229 Otherwise, prefix_count != 0. Save just prefix_count lines at start
1230 of the line buffer; they'll be moved to the proper location later.
1231 Handle 1 more line than the context says (because we count 1 too many),
1232 rounded up to the next power of 2 to speed index computation. */
1234 lin alloc_lines0, prefix_count, middle_guess;
1235 if (no_diff_means_no_output && ! function_regexp.fastmap
1236 && context < LIN_MAX / 4 && context < n0)
1238 middle_guess = guess_lines (0, 0, p0 - filevec[0].prefix_end);
1239 lin suffix_guess = guess_lines (0, 0, buffer0 + n0 - p0);
1240 prefix_count = (lin) 1 << (floor_log2 (context) + 1);
1241 alloc_lines0 = (prefix_count + middle_guess
1242 + MIN (context, suffix_guess));
1244 else
1246 prefix_count = 0;
1247 alloc_lines0 = guess_lines (0, 0, n0);
1250 lin prefix_mask = prefix_count - 1;
1251 lin lines = 0;
1252 char const **linbuf0 = xinmalloc (alloc_lines0, sizeof *linbuf0);
1253 bool prefix_needed = ! (no_diff_means_no_output
1254 && filevec[0].prefix_end == p0
1255 && filevec[1].prefix_end == p1);
1256 p0 = buffer0;
1258 /* If the prefix is needed, find the prefix lines. */
1259 if (prefix_needed)
1261 char const *end0 = filevec[0].prefix_end;
1262 while (p0 != end0)
1264 lin l = lines++ & prefix_mask;
1265 if (l == alloc_lines0)
1266 linbuf0 = xpalloc (linbuf0, &alloc_lines0, 1, -1, sizeof *linbuf0);
1267 linbuf0[l] = p0;
1268 while (*p0++ != '\n')
1269 continue;
1272 lin buffered_prefix = prefix_count && context < lines ? context : lines;
1274 /* Allocate line buffer 1. */
1276 middle_guess = guess_lines (lines, p0 - buffer0, p1 - filevec[1].prefix_end);
1277 lin suffix_guess = guess_lines (lines, p0 - buffer0, buffer1 + n1 - p1);
1278 lin alloc_lines1;
1279 if (ckd_add (&alloc_lines1, buffered_prefix,
1280 middle_guess + MIN (context, suffix_guess)))
1281 xalloc_die ();
1282 char const **linbuf1 = xnmalloc (alloc_lines1, sizeof *linbuf1);
1284 if (buffered_prefix != lines)
1286 /* Rotate prefix lines to proper location. */
1287 for (lin i = 0; i < buffered_prefix; i++)
1288 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
1289 for (lin i = 0; i < buffered_prefix; i++)
1290 linbuf0[i] = linbuf1[i];
1293 /* Initialize line buffer 1 from line buffer 0. */
1294 for (lin i = 0; i < buffered_prefix; i++)
1295 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
1297 /* Record the line buffer, adjusted so that
1298 linbuf[0] points at the first differing line. */
1299 filevec[0].linbuf = linbuf0 + buffered_prefix;
1300 filevec[1].linbuf = linbuf1 + buffered_prefix;
1301 filevec[0].linbuf_base = filevec[1].linbuf_base = - buffered_prefix;
1302 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
1303 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
1304 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
1307 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
1308 than 2**k. This table is derived from Chris K. Caldwell's list
1309 <http://www.utm.edu/research/primes/lists/2small/>. */
1311 static unsigned char const prime_offset[] =
1313 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
1314 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
1315 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
1316 55, 93, 1, 57, 25
1319 /* Verify that this host's idx_t is not too wide for the above table. */
1321 static_assert (PTRDIFF_WIDTH - 1 <= sizeof prime_offset);
1323 /* Given a vector of two file_data objects, read the file associated
1324 with each one, and build the table of equivalence classes.
1325 Return nonzero if either file appears to be a binary file.
1326 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
1328 bool
1329 read_files (struct file_data filevec[], bool pretend_binary)
1331 bool skip_test = text | pretend_binary;
1332 bool appears_binary = pretend_binary | sip (&filevec[0], skip_test);
1334 if (filevec[0].desc != filevec[1].desc)
1335 appears_binary |= sip (&filevec[1], skip_test | appears_binary);
1336 else
1338 filevec[1].buffer = filevec[0].buffer;
1339 filevec[1].bufsize = filevec[0].bufsize;
1340 filevec[1].buffered = filevec[0].buffered;
1342 if (appears_binary)
1344 set_binary_mode (filevec[0].desc, O_BINARY);
1345 set_binary_mode (filevec[1].desc, O_BINARY);
1346 return true;
1349 find_identical_ends (filevec);
1351 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
1352 equivs = xnmalloc (equivs_alloc, sizeof *equivs);
1353 /* Equivalence class 0 is permanently safe for lines that were not
1354 hashed. Real equivalence classes start at 1. */
1355 equivs_index = 1;
1357 /* Allocate (one plus) a prime number of hash buckets. Use a prime
1358 number between 1/3 and 2/3 of the value of equiv_allocs,
1359 approximately. */
1360 int p = equivs_alloc <= 256 * 3 ? 9 : floor_log2 (equivs_alloc / 3) + 1;
1361 nbuckets = ((idx_t) 1 << p) - prime_offset[p];
1362 buckets = xicalloc (nbuckets + 1, sizeof *buckets);
1363 buckets++;
1365 for (int i = 0; i < 2; i++)
1366 find_and_hash_each_line (&filevec[i]);
1368 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
1370 free (equivs);
1371 free (buckets - 1);
1373 return false;