1 /* File I/O for GNU DIFF.
3 Copyright (C) 1988-1989, 1992-1995, 1998, 2001-2002, 2004, 2006, 2009-2013,
4 2015-2025 Free Software Foundation, Inc.
6 This file is part of GNU DIFF.
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>. */
22 #include <binary-io.h>
24 #include <file-type.h>
32 /* The type of a hash value. */
33 typedef size_t hash_value
;
34 enum { HASH_VALUE_WIDTH
= SIZE_WIDTH
};
35 static_assert (! TYPE_SIGNED (hash_value
));
37 /* Rotate a hash value to the left. */
39 rol (hash_value v
, int n
)
41 return v
<< n
| v
>> (HASH_VALUE_WIDTH
- n
);
44 /* Given a hash value and a new character, return a new hash value. */
46 hash (hash_value h
, hash_value c
)
48 return rol (h
, 7) + c
;
51 /* Lines are put into equivalence classes of lines that match in lines_differ.
52 Each equivalence class is represented by one of these structures,
53 but only while the classes are being computed.
54 Afterward, each class is represented by a number. */
57 lin next
; /* Next item in this bucket. */
58 hash_value hash
; /* Hash of lines in this class. */
59 char const *line
; /* A line that fits this class. */
60 idx_t length
; /* That line's length, counting its newline. */
63 /* Hash-table: array of buckets, each being a chain of equivalence classes.
64 buckets[-1] is reserved for incomplete lines. */
67 /* Number of buckets in the hash table array, not counting buckets[-1]. */
68 static idx_t nbuckets
;
70 /* Array in which the equivalence classes are allocated.
71 The bucket-chains go through the elements in this array.
72 The number of an equivalence class is its index in this array. */
73 static struct equivclass
*equivs
;
75 /* Index of first free element in the array 'equivs'. */
76 static lin equivs_index
;
78 /* Number of elements allocated in the array 'equivs'. */
79 static idx_t equivs_alloc
;
81 /* The file buffer, considered as an array of bytes rather than
82 as an array of words. */
85 file_buffer (struct file_data
const *f
)
87 return (char *) f
->buffer
;
90 /* Read a block of data into a file buffer, checking for EOF and error. */
93 file_block_read (struct file_data
*current
, idx_t size
)
95 if (size
&& ! current
->eof
)
97 ptrdiff_t s
= block_read (current
->desc
,
98 file_buffer (current
) + current
->buffered
,
101 pfatal_with_name (current
->name
);
102 current
->buffered
+= s
;
103 current
->eof
= s
< size
;
107 /* Check for binary files and compare them for exact identity. */
109 /* Return true if BUF contains a non text character.
110 SIZE is the number of characters in BUF. */
112 #define binary_file_p(buf, size) (!! memchr (buf, 0, size))
114 /* Get ready to read the current file.
115 Return nonzero if SKIP_TEST is zero,
116 and if it appears to be a binary file. */
119 sip (struct file_data
*current
, bool skip_test
)
121 /* If we have a nonexistent file at this stage, treat it as empty. */
122 if (current
->desc
< 0)
124 /* Leave room for a sentinel. */
125 current
->bufsize
= sizeof (word
);
126 current
->buffer
= ximalloc (current
->bufsize
);
131 if (ST_BLKSIZE (current
->stat
) < 0
132 || ckd_add (&blksize
, ST_BLKSIZE (current
->stat
), 0))
134 current
->bufsize
= buffer_lcm (sizeof (word
), blksize
, IDX_MAX
);
135 current
->buffer
= ximalloc (current
->bufsize
);
138 /* Skip test if seek is not possible */
139 skip_test
= skip_test
140 || (lseek (current
->desc
, 0, SEEK_CUR
) < 0
146 /* Check first part of file to see if it's a binary file. */
148 int prev_mode
= set_binary_mode (current
->desc
, O_BINARY
);
149 file_block_read (current
, current
->bufsize
);
150 off_t buffered
= current
->buffered
;
152 if (prev_mode
!= O_BINARY
)
154 /* Revert to text mode and seek back to the start to reread
155 the file. Use relative seek, since file descriptors
156 like stdin might not start at offset zero. */
157 if (lseek (current
->desc
, - buffered
, SEEK_CUR
) < 0)
158 pfatal_with_name (current
->name
);
159 set_binary_mode (current
->desc
, prev_mode
);
160 current
->buffered
= 0;
161 current
->eof
= false;
164 return binary_file_p (current
->buffer
, buffered
);
168 current
->buffered
= 0;
169 current
->eof
= false;
173 /* Slurp the rest of the current file completely into memory. */
176 slurp (struct file_data
*current
)
178 if (current
->desc
< 0)
180 /* The file is nonexistent. */
184 /* Upper bound on the room needed for an appended newline, word
185 sentinel, and worst-case word alignment. */
186 enum { extra_room
= 2 * sizeof (word
) };
188 if (S_ISREG (current
->stat
.st_mode
))
190 /* It's a regular file; try to allocate a big enough buffer so
191 that it can be slurped in all at once, along with appended
192 newline plus word sentinel plus word-alignment. */
193 off_t file_size
= current
->stat
.st_size
;
196 && !ckd_add (&cc
, file_size
- file_size
% sizeof (word
), extra_room
)
197 && current
->bufsize
< cc
)
199 #if __GNUC__ == 13 && __GNUC_MINOR__ < 3
200 /* Work around GCC bug 110014. */
201 #pragma GCC diagnostic push
202 #pragma GCC diagnostic ignored "-Wanalyzer-allocation-size"
205 word
*buffer
= irealloc (current
->buffer
, cc
);
209 current
->buffer
= buffer
;
210 current
->bufsize
= cc
;
214 #pragma GCC diagnostic pop
219 /* Read the file, growing the buffer as needed. */
221 while (file_block_read (current
, current
->bufsize
- current
->buffered
),
223 current
->buffer
= xpalloc (current
->buffer
, ¤t
->bufsize
,
226 if (current
->bufsize
- current
->buffered
< extra_room
)
228 if (ckd_add (¤t
->bufsize
, current
->buffered
, extra_room
))
230 current
->buffer
= xirealloc (current
->buffer
, current
->bufsize
);
234 /* Return true if CH1 and ERR1 stand for the same character or
235 encoding error as CH2 and ERR2. */
237 same_ch_err (char32_t ch1
, unsigned char err1
, char32_t ch2
, unsigned char err2
)
239 return ! ((ch1
^ ch2
) | (err1
^ err2
));
242 /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
243 one line from each input file) according to the command line options.
244 Line lengths include the trailing newline.
245 For efficiency, this is invoked only when the lines do not match exactly
246 but an option like -i might cause us to ignore the difference.
247 Return nonzero if the lines differ.
248 Line lengths do not include the trailing newline. */
251 lines_differ (char const *s1
, idx_t s1len
, char const *s2
, idx_t s2len
)
255 intmax_t tab
= 0, column
= 0;
260 unsigned char c1
= *t1
++;
261 unsigned char c2
= *t2
++;
263 /* Test for exact char equality first, since it's a common case. */
266 switch (ignore_white_space
)
268 case IGNORE_ALL_SPACE
:
269 /* For -w, just skip past any white space. */
270 while (isspace (c1
) && c1
!= '\n') c1
= *t1
++;
271 while (isspace (c2
) && c2
!= '\n') c2
= *t2
++;
274 case IGNORE_SPACE_CHANGE
:
275 /* For -b, advance past any sequence of white space in
276 line 1 and consider it just one space, or nothing at
277 all if it is at the end of the line. */
290 /* Likewise for line 2. */
305 /* If we went too far when doing the simple test
306 for equality, go back to the first non-white-space
307 character in both sides and try again. */
308 if (c2
== ' ' && c1
!= '\n'
310 && isspace ((unsigned char) t1
[-2]))
315 if (c1
== ' ' && c2
!= '\n'
317 && isspace ((unsigned char) t2
[-2]))
326 case IGNORE_TRAILING_SPACE
:
327 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE
:
328 if (isspace (c1
) && isspace (c2
))
334 while ((c
= *p
) != '\n' && isspace (c
))
343 while ((c
= *p
) != '\n' && isspace (c
))
348 /* Both lines have nothing but whitespace left. */
351 if (ignore_white_space
== IGNORE_TRAILING_SPACE
)
354 case IGNORE_TAB_EXPANSION
:
355 if ((c1
== ' ' && c2
== '\t')
356 || (c1
== '\t' && c2
== ' '))
358 intmax_t tab2
= tab
, column2
= column
;
361 if (c1
== '\t' || (c1
== ' ' && column
== tabsize
- 1))
373 if (c2
== '\t' || (c2
== ' ' && column2
== tabsize
- 1))
383 if (tab
!= tab2
|| column
!= column2
)
388 case IGNORE_NO_WHITE_SPACE
:
417 column
= tabsize
- 1;
421 case '\0': case '\a': case '\f': case '\v':
425 column
+= !! isprint (c1
);
426 if (column
< tabsize
)
437 char const *lim1
= s1
+ s1len
;
438 char const *lim2
= s2
+ s2len
;
439 char32_t ch1prev
= 0;
443 mcel_t g1
= mcel_scan (t1
, lim1
);
444 mcel_t g2
= mcel_scan (t2
, lim2
);
447 char32_t ch1
= g1
.ch
;
448 char32_t ch2
= g2
.ch
;
450 /* Test for exact equality first, since it's a common case. */
451 if (! same_ch_err (ch1
, g1
.err
, ch2
, g2
.err
))
453 switch (ignore_white_space
)
455 case IGNORE_ALL_SPACE
:
456 /* For -w, just skip past any white space. */
457 while (ch1
!= '\n' && c32isspace (ch1
))
459 g1
= mcel_scan (t1
, lim1
);
463 while (ch2
!= '\n' && c32isspace (ch2
))
465 g2
= mcel_scan (t2
, lim2
);
471 case IGNORE_SPACE_CHANGE
:
472 /* For -b, advance past any sequence of white space in
473 line 1 and consider it just one space, or nothing at
474 all if it is at the end of the line. */
475 if (c32isspace (ch1
))
478 g1
= mcel_scan (t1
, lim1
);
481 if (! c32isspace (ch1
))
489 /* Likewise for line 2. */
490 if (c32isspace (ch2
))
493 g2
= mcel_scan (t2
, lim2
);
496 if (! c32isspace (ch2
))
506 /* If we went too far when doing the simple test
507 for equality, go back to the first non-white-space
508 character in both sides and try again. */
509 if (ch2
== ' ' && ch1
!= '\n' && c32isspace (ch1prev
))
514 if (ch1
== ' ' && ch2
!= '\n' && c32isspace (ch1prev
))
523 case IGNORE_TRAILING_SPACE
:
524 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE
:
525 if (c32isspace (ch1
) && c32isspace (ch2
))
532 mcel_t g
= mcel_scan (p
, lim1
);
533 if (c32isspace (g
.ch
))
545 mcel_t g
= mcel_scan (p
, lim2
);
546 if (! c32isspace (g
.ch
))
553 /* Both lines have nothing but whitespace left. */
556 if (ignore_white_space
== IGNORE_TRAILING_SPACE
)
559 case IGNORE_TAB_EXPANSION
:
560 if ((ch1
== ' ' && ch2
== '\t')
561 || (ch1
== '\t' && ch2
== ' '))
563 intmax_t tab2
= tab
, column2
= column
;
568 || (ch1
== ' ' && column
== tabsize
- 1))
578 g1
= mcel_scan (t1
, lim1
);
586 || (ch2
== ' ' && column2
== tabsize
- 1))
596 g2
= mcel_scan (t2
, lim2
);
601 if (tab
!= tab2
|| column
!= column2
)
606 case IGNORE_NO_WHITE_SPACE
:
612 ch1
= c32tolower (ch1
);
613 ch2
= c32tolower (ch2
);
616 if (! same_ch_err (ch1
, g1
.err
, ch2
, g2
.err
))
635 column
= tabsize
- 1;
639 case '\a': case '\f': case '\v':
643 /* Assume that downcasing does not change print width. */
644 column
+= g1
.err
? 1 : c32width (ch1
);
645 if (column
< tabsize
)
661 /* Split the file into lines, simultaneously computing the equivalence
662 class for each line. If two lines hash differently, lines_differ
663 must return false. */
666 find_and_hash_each_line (struct file_data
*current
)
668 char const *p
= current
->prefix_end
;
670 /* Cache often-used quantities in local variables to help the compiler. */
671 char const **linbuf
= current
->linbuf
;
672 lin alloc_lines
= current
->alloc_lines
;
674 lin linbuf_base
= current
->linbuf_base
;
675 lin
*cureqs
= xinmalloc (alloc_lines
, sizeof *cureqs
);
676 struct equivclass
*eqs
= equivs
;
677 lin eqs_index
= equivs_index
;
678 idx_t eqs_alloc
= equivs_alloc
;
679 char const *suffix_begin
= current
->suffix_begin
;
680 char const *bufend
= file_buffer (current
) + current
->buffered
;
681 bool ig_case
= ignore_case
;
682 enum DIFF_white_space ig_white_space
= ignore_white_space
;
683 bool unibyte
= MB_CUR_MAX
== 1;
684 bool diff_length_compare_anyway
=
685 (ig_white_space
!= IGNORE_NO_WHITE_SPACE
) | (!unibyte
& ig_case
);
686 bool same_length_diff_contents_compare_anyway
=
687 diff_length_compare_anyway
| ig_case
;
689 while (p
< suffix_begin
)
694 /* Hash this line until we find a newline. */
695 switch (ig_white_space
)
697 case IGNORE_ALL_SPACE
:
699 for (unsigned char c
; (c
= *p
) != '\n'; p
++)
702 h
= hash (h
, ig_case
? tolower (c
) : c
);
705 for (mcel_t g
; *p
!= '\n'; p
+= g
.len
)
707 g
= mcel_scan (p
, suffix_begin
);
708 if (! c32isspace (g
.ch
))
709 h
= hash (h
, (ig_case
? c32tolower (g
.ch
) : g
.ch
) - g
.err
);
713 case IGNORE_SPACE_CHANGE
:
715 for (unsigned char c
; (c
= *p
) != '\n'; p
++)
730 /* C is now the first non-space. */
731 h
= hash (h
, ig_case
? tolower (c
) : c
);
734 for (mcel_t g
; *p
!= '\n'; p
+= g
.len
)
736 g
= mcel_scan (p
, suffix_begin
);
737 if (c32isspace (g
.ch
))
744 g
= mcel_scan (p
, suffix_begin
);
746 while (c32isspace (g
.ch
));
751 /* G is now the first non-space. */
752 h
= hash (h
, (ig_case
? c32tolower (g
.ch
) : g
.ch
) - g
.err
);
756 case IGNORE_TAB_EXPANSION
:
757 case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE
:
758 case IGNORE_TRAILING_SPACE
:
760 intmax_t tab
= 0, column
= 0;
762 for (unsigned char c
; (c
= *p
) != '\n'; p
++)
764 intmax_t repetitions
= 1;
766 if (ig_white_space
& IGNORE_TRAILING_SPACE
780 while (isspace (c1
));
783 if (ig_white_space
& IGNORE_TAB_EXPANSION
)
792 column
= tabsize
- 1;
798 repetitions
= tabsize
- column
% tabsize
;
799 tab
+= column
/ tabsize
+ 1;
807 case '\0': case '\a': case '\f': case '\v':
820 while (--repetitions
!= 0);
823 for (mcel_t g
; *p
!= '\n'; p
+= g
.len
)
825 intmax_t repetitions
= 1;
827 g
= mcel_scan (p
, suffix_begin
);
837 if (ig_white_space
& IGNORE_TRAILING_SPACE
840 char const *p1
= p
+ g
.len
;
841 for (mcel_t g1
; ; p1
+= g1
.len
)
848 g1
= mcel_scan (p1
, suffix_begin
);
849 if (! c32isspace (g1
.ch
))
854 if (ig_white_space
& IGNORE_TAB_EXPANSION
)
863 column
= tabsize
- 1;
869 repetitions
= tabsize
- column
% tabsize
;
870 tab
+= column
/ tabsize
+ 1;
878 case '\0': case '\a': case '\f': case '\v':
882 column
+= c32width (ch
);
887 ch
= c32tolower (ch
);
892 while (--repetitions
!= 0);
901 for (unsigned char c
; (c
= *p
) != '\n'; p
++)
902 h
= hash (h
, tolower (c
));
904 for (unsigned char c
; (c
= *p
) != '\n'; p
++)
910 for (mcel_t g
; *p
!= '\n'; p
+= g
.len
)
912 g
= mcel_scan (p
, suffix_begin
);
913 h
= hash (h
, c32tolower (g
.ch
) - g
.err
);
916 for (mcel_t g
; *p
!= '\n'; p
+= g
.len
)
918 g
= mcel_scan (p
, suffix_begin
);
919 h
= hash (h
, g
.ch
- g
.err
);
927 lin
*bucket
= &buckets
[h
% nbuckets
];
929 /* Advance past the line's trailing newline. */
931 idx_t length
= p
- ip
;
934 && current
->missing_newline
935 && robust_output_style (output_style
))
937 /* The last line is incomplete and we do not silently
938 complete lines. If the line cannot compare equal to any
939 complete line, put it into buckets[-1] so that it can
940 compare equal only to the other file's incomplete line
942 if (ig_white_space
< IGNORE_TRAILING_SPACE
)
943 bucket
= &buckets
[-1];
947 for (i
= *bucket
; ; i
= eqs
[i
].next
)
950 /* Create a new equivalence class in this bucket. */
953 eqs
= xpalloc (eqs
, &eqs_alloc
, 1, -1, sizeof *eqs
);
954 eqs
[i
].next
= *bucket
;
957 eqs
[i
].length
= length
;
961 else if (eqs
[i
].hash
== h
)
963 char const *eqline
= eqs
[i
].line
;
964 idx_t eqlinelen
= eqs
[i
].length
;
966 /* Reuse existing class if lines_differ reports the lines
968 if (eqlinelen
== length
)
970 /* Reuse existing equivalence class if the lines are identical.
971 This detects the common case of exact identity
972 faster than lines_differ would. */
973 if (memcmp (eqline
, ip
, length
- 1) == 0)
975 if (!same_length_diff_contents_compare_anyway
)
978 else if (!diff_length_compare_anyway
)
981 if (! lines_differ (eqline
, eqlinelen
, ip
, length
))
985 /* Maybe increase the size of the line table. */
986 if (line
== alloc_lines
)
988 /* Grow (alloc_lines - linbuf_base) by adding to alloc_lines. */
989 idx_t n
= alloc_lines
- linbuf_base
;
990 linbuf
+= linbuf_base
;
991 linbuf
= xpalloc (linbuf
, &n
, 1, -1, sizeof *linbuf
);
992 linbuf
-= linbuf_base
;
993 alloc_lines
= linbuf_base
+ n
;
994 cureqs
= xirealloc (cureqs
, alloc_lines
* sizeof *cureqs
);
1001 current
->buffered_lines
= line
;
1003 for (lin i
= 0; ; i
++)
1005 /* Record the line start for lines in the suffix that we care about.
1006 Record one more line start than lines,
1007 so that we can compute the length of any buffered line. */
1008 if (line
== alloc_lines
)
1010 /* Grow (alloc_lines - linbuf_base) by adding to alloc_lines. */
1011 idx_t n
= alloc_lines
- linbuf_base
;
1012 linbuf
+= linbuf_base
;
1013 linbuf
= xpalloc (linbuf
, &n
, 1, -1, sizeof *linbuf
);
1014 linbuf
-= linbuf_base
;
1015 alloc_lines
= n
- linbuf_base
;
1021 /* If the last line is incomplete and we do not silently
1022 complete lines, don't count its appended newline. */
1023 if (current
->missing_newline
&& robust_output_style (output_style
))
1028 if (context
<= i
&& no_diff_means_no_output
)
1033 while (*p
++ != '\n')
1037 /* Done with cache in local variables. */
1038 current
->linbuf
= linbuf
;
1039 current
->valid_lines
= line
;
1040 current
->alloc_lines
= alloc_lines
;
1041 current
->equivs
= cureqs
;
1043 equivs_alloc
= eqs_alloc
;
1044 equivs_index
= eqs_index
;
1047 /* Prepare the text. Make sure the text end is initialized.
1048 Make sure text ends in a newline,
1049 but remember that we had to add one.
1050 Strip trailing CRs, if that was requested. */
1053 prepare_text (struct file_data
*current
)
1055 idx_t buffered
= current
->buffered
;
1056 char *p
= file_buffer (current
);
1060 if (strip_trailing_cr
)
1062 char *srclim
= p
+ buffered
;
1064 char *dst
= rawmemchr (p
, '\r');
1066 for (char const *src
= dst
; src
!= srclim
; src
++)
1068 src
+= *src
== '\r' && src
[1] == '\n';
1072 buffered
-= srclim
- dst
;
1075 if (buffered
!= 0 && p
[buffered
- 1] != '\n')
1077 p
[buffered
++] = '\n';
1078 current
->missing_newline
= true;
1081 /* Don't use uninitialized storage when planting or using sentinels. */
1082 memset (p
+ buffered
, 0, sizeof (word
));
1084 current
->buffered
= buffered
;
1087 /* We have found N lines in a buffer of size S; guess the
1088 proportionate number of lines that will be found in a buffer of
1089 size T. However, do not guess a number of lines so large that the
1090 resulting line table might cause overflow in size calculations. */
1092 guess_lines (lin n
, idx_t s
, idx_t t
)
1094 idx_t guessed_bytes_per_line
= n
< 10 ? 32 : s
/ (n
- 1);
1095 lin guessed_lines
= MAX (1, t
/ guessed_bytes_per_line
);
1096 return MIN (guessed_lines
, LIN_MAX
/ (2 * sizeof (char *) + 1) - 5) + 5;
1099 /* Given a vector of two file_data objects, find the identical
1100 prefixes and suffixes of each object. */
1103 find_identical_ends (struct file_data filevec
[])
1105 slurp (&filevec
[0]);
1106 prepare_text (&filevec
[0]);
1107 if (filevec
[0].desc
!= filevec
[1].desc
)
1109 slurp (&filevec
[1]);
1110 prepare_text (&filevec
[1]);
1114 filevec
[1].buffer
= filevec
[0].buffer
;
1115 filevec
[1].bufsize
= filevec
[0].bufsize
;
1116 filevec
[1].buffered
= filevec
[0].buffered
;
1117 filevec
[1].missing_newline
= filevec
[0].missing_newline
;
1120 /* Find identical prefix. */
1122 word
*w0
= filevec
[0].buffer
;
1123 word
*w1
= filevec
[1].buffer
;
1124 char *buffer0
= (char *) w0
;
1125 char *buffer1
= (char *) w1
;
1128 idx_t n0
= filevec
[0].buffered
;
1129 idx_t n1
= filevec
[1].buffered
;
1132 /* The buffers are the same; sentinels won't work. */
1136 /* Insert end sentinels, in this case characters that are guaranteed
1137 to make the equality test false, and thus terminate the loop. */
1144 /* Loop until first mismatch, or to the sentinel characters. */
1146 /* Compare a word at a time for speed. */
1150 /* Do the last few bytes of comparison a byte at a time. */
1156 /* Don't mistakenly count missing newline as part of prefix. */
1157 if (robust_output_style (output_style
)
1158 && ((buffer0
+ n0
- filevec
[0].missing_newline
< p0
)
1160 (buffer1
+ n1
- filevec
[1].missing_newline
< p1
)))
1164 /* Now P0 and P1 point at the first nonmatching characters. */
1166 /* Skip back to last line-beginning in the prefix,
1167 and then discard up to HORIZON_LINES lines from the prefix. */
1168 lin hor
= horizon_lines
;
1169 while (p0
!= buffer0
&& (p0
[-1] != '\n' || hor
--))
1172 /* Record the prefix. */
1173 filevec
[0].prefix_end
= p0
;
1174 filevec
[1].prefix_end
= p1
;
1176 /* Find identical suffix. */
1178 /* P0 and P1 point beyond the last chars not yet compared. */
1182 if (! robust_output_style (output_style
)
1183 || filevec
[0].missing_newline
== filevec
[1].missing_newline
)
1185 char const *end0
= p0
; /* Addr of last char in file 0. */
1187 /* Get value of P0 at which we should stop scanning backward:
1188 this is when either P0 or P1 points just past the last char
1189 of the identical prefix. */
1190 char const *beg0
= filevec
[0].prefix_end
+ (n0
< n1
? 0 : n0
- n1
);
1192 /* Scan back until chars don't match or we reach that point. */
1196 /* Point at the first char of the matching suffix. */
1202 /* Are we at a line-beginning in both files? If not, add the rest of
1203 this line to the main body. Discard up to HORIZON_LINES lines from
1204 the identical suffix. Also, discard one extra line,
1205 because shift_boundaries may need it. */
1206 lin i
= horizon_lines
+ !((buffer0
== p0
|| p0
[-1] == '\n')
1208 (buffer1
== p1
|| p1
[-1] == '\n'));
1209 while (i
-- && p0
!= end0
)
1210 while (*p0
++ != '\n')
1216 /* Record the suffix. */
1217 filevec
[0].suffix_begin
= p0
;
1218 filevec
[1].suffix_begin
= p1
;
1220 /* Calculate number of lines of prefix to save.
1222 prefix_count == 0 means save the whole prefix;
1223 we need this for options like -D that output the whole file,
1224 or for enormous contexts (to avoid worrying about arithmetic overflow).
1225 We also need it for options like -F that output some preceding line;
1226 at least we will need to find the last few lines,
1227 but since we don't know how many, it's easiest to find them all.
1229 Otherwise, prefix_count != 0. Save just prefix_count lines at start
1230 of the line buffer; they'll be moved to the proper location later.
1231 Handle 1 more line than the context says (because we count 1 too many),
1232 rounded up to the next power of 2 to speed index computation. */
1234 lin alloc_lines0
, prefix_count
, middle_guess
;
1235 if (no_diff_means_no_output
&& ! function_regexp
.fastmap
1236 && context
< LIN_MAX
/ 4 && context
< n0
)
1238 middle_guess
= guess_lines (0, 0, p0
- filevec
[0].prefix_end
);
1239 lin suffix_guess
= guess_lines (0, 0, buffer0
+ n0
- p0
);
1240 prefix_count
= (lin
) 1 << (floor_log2 (context
) + 1);
1241 alloc_lines0
= (prefix_count
+ middle_guess
1242 + MIN (context
, suffix_guess
));
1247 alloc_lines0
= guess_lines (0, 0, n0
);
1250 lin prefix_mask
= prefix_count
- 1;
1252 char const **linbuf0
= xinmalloc (alloc_lines0
, sizeof *linbuf0
);
1253 bool prefix_needed
= ! (no_diff_means_no_output
1254 && filevec
[0].prefix_end
== p0
1255 && filevec
[1].prefix_end
== p1
);
1258 /* If the prefix is needed, find the prefix lines. */
1261 char const *end0
= filevec
[0].prefix_end
;
1264 lin l
= lines
++ & prefix_mask
;
1265 if (l
== alloc_lines0
)
1266 linbuf0
= xpalloc (linbuf0
, &alloc_lines0
, 1, -1, sizeof *linbuf0
);
1268 while (*p0
++ != '\n')
1272 lin buffered_prefix
= prefix_count
&& context
< lines
? context
: lines
;
1274 /* Allocate line buffer 1. */
1276 middle_guess
= guess_lines (lines
, p0
- buffer0
, p1
- filevec
[1].prefix_end
);
1277 lin suffix_guess
= guess_lines (lines
, p0
- buffer0
, buffer1
+ n1
- p1
);
1279 if (ckd_add (&alloc_lines1
, buffered_prefix
,
1280 middle_guess
+ MIN (context
, suffix_guess
)))
1282 char const **linbuf1
= xnmalloc (alloc_lines1
, sizeof *linbuf1
);
1284 if (buffered_prefix
!= lines
)
1286 /* Rotate prefix lines to proper location. */
1287 for (lin i
= 0; i
< buffered_prefix
; i
++)
1288 linbuf1
[i
] = linbuf0
[(lines
- context
+ i
) & prefix_mask
];
1289 for (lin i
= 0; i
< buffered_prefix
; i
++)
1290 linbuf0
[i
] = linbuf1
[i
];
1293 /* Initialize line buffer 1 from line buffer 0. */
1294 for (lin i
= 0; i
< buffered_prefix
; i
++)
1295 linbuf1
[i
] = linbuf0
[i
] - buffer0
+ buffer1
;
1297 /* Record the line buffer, adjusted so that
1298 linbuf[0] points at the first differing line. */
1299 filevec
[0].linbuf
= linbuf0
+ buffered_prefix
;
1300 filevec
[1].linbuf
= linbuf1
+ buffered_prefix
;
1301 filevec
[0].linbuf_base
= filevec
[1].linbuf_base
= - buffered_prefix
;
1302 filevec
[0].alloc_lines
= alloc_lines0
- buffered_prefix
;
1303 filevec
[1].alloc_lines
= alloc_lines1
- buffered_prefix
;
1304 filevec
[0].prefix_lines
= filevec
[1].prefix_lines
= lines
;
1307 /* If 1 < k, then (2**k - prime_offset[k]) is the largest prime less
1308 than 2**k. This table is derived from Chris K. Caldwell's list
1309 <http://www.utm.edu/research/primes/lists/2small/>. */
1311 static unsigned char const prime_offset
[] =
1313 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
1314 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
1315 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
1319 /* Verify that this host's idx_t is not too wide for the above table. */
1321 static_assert (PTRDIFF_WIDTH
- 1 <= sizeof prime_offset
);
1323 /* Given a vector of two file_data objects, read the file associated
1324 with each one, and build the table of equivalence classes.
1325 Return nonzero if either file appears to be a binary file.
1326 If PRETEND_BINARY is nonzero, pretend they are binary regardless. */
1329 read_files (struct file_data filevec
[], bool pretend_binary
)
1331 bool skip_test
= text
| pretend_binary
;
1332 bool appears_binary
= pretend_binary
| sip (&filevec
[0], skip_test
);
1334 if (filevec
[0].desc
!= filevec
[1].desc
)
1335 appears_binary
|= sip (&filevec
[1], skip_test
| appears_binary
);
1338 filevec
[1].buffer
= filevec
[0].buffer
;
1339 filevec
[1].bufsize
= filevec
[0].bufsize
;
1340 filevec
[1].buffered
= filevec
[0].buffered
;
1344 set_binary_mode (filevec
[0].desc
, O_BINARY
);
1345 set_binary_mode (filevec
[1].desc
, O_BINARY
);
1349 find_identical_ends (filevec
);
1351 equivs_alloc
= filevec
[0].alloc_lines
+ filevec
[1].alloc_lines
+ 1;
1352 equivs
= xnmalloc (equivs_alloc
, sizeof *equivs
);
1353 /* Equivalence class 0 is permanently safe for lines that were not
1354 hashed. Real equivalence classes start at 1. */
1357 /* Allocate (one plus) a prime number of hash buckets. Use a prime
1358 number between 1/3 and 2/3 of the value of equiv_allocs,
1360 int p
= equivs_alloc
<= 256 * 3 ? 9 : floor_log2 (equivs_alloc
/ 3) + 1;
1361 nbuckets
= ((idx_t
) 1 << p
) - prime_offset
[p
];
1362 buckets
= xicalloc (nbuckets
+ 1, sizeof *buckets
);
1365 for (int i
= 0; i
< 2; i
++)
1366 find_and_hash_each_line (&filevec
[i
]);
1368 filevec
[0].equiv_max
= filevec
[1].equiv_max
= equivs_index
;