1 /* Split source by line breaks, and calculate a simplistic checksum. */
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
26 #include <arraylist.h>
27 #include <diff_main.h>
29 #include "diff_internal.h"
30 #include "diff_debug.h"
33 diff_atom_hash_update(unsigned int hash
, unsigned char atom_byte
)
35 return hash
* 23 + atom_byte
;
39 diff_data_atomize_text_lines_fd(struct diff_data
*d
)
42 const off_t end
= pos
+ d
->len
;
43 unsigned int array_size_estimate
= d
->len
/ 50;
44 unsigned int pow2
= 1;
45 bool ignore_whitespace
= (d
->diff_flags
& DIFF_FLAG_IGNORE_WHITESPACE
);
46 bool embedded_nul
= false;
48 while (array_size_estimate
>>= 1)
51 ARRAYLIST_INIT(d
->atoms
, 1 << pow2
);
53 if (fseek(d
->root
->f
, 0L, SEEK_SET
) == -1)
58 unsigned int hash
= 0;
59 unsigned char buf
[512];
61 struct diff_atom
*atom
;
64 while (eol
== 0 && line_end
< end
) {
65 r
= fread(buf
, sizeof(char), sizeof(buf
), d
->root
->f
);
66 if (r
== 0 && ferror(d
->root
->f
))
69 while (eol
== 0 && i
< r
) {
70 if (buf
[i
] != '\r' && buf
[i
] != '\n') {
71 if (!ignore_whitespace
72 || !isspace((unsigned char)buf
[i
]))
73 hash
= diff_atom_hash_update(
84 /* When not at the end of data, the line ending char ('\r' or
85 * '\n') must follow */
88 /* If that was an '\r', also pull in any following '\n' */
89 if (line_end
< end
&& eol
== '\r') {
90 if (fseeko(d
->root
->f
, line_end
, SEEK_SET
) == -1)
92 r
= fread(buf
, sizeof(char), sizeof(buf
), d
->root
->f
);
93 if (r
== 0 && ferror(d
->root
->f
))
95 if (r
> 0 && buf
[0] == '\n')
99 /* Record the found line as diff atom */
100 ARRAYLIST_ADD(atom
, d
->atoms
);
104 *atom
= (struct diff_atom
){
107 .at
= NULL
, /* atom data is not memory-mapped */
108 .len
= line_end
- pos
,
112 /* Starting point for next line: */
114 if (fseeko(d
->root
->f
, pos
, SEEK_SET
) == -1)
118 /* File are considered binary if they contain embedded '\0' bytes. */
120 d
->atomizer_flags
|= DIFF_ATOMIZER_FOUND_BINARY_DATA
;
126 diff_data_atomize_text_lines_mmap(struct diff_data
*d
)
128 const uint8_t *pos
= d
->data
;
129 const uint8_t *end
= pos
+ d
->len
;
130 bool ignore_whitespace
= (d
->diff_flags
& DIFF_FLAG_IGNORE_WHITESPACE
);
131 bool embedded_nul
= false;
132 unsigned int array_size_estimate
= d
->len
/ 50;
133 unsigned int pow2
= 1;
134 while (array_size_estimate
>>= 1)
137 ARRAYLIST_INIT(d
->atoms
, 1 << pow2
);
140 const uint8_t *line_end
= pos
;
141 unsigned int hash
= 0;
143 while (line_end
< end
&& *line_end
!= '\r' && *line_end
!= '\n') {
144 if (!ignore_whitespace
145 || !isspace((unsigned char)*line_end
))
146 hash
= diff_atom_hash_update(hash
, *line_end
);
147 if (*line_end
== '\0')
152 /* When not at the end of data, the line ending char ('\r' or
153 * '\n') must follow */
154 if (line_end
< end
&& *line_end
== '\r')
156 if (line_end
< end
&& *line_end
== '\n')
159 /* Record the found line as diff atom */
160 struct diff_atom
*atom
;
161 ARRAYLIST_ADD(atom
, d
->atoms
);
165 *atom
= (struct diff_atom
){
167 .pos
= (off_t
)(pos
- d
->data
),
169 .len
= line_end
- pos
,
173 /* Starting point for next line: */
177 /* File are considered binary if they contain embedded '\0' bytes. */
179 d
->atomizer_flags
|= DIFF_ATOMIZER_FOUND_BINARY_DATA
;
185 diff_data_atomize_text_lines(struct diff_data
*d
)
188 return diff_data_atomize_text_lines_fd(d
);
190 return diff_data_atomize_text_lines_mmap(d
);
194 diff_atomize_text_by_line(void *func_data
, struct diff_data
*d
)
196 return diff_data_atomize_text_lines(d
);