update TODO list
[got-portable.git] / lib / diff_atomize_text.c
blob32023105af9438217a65a1bf6b821e4d225516d1
1 /* Split source by line breaks, and calculate a simplistic checksum. */
2 /*
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 #include <errno.h>
19 #include <stdbool.h>
20 #include <stdint.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <unistd.h>
24 #include <ctype.h>
26 #include <arraylist.h>
27 #include <diff_main.h>
29 #include "diff_internal.h"
30 #include "diff_debug.h"
32 unsigned int
33 diff_atom_hash_update(unsigned int hash, unsigned char atom_byte)
35 return hash * 23 + atom_byte;
38 static int
39 diff_data_atomize_text_lines_fd(struct diff_data *d)
41 off_t pos = 0;
42 const off_t end = pos + d->len;
43 unsigned int array_size_estimate = d->len / 50;
44 unsigned int pow2 = 1;
45 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
46 bool embedded_nul = false;
48 while (array_size_estimate >>= 1)
49 pow2++;
51 ARRAYLIST_INIT(d->atoms, 1 << pow2);
53 if (fseek(d->root->f, 0L, SEEK_SET) == -1)
54 return errno;
56 while (pos < end) {
57 off_t line_end = pos;
58 unsigned int hash = 0;
59 unsigned char buf[512];
60 size_t r, i;
61 struct diff_atom *atom;
62 int eol = 0;
64 while (eol == 0 && line_end < end) {
65 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
66 if (r == 0 && ferror(d->root->f))
67 return EIO;
68 i = 0;
69 while (eol == 0 && i < r) {
70 if (buf[i] != '\r' && buf[i] != '\n') {
71 if (!ignore_whitespace
72 || !isspace((unsigned char)buf[i]))
73 hash = diff_atom_hash_update(
74 hash, buf[i]);
75 if (buf[i] == '\0')
76 embedded_nul = true;
77 line_end++;
78 } else
79 eol = buf[i];
80 i++;
84 /* When not at the end of data, the line ending char ('\r' or
85 * '\n') must follow */
86 if (line_end < end)
87 line_end++;
88 /* If that was an '\r', also pull in any following '\n' */
89 if (line_end < end && eol == '\r') {
90 if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
91 return errno;
92 r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
93 if (r == 0 && ferror(d->root->f))
94 return EIO;
95 if (r > 0 && buf[0] == '\n')
96 line_end++;
99 /* Record the found line as diff atom */
100 ARRAYLIST_ADD(atom, d->atoms);
101 if (!atom)
102 return ENOMEM;
104 *atom = (struct diff_atom){
105 .root = d,
106 .pos = pos,
107 .at = NULL, /* atom data is not memory-mapped */
108 .len = line_end - pos,
109 .hash = hash,
112 /* Starting point for next line: */
113 pos = line_end;
114 if (fseeko(d->root->f, pos, SEEK_SET) == -1)
115 return errno;
118 /* File are considered binary if they contain embedded '\0' bytes. */
119 if (embedded_nul)
120 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
122 return DIFF_RC_OK;
125 static int
126 diff_data_atomize_text_lines_mmap(struct diff_data *d)
128 const uint8_t *pos = d->data;
129 const uint8_t *end = pos + d->len;
130 bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
131 bool embedded_nul = false;
132 unsigned int array_size_estimate = d->len / 50;
133 unsigned int pow2 = 1;
134 while (array_size_estimate >>= 1)
135 pow2++;
137 ARRAYLIST_INIT(d->atoms, 1 << pow2);
139 while (pos < end) {
140 const uint8_t *line_end = pos;
141 unsigned int hash = 0;
143 while (line_end < end && *line_end != '\r' && *line_end != '\n') {
144 if (!ignore_whitespace
145 || !isspace((unsigned char)*line_end))
146 hash = diff_atom_hash_update(hash, *line_end);
147 if (*line_end == '\0')
148 embedded_nul = true;
149 line_end++;
152 /* When not at the end of data, the line ending char ('\r' or
153 * '\n') must follow */
154 if (line_end < end && *line_end == '\r')
155 line_end++;
156 if (line_end < end && *line_end == '\n')
157 line_end++;
159 /* Record the found line as diff atom */
160 struct diff_atom *atom;
161 ARRAYLIST_ADD(atom, d->atoms);
162 if (!atom)
163 return ENOMEM;
165 *atom = (struct diff_atom){
166 .root = d,
167 .pos = (off_t)(pos - d->data),
168 .at = pos,
169 .len = line_end - pos,
170 .hash = hash,
173 /* Starting point for next line: */
174 pos = line_end;
177 /* File are considered binary if they contain embedded '\0' bytes. */
178 if (embedded_nul)
179 d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
181 return DIFF_RC_OK;
184 static int
185 diff_data_atomize_text_lines(struct diff_data *d)
187 if (d->data == NULL)
188 return diff_data_atomize_text_lines_fd(d);
189 else
190 return diff_data_atomize_text_lines_mmap(d);
194 diff_atomize_text_by_line(void *func_data, struct diff_data *d)
196 return diff_data_atomize_text_lines(d);