1 /* Generic infrastructure to implement various diff algorithms (implementation). */
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
31 #include <arraylist.h>
32 #include <diff_main.h>
34 #include "got_compat.h"
36 #include "diff_internal.h"
37 #include "diff_debug.h"
40 read_at(FILE *f
, off_t at_pos
, unsigned char *buf
, size_t len
)
43 if (fseeko(f
, at_pos
, SEEK_SET
) == -1)
45 r
= fread(buf
, sizeof(char), len
, f
);
46 if ((r
== 0 || r
< len
) && ferror(f
))
54 buf_cmp(const unsigned char *left
, size_t left_len
,
55 const unsigned char *right
, size_t right_len
,
56 bool ignore_whitespace
)
60 if (ignore_whitespace
) {
62 while (il
< left_len
&& ir
< right_len
) {
63 unsigned char cl
= left
[il
];
64 unsigned char cr
= right
[ir
];
66 if (isspace(cl
) && il
< left_len
) {
70 if (isspace(cr
) && ir
< right_len
) {
82 while (il
< left_len
) {
83 unsigned char cl
= left
[il
++];
87 while (ir
< right_len
) {
88 unsigned char cr
= right
[ir
++];
96 cmp
= memcmp(left
, right
, MIN(left_len
, right_len
));
99 if (left_len
== right_len
)
101 return (left_len
> right_len
) ? 1 : -1;
105 diff_atom_cmp(int *cmp
,
106 const struct diff_atom
*left
,
107 const struct diff_atom
*right
)
109 off_t remain_left
, remain_right
;
110 int flags
= (left
->root
->diff_flags
| right
->root
->diff_flags
);
111 bool ignore_whitespace
= (flags
& DIFF_FLAG_IGNORE_WHITESPACE
);
113 if (!left
->len
&& !right
->len
) {
117 if (!ignore_whitespace
) {
128 if (left
->at
!= NULL
&& right
->at
!= NULL
) {
129 *cmp
= buf_cmp(left
->at
, left
->len
, right
->at
, right
->len
,
134 remain_left
= left
->len
;
135 remain_right
= right
->len
;
136 while (remain_left
> 0 || remain_right
> 0) {
137 const size_t chunksz
= 8192;
138 unsigned char buf_left
[chunksz
], buf_right
[chunksz
];
139 const uint8_t *p_left
, *p_right
;
140 off_t n_left
, n_right
;
152 n_left
= MIN(chunksz
, remain_left
);
153 n_right
= MIN(chunksz
, remain_right
);
155 if (left
->at
== NULL
) {
156 r
= read_at(left
->root
->f
,
157 left
->pos
+ (left
->len
- remain_left
),
165 p_left
= left
->at
+ (left
->len
- remain_left
);
168 if (right
->at
== NULL
) {
169 r
= read_at(right
->root
->f
,
170 right
->pos
+ (right
->len
- remain_right
),
178 p_right
= right
->at
+ (right
->len
- remain_right
);
181 r
= buf_cmp(p_left
, n_left
, p_right
, n_right
,
188 remain_left
-= n_left
;
189 remain_right
-= n_right
;
197 diff_atom_same(bool *same
,
198 const struct diff_atom
*left
,
199 const struct diff_atom
*right
)
203 if (left
->hash
!= right
->hash
) {
207 r
= diff_atom_cmp(&cmp
, left
, right
);
216 static struct diff_chunk
*
217 diff_state_add_solved_chunk(struct diff_state
*state
,
218 const struct diff_chunk
*chunk
)
220 diff_chunk_arraylist_t
*result
;
221 struct diff_chunk
*new_chunk
;
222 enum diff_chunk_type last_t
;
223 enum diff_chunk_type new_t
;
224 struct diff_chunk
*last
;
226 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
227 * never directly follows a plus chunk. */
228 result
= &state
->result
->chunks
;
230 last_t
= result
->len
? diff_chunk_type(&result
->head
[result
->len
- 1])
232 new_t
= diff_chunk_type(chunk
);
234 debug("ADD %s chunk #%u:\n", chunk
->solved
? "solved" : "UNSOLVED",
237 debug_dump_atoms(&state
->left
, chunk
->left_start
, chunk
->left_count
);
239 debug_dump_atoms(&state
->right
, chunk
->right_start
, chunk
->right_count
);
242 last
= &result
->head
[result
->len
- 1];
243 assert(chunk
->left_start
244 == last
->left_start
+ last
->left_count
);
245 assert(chunk
->right_start
246 == last
->right_start
+ last
->right_count
);
249 if (new_t
== last_t
) {
250 new_chunk
= &result
->head
[result
->len
- 1];
251 new_chunk
->left_count
+= chunk
->left_count
;
252 new_chunk
->right_count
+= chunk
->right_count
;
253 debug(" - added chunk touches previous one of same type, joined:\n");
255 debug_dump_atoms(&state
->left
, new_chunk
->left_start
, new_chunk
->left_count
);
257 debug_dump_atoms(&state
->right
, new_chunk
->right_start
, new_chunk
->right_count
);
258 } else if (last_t
== CHUNK_PLUS
&& new_t
== CHUNK_MINUS
) {
259 enum diff_chunk_type prev_last_t
=
261 diff_chunk_type(&result
->head
[result
->len
- 2])
263 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
264 * Is the one before that also a minus? combine. */
265 if (prev_last_t
== CHUNK_MINUS
) {
266 new_chunk
= &result
->head
[result
->len
- 2];
267 new_chunk
->left_count
+= chunk
->left_count
;
268 new_chunk
->right_count
+= chunk
->right_count
;
270 debug(" - added minus-chunk follows plus-chunk,"
271 " put before that plus-chunk and joined"
272 " with preceding minus-chunk:\n");
274 debug_dump_atoms(&state
->left
, new_chunk
->left_start
, new_chunk
->left_count
);
276 debug_dump_atoms(&state
->right
, new_chunk
->right_start
, new_chunk
->right_count
);
278 ARRAYLIST_INSERT(new_chunk
, *result
, result
->len
- 1);
283 /* The new minus chunk indicates to which position on
284 * the right it corresponds, even though it doesn't add
285 * any lines on the right. By moving above a plus chunk,
286 * that position on the right has shifted. */
287 last
= &result
->head
[result
->len
- 1];
288 new_chunk
->right_start
= last
->right_start
;
290 debug(" - added minus-chunk follows plus-chunk,"
291 " put before that plus-chunk\n");
294 /* That last_t == CHUNK_PLUS indicates to which position on the
295 * left it corresponds, even though it doesn't add any lines on
296 * the left. By inserting/extending the prev_last_t ==
297 * CHUNK_MINUS, that position on the left has shifted. */
298 last
= &result
->head
[result
->len
- 1];
299 last
->left_start
= new_chunk
->left_start
300 + new_chunk
->left_count
;
303 ARRAYLIST_ADD(new_chunk
, *result
);
311 /* Even if a left or right side is empty, diff output may need to know the
312 * position in that file.
313 * So left_start or right_start must never be NULL -- pass left_count or
314 * right_count as zero to indicate staying at that position without consuming
317 diff_state_add_chunk(struct diff_state
*state
, bool solved
,
318 struct diff_atom
*left_start
, unsigned int left_count
,
319 struct diff_atom
*right_start
, unsigned int right_count
)
321 struct diff_chunk
*new_chunk
;
322 struct diff_chunk chunk
= {
324 .left_start
= left_start
,
325 .left_count
= left_count
,
326 .right_start
= right_start
,
327 .right_count
= right_count
,
330 /* An unsolved chunk means store as intermediate result for later
332 * If there already are intermediate results, that means even a
333 * following solved chunk needs to go to intermediate results, so that
334 * it is later put in the final correct position in solved chunks.
336 if (!solved
|| state
->temp_result
.len
) {
337 /* Append to temp_result */
338 debug("ADD %s chunk to temp result:\n",
339 chunk
.solved
? "solved" : "UNSOLVED");
341 debug_dump_atoms(&state
->left
, left_start
, left_count
);
343 debug_dump_atoms(&state
->right
, right_start
, right_count
);
344 ARRAYLIST_ADD(new_chunk
, state
->temp_result
);
351 return diff_state_add_solved_chunk(state
, &chunk
);
355 diff_data_init_root(struct diff_data
*d
, FILE *f
, const uint8_t *data
,
356 unsigned long long len
, int diff_flags
)
358 *d
= (struct diff_data
){
364 .diff_flags
= diff_flags
,
369 diff_data_init_subsection(struct diff_data
*d
, struct diff_data
*parent
,
370 struct diff_atom
*from_atom
, unsigned int atoms_count
)
372 struct diff_atom
*last_atom
;
374 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
375 d
, parent
, from_atom
, atoms_count
);
376 debug(" from_atom ");
377 debug_dump_atom(parent
, NULL
, from_atom
);
379 if (atoms_count
== 0) {
380 *d
= (struct diff_data
){
385 .root
= parent
->root
,
387 .atoms
.len
= atoms_count
,
393 last_atom
= from_atom
+ atoms_count
- 1;
394 *d
= (struct diff_data
){
396 .pos
= from_atom
->pos
,
397 .data
= from_atom
->at
,
398 .len
= (last_atom
->pos
+ last_atom
->len
) - from_atom
->pos
,
399 .root
= parent
->root
,
400 .atoms
.head
= from_atom
,
401 .atoms
.len
= atoms_count
,
404 debug("subsection:\n");
409 diff_data_free(struct diff_data
*diff_data
)
413 if (diff_data
->atoms
.allocated
)
414 ARRAYLIST_FREE(diff_data
->atoms
);
418 diff_algo_none(const struct diff_algo_config
*algo_config
,
419 struct diff_state
*state
)
421 debug("\n** %s\n", __func__
);
423 debug_dump(&state
->left
);
425 debug_dump(&state
->right
);
426 debug_dump_myers_graph(&state
->left
, &state
->right
, NULL
, NULL
, 0, NULL
,
429 /* Add a chunk of equal lines, if any */
430 struct diff_atom
*l
= state
->left
.atoms
.head
;
431 unsigned int l_len
= state
->left
.atoms
.len
;
432 struct diff_atom
*r
= state
->right
.atoms
.head
;
433 unsigned int r_len
= state
->right
.atoms
.len
;
434 unsigned int equal_atoms_start
= 0;
435 unsigned int equal_atoms_end
= 0;
436 unsigned int l_idx
= 0;
437 unsigned int r_idx
= 0;
439 while (equal_atoms_start
< l_len
440 && equal_atoms_start
< r_len
) {
443 err
= diff_atom_same(&same
, &l
[equal_atoms_start
],
444 &r
[equal_atoms_start
]);
451 while (equal_atoms_end
< (l_len
- equal_atoms_start
)
452 && equal_atoms_end
< (r_len
- equal_atoms_start
)) {
455 err
= diff_atom_same(&same
, &l
[l_len
- 1 - equal_atoms_end
],
456 &r
[r_len
- 1 - equal_atoms_end
]);
464 /* Add a chunk of equal lines at the start */
465 if (equal_atoms_start
) {
466 if (!diff_state_add_chunk(state
, true,
467 l
, equal_atoms_start
,
468 r
, equal_atoms_start
))
470 l_idx
+= equal_atoms_start
;
471 r_idx
+= equal_atoms_start
;
474 /* Add a "minus" chunk with all lines from the left. */
475 if (equal_atoms_start
+ equal_atoms_end
< l_len
) {
476 unsigned int add_len
= l_len
- equal_atoms_start
- equal_atoms_end
;
477 if (!diff_state_add_chunk(state
, true,
484 /* Add a "plus" chunk with all lines from the right. */
485 if (equal_atoms_start
+ equal_atoms_end
< r_len
) {
486 unsigned int add_len
= r_len
- equal_atoms_start
- equal_atoms_end
;
487 if (!diff_state_add_chunk(state
, true,
494 /* Add a chunk of equal lines at the end */
495 if (equal_atoms_end
) {
496 if (!diff_state_add_chunk(state
, true,
497 &l
[l_idx
], equal_atoms_end
,
498 &r
[r_idx
], equal_atoms_end
))
506 diff_run_algo(const struct diff_algo_config
*algo_config
,
507 struct diff_state
*state
)
511 if (!algo_config
|| !algo_config
->impl
512 || !state
->recursion_depth_left
513 || !state
->left
.atoms
.len
|| !state
->right
.atoms
.len
) {
514 debug("Fall back to diff_algo_none():%s%s%s\n",
515 (!algo_config
|| !algo_config
->impl
) ? " no-cfg" : "",
516 (!state
->recursion_depth_left
) ? " max-depth" : "",
517 (!state
->left
.atoms
.len
|| !state
->right
.atoms
.len
)?
519 return diff_algo_none(algo_config
, state
);
522 ARRAYLIST_FREE(state
->temp_result
);
523 ARRAYLIST_INIT(state
->temp_result
, DIFF_RESULT_ALLOC_BLOCKSIZE
);
524 rc
= algo_config
->impl(algo_config
, state
);
526 case DIFF_RC_USE_DIFF_ALGO_FALLBACK
:
527 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
528 algo_config
->fallback_algo
);
529 rc
= diff_run_algo(algo_config
->fallback_algo
, state
);
537 /* some error happened */
541 /* Pick up any diff chunks that are still unsolved and feed to
542 * inner_algo. inner_algo will solve unsolved chunks and append to
543 * result, and subsequent solved chunks on this level are then appended
544 * to result afterwards. */
546 for (i
= 0; i
< state
->temp_result
.len
; i
++) {
547 struct diff_chunk
*c
= &state
->temp_result
.head
[i
];
549 diff_state_add_solved_chunk(state
, c
);
553 /* c is an unsolved chunk, feed to inner_algo */
554 struct diff_state inner_state
= {
555 .result
= state
->result
,
556 .recursion_depth_left
= state
->recursion_depth_left
- 1,
557 .kd_buf
= state
->kd_buf
,
558 .kd_buf_size
= state
->kd_buf_size
,
560 diff_data_init_subsection(&inner_state
.left
, &state
->left
,
561 c
->left_start
, c
->left_count
);
562 diff_data_init_subsection(&inner_state
.right
, &state
->right
,
563 c
->right_start
, c
->right_count
);
565 rc
= diff_run_algo(algo_config
->inner_algo
, &inner_state
);
566 state
->kd_buf
= inner_state
.kd_buf
;
567 state
->kd_buf_size
= inner_state
.kd_buf_size
;
568 if (rc
!= DIFF_RC_OK
)
574 ARRAYLIST_FREE(state
->temp_result
);
579 diff_atomize_file(struct diff_data
*d
,
580 const struct diff_config
*config
,
581 FILE *f
, const uint8_t *data
, off_t len
, int diff_flags
)
583 if (!config
->atomize_func
)
586 diff_data_init_root(d
, f
, data
, len
, diff_flags
);
588 return config
->atomize_func(config
->atomize_func_data
, d
);
593 diff_main(const struct diff_config
*config
, struct diff_data
*left
,
594 struct diff_data
*right
)
596 struct diff_result
*result
= malloc(sizeof(struct diff_result
));
600 *result
= (struct diff_result
){};
602 result
->right
= right
;
604 struct diff_state state
= {
606 .recursion_depth_left
= config
->max_recursion_depth
? : UINT_MAX
,
610 diff_data_init_subsection(&state
.left
, left
,
613 diff_data_init_subsection(&state
.right
, right
,
617 result
->rc
= diff_run_algo(config
->algo
, &state
);
624 diff_result_free(struct diff_result
*result
)
628 ARRAYLIST_FREE(result
->chunks
);