1 /* Generic infrastructure to implement various diff algorithms (implementation). */
3 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 #include <sys/queue.h>
32 #include <arraylist.h>
33 #include <diff_main.h>
35 #include "got_compat.h"
37 #include "diff_internal.h"
38 #include "diff_debug.h"
40 inline enum diff_chunk_type
41 diff_chunk_type(const struct diff_chunk
*chunk
)
43 if (!chunk
->left_count
&& !chunk
->right_count
)
47 if (!chunk
->right_count
)
49 if (!chunk
->left_count
)
51 if (chunk
->left_count
!= chunk
->right_count
)
57 read_at(FILE *f
, off_t at_pos
, unsigned char *buf
, size_t len
)
60 if (fseeko(f
, at_pos
, SEEK_SET
) == -1)
62 r
= fread(buf
, sizeof(char), len
, f
);
63 if ((r
== 0 || r
< len
) && ferror(f
))
71 buf_cmp(const unsigned char *left
, size_t left_len
,
72 const unsigned char *right
, size_t right_len
,
73 bool ignore_whitespace
)
77 if (ignore_whitespace
) {
79 while (il
< left_len
&& ir
< right_len
) {
80 unsigned char cl
= left
[il
];
81 unsigned char cr
= right
[ir
];
83 if (isspace((unsigned char)cl
) && il
< left_len
) {
87 if (isspace((unsigned char)cr
) && ir
< right_len
) {
99 while (il
< left_len
) {
100 unsigned char cl
= left
[il
++];
101 if (!isspace((unsigned char)cl
))
104 while (ir
< right_len
) {
105 unsigned char cr
= right
[ir
++];
106 if (!isspace((unsigned char)cr
))
113 cmp
= memcmp(left
, right
, MIN(left_len
, right_len
));
116 if (left_len
== right_len
)
118 return (left_len
> right_len
) ? 1 : -1;
122 diff_atom_cmp(int *cmp
,
123 const struct diff_atom
*left
,
124 const struct diff_atom
*right
)
126 off_t remain_left
, remain_right
;
127 int flags
= (left
->root
->diff_flags
| right
->root
->diff_flags
);
128 bool ignore_whitespace
= (flags
& DIFF_FLAG_IGNORE_WHITESPACE
);
130 if (!left
->len
&& !right
->len
) {
134 if (!ignore_whitespace
) {
145 if (left
->at
!= NULL
&& right
->at
!= NULL
) {
146 *cmp
= buf_cmp(left
->at
, left
->len
, right
->at
, right
->len
,
151 remain_left
= left
->len
;
152 remain_right
= right
->len
;
153 while (remain_left
> 0 || remain_right
> 0) {
154 const size_t chunksz
= 8192;
155 unsigned char buf_left
[chunksz
], buf_right
[chunksz
];
156 const uint8_t *p_left
, *p_right
;
157 off_t n_left
, n_right
;
169 n_left
= MIN(chunksz
, remain_left
);
170 n_right
= MIN(chunksz
, remain_right
);
172 if (left
->at
== NULL
) {
173 r
= read_at(left
->root
->f
,
174 left
->pos
+ (left
->len
- remain_left
),
182 p_left
= left
->at
+ (left
->len
- remain_left
);
185 if (right
->at
== NULL
) {
186 r
= read_at(right
->root
->f
,
187 right
->pos
+ (right
->len
- remain_right
),
195 p_right
= right
->at
+ (right
->len
- remain_right
);
198 r
= buf_cmp(p_left
, n_left
, p_right
, n_right
,
205 remain_left
-= n_left
;
206 remain_right
-= n_right
;
214 diff_atom_same(bool *same
,
215 const struct diff_atom
*left
,
216 const struct diff_atom
*right
)
220 if (left
->hash
!= right
->hash
) {
224 r
= diff_atom_cmp(&cmp
, left
, right
);
233 static struct diff_chunk
*
234 diff_state_add_solved_chunk(struct diff_state
*state
,
235 const struct diff_chunk
*chunk
)
237 diff_chunk_arraylist_t
*result
;
238 struct diff_chunk
*new_chunk
;
239 enum diff_chunk_type last_t
;
240 enum diff_chunk_type new_t
;
241 struct diff_chunk
*last
;
243 /* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
244 * never directly follows a plus chunk. */
245 result
= &state
->result
->chunks
;
247 last_t
= result
->len
? diff_chunk_type(&result
->head
[result
->len
- 1])
249 new_t
= diff_chunk_type(chunk
);
251 debug("ADD %s chunk #%u:\n", chunk
->solved
? "solved" : "UNSOLVED",
254 debug_dump_atoms(&state
->left
, chunk
->left_start
, chunk
->left_count
);
256 debug_dump_atoms(&state
->right
, chunk
->right_start
, chunk
->right_count
);
259 last
= &result
->head
[result
->len
- 1];
260 assert(chunk
->left_start
261 == last
->left_start
+ last
->left_count
);
262 assert(chunk
->right_start
263 == last
->right_start
+ last
->right_count
);
266 if (new_t
== last_t
) {
267 new_chunk
= &result
->head
[result
->len
- 1];
268 new_chunk
->left_count
+= chunk
->left_count
;
269 new_chunk
->right_count
+= chunk
->right_count
;
270 debug(" - added chunk touches previous one of same type, joined:\n");
272 debug_dump_atoms(&state
->left
, new_chunk
->left_start
, new_chunk
->left_count
);
274 debug_dump_atoms(&state
->right
, new_chunk
->right_start
, new_chunk
->right_count
);
275 } else if (last_t
== CHUNK_PLUS
&& new_t
== CHUNK_MINUS
) {
276 enum diff_chunk_type prev_last_t
=
278 diff_chunk_type(&result
->head
[result
->len
- 2])
280 /* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
281 * Is the one before that also a minus? combine. */
282 if (prev_last_t
== CHUNK_MINUS
) {
283 new_chunk
= &result
->head
[result
->len
- 2];
284 new_chunk
->left_count
+= chunk
->left_count
;
285 new_chunk
->right_count
+= chunk
->right_count
;
287 debug(" - added minus-chunk follows plus-chunk,"
288 " put before that plus-chunk and joined"
289 " with preceding minus-chunk:\n");
291 debug_dump_atoms(&state
->left
, new_chunk
->left_start
, new_chunk
->left_count
);
293 debug_dump_atoms(&state
->right
, new_chunk
->right_start
, new_chunk
->right_count
);
295 ARRAYLIST_INSERT(new_chunk
, *result
, result
->len
- 1);
300 /* The new minus chunk indicates to which position on
301 * the right it corresponds, even though it doesn't add
302 * any lines on the right. By moving above a plus chunk,
303 * that position on the right has shifted. */
304 last
= &result
->head
[result
->len
- 1];
305 new_chunk
->right_start
= last
->right_start
;
307 debug(" - added minus-chunk follows plus-chunk,"
308 " put before that plus-chunk\n");
311 /* That last_t == CHUNK_PLUS indicates to which position on the
312 * left it corresponds, even though it doesn't add any lines on
313 * the left. By inserting/extending the prev_last_t ==
314 * CHUNK_MINUS, that position on the left has shifted. */
315 last
= &result
->head
[result
->len
- 1];
316 last
->left_start
= new_chunk
->left_start
317 + new_chunk
->left_count
;
320 ARRAYLIST_ADD(new_chunk
, *result
);
328 /* Even if a left or right side is empty, diff output may need to know the
329 * position in that file.
330 * So left_start or right_start must never be NULL -- pass left_count or
331 * right_count as zero to indicate staying at that position without consuming
334 diff_state_add_chunk(struct diff_state
*state
, bool solved
,
335 struct diff_atom
*left_start
, unsigned int left_count
,
336 struct diff_atom
*right_start
, unsigned int right_count
)
338 struct diff_chunk
*new_chunk
;
339 struct diff_chunk chunk
= {
341 .left_start
= left_start
,
342 .left_count
= left_count
,
343 .right_start
= right_start
,
344 .right_count
= right_count
,
347 /* An unsolved chunk means store as intermediate result for later
349 * If there already are intermediate results, that means even a
350 * following solved chunk needs to go to intermediate results, so that
351 * it is later put in the final correct position in solved chunks.
353 if (!solved
|| state
->temp_result
.len
) {
354 /* Append to temp_result */
355 debug("ADD %s chunk to temp result:\n",
356 chunk
.solved
? "solved" : "UNSOLVED");
358 debug_dump_atoms(&state
->left
, left_start
, left_count
);
360 debug_dump_atoms(&state
->right
, right_start
, right_count
);
361 ARRAYLIST_ADD(new_chunk
, state
->temp_result
);
368 return diff_state_add_solved_chunk(state
, &chunk
);
372 diff_data_init_root(struct diff_data
*d
, FILE *f
, const uint8_t *data
,
373 unsigned long long len
, int diff_flags
)
375 *d
= (struct diff_data
){
381 .diff_flags
= diff_flags
,
386 diff_data_init_subsection(struct diff_data
*d
, struct diff_data
*parent
,
387 struct diff_atom
*from_atom
, unsigned int atoms_count
)
389 struct diff_atom
*last_atom
;
391 debug("diff_data %p parent %p from_atom %p atoms_count %u\n",
392 d
, parent
, from_atom
, atoms_count
);
393 debug(" from_atom ");
394 debug_dump_atom(parent
, NULL
, from_atom
);
396 if (atoms_count
== 0) {
397 *d
= (struct diff_data
){
402 .root
= parent
->root
,
404 .atoms
.len
= atoms_count
,
410 last_atom
= from_atom
+ atoms_count
- 1;
411 *d
= (struct diff_data
){
413 .pos
= from_atom
->pos
,
414 .data
= from_atom
->at
,
415 .len
= (last_atom
->pos
+ last_atom
->len
) - from_atom
->pos
,
416 .root
= parent
->root
,
417 .atoms
.head
= from_atom
,
418 .atoms
.len
= atoms_count
,
421 debug("subsection:\n");
426 diff_data_free(struct diff_data
*diff_data
)
430 if (diff_data
->atoms
.allocated
)
431 ARRAYLIST_FREE(diff_data
->atoms
);
435 diff_algo_none(const struct diff_algo_config
*algo_config
,
436 struct diff_state
*state
)
438 debug("\n** %s\n", __func__
);
440 debug_dump(&state
->left
);
442 debug_dump(&state
->right
);
443 debug_dump_myers_graph(&state
->left
, &state
->right
, NULL
, NULL
, 0, NULL
,
446 /* Add a chunk of equal lines, if any */
447 struct diff_atom
*l
= state
->left
.atoms
.head
;
448 unsigned int l_len
= state
->left
.atoms
.len
;
449 struct diff_atom
*r
= state
->right
.atoms
.head
;
450 unsigned int r_len
= state
->right
.atoms
.len
;
451 unsigned int equal_atoms_start
= 0;
452 unsigned int equal_atoms_end
= 0;
453 unsigned int l_idx
= 0;
454 unsigned int r_idx
= 0;
456 while (equal_atoms_start
< l_len
457 && equal_atoms_start
< r_len
) {
460 err
= diff_atom_same(&same
, &l
[equal_atoms_start
],
461 &r
[equal_atoms_start
]);
468 while (equal_atoms_end
< (l_len
- equal_atoms_start
)
469 && equal_atoms_end
< (r_len
- equal_atoms_start
)) {
472 err
= diff_atom_same(&same
, &l
[l_len
- 1 - equal_atoms_end
],
473 &r
[r_len
- 1 - equal_atoms_end
]);
481 /* Add a chunk of equal lines at the start */
482 if (equal_atoms_start
) {
483 if (!diff_state_add_chunk(state
, true,
484 l
, equal_atoms_start
,
485 r
, equal_atoms_start
))
487 l_idx
+= equal_atoms_start
;
488 r_idx
+= equal_atoms_start
;
491 /* Add a "minus" chunk with all lines from the left. */
492 if (equal_atoms_start
+ equal_atoms_end
< l_len
) {
493 unsigned int add_len
= l_len
- equal_atoms_start
- equal_atoms_end
;
494 if (!diff_state_add_chunk(state
, true,
501 /* Add a "plus" chunk with all lines from the right. */
502 if (equal_atoms_start
+ equal_atoms_end
< r_len
) {
503 unsigned int add_len
= r_len
- equal_atoms_start
- equal_atoms_end
;
504 if (!diff_state_add_chunk(state
, true,
511 /* Add a chunk of equal lines at the end */
512 if (equal_atoms_end
) {
513 if (!diff_state_add_chunk(state
, true,
514 &l
[l_idx
], equal_atoms_end
,
515 &r
[r_idx
], equal_atoms_end
))
523 diff_run_algo(const struct diff_algo_config
*algo_config
,
524 struct diff_state
*state
)
528 if (!algo_config
|| !algo_config
->impl
529 || !state
->recursion_depth_left
530 || !state
->left
.atoms
.len
|| !state
->right
.atoms
.len
) {
531 debug("Fall back to diff_algo_none():%s%s%s\n",
532 (!algo_config
|| !algo_config
->impl
) ? " no-cfg" : "",
533 (!state
->recursion_depth_left
) ? " max-depth" : "",
534 (!state
->left
.atoms
.len
|| !state
->right
.atoms
.len
)?
536 return diff_algo_none(algo_config
, state
);
539 ARRAYLIST_FREE(state
->temp_result
);
540 ARRAYLIST_INIT(state
->temp_result
, DIFF_RESULT_ALLOC_BLOCKSIZE
);
541 rc
= algo_config
->impl(algo_config
, state
);
543 case DIFF_RC_USE_DIFF_ALGO_FALLBACK
:
544 debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
545 algo_config
->fallback_algo
);
546 rc
= diff_run_algo(algo_config
->fallback_algo
, state
);
554 /* some error happened */
558 /* Pick up any diff chunks that are still unsolved and feed to
559 * inner_algo. inner_algo will solve unsolved chunks and append to
560 * result, and subsequent solved chunks on this level are then appended
561 * to result afterwards. */
563 for (i
= 0; i
< state
->temp_result
.len
; i
++) {
564 struct diff_chunk
*c
= &state
->temp_result
.head
[i
];
566 diff_state_add_solved_chunk(state
, c
);
570 /* c is an unsolved chunk, feed to inner_algo */
571 struct diff_state inner_state
= {
572 .result
= state
->result
,
573 .recursion_depth_left
= state
->recursion_depth_left
- 1,
574 .kd_buf
= state
->kd_buf
,
575 .kd_buf_size
= state
->kd_buf_size
,
577 diff_data_init_subsection(&inner_state
.left
, &state
->left
,
578 c
->left_start
, c
->left_count
);
579 diff_data_init_subsection(&inner_state
.right
, &state
->right
,
580 c
->right_start
, c
->right_count
);
582 rc
= diff_run_algo(algo_config
->inner_algo
, &inner_state
);
583 state
->kd_buf
= inner_state
.kd_buf
;
584 state
->kd_buf_size
= inner_state
.kd_buf_size
;
585 if (rc
!= DIFF_RC_OK
)
591 ARRAYLIST_FREE(state
->temp_result
);
596 diff_atomize_file(struct diff_data
*d
,
597 const struct diff_config
*config
,
598 FILE *f
, const uint8_t *data
, off_t len
, int diff_flags
)
600 if (!config
->atomize_func
)
603 diff_data_init_root(d
, f
, data
, len
, diff_flags
);
605 return config
->atomize_func(config
->atomize_func_data
, d
);
610 diff_main(const struct diff_config
*config
, struct diff_data
*left
,
611 struct diff_data
*right
)
613 struct diff_result
*result
= malloc(sizeof(struct diff_result
));
617 *result
= (struct diff_result
){};
619 result
->right
= right
;
621 struct diff_state state
= {
623 .recursion_depth_left
= config
->max_recursion_depth
?
624 config
->max_recursion_depth
: UINT_MAX
,
628 diff_data_init_subsection(&state
.left
, left
,
631 diff_data_init_subsection(&state
.right
, right
,
635 result
->rc
= diff_run_algo(config
->algo
, &state
);
642 diff_result_free(struct diff_result
*result
)
646 ARRAYLIST_FREE(result
->chunks
);
651 diff_result_contains_printable_chunks(struct diff_result
*result
)
653 struct diff_chunk
*c
;
654 enum diff_chunk_type t
;
657 for (i
= 0; i
< result
->chunks
.len
; i
++) {
658 c
= &result
->chunks
.head
[i
];
659 t
= diff_chunk_type(c
);
660 if (t
== CHUNK_MINUS
|| t
== CHUNK_PLUS
)