1 /* Implementation of the Patience Diff algorithm invented by Bram Cohen:
2 * Divide a diff problem into smaller chunks by an LCS (Longest Common Sequence)
3 * of common-unique lines. */
5 * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
27 #include <arraylist.h>
28 #include <diff_main.h>
30 #include "diff_internal.h"
31 #include "diff_debug.h"
33 /* Algorithm to find unique lines:
34 * 0: stupidly iterate atoms
38 #define UNIQUE_STRATEGY 1
40 /* Per-atom state for the Patience Diff algorithm */
41 struct atom_patience
{
42 #if UNIQUE_STRATEGY == 0
46 struct diff_atom
*pos_in_other
;
47 struct diff_atom
*prev_stack
;
48 struct diff_range identical_lines
;
51 /* A diff_atom has a backpointer to the root diff_data. That points to the
52 * current diff_data, a possibly smaller section of the root. That current
53 * diff_data->algo_data is a pointer to an array of struct atom_patience. The
54 * atom's index in current diff_data gives the index in the atom_patience array.
56 #define PATIENCE(ATOM) \
57 (((struct atom_patience*)((ATOM)->root->current->algo_data))\
58 [diff_atom_idx((ATOM)->root->current, ATOM)])
60 #if UNIQUE_STRATEGY == 0
62 /* Stupid iteration and comparison of all atoms */
64 diff_atoms_mark_unique(struct diff_data
*d
, unsigned int *unique_count
)
67 unsigned int count
= 0;
68 diff_data_foreach_atom(i
, d
) {
69 PATIENCE(i
).unique_here
= true;
70 PATIENCE(i
).unique_in_both
= true;
73 diff_data_foreach_atom(i
, d
) {
76 if (!PATIENCE(i
).unique_here
)
79 diff_data_foreach_atom_from(i
+ 1, j
, d
) {
81 int r
= diff_atom_same(&same
, i
, j
);
86 if (PATIENCE(i
).unique_here
) {
87 PATIENCE(i
).unique_here
= false;
88 PATIENCE(i
).unique_in_both
= false;
91 PATIENCE(j
).unique_here
= false;
92 PATIENCE(j
).unique_in_both
= false;
97 *unique_count
= count
;
101 /* Mark those lines as PATIENCE(atom).unique_in_both = true that appear exactly
102 * once in each side. */
104 diff_atoms_mark_unique_in_both(struct diff_data
*left
, struct diff_data
*right
,
105 unsigned int *unique_in_both_count
)
107 /* Derive the final unique_in_both count without needing an explicit
108 * iteration. So this is just some optimiziation to save one iteration
110 unsigned int unique_in_both
;
113 r
= diff_atoms_mark_unique(left
, &unique_in_both
);
116 r
= diff_atoms_mark_unique(right
, NULL
);
120 debug("unique_in_both %u\n", unique_in_both
);
123 diff_data_foreach_atom(i
, left
) {
124 if (!PATIENCE(i
).unique_here
)
128 diff_data_foreach_atom(j
, right
) {
130 int r
= diff_atom_same(&same
, i
, j
);
135 if (!PATIENCE(j
).unique_here
) {
136 found_in_b
= 2; /* or more */
140 PATIENCE(j
).pos_in_other
= i
;
141 PATIENCE(i
).pos_in_other
= j
;
145 if (found_in_b
== 0 || found_in_b
> 1) {
146 PATIENCE(i
).unique_in_both
= false;
148 debug("unique_in_both %u (%d) ", unique_in_both
,
150 debug_dump_atom(left
, NULL
, i
);
154 /* Still need to unmark right[*]->patience.unique_in_both for atoms that
155 * don't exist in left */
156 diff_data_foreach_atom(i
, right
) {
157 if (!PATIENCE(i
).unique_here
158 || !PATIENCE(i
).unique_in_both
)
161 bool found_in_a
= false;
162 diff_data_foreach_atom(j
, left
) {
165 if (!PATIENCE(j
).unique_in_both
)
167 r
= diff_atom_same(&same
, i
, j
);
177 PATIENCE(i
).unique_in_both
= false;
180 if (unique_in_both_count
)
181 *unique_in_both_count
= unique_in_both
;
185 #else /* UNIQUE_STRATEGY != 0 */
187 /* Use an optimized sorting algorithm (qsort, mergesort) to find unique lines */
189 static int diff_atoms_compar(const void *_a
, const void *_b
)
191 const struct diff_atom
*a
= *(struct diff_atom
**)_a
;
192 const struct diff_atom
*b
= *(struct diff_atom
**)_b
;
196 /* If there's been an error (e.g. I/O error) in a previous compar, we
197 * have no way to abort the sort but just report the rc and stop
198 * comparing. Make sure to catch errors on either side. If atoms are
199 * from more than one diff_data, make sure the error, if any, spreads
200 * to all of them, so we can cut short all future comparisons. */
208 /* just return 'equal' to not swap more positions */
212 /* Sort by the simplistic hash */
213 if (a
->hash
< b
->hash
)
215 if (a
->hash
> b
->hash
)
218 /* If hashes are the same, the lines may still differ. Do a full cmp. */
219 rc
= diff_atom_cmp(&cmp
, a
, b
);
222 /* Mark the I/O error so that the caller can find out about it.
223 * For the case atoms are from more than one diff_data, mark in
226 if (a
->root
!= b
->root
)
234 /* Sort an array of struct diff_atom* in-place. */
235 static int diff_atoms_sort(struct diff_atom
*atoms
[],
238 #if UNIQUE_STRATEGY == 1
239 qsort(atoms
, atoms_count
, sizeof(struct diff_atom
*), diff_atoms_compar
);
241 mergesort(atoms
, atoms_count
, sizeof(struct diff_atom
*),
244 return atoms
[0]->root
->err
;
248 diff_atoms_mark_unique_in_both(struct diff_data
*left
, struct diff_data
*right
,
249 unsigned int *unique_in_both_count_p
)
253 struct diff_atom
**all_atoms
;
254 unsigned int len
= 0;
256 unsigned int unique_in_both_count
= 0;
259 all_atoms
= calloc(left
->atoms
.len
+ right
->atoms
.len
,
260 sizeof(struct diff_atom
*));
261 if (all_atoms
== NULL
)
267 right
->root
->err
= 0;
268 diff_data_foreach_atom(a
, left
) {
269 all_atoms
[len
++] = a
;
271 diff_data_foreach_atom(b
, right
) {
272 all_atoms
[len
++] = b
;
275 rc
= diff_atoms_sort(all_atoms
, len
);
279 /* Now we have a sorted array of atom pointers. All similar lines are
280 * adjacent. Walk through the array and mark those that are unique on
281 * each side, but exist once in both sources. */
282 for (i
= 0; i
< len
; i
++) {
284 unsigned int next_differing_i
;
285 unsigned int last_identical_i
;
287 unsigned int count_first_side
= 1;
288 unsigned int count_other_side
= 0;
291 debug_dump_atom(a
->root
, NULL
, a
);
293 /* Do as few diff_atom_cmp() as possible: first walk forward
294 * only using the cheap hash as indicator for differing atoms;
295 * then walk backwards until hitting an identical atom. */
296 for (next_differing_i
= i
+ 1; next_differing_i
< len
;
297 next_differing_i
++) {
298 b
= all_atoms
[next_differing_i
];
299 if (a
->hash
!= b
->hash
)
302 for (last_identical_i
= next_differing_i
- 1;
303 last_identical_i
> i
;
304 last_identical_i
--) {
305 b
= all_atoms
[last_identical_i
];
306 rc
= diff_atom_same(&same
, a
, b
);
312 next_differing_i
= last_identical_i
+ 1;
314 for (j
= i
+1; j
< next_differing_i
; j
++) {
316 /* A following atom is the same. See on which side the
317 * repetition counts. */
318 if (a
->root
== b
->root
)
323 debug_dump_atom(b
->root
, NULL
, b
);
324 debug(" count_first_side=%d count_other_side=%d\n",
325 count_first_side
, count_other_side
);
328 /* Counted a section of similar atoms, put the results back to
330 if ((count_first_side
== 1)
331 && (count_other_side
== 1)) {
333 PATIENCE(a
).unique_in_both
= true;
334 PATIENCE(a
).pos_in_other
= b
;
335 PATIENCE(b
).unique_in_both
= true;
336 PATIENCE(b
).pos_in_other
= a
;
337 unique_in_both_count
++;
340 /* j now points at the first atom after 'a' that is not
341 * identical to 'a'. j is always > i. */
344 *unique_in_both_count_p
= unique_in_both_count
;
350 #endif /* UNIQUE_STRATEGY != 0 */
352 /* binary search to find the stack to put this atom "card" on. */
354 find_target_stack(struct diff_atom
*atom
,
355 struct diff_atom
**patience_stacks
,
356 unsigned int patience_stacks_count
)
359 unsigned int hi
= patience_stacks_count
;
361 unsigned int mid
= (lo
+ hi
) >> 1;
363 if (PATIENCE(patience_stacks
[mid
]).pos_in_other
364 < PATIENCE(atom
).pos_in_other
)
372 /* Among the lines that appear exactly once in each side, find the longest
373 * streak that appear in both files in the same order (with other stuff allowed
374 * to interleave). Use patience sort for that, as in the Patience Diff
376 * See https://bramcohen.livejournal.com/73318.html and, for a much more
377 * detailed explanation,
378 * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
380 diff_algo_patience(const struct diff_algo_config
*algo_config
,
381 struct diff_state
*state
)
384 struct diff_data
*left
= &state
->left
;
385 struct diff_data
*right
= &state
->right
;
386 struct atom_patience
*atom_patience_left
=
387 calloc(left
->atoms
.len
, sizeof(struct atom_patience
));
388 struct atom_patience
*atom_patience_right
=
389 calloc(right
->atoms
.len
, sizeof(struct atom_patience
));
390 unsigned int unique_in_both_count
;
391 struct diff_atom
**lcs
= NULL
;
393 debug("\n** %s\n", __func__
);
395 left
->root
->current
= left
;
396 right
->root
->current
= right
;
397 left
->algo_data
= atom_patience_left
;
398 right
->algo_data
= atom_patience_right
;
400 /* Find those lines that appear exactly once in 'left' and exactly once
402 rc
= diff_atoms_mark_unique_in_both(left
, right
, &unique_in_both_count
);
406 debug("unique_in_both_count %u\n", unique_in_both_count
);
412 if (!unique_in_both_count
) {
413 /* Cannot apply Patience, tell the caller to use fallback_algo
415 rc
= DIFF_RC_USE_DIFF_ALGO_FALLBACK
;
421 /* An array of Longest Common Sequence is the result of the below
423 unsigned int lcs_count
= 0;
424 struct diff_atom
*lcs_tail
= NULL
;
427 /* This subscope marks the lifetime of the atom_pointers
430 /* One chunk of storage for atom pointers */
431 struct diff_atom
**atom_pointers
;
432 atom_pointers
= recallocarray(NULL
, 0, unique_in_both_count
* 2,
433 sizeof(struct diff_atom
*));
434 if (atom_pointers
== NULL
)
436 /* Half for the list of atoms that still need to be put on
438 struct diff_atom
**uniques
= atom_pointers
;
440 /* Half for the patience sort state's "card stacks" -- we
441 * remember only each stack's topmost "card" */
442 struct diff_atom
**patience_stacks
;
443 patience_stacks
= atom_pointers
+ unique_in_both_count
;
444 unsigned int patience_stacks_count
= 0;
446 /* Take all common, unique items from 'left' ... */
448 struct diff_atom
*atom
;
449 struct diff_atom
**uniques_end
= uniques
;
450 diff_data_foreach_atom(atom
, left
) {
451 if (!PATIENCE(atom
).unique_in_both
)
457 /* ...and sort them to the order found in 'right'.
458 * The idea is to find the leftmost stack that has a higher line
459 * number and add it to the stack's top.
460 * If there is no such stack, open a new one on the right. The
461 * line number is derived from the atom*, which are array items
462 * and hence reflect the relative position in the source file.
463 * So we got the common-uniques from 'left' and sort them
464 * according to PATIENCE(atom).pos_in_other. */
466 for (i
= 0; i
< unique_in_both_count
; i
++) {
468 unsigned int target_stack
;
469 target_stack
= find_target_stack(atom
, patience_stacks
,
470 patience_stacks_count
);
471 assert(target_stack
<= patience_stacks_count
);
472 patience_stacks
[target_stack
] = atom
;
473 if (target_stack
== patience_stacks_count
)
474 patience_stacks_count
++;
476 /* Record a back reference to the next stack on the
477 * left, which will form the final longest sequence
479 PATIENCE(atom
).prev_stack
= target_stack
?
480 patience_stacks
[target_stack
- 1] : NULL
;
484 for (xx
= 0; xx
< patience_stacks_count
; xx
++) {
486 (xx
== target_stack
) ? ">" : "",
488 PATIENCE(patience_stacks
[xx
]).pos_in_other
));
494 /* backtrace through prev_stack references to form the final
495 * longest common sequence */
496 lcs_tail
= patience_stacks
[patience_stacks_count
- 1];
497 lcs_count
= patience_stacks_count
;
499 /* uniques and patience_stacks are no longer needed.
500 * Backpointers are in PATIENCE(atom).prev_stack */
504 lcs
= recallocarray(NULL
, 0, lcs_count
, sizeof(struct diff_atom
*));
505 struct diff_atom
**lcs_backtrace_pos
= &lcs
[lcs_count
- 1];
506 struct diff_atom
*atom
;
507 for (atom
= lcs_tail
; atom
; atom
= PATIENCE(atom
).prev_stack
, lcs_backtrace_pos
--) {
508 assert(lcs_backtrace_pos
>= lcs
);
509 *lcs_backtrace_pos
= atom
;
514 debug("\npatience LCS:\n");
515 for (i
= 0; i
< lcs_count
; i
++) {
516 debug("\n L "); debug_dump_atom(left
, right
, lcs
[i
]);
517 debug(" R "); debug_dump_atom(right
, left
,
518 PATIENCE(lcs
[i
]).pos_in_other
);
523 /* TODO: For each common-unique line found (now listed in lcs), swallow
524 * lines upwards and downwards that are identical on each side. Requires
525 * a way to represent atoms being glued to adjacent atoms. */
527 debug("\ntraverse LCS, possibly recursing:\n");
529 /* Now we have pinned positions in both files at which it makes sense to
530 * divide the diff problem into smaller chunks. Go into the next round:
531 * look at each section in turn, trying to again find common-unique
532 * lines in those smaller sections. As soon as no more are found, the
533 * remaining smaller sections are solved by Myers. */
534 /* left_pos and right_pos are indexes in left/right->atoms.head until
535 * which the atoms are already handled (added to result chunks). */
536 unsigned int left_pos
= 0;
537 unsigned int right_pos
= 0;
538 for (i
= 0; i
<= lcs_count
; i
++) {
539 struct diff_atom
*atom
;
540 struct diff_atom
*atom_r
;
541 /* left_idx and right_idx are indexes of the start of this
542 * section of identical lines on both sides.
543 * left_pos marks the index of the first still unhandled line,
544 * left_idx is the start of an identical section some way
545 * further down, and this loop adds an unsolved chunk of
546 * [left_pos..left_idx[ and a solved chunk of
547 * [left_idx..identical_lines.end[. */
548 unsigned int left_idx
;
549 unsigned int right_idx
;
551 debug("iteration %u of %u left_pos %u right_pos %u\n",
552 i
, lcs_count
, left_pos
, right_pos
);
556 atom_r
= PATIENCE(atom
).pos_in_other
;
557 debug("lcs[%u] = left[%u] = right[%u]\n", i
,
558 diff_atom_idx(left
, atom
), diff_atom_idx(right
, atom_r
));
559 left_idx
= diff_atom_idx(left
, atom
);
560 right_idx
= diff_atom_idx(right
, atom_r
);
562 /* There are no more identical lines until the end of
566 left_idx
= left
->atoms
.len
;
567 right_idx
= right
->atoms
.len
;
570 /* 'atom' (if not NULL) now marks an atom that matches on both
571 * sides according to patience-diff (a common-unique identical
572 * atom in both files).
573 * Handle the section before and the atom itself; the section
574 * after will be handled by the next loop iteration -- note that
575 * i loops to last element + 1 ("i <= lcs_count"), so that there
576 * will be another final iteration to pick up the last remaining
577 * items after the last LCS atom.
580 debug("iteration %u left_pos %u left_idx %u"
581 " right_pos %u right_idx %u\n",
582 i
, left_pos
, left_idx
, right_pos
, right_idx
);
584 /* Section before the matching atom */
585 struct diff_atom
*left_atom
= &left
->atoms
.head
[left_pos
];
586 unsigned int left_section_len
= left_idx
- left_pos
;
588 struct diff_atom
*right_atom
= &(right
->atoms
.head
[right_pos
]);
589 unsigned int right_section_len
= right_idx
- right_pos
;
591 if (left_section_len
&& right_section_len
) {
592 /* Record an unsolved chunk, the caller will apply
593 * inner_algo() on this chunk. */
594 if (!diff_state_add_chunk(state
, false,
595 left_atom
, left_section_len
,
599 } else if (left_section_len
&& !right_section_len
) {
600 /* Only left atoms and none on the right, they form a
601 * "minus" chunk, then. */
602 if (!diff_state_add_chunk(state
, true,
603 left_atom
, left_section_len
,
606 } else if (!left_section_len
&& right_section_len
) {
607 /* No left atoms, only atoms on the right, they form a
608 * "plus" chunk, then. */
609 if (!diff_state_add_chunk(state
, true,
611 right_atom
, right_section_len
))
614 /* else: left_section_len == 0 and right_section_len == 0, i.e.
617 /* The atom found to match on both sides forms a chunk of equals
618 * on each side. In the very last iteration of this loop, there
619 * is no matching atom, we were just cleaning out the remaining
623 ok
= diff_state_add_chunk(state
, true,
625 PATIENCE(atom
).pos_in_other
, 1);
629 left_pos
= left_idx
+ 1;
630 right_pos
= right_idx
+ 1;
631 debug("end of iteration %u left_pos %u left_idx %u"
632 " right_pos %u right_idx %u\n",
633 i
, left_pos
, left_idx
, right_pos
, right_idx
);
635 debug("** END %s\n", __func__
);
640 left
->root
->current
= NULL
;
641 right
->root
->current
= NULL
;
642 free(atom_patience_left
);
643 free(atom_patience_right
);