diffcore-delta.c

   1 #include "git-compat-util.h"
   2 #include "diffcore.h"
   3
   4 /*
   5  * Idea here is very simple.
   6  *
   7  * Almost all data we are interested in are text, but sometimes we have
   8  * to deal with binary data.  So we cut them into chunks delimited by
   9  * LF byte, or 64-byte sequence, whichever comes first, and hash them.
  10  *
  11  * For those chunks, if the source buffer has more instances of it
  12  * than the destination buffer, that means the difference are the
  13  * number of bytes not copied from source to destination.  If the
  14  * counts are the same, everything was copied from source to
  15  * destination.  If the destination has more, everything was copied,
  16  * and destination added more.
  17  *
  18  * We are doing an approximation so we do not really have to waste
  19  * memory by actually storing the sequence.  We just hash them into
  20  * somewhere around 2^16 hashbuckets and count the occurrences.
  21  */
  22
  23 /* Wild guess at the initial hash size */
  24 #define INITIAL_HASH_SIZE 9
  25
  26 /* We leave more room in smaller hash but do not let it
  27  * grow to have unused hole too much.
  28  */
  29 #define INITIAL_FREE(sz_log2) ((1<<(sz_log2))*(sz_log2-3)/(sz_log2))
  30
  31 /* A prime rather carefully chosen between 2^16..2^17, so that
  32  * HASHBASE < INITIAL_FREE(17).  We want to keep the maximum hashtable
  33  * size under the current 2<<17 maximum, which can hold this many
  34  * different values before overflowing to hashtable of size 2<<18.
  35  */
  36 #define HASHBASE 107927
  37
  38 struct spanhash {
  39         unsigned int hashval;
  40         unsigned int cnt;
  41 };
  42 struct spanhash_top {
  43         int alloc_log2;
  44         int free;
  45         struct spanhash data[FLEX_ARRAY];
  46 };
  47
  48 static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
  49 {
  50         struct spanhash_top *new_spanhash;
  51         int i;
  52         int osz = 1 << orig->alloc_log2;
  53         int sz = osz << 1;
  54
  55         new_spanhash = xmalloc(st_add(sizeof(*orig),
  56                              st_mult(sizeof(struct spanhash), sz)));
  57         new_spanhash->alloc_log2 = orig->alloc_log2 + 1;
  58         new_spanhash->free = INITIAL_FREE(new_spanhash->alloc_log2);
  59         memset(new_spanhash->data, 0, sizeof(struct spanhash) * sz);
  60         for (i = 0; i < osz; i++) {
  61                 struct spanhash *o = &(orig->data[i]);
  62                 int bucket;
  63                 if (!o->cnt)
  64                         continue;
  65                 bucket = o->hashval & (sz - 1);
  66                 while (1) {
  67                         struct spanhash *h = &(new_spanhash->data[bucket++]);
  68                         if (!h->cnt) {
  69                                 h->hashval = o->hashval;
  70                                 h->cnt = o->cnt;
  71                                 new_spanhash->free--;
  72                                 break;
  73                         }
  74                         if (sz <= bucket)
  75                                 bucket = 0;
  76                 }
  77         }
  78         free(orig);
  79         return new_spanhash;
  80 }
  81
  82 static struct spanhash_top *add_spanhash(struct spanhash_top *top,
  83                                          unsigned int hashval, int cnt)
  84 {
  85         int bucket, lim;
  86         struct spanhash *h;
  87
  88         lim = (1 << top->alloc_log2);
  89         bucket = hashval & (lim - 1);
  90         while (1) {
  91                 h = &(top->data[bucket++]);
  92                 if (!h->cnt) {
  93                         h->hashval = hashval;
  94                         h->cnt = cnt;
  95                         top->free--;
  96                         if (top->free < 0)
  97                                 return spanhash_rehash(top);
  98                         return top;
  99                 }
 100                 if (h->hashval == hashval) {
 101                         h->cnt += cnt;
 102                         return top;
 103                 }
 104                 if (lim <= bucket)
 105                         bucket = 0;
 106         }
 107 }
 108
 109 static int spanhash_cmp(const void *a_, const void *b_)
 110 {
 111         const struct spanhash *a = a_;
 112         const struct spanhash *b = b_;
 113
 114         /* A count of zero compares at the end.. */
 115         if (!a->cnt)
 116                 return !b->cnt ? 0 : 1;
 117         if (!b->cnt)
 118                 return -1;
 119         return a->hashval < b->hashval ? -1 :
 120                 a->hashval > b->hashval ? 1 : 0;
 121 }
 122
 123 static struct spanhash_top *hash_chars(struct repository *r,
 124                                        struct diff_filespec *one)
 125 {
 126         int i, n;
 127         unsigned int accum1, accum2, hashval;
 128         struct spanhash_top *hash;
 129         unsigned char *buf = one->data;
 130         unsigned int sz = one->size;
 131         int is_text = !diff_filespec_is_binary(r, one);
 132
 133         i = INITIAL_HASH_SIZE;
 134         hash = xmalloc(st_add(sizeof(*hash),
 135                               st_mult(sizeof(struct spanhash), (size_t)1 << i)));
 136         hash->alloc_log2 = i;
 137         hash->free = INITIAL_FREE(i);
 138         memset(hash->data, 0, sizeof(struct spanhash) * ((size_t)1 << i));
 139
 140         n = 0;
 141         accum1 = accum2 = 0;
 142         while (sz) {
 143                 unsigned int c = *buf++;
 144                 unsigned int old_1 = accum1;
 145                 sz--;
 146
 147                 /* Ignore CR in CRLF sequence if text */
 148                 if (is_text && c == '\r' && sz && *buf == '\n')
 149                         continue;
 150
 151                 accum1 = (accum1 << 7) ^ (accum2 >> 25);
 152                 accum2 = (accum2 << 7) ^ (old_1 >> 25);
 153                 accum1 += c;
 154                 if (++n < 64 && c != '\n')
 155                         continue;
 156                 hashval = (accum1 + accum2 * 0x61) % HASHBASE;
 157                 hash = add_spanhash(hash, hashval, n);
 158                 n = 0;
 159                 accum1 = accum2 = 0;
 160         }
 161         if (n > 0) {
 162                 hashval = (accum1 + accum2 * 0x61) % HASHBASE;
 163                 hash = add_spanhash(hash, hashval, n);
 164         }
 165         QSORT(hash->data, (size_t)1ul << hash->alloc_log2, spanhash_cmp);
 166         return hash;
 167 }
 168
 169 int diffcore_count_changes(struct repository *r,
 170                            struct diff_filespec *src,
 171                            struct diff_filespec *dst,
 172                            void **src_count_p,
 173                            void **dst_count_p,
 174                            unsigned long *src_copied,
 175                            unsigned long *literal_added)
 176 {
 177         struct spanhash *s, *d;
 178         struct spanhash_top *src_count, *dst_count;
 179         unsigned long sc, la;
 180
 181         src_count = dst_count = NULL;
 182         if (src_count_p)
 183                 src_count = *src_count_p;
 184         if (!src_count) {
 185                 src_count = hash_chars(r, src);
 186                 if (src_count_p)
 187                         *src_count_p = src_count;
 188         }
 189         if (dst_count_p)
 190                 dst_count = *dst_count_p;
 191         if (!dst_count) {
 192                 dst_count = hash_chars(r, dst);
 193                 if (dst_count_p)
 194                         *dst_count_p = dst_count;
 195         }
 196         sc = la = 0;
 197
 198         s = src_count->data;
 199         d = dst_count->data;
 200         for (;;) {
 201                 unsigned dst_cnt, src_cnt;
 202                 if (!s->cnt)
 203                         break; /* we checked all in src */
 204                 while (d->cnt) {
 205                         if (d->hashval >= s->hashval)
 206                                 break;
 207                         la += d->cnt;
 208                         d++;
 209                 }
 210                 src_cnt = s->cnt;
 211                 dst_cnt = 0;
 212                 if (d->cnt && d->hashval == s->hashval) {
 213                         dst_cnt = d->cnt;
 214                         d++;
 215                 }
 216                 if (src_cnt < dst_cnt) {
 217                         la += dst_cnt - src_cnt;
 218                         sc += src_cnt;
 219                 }
 220                 else
 221                         sc += dst_cnt;
 222                 s++;
 223         }
 224         while (d->cnt) {
 225                 la += d->cnt;
 226                 d++;
 227         }
 228
 229         if (!src_count_p)
 230                 free(src_count);
 231         if (!dst_count_p)
 232                 free(dst_count);
 233         *src_copied = sc;
 234         *literal_added = la;
 235         return 0;
 236 }