compiler/libbz2/blocksort.c

   1
   2 /*-------------------------------------------------------------*/
   3 /*--- Block sorting machinery                               ---*/
   4 /*---                                           blocksort.c ---*/
   5 /*-------------------------------------------------------------*/
   6
   7 /* ------------------------------------------------------------------
   8    This file is part of bzip2/libbzip2, a program and library for
   9    lossless, block-sorting data compression.
  10
  11    bzip2/libbzip2 version 1.0.5 of 10 December 2007
  12    Copyright (C) 1996-2007 Julian Seward <jseward@bzip.org>
  13
  14    Please read the WARNING, DISCLAIMER and PATENTS sections in the
  15    README file.
  16
  17    This program is released under the terms of the license contained
  18    in the file LICENSE.
  19    ------------------------------------------------------------------ */
  20
  21
  22 #include "bzlib_private.h"
  23
  24 /*---------------------------------------------*/
  25 /*--- Fallback O(N log(N)^2) sorting        ---*/
  26 /*--- algorithm, for repetitive blocks      ---*/
  27 /*---------------------------------------------*/
  28
  29 /*---------------------------------------------*/
  30 static
  31 __inline__
  32 void fallbackSimpleSort ( UInt32* fmap,
  33                           UInt32* eclass,
  34                           Int32   lo,
  35                           Int32   hi )
  36 {
  37    Int32 i, j, tmp;
  38    UInt32 ec_tmp;
  39
  40    if (lo == hi) return;
  41
  42    if (hi - lo > 3) {
  43       for ( i = hi-4; i >= lo; i-- ) {
  44          tmp = fmap[i];
  45          ec_tmp = eclass[tmp];
  46          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  47             fmap[j-4] = fmap[j];
  48          fmap[j-4] = tmp;
  49       }
  50    }
  51
  52    for ( i = hi-1; i >= lo; i-- ) {
  53       tmp = fmap[i];
  54       ec_tmp = eclass[tmp];
  55       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
  56          fmap[j-1] = fmap[j];
  57       fmap[j-1] = tmp;
  58    }
  59 }
  60
  61
  62 /*---------------------------------------------*/
  63 #define fswap(zz1, zz2) \
  64    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
  65
  66 #define fvswap(zzp1, zzp2, zzn)       \
  67 {                                     \
  68    Int32 yyp1 = (zzp1);               \
  69    Int32 yyp2 = (zzp2);               \
  70    Int32 yyn  = (zzn);                \
  71    while (yyn > 0) {                  \
  72       fswap(fmap[yyp1], fmap[yyp2]);  \
  73       yyp1++; yyp2++; yyn--;          \
  74    }                                  \
  75 }
  76
  77
  78 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
  79
  80 #define fpush(lz,hz) { stackLo[sp] = lz; \
  81                        stackHi[sp] = hz; \
  82                        sp++; }
  83
  84 #define fpop(lz,hz) { sp--;              \
  85                       lz = stackLo[sp];  \
  86                       hz = stackHi[sp]; }
  87
  88 #define FALLBACK_QSORT_SMALL_THRESH 10
  89 #define FALLBACK_QSORT_STACK_SIZE   100
  90
  91
  92 static
  93 void fallbackQSort3 ( UInt32* fmap,
  94                       UInt32* eclass,
  95                       Int32   loSt,
  96                       Int32   hiSt )
  97 {
  98    Int32 unLo, unHi, ltLo, gtHi, n, m;
  99    Int32 sp, lo, hi;
 100    UInt32 med, r, r3;
 101    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 102    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 103
 104    r = 0;
 105
 106    sp = 0;
 107    fpush ( loSt, hiSt );
 108
 109    while (sp > 0) {
 110
 111       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE - 1, 1004 );
 112
 113       fpop ( lo, hi );
 114       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 115          fallbackSimpleSort ( fmap, eclass, lo, hi );
 116          continue;
 117       }
 118
 119       /* Random partitioning.  Median of 3 sometimes fails to
 120          avoid bad cases.  Median of 9 seems to help but
 121          looks rather expensive.  This too seems to work but
 122          is cheaper.  Guidance for the magic constants
 123          7621 and 32768 is taken from Sedgewick's algorithms
 124          book, chapter 35.
 125       */
 126       r = ((r * 7621) + 1) % 32768;
 127       r3 = r % 3;
 128       if (r3 == 0) med = eclass[fmap[lo]]; else
 129       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 130                    med = eclass[fmap[hi]];
 131
 132       unLo = ltLo = lo;
 133       unHi = gtHi = hi;
 134
 135       while (1) {
 136          while (1) {
 137             if (unLo > unHi) break;
 138             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 139             if (n == 0) {
 140                fswap(fmap[unLo], fmap[ltLo]);
 141                ltLo++; unLo++;
 142                continue;
 143             };
 144             if (n > 0) break;
 145             unLo++;
 146          }
 147          while (1) {
 148             if (unLo > unHi) break;
 149             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 150             if (n == 0) {
 151                fswap(fmap[unHi], fmap[gtHi]);
 152                gtHi--; unHi--;
 153                continue;
 154             };
 155             if (n < 0) break;
 156             unHi--;
 157          }
 158          if (unLo > unHi) break;
 159          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 160       }
 161
 162       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 163
 164       if (gtHi < ltLo) continue;
 165
 166       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 167       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 168
 169       n = lo + unLo - ltLo - 1;
 170       m = hi - (gtHi - unHi) + 1;
 171
 172       if (n - lo > hi - m) {
 173          fpush ( lo, n );
 174          fpush ( m, hi );
 175       } else {
 176          fpush ( m, hi );
 177          fpush ( lo, n );
 178       }
 179    }
 180 }
 181
 182 #undef fmin
 183 #undef fpush
 184 #undef fpop
 185 #undef fswap
 186 #undef fvswap
 187 #undef FALLBACK_QSORT_SMALL_THRESH
 188 #undef FALLBACK_QSORT_STACK_SIZE
 189
 190
 191 /*---------------------------------------------*/
 192 /* Pre:
 193       nblock > 0
 194       eclass exists for [0 .. nblock-1]
 195       ((UChar*)eclass) [0 .. nblock-1] holds block
 196       ptr exists for [0 .. nblock-1]
 197
 198    Post:
 199       ((UChar*)eclass) [0 .. nblock-1] holds block
 200       All other areas of eclass destroyed
 201       fmap [0 .. nblock-1] holds sorted order
 202       bhtab [ 0 .. 2+(nblock/32) ] destroyed
 203 */
 204
 205 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 206 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 207 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 208 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
 209 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 210
 211 static
 212 void fallbackSort ( UInt32* fmap,
 213                     UInt32* eclass,
 214                     UInt32* bhtab,
 215                     Int32   nblock,
 216                     Int32   verb )
 217 {
 218    Int32 ftab[257];
 219    Int32 ftabCopy[256];
 220    Int32 H, i, j, k, l, r, cc, cc1;
 221    Int32 nNotDone;
 222    Int32 nBhtab;
 223    UChar* eclass8 = (UChar*)eclass;
 224
 225    /*--
 226       Initial 1-char radix sort to generate
 227       initial fmap and initial BH bits.
 228    --*/
 229    if (verb >= 4)
 230       VPrintf0 ( "        bucket sorting ...\n" );
 231    for (i = 0; i < 257;    i++) ftab[i] = 0;
 232    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 233    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 234    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 235
 236    for (i = 0; i < nblock; i++) {
 237       j = eclass8[i];
 238       k = ftab[j] - 1;
 239       ftab[j] = k;
 240       fmap[k] = i;
 241    }
 242
 243    nBhtab = 2 + (nblock / 32);
 244    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 245    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 246
 247    /*--
 248       Inductively refine the buckets.  Kind-of an
 249       "exponential radix sort" (!), inspired by the
 250       Manber-Myers suffix array construction algorithm.
 251    --*/
 252
 253    /*-- set sentinel bits for block-end detection --*/
 254    for (i = 0; i < 32; i++) {
 255       SET_BH(nblock + 2*i);
 256       CLEAR_BH(nblock + 2*i + 1);
 257    }
 258
 259    /*-- the log(N) loop --*/
 260    H = 1;
 261    while (1) {
 262
 263       if (verb >= 4)
 264          VPrintf1 ( "        depth %6d has ", H );
 265
 266       j = 0;
 267       for (i = 0; i < nblock; i++) {
 268          if (ISSET_BH(i)) j = i;
 269          k = fmap[i] - H; if (k < 0) k += nblock;
 270          eclass[k] = j;
 271       }
 272
 273       nNotDone = 0;
 274       r = -1;
 275       while (1) {
 276
 277          /*-- find the next non-singleton bucket --*/
 278          k = r + 1;
 279          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 280          if (ISSET_BH(k)) {
 281             while (WORD_BH(k) == 0xffffffff) k += 32;
 282             while (ISSET_BH(k)) k++;
 283          }
 284          l = k - 1;
 285          if (l >= nblock) break;
 286          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 287          if (!ISSET_BH(k)) {
 288             while (WORD_BH(k) == 0x00000000) k += 32;
 289             while (!ISSET_BH(k)) k++;
 290          }
 291          r = k - 1;
 292          if (r >= nblock) break;
 293
 294          /*-- now [l, r] bracket current bucket --*/
 295          if (r > l) {
 296             nNotDone += (r - l + 1);
 297             fallbackQSort3 ( fmap, eclass, l, r );
 298
 299             /*-- scan bucket and generate header bits-- */
 300             cc = -1;
 301             for (i = l; i <= r; i++) {
 302                cc1 = eclass[fmap[i]];
 303                if (cc != cc1) { SET_BH(i); cc = cc1; };
 304             }
 305          }
 306       }
 307
 308       if (verb >= 4)
 309          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 310
 311       H *= 2;
 312       if (H > nblock || nNotDone == 0) break;
 313    }
 314
 315    /*--
 316       Reconstruct the original block in
 317       eclass8 [0 .. nblock-1], since the
 318       previous phase destroyed it.
 319    --*/
 320    if (verb >= 4)
 321       VPrintf0 ( "        reconstructing block ...\n" );
 322    j = 0;
 323    for (i = 0; i < nblock; i++) {
 324       while (ftabCopy[j] == 0) j++;
 325       ftabCopy[j]--;
 326       eclass8[fmap[i]] = (UChar)j;
 327    }
 328    AssertH ( j < 256, 1005 );
 329 }
 330
 331 #undef       SET_BH
 332 #undef     CLEAR_BH
 333 #undef     ISSET_BH
 334 #undef      WORD_BH
 335 #undef UNALIGNED_BH
 336
 337
 338 /*---------------------------------------------*/
 339 /*--- The main, O(N^2 log(N)) sorting       ---*/
 340 /*--- algorithm.  Faster for "normal"       ---*/
 341 /*--- non-repetitive blocks.                ---*/
 342 /*---------------------------------------------*/
 343
 344 /*---------------------------------------------*/
 345 static
 346 __inline__
 347 Bool mainGtU ( UInt32  i1,
 348                UInt32  i2,
 349                UChar*  block,
 350                UInt16* quadrant,
 351                UInt32  nblock,
 352                Int32*  budget )
 353 {
 354    Int32  k;
 355    UChar  c1, c2;
 356    UInt16 s1, s2;
 357
 358    AssertD ( i1 != i2, "mainGtU" );
 359    /* 1 */
 360    c1 = block[i1]; c2 = block[i2];
 361    if (c1 != c2) return (c1 > c2);
 362    i1++; i2++;
 363    /* 2 */
 364    c1 = block[i1]; c2 = block[i2];
 365    if (c1 != c2) return (c1 > c2);
 366    i1++; i2++;
 367    /* 3 */
 368    c1 = block[i1]; c2 = block[i2];
 369    if (c1 != c2) return (c1 > c2);
 370    i1++; i2++;
 371    /* 4 */
 372    c1 = block[i1]; c2 = block[i2];
 373    if (c1 != c2) return (c1 > c2);
 374    i1++; i2++;
 375    /* 5 */
 376    c1 = block[i1]; c2 = block[i2];
 377    if (c1 != c2) return (c1 > c2);
 378    i1++; i2++;
 379    /* 6 */
 380    c1 = block[i1]; c2 = block[i2];
 381    if (c1 != c2) return (c1 > c2);
 382    i1++; i2++;
 383    /* 7 */
 384    c1 = block[i1]; c2 = block[i2];
 385    if (c1 != c2) return (c1 > c2);
 386    i1++; i2++;
 387    /* 8 */
 388    c1 = block[i1]; c2 = block[i2];
 389    if (c1 != c2) return (c1 > c2);
 390    i1++; i2++;
 391    /* 9 */
 392    c1 = block[i1]; c2 = block[i2];
 393    if (c1 != c2) return (c1 > c2);
 394    i1++; i2++;
 395    /* 10 */
 396    c1 = block[i1]; c2 = block[i2];
 397    if (c1 != c2) return (c1 > c2);
 398    i1++; i2++;
 399    /* 11 */
 400    c1 = block[i1]; c2 = block[i2];
 401    if (c1 != c2) return (c1 > c2);
 402    i1++; i2++;
 403    /* 12 */
 404    c1 = block[i1]; c2 = block[i2];
 405    if (c1 != c2) return (c1 > c2);
 406    i1++; i2++;
 407
 408    k = nblock + 8;
 409
 410    do {
 411       /* 1 */
 412       c1 = block[i1]; c2 = block[i2];
 413       if (c1 != c2) return (c1 > c2);
 414       s1 = quadrant[i1]; s2 = quadrant[i2];
 415       if (s1 != s2) return (s1 > s2);
 416       i1++; i2++;
 417       /* 2 */
 418       c1 = block[i1]; c2 = block[i2];
 419       if (c1 != c2) return (c1 > c2);
 420       s1 = quadrant[i1]; s2 = quadrant[i2];
 421       if (s1 != s2) return (s1 > s2);
 422       i1++; i2++;
 423       /* 3 */
 424       c1 = block[i1]; c2 = block[i2];
 425       if (c1 != c2) return (c1 > c2);
 426       s1 = quadrant[i1]; s2 = quadrant[i2];
 427       if (s1 != s2) return (s1 > s2);
 428       i1++; i2++;
 429       /* 4 */
 430       c1 = block[i1]; c2 = block[i2];
 431       if (c1 != c2) return (c1 > c2);
 432       s1 = quadrant[i1]; s2 = quadrant[i2];
 433       if (s1 != s2) return (s1 > s2);
 434       i1++; i2++;
 435       /* 5 */
 436       c1 = block[i1]; c2 = block[i2];
 437       if (c1 != c2) return (c1 > c2);
 438       s1 = quadrant[i1]; s2 = quadrant[i2];
 439       if (s1 != s2) return (s1 > s2);
 440       i1++; i2++;
 441       /* 6 */
 442       c1 = block[i1]; c2 = block[i2];
 443       if (c1 != c2) return (c1 > c2);
 444       s1 = quadrant[i1]; s2 = quadrant[i2];
 445       if (s1 != s2) return (s1 > s2);
 446       i1++; i2++;
 447       /* 7 */
 448       c1 = block[i1]; c2 = block[i2];
 449       if (c1 != c2) return (c1 > c2);
 450       s1 = quadrant[i1]; s2 = quadrant[i2];
 451       if (s1 != s2) return (s1 > s2);
 452       i1++; i2++;
 453       /* 8 */
 454       c1 = block[i1]; c2 = block[i2];
 455       if (c1 != c2) return (c1 > c2);
 456       s1 = quadrant[i1]; s2 = quadrant[i2];
 457       if (s1 != s2) return (s1 > s2);
 458       i1++; i2++;
 459
 460       if (i1 >= nblock) i1 -= nblock;
 461       if (i2 >= nblock) i2 -= nblock;
 462
 463       k -= 8;
 464       (*budget)--;
 465    }
 466       while (k >= 0);
 467
 468    return False;
 469 }
 470
 471
 472 /*---------------------------------------------*/
 473 /*--
 474    Knuth's increments seem to work better
 475    than Incerpi-Sedgewick here.  Possibly
 476    because the number of elems to sort is
 477    usually small, typically <= 20.
 478 --*/
 479 static
 480 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 481                    9841, 29524, 88573, 265720,
 482                    797161, 2391484 };
 483
 484 static
 485 void mainSimpleSort ( UInt32* ptr,
 486                       UChar*  block,
 487                       UInt16* quadrant,
 488                       Int32   nblock,
 489                       Int32   lo,
 490                       Int32   hi,
 491                       Int32   d,
 492                       Int32*  budget )
 493 {
 494    Int32 i, j, h, bigN, hp;
 495    UInt32 v;
 496
 497    bigN = hi - lo + 1;
 498    if (bigN < 2) return;
 499
 500    hp = 0;
 501    while (incs[hp] < bigN) hp++;
 502    hp--;
 503
 504    for (; hp >= 0; hp--) {
 505       h = incs[hp];
 506
 507       i = lo + h;
 508       while (True) {
 509
 510          /*-- copy 1 --*/
 511          if (i > hi) break;
 512          v = ptr[i];
 513          j = i;
 514          while ( mainGtU (
 515                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 516                  ) ) {
 517             ptr[j] = ptr[j-h];
 518             j = j - h;
 519             if (j <= (lo + h - 1)) break;
 520          }
 521          ptr[j] = v;
 522          i++;
 523
 524          /*-- copy 2 --*/
 525          if (i > hi) break;
 526          v = ptr[i];
 527          j = i;
 528          while ( mainGtU (
 529                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 530                  ) ) {
 531             ptr[j] = ptr[j-h];
 532             j = j - h;
 533             if (j <= (lo + h - 1)) break;
 534          }
 535          ptr[j] = v;
 536          i++;
 537
 538          /*-- copy 3 --*/
 539          if (i > hi) break;
 540          v = ptr[i];
 541          j = i;
 542          while ( mainGtU (
 543                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 544                  ) ) {
 545             ptr[j] = ptr[j-h];
 546             j = j - h;
 547             if (j <= (lo + h - 1)) break;
 548          }
 549          ptr[j] = v;
 550          i++;
 551
 552          if (*budget < 0) return;
 553       }
 554    }
 555 }
 556
 557
 558 /*---------------------------------------------*/
 559 /*--
 560    The following is an implementation of
 561    an elegant 3-way quicksort for strings,
 562    described in a paper "Fast Algorithms for
 563    Sorting and Searching Strings", by Robert
 564    Sedgewick and Jon L. Bentley.
 565 --*/
 566
 567 #define mswap(zz1, zz2) \
 568    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 569
 570 #define mvswap(zzp1, zzp2, zzn)       \
 571 {                                     \
 572    Int32 yyp1 = (zzp1);               \
 573    Int32 yyp2 = (zzp2);               \
 574    Int32 yyn  = (zzn);                \
 575    while (yyn > 0) {                  \
 576       mswap(ptr[yyp1], ptr[yyp2]);    \
 577       yyp1++; yyp2++; yyn--;          \
 578    }                                  \
 579 }
 580
 581 static
 582 __inline__
 583 UChar mmed3 ( UChar a, UChar b, UChar c )
 584 {
 585    UChar t;
 586    if (a > b) { t = a; a = b; b = t; };
 587    if (b > c) {
 588       b = c;
 589       if (a > b) b = a;
 590    }
 591    return b;
 592 }
 593
 594 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
 595
 596 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 597                           stackHi[sp] = hz; \
 598                           stackD [sp] = dz; \
 599                           sp++; }
 600
 601 #define mpop(lz,hz,dz) { sp--;             \
 602                          lz = stackLo[sp]; \
 603                          hz = stackHi[sp]; \
 604                          dz = stackD [sp]; }
 605
 606
 607 #define mnextsize(az) (nextHi[az]-nextLo[az])
 608
 609 #define mnextswap(az,bz)                                        \
 610    { Int32 tz;                                                  \
 611      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 612      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 613      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 614
 615
 616 #define MAIN_QSORT_SMALL_THRESH 20
 617 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 618 #define MAIN_QSORT_STACK_SIZE 100
 619
 620 static
 621 void mainQSort3 ( UInt32* ptr,
 622                   UChar*  block,
 623                   UInt16* quadrant,
 624                   Int32   nblock,
 625                   Int32   loSt,
 626                   Int32   hiSt,
 627                   Int32   dSt,
 628                   Int32*  budget )
 629 {
 630    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 631    Int32 sp, lo, hi, d;
 632
 633    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 634    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 635    Int32 stackD [MAIN_QSORT_STACK_SIZE];
 636
 637    Int32 nextLo[3];
 638    Int32 nextHi[3];
 639    Int32 nextD [3];
 640
 641    sp = 0;
 642    mpush ( loSt, hiSt, dSt );
 643
 644    while (sp > 0) {
 645
 646       AssertH ( sp < MAIN_QSORT_STACK_SIZE - 2, 1001 );
 647
 648       mpop ( lo, hi, d );
 649       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
 650           d > MAIN_QSORT_DEPTH_THRESH) {
 651          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 652          if (*budget < 0) return;
 653          continue;
 654       }
 655
 656       med = (Int32)
 657             mmed3 ( block[ptr[ lo         ]+d],
 658                     block[ptr[ hi         ]+d],
 659                     block[ptr[ (lo+hi)>>1 ]+d] );
 660
 661       unLo = ltLo = lo;
 662       unHi = gtHi = hi;
 663
 664       while (True) {
 665          while (True) {
 666             if (unLo > unHi) break;
 667             n = ((Int32)block[ptr[unLo]+d]) - med;
 668             if (n == 0) {
 669                mswap(ptr[unLo], ptr[ltLo]);
 670                ltLo++; unLo++; continue;
 671             };
 672             if (n >  0) break;
 673             unLo++;
 674          }
 675          while (True) {
 676             if (unLo > unHi) break;
 677             n = ((Int32)block[ptr[unHi]+d]) - med;
 678             if (n == 0) {
 679                mswap(ptr[unHi], ptr[gtHi]);
 680                gtHi--; unHi--; continue;
 681             };
 682             if (n <  0) break;
 683             unHi--;
 684          }
 685          if (unLo > unHi) break;
 686          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 687       }
 688
 689       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 690
 691       if (gtHi < ltLo) {
 692          mpush(lo, hi, d+1 );
 693          continue;
 694       }
 695
 696       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 697       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 698
 699       n = lo + unLo - ltLo - 1;
 700       m = hi - (gtHi - unHi) + 1;
 701
 702       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 703       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 704       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 705
 706       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 707       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 708       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 709
 710       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 711       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 712
 713       mpush (nextLo[0], nextHi[0], nextD[0]);
 714       mpush (nextLo[1], nextHi[1], nextD[1]);
 715       mpush (nextLo[2], nextHi[2], nextD[2]);
 716    }
 717 }
 718
 719 #undef mswap
 720 #undef mvswap
 721 #undef mpush
 722 #undef mpop
 723 #undef mmin
 724 #undef mnextsize
 725 #undef mnextswap
 726 #undef MAIN_QSORT_SMALL_THRESH
 727 #undef MAIN_QSORT_DEPTH_THRESH
 728 #undef MAIN_QSORT_STACK_SIZE
 729
 730
 731 /*---------------------------------------------*/
 732 /* Pre:
 733       nblock > N_OVERSHOOT
 734       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 735       ((UChar*)block32) [0 .. nblock-1] holds block
 736       ptr exists for [0 .. nblock-1]
 737
 738    Post:
 739       ((UChar*)block32) [0 .. nblock-1] holds block
 740       All other areas of block32 destroyed
 741       ftab [0 .. 65536 ] destroyed
 742       ptr [0 .. nblock-1] holds sorted order
 743       if (*budget < 0), sorting was abandoned
 744 */
 745
 746 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 747 #define SETMASK (1 << 21)
 748 #define CLEARMASK (~(SETMASK))
 749
 750 static
 751 void mainSort ( UInt32* ptr,
 752                 UChar*  block,
 753                 UInt16* quadrant,
 754                 UInt32* ftab,
 755                 Int32   nblock,
 756                 Int32   verb,
 757                 Int32*  budget )
 758 {
 759    Int32  i, j, k, ss, sb;
 760    Int32  runningOrder[256];
 761    Bool   bigDone[256];
 762    Int32  copyStart[256];
 763    Int32  copyEnd  [256];
 764    UChar  c1;
 765    Int32  numQSorted;
 766    UInt16 s;
 767    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 768
 769    /*-- set up the 2-byte frequency table --*/
 770    for (i = 65536; i >= 0; i--) ftab[i] = 0;
 771
 772    j = block[0] << 8;
 773    i = nblock-1;
 774    for (; i >= 3; i -= 4) {
 775       quadrant[i] = 0;
 776       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 777       ftab[j]++;
 778       quadrant[i-1] = 0;
 779       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 780       ftab[j]++;
 781       quadrant[i-2] = 0;
 782       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 783       ftab[j]++;
 784       quadrant[i-3] = 0;
 785       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 786       ftab[j]++;
 787    }
 788    for (; i >= 0; i--) {
 789       quadrant[i] = 0;
 790       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 791       ftab[j]++;
 792    }
 793
 794    /*-- (emphasises close relationship of block & quadrant) --*/
 795    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 796       block   [nblock+i] = block[i];
 797       quadrant[nblock+i] = 0;
 798    }
 799
 800    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 801
 802    /*-- Complete the initial radix sort --*/
 803    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 804
 805    s = block[0] << 8;
 806    i = nblock-1;
 807    for (; i >= 3; i -= 4) {
 808       s = (s >> 8) | (block[i] << 8);
 809       j = ftab[s] -1;
 810       ftab[s] = j;
 811       ptr[j] = i;
 812       s = (s >> 8) | (block[i-1] << 8);
 813       j = ftab[s] -1;
 814       ftab[s] = j;
 815       ptr[j] = i-1;
 816       s = (s >> 8) | (block[i-2] << 8);
 817       j = ftab[s] -1;
 818       ftab[s] = j;
 819       ptr[j] = i-2;
 820       s = (s >> 8) | (block[i-3] << 8);
 821       j = ftab[s] -1;
 822       ftab[s] = j;
 823       ptr[j] = i-3;
 824    }
 825    for (; i >= 0; i--) {
 826       s = (s >> 8) | (block[i] << 8);
 827       j = ftab[s] -1;
 828       ftab[s] = j;
 829       ptr[j] = i;
 830    }
 831
 832    /*--
 833       Now ftab contains the first loc of every small bucket.
 834       Calculate the running order, from smallest to largest
 835       big bucket.
 836    --*/
 837    for (i = 0; i <= 255; i++) {
 838       bigDone     [i] = False;
 839       runningOrder[i] = i;
 840    }
 841
 842    {
 843       Int32 vv;
 844       Int32 h = 1;
 845       do h = 3 * h + 1; while (h <= 256);
 846       do {
 847          h = h / 3;
 848          for (i = h; i <= 255; i++) {
 849             vv = runningOrder[i];
 850             j = i;
 851             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 852                runningOrder[j] = runningOrder[j-h];
 853                j = j - h;
 854                if (j <= (h - 1)) goto zero;
 855             }
 856             zero:
 857             runningOrder[j] = vv;
 858          }
 859       } while (h != 1);
 860    }
 861
 862    /*--
 863       The main sorting loop.
 864    --*/
 865
 866    numQSorted = 0;
 867
 868    for (i = 0; i <= 255; i++) {
 869
 870       /*--
 871          Process big buckets, starting with the least full.
 872          Basically this is a 3-step process in which we call
 873          mainQSort3 to sort the small buckets [ss, j], but
 874          also make a big effort to avoid the calls if we can.
 875       --*/
 876       ss = runningOrder[i];
 877
 878       /*--
 879          Step 1:
 880          Complete the big bucket [ss] by quicksorting
 881          any unsorted small buckets [ss, j], for j != ss.
 882          Hopefully previous pointer-scanning phases have already
 883          completed many of the small buckets [ss, j], so
 884          we don't have to sort them at all.
 885       --*/
 886       for (j = 0; j <= 255; j++) {
 887          if (j != ss) {
 888             sb = (ss << 8) + j;
 889             if ( ! (ftab[sb] & SETMASK) ) {
 890                Int32 lo = ftab[sb]   & CLEARMASK;
 891                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 892                if (hi > lo) {
 893                   if (verb >= 4)
 894                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 895                                 "done %d   this %d\n",
 896                                 ss, j, numQSorted, hi - lo + 1 );
 897                   mainQSort3 (
 898                      ptr, block, quadrant, nblock,
 899                      lo, hi, BZ_N_RADIX, budget
 900                   );
 901                   numQSorted += (hi - lo + 1);
 902                   if (*budget < 0) return;
 903                }
 904             }
 905             ftab[sb] |= SETMASK;
 906          }
 907       }
 908
 909       AssertH ( !bigDone[ss], 1006 );
 910
 911       /*--
 912          Step 2:
 913          Now scan this big bucket [ss] so as to synthesise the
 914          sorted order for small buckets [t, ss] for all t,
 915          including, magically, the bucket [ss,ss] too.
 916          This will avoid doing Real Work in subsequent Step 1's.
 917       --*/
 918       {
 919          for (j = 0; j <= 255; j++) {
 920             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 921             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 922          }
 923          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 924             k = ptr[j]-1; if (k < 0) k += nblock;
 925             c1 = block[k];
 926             if (!bigDone[c1])
 927                ptr[ copyStart[c1]++ ] = k;
 928          }
 929          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 930             k = ptr[j]-1; if (k < 0) k += nblock;
 931             c1 = block[k];
 932             if (!bigDone[c1])
 933                ptr[ copyEnd[c1]-- ] = k;
 934          }
 935       }
 936
 937       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
 938                 ||
 939                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 940                    Necessity for this case is demonstrated by compressing
 941                    a sequence of approximately 48.5 million of character
 942                    251; 1.0.0/1.0.1 will then die here. */
 943                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
 944                 1007 )
 945
 946       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 947
 948       /*--
 949          Step 3:
 950          The [ss] big bucket is now done.  Record this fact,
 951          and update the quadrant descriptors.  Remember to
 952          update quadrants in the overshoot area too, if
 953          necessary.  The "if (i < 255)" test merely skips
 954          this updating for the last bucket processed, since
 955          updating for the last bucket is pointless.
 956
 957          The quadrant array provides a way to incrementally
 958          cache sort orderings, as they appear, so as to
 959          make subsequent comparisons in fullGtU() complete
 960          faster.  For repetitive blocks this makes a big
 961          difference (but not big enough to be able to avoid
 962          the fallback sorting mechanism, exponential radix sort).
 963
 964          The precise meaning is: at all times:
 965
 966             for 0 <= i < nblock and 0 <= j <= nblock
 967
 968             if block[i] != block[j],
 969
 970                then the relative values of quadrant[i] and
 971                     quadrant[j] are meaningless.
 972
 973                else {
 974                   if quadrant[i] < quadrant[j]
 975                      then the string starting at i lexicographically
 976                      precedes the string starting at j
 977
 978                   else if quadrant[i] > quadrant[j]
 979                      then the string starting at j lexicographically
 980                      precedes the string starting at i
 981
 982                   else
 983                      the relative ordering of the strings starting
 984                      at i and j has not yet been determined.
 985                }
 986       --*/
 987       bigDone[ss] = True;
 988
 989       if (i < 255) {
 990          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
 991          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
 992          Int32 shifts   = 0;
 993
 994          while ((bbSize >> shifts) > 65534) shifts++;
 995
 996          for (j = bbSize-1; j >= 0; j--) {
 997             Int32 a2update     = ptr[bbStart + j];
 998             UInt16 qVal        = (UInt16)(j >> shifts);
 999             quadrant[a2update] = qVal;
1000             if (a2update < BZ_N_OVERSHOOT)
1001                quadrant[a2update + nblock] = qVal;
1002          }
1003          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1004       }
1005
1006    }
1007
1008    if (verb >= 4)
1009       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1010                  nblock, numQSorted, nblock - numQSorted );
1011 }
1012
1013 #undef BIGFREQ
1014 #undef SETMASK
1015 #undef CLEARMASK
1016
1017
1018 /*---------------------------------------------*/
1019 /* Pre:
1020       nblock > 0
1021       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1022       ((UChar*)arr2)  [0 .. nblock-1] holds block
1023       arr1 exists for [0 .. nblock-1]
1024
1025    Post:
1026       ((UChar*)arr2) [0 .. nblock-1] holds block
1027       All other areas of block destroyed
1028       ftab [ 0 .. 65536 ] destroyed
1029       arr1 [0 .. nblock-1] holds sorted order
1030 */
1031 void BZ2_blockSort ( EState* s )
1032 {
1033    UInt32* ptr    = s->ptr;
1034    UChar*  block  = s->block;
1035    UInt32* ftab   = s->ftab;
1036    Int32   nblock = s->nblock;
1037    Int32   verb   = s->verbosity;
1038    Int32   wfact  = s->workFactor;
1039    UInt16* quadrant;
1040    Int32   budget;
1041    Int32   budgetInit;
1042    Int32   i;
1043
1044    if (nblock < 10000) {
1045       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1046    } else {
1047       /* Calculate the location for quadrant, remembering to get
1048          the alignment right.  Assumes that &(block[0]) is at least
1049          2-byte aligned -- this should be ok since block is really
1050          the first section of arr2.
1051       */
1052       i = nblock+BZ_N_OVERSHOOT;
1053       if (i & 1) i++;
1054       quadrant = (UInt16*)(&(block[i]));
1055
1056       /* (wfact-1) / 3 puts the default-factor-30
1057          transition point at very roughly the same place as
1058          with v0.1 and v0.9.0.
1059          Not that it particularly matters any more, since the
1060          resulting compressed stream is now the same regardless
1061          of whether or not we use the main sort or fallback sort.
1062       */
1063       if (wfact < 1  ) wfact = 1;
1064       if (wfact > 100) wfact = 100;
1065       budgetInit = nblock * ((wfact-1) / 3);
1066       budget = budgetInit;
1067
1068       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1069       if (verb >= 3)
1070          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1071                     budgetInit - budget,
1072                     nblock,
1073                     (float)(budgetInit - budget) /
1074                     (float)(nblock==0 ? 1 : nblock) );
1075       if (budget < 0) {
1076          if (verb >= 2)
1077             VPrintf0 ( "    too repetitive; using fallback"
1078                        " sorting algorithm\n" );
1079          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1080       }
1081    }
1082
1083    s->origPtr = -1;
1084    for (i = 0; i < s->nblock; i++)
1085       if (ptr[i] == 0)
1086          { s->origPtr = i; break; };
1087
1088    AssertH( s->origPtr != -1, 1003 );
1089 }
1090
1091
1092 /*-------------------------------------------------------------*/
1093 /*--- end                                       blocksort.c ---*/
1094 /*-------------------------------------------------------------*/