lib/Support/bzip2/blocksort.c

   1
   2 /*-------------------------------------------------------------*/
   3 /*--- Block sorting machinery                               ---*/
   4 /*---                                           blocksort.c ---*/
   5 /*-------------------------------------------------------------*/
   6
   7 /*--
   8   This file is a part of bzip2 and/or libbzip2, a program and
   9   library for lossless, block-sorting data compression.
  10
  11   Copyright (C) 1996-2002 Julian R Seward.  All rights reserved.
  12
  13   Redistribution and use in source and binary forms, with or without
  14   modification, are permitted provided that the following conditions
  15   are met:
  16
  17   1. Redistributions of source code must retain the above copyright
  18      notice, this list of conditions and the following disclaimer.
  19
  20   2. The origin of this software must not be misrepresented; you must
  21      not claim that you wrote the original software.  If you use this
  22      software in a product, an acknowledgment in the product
  23      documentation would be appreciated but is not required.
  24
  25   3. Altered source versions must be plainly marked as such, and must
  26      not be misrepresented as being the original software.
  27
  28   4. The name of the author may not be used to endorse or promote
  29      products derived from this software without specific prior written
  30      permission.
  31
  32   THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  33   OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  34   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  35   ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  36   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  37   DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  38   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  39   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  40   WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  41   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  42   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43
  44   Julian Seward, Cambridge, UK.
  45   jseward@acm.org
  46   bzip2/libbzip2 version 1.0 of 21 March 2000
  47
  48   This program is based on (at least) the work of:
  49      Mike Burrows
  50      David Wheeler
  51      Peter Fenwick
  52      Alistair Moffat
  53      Radford Neal
  54      Ian H. Witten
  55      Robert Sedgewick
  56      Jon L. Bentley
  57
  58   For more information on these sources, see the manual.
  59
  60   To get some idea how the block sorting algorithms in this file
  61   work, read my paper
  62      On the Performance of BWT Sorting Algorithms
  63   in Proceedings of the IEEE Data Compression Conference 2000,
  64   Snowbird, Utah, USA, 27-30 March 2000.  The main sort in this
  65   file implements the algorithm called  cache  in the paper.
  66 --*/
  67
  68
  69 #include "bzlib_private.h"
  70
  71 /*---------------------------------------------*/
  72 /*--- Fallback O(N log(N)^2) sorting        ---*/
  73 /*--- algorithm, for repetitive blocks      ---*/
  74 /*---------------------------------------------*/
  75
  76 /*---------------------------------------------*/
  77 static
  78 __inline__
  79 void fallbackSimpleSort ( UInt32* fmap,
  80                           UInt32* eclass,
  81                           Int32   lo,
  82                           Int32   hi )
  83 {
  84    Int32 i, j, tmp;
  85    UInt32 ec_tmp;
  86
  87    if (lo == hi) return;
  88
  89    if (hi - lo > 3) {
  90       for ( i = hi-4; i >= lo; i-- ) {
  91          tmp = fmap[i];
  92          ec_tmp = eclass[tmp];
  93          for ( j = i+4; j <= hi && ec_tmp > eclass[fmap[j]]; j += 4 )
  94             fmap[j-4] = fmap[j];
  95          fmap[j-4] = tmp;
  96       }
  97    }
  98
  99    for ( i = hi-1; i >= lo; i-- ) {
 100       tmp = fmap[i];
 101       ec_tmp = eclass[tmp];
 102       for ( j = i+1; j <= hi && ec_tmp > eclass[fmap[j]]; j++ )
 103          fmap[j-1] = fmap[j];
 104       fmap[j-1] = tmp;
 105    }
 106 }
 107
 108
 109 /*---------------------------------------------*/
 110 #define fswap(zz1, zz2) \
 111    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 112
 113 #define fvswap(zzp1, zzp2, zzn)       \
 114 {                                     \
 115    Int32 yyp1 = (zzp1);               \
 116    Int32 yyp2 = (zzp2);               \
 117    Int32 yyn  = (zzn);                \
 118    while (yyn > 0) {                  \
 119       fswap(fmap[yyp1], fmap[yyp2]);  \
 120       yyp1++; yyp2++; yyn--;          \
 121    }                                  \
 122 }
 123
 124
 125 #define fmin(a,b) ((a) < (b)) ? (a) : (b)
 126
 127 #define fpush(lz,hz) { stackLo[sp] = lz; \
 128                        stackHi[sp] = hz; \
 129                        sp++; }
 130
 131 #define fpop(lz,hz) { sp--;              \
 132                       lz = stackLo[sp];  \
 133                       hz = stackHi[sp]; }
 134
 135 #define FALLBACK_QSORT_SMALL_THRESH 10
 136 #define FALLBACK_QSORT_STACK_SIZE   100
 137
 138
 139 static
 140 void fallbackQSort3 ( UInt32* fmap,
 141                       UInt32* eclass,
 142                       Int32   loSt,
 143                       Int32   hiSt )
 144 {
 145    Int32 unLo, unHi, ltLo, gtHi, n, m;
 146    Int32 sp, lo, hi;
 147    UInt32 med, r, r3;
 148    Int32 stackLo[FALLBACK_QSORT_STACK_SIZE];
 149    Int32 stackHi[FALLBACK_QSORT_STACK_SIZE];
 150
 151    r = 0;
 152
 153    sp = 0;
 154    fpush ( loSt, hiSt );
 155
 156    while (sp > 0) {
 157
 158       AssertH ( sp < FALLBACK_QSORT_STACK_SIZE, 1004 );
 159
 160       fpop ( lo, hi );
 161       if (hi - lo < FALLBACK_QSORT_SMALL_THRESH) {
 162          fallbackSimpleSort ( fmap, eclass, lo, hi );
 163          continue;
 164       }
 165
 166       /* Random partitioning.  Median of 3 sometimes fails to
 167          avoid bad cases.  Median of 9 seems to help but
 168          looks rather expensive.  This too seems to work but
 169          is cheaper.  Guidance for the magic constants
 170          7621 and 32768 is taken from Sedgewick's algorithms
 171          book, chapter 35.
 172       */
 173       r = ((r * 7621) + 1) % 32768;
 174       r3 = r % 3;
 175       if (r3 == 0) med = eclass[fmap[lo]]; else
 176       if (r3 == 1) med = eclass[fmap[(lo+hi)>>1]]; else
 177                    med = eclass[fmap[hi]];
 178
 179       unLo = ltLo = lo;
 180       unHi = gtHi = hi;
 181
 182       while (1) {
 183          while (1) {
 184             if (unLo > unHi) break;
 185             n = (Int32)eclass[fmap[unLo]] - (Int32)med;
 186             if (n == 0) {
 187                fswap(fmap[unLo], fmap[ltLo]);
 188                ltLo++; unLo++;
 189                continue;
 190             };
 191             if (n > 0) break;
 192             unLo++;
 193          }
 194          while (1) {
 195             if (unLo > unHi) break;
 196             n = (Int32)eclass[fmap[unHi]] - (Int32)med;
 197             if (n == 0) {
 198                fswap(fmap[unHi], fmap[gtHi]);
 199                gtHi--; unHi--;
 200                continue;
 201             };
 202             if (n < 0) break;
 203             unHi--;
 204          }
 205          if (unLo > unHi) break;
 206          fswap(fmap[unLo], fmap[unHi]); unLo++; unHi--;
 207       }
 208
 209       AssertD ( unHi == unLo-1, "fallbackQSort3(2)" );
 210
 211       if (gtHi < ltLo) continue;
 212
 213       n = fmin(ltLo-lo, unLo-ltLo); fvswap(lo, unLo-n, n);
 214       m = fmin(hi-gtHi, gtHi-unHi); fvswap(unLo, hi-m+1, m);
 215
 216       n = lo + unLo - ltLo - 1;
 217       m = hi - (gtHi - unHi) + 1;
 218
 219       if (n - lo > hi - m) {
 220          fpush ( lo, n );
 221          fpush ( m, hi );
 222       } else {
 223          fpush ( m, hi );
 224          fpush ( lo, n );
 225       }
 226    }
 227 }
 228
 229 #undef fmin
 230 #undef fpush
 231 #undef fpop
 232 #undef fswap
 233 #undef fvswap
 234 #undef FALLBACK_QSORT_SMALL_THRESH
 235 #undef FALLBACK_QSORT_STACK_SIZE
 236
 237
 238 /*---------------------------------------------*/
 239 /* Pre:
 240       nblock > 0
 241       eclass exists for [0 .. nblock-1]
 242       ((UChar*)eclass) [0 .. nblock-1] holds block
 243       ptr exists for [0 .. nblock-1]
 244
 245    Post:
 246       ((UChar*)eclass) [0 .. nblock-1] holds block
 247       All other areas of eclass destroyed
 248       fmap [0 .. nblock-1] holds sorted order
 249       bhtab [ 0 .. 2+(nblock/32) ] destroyed
 250 */
 251
 252 #define       SET_BH(zz)  bhtab[(zz) >> 5] |= (1 << ((zz) & 31))
 253 #define     CLEAR_BH(zz)  bhtab[(zz) >> 5] &= ~(1 << ((zz) & 31))
 254 #define     ISSET_BH(zz)  (bhtab[(zz) >> 5] & (1 << ((zz) & 31)))
 255 #define      WORD_BH(zz)  bhtab[(zz) >> 5]
 256 #define UNALIGNED_BH(zz)  ((zz) & 0x01f)
 257
 258 static
 259 void fallbackSort ( UInt32* fmap,
 260                     UInt32* eclass,
 261                     UInt32* bhtab,
 262                     Int32   nblock,
 263                     Int32   verb )
 264 {
 265    Int32 ftab[257];
 266    Int32 ftabCopy[256];
 267    Int32 H, i, j, k, l, r, cc, cc1;
 268    Int32 nNotDone;
 269    Int32 nBhtab;
 270    UChar* eclass8 = (UChar*)eclass;
 271
 272    /*--
 273       Initial 1-char radix sort to generate
 274       initial fmap and initial BH bits.
 275    --*/
 276    if (verb >= 4)
 277       VPrintf0 ( "        bucket sorting ...\n" );
 278    for (i = 0; i < 257;    i++) ftab[i] = 0;
 279    for (i = 0; i < nblock; i++) ftab[eclass8[i]]++;
 280    for (i = 0; i < 256;    i++) ftabCopy[i] = ftab[i];
 281    for (i = 1; i < 257;    i++) ftab[i] += ftab[i-1];
 282
 283    for (i = 0; i < nblock; i++) {
 284       j = eclass8[i];
 285       k = ftab[j] - 1;
 286       ftab[j] = k;
 287       fmap[k] = i;
 288    }
 289
 290    nBhtab = 2 + (nblock / 32);
 291    for (i = 0; i < nBhtab; i++) bhtab[i] = 0;
 292    for (i = 0; i < 256; i++) SET_BH(ftab[i]);
 293
 294    /*--
 295       Inductively refine the buckets.  Kind-of an
 296       "exponential radix sort" (!), inspired by the
 297       Manber-Myers suffix array construction algorithm.
 298    --*/
 299
 300    /*-- set sentinel bits for block-end detection --*/
 301    for (i = 0; i < 32; i++) {
 302       SET_BH(nblock + 2*i);
 303       CLEAR_BH(nblock + 2*i + 1);
 304    }
 305
 306    /*-- the log(N) loop --*/
 307    H = 1;
 308    while (1) {
 309
 310       if (verb >= 4)
 311          VPrintf1 ( "        depth %6d has ", H );
 312
 313       j = 0;
 314       for (i = 0; i < nblock; i++) {
 315          if (ISSET_BH(i)) j = i;
 316          k = fmap[i] - H; if (k < 0) k += nblock;
 317          eclass[k] = j;
 318       }
 319
 320       nNotDone = 0;
 321       r = -1;
 322       while (1) {
 323
 324          /*-- find the next non-singleton bucket --*/
 325          k = r + 1;
 326          while (ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 327          if (ISSET_BH(k)) {
 328             while (WORD_BH(k) == 0xffffffff) k += 32;
 329             while (ISSET_BH(k)) k++;
 330          }
 331          l = k - 1;
 332          if (l >= nblock) break;
 333          while (!ISSET_BH(k) && UNALIGNED_BH(k)) k++;
 334          if (!ISSET_BH(k)) {
 335             while (WORD_BH(k) == 0x00000000) k += 32;
 336             while (!ISSET_BH(k)) k++;
 337          }
 338          r = k - 1;
 339          if (r >= nblock) break;
 340
 341          /*-- now [l, r] bracket current bucket --*/
 342          if (r > l) {
 343             nNotDone += (r - l + 1);
 344             fallbackQSort3 ( fmap, eclass, l, r );
 345
 346             /*-- scan bucket and generate header bits-- */
 347             cc = -1;
 348             for (i = l; i <= r; i++) {
 349                cc1 = eclass[fmap[i]];
 350                if (cc != cc1) { SET_BH(i); cc = cc1; };
 351             }
 352          }
 353       }
 354
 355       if (verb >= 4)
 356          VPrintf1 ( "%6d unresolved strings\n", nNotDone );
 357
 358       H *= 2;
 359       if (H > nblock || nNotDone == 0) break;
 360    }
 361
 362    /*--
 363       Reconstruct the original block in
 364       eclass8 [0 .. nblock-1], since the
 365       previous phase destroyed it.
 366    --*/
 367    if (verb >= 4)
 368       VPrintf0 ( "        reconstructing block ...\n" );
 369    j = 0;
 370    for (i = 0; i < nblock; i++) {
 371       while (ftabCopy[j] == 0) j++;
 372       ftabCopy[j]--;
 373       eclass8[fmap[i]] = (UChar)j;
 374    }
 375    AssertH ( j < 256, 1005 );
 376 }
 377
 378 #undef       SET_BH
 379 #undef     CLEAR_BH
 380 #undef     ISSET_BH
 381 #undef      WORD_BH
 382 #undef UNALIGNED_BH
 383
 384
 385 /*---------------------------------------------*/
 386 /*--- The main, O(N^2 log(N)) sorting       ---*/
 387 /*--- algorithm.  Faster for "normal"       ---*/
 388 /*--- non-repetitive blocks.                ---*/
 389 /*---------------------------------------------*/
 390
 391 /*---------------------------------------------*/
 392 static
 393 __inline__
 394 Bool mainGtU ( UInt32  i1,
 395                UInt32  i2,
 396                UChar*  block,
 397                UInt16* quadrant,
 398                UInt32  nblock,
 399                Int32*  budget )
 400 {
 401    Int32  k;
 402    UChar  c1, c2;
 403    UInt16 s1, s2;
 404
 405    AssertD ( i1 != i2, "mainGtU" );
 406    /* 1 */
 407    c1 = block[i1]; c2 = block[i2];
 408    if (c1 != c2) return (c1 > c2);
 409    i1++; i2++;
 410    /* 2 */
 411    c1 = block[i1]; c2 = block[i2];
 412    if (c1 != c2) return (c1 > c2);
 413    i1++; i2++;
 414    /* 3 */
 415    c1 = block[i1]; c2 = block[i2];
 416    if (c1 != c2) return (c1 > c2);
 417    i1++; i2++;
 418    /* 4 */
 419    c1 = block[i1]; c2 = block[i2];
 420    if (c1 != c2) return (c1 > c2);
 421    i1++; i2++;
 422    /* 5 */
 423    c1 = block[i1]; c2 = block[i2];
 424    if (c1 != c2) return (c1 > c2);
 425    i1++; i2++;
 426    /* 6 */
 427    c1 = block[i1]; c2 = block[i2];
 428    if (c1 != c2) return (c1 > c2);
 429    i1++; i2++;
 430    /* 7 */
 431    c1 = block[i1]; c2 = block[i2];
 432    if (c1 != c2) return (c1 > c2);
 433    i1++; i2++;
 434    /* 8 */
 435    c1 = block[i1]; c2 = block[i2];
 436    if (c1 != c2) return (c1 > c2);
 437    i1++; i2++;
 438    /* 9 */
 439    c1 = block[i1]; c2 = block[i2];
 440    if (c1 != c2) return (c1 > c2);
 441    i1++; i2++;
 442    /* 10 */
 443    c1 = block[i1]; c2 = block[i2];
 444    if (c1 != c2) return (c1 > c2);
 445    i1++; i2++;
 446    /* 11 */
 447    c1 = block[i1]; c2 = block[i2];
 448    if (c1 != c2) return (c1 > c2);
 449    i1++; i2++;
 450    /* 12 */
 451    c1 = block[i1]; c2 = block[i2];
 452    if (c1 != c2) return (c1 > c2);
 453    i1++; i2++;
 454
 455    k = nblock + 8;
 456
 457    do {
 458       /* 1 */
 459       c1 = block[i1]; c2 = block[i2];
 460       if (c1 != c2) return (c1 > c2);
 461       s1 = quadrant[i1]; s2 = quadrant[i2];
 462       if (s1 != s2) return (s1 > s2);
 463       i1++; i2++;
 464       /* 2 */
 465       c1 = block[i1]; c2 = block[i2];
 466       if (c1 != c2) return (c1 > c2);
 467       s1 = quadrant[i1]; s2 = quadrant[i2];
 468       if (s1 != s2) return (s1 > s2);
 469       i1++; i2++;
 470       /* 3 */
 471       c1 = block[i1]; c2 = block[i2];
 472       if (c1 != c2) return (c1 > c2);
 473       s1 = quadrant[i1]; s2 = quadrant[i2];
 474       if (s1 != s2) return (s1 > s2);
 475       i1++; i2++;
 476       /* 4 */
 477       c1 = block[i1]; c2 = block[i2];
 478       if (c1 != c2) return (c1 > c2);
 479       s1 = quadrant[i1]; s2 = quadrant[i2];
 480       if (s1 != s2) return (s1 > s2);
 481       i1++; i2++;
 482       /* 5 */
 483       c1 = block[i1]; c2 = block[i2];
 484       if (c1 != c2) return (c1 > c2);
 485       s1 = quadrant[i1]; s2 = quadrant[i2];
 486       if (s1 != s2) return (s1 > s2);
 487       i1++; i2++;
 488       /* 6 */
 489       c1 = block[i1]; c2 = block[i2];
 490       if (c1 != c2) return (c1 > c2);
 491       s1 = quadrant[i1]; s2 = quadrant[i2];
 492       if (s1 != s2) return (s1 > s2);
 493       i1++; i2++;
 494       /* 7 */
 495       c1 = block[i1]; c2 = block[i2];
 496       if (c1 != c2) return (c1 > c2);
 497       s1 = quadrant[i1]; s2 = quadrant[i2];
 498       if (s1 != s2) return (s1 > s2);
 499       i1++; i2++;
 500       /* 8 */
 501       c1 = block[i1]; c2 = block[i2];
 502       if (c1 != c2) return (c1 > c2);
 503       s1 = quadrant[i1]; s2 = quadrant[i2];
 504       if (s1 != s2) return (s1 > s2);
 505       i1++; i2++;
 506
 507       if (i1 >= nblock) i1 -= nblock;
 508       if (i2 >= nblock) i2 -= nblock;
 509
 510       k -= 8;
 511       (*budget)--;
 512    }
 513       while (k >= 0);
 514
 515    return False;
 516 }
 517
 518
 519 /*---------------------------------------------*/
 520 /*--
 521    Knuth's increments seem to work better
 522    than Incerpi-Sedgewick here.  Possibly
 523    because the number of elems to sort is
 524    usually small, typically <= 20.
 525 --*/
 526 static
 527 Int32 incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
 528                    9841, 29524, 88573, 265720,
 529                    797161, 2391484 };
 530
 531 static
 532 void mainSimpleSort ( UInt32* ptr,
 533                       UChar*  block,
 534                       UInt16* quadrant,
 535                       Int32   nblock,
 536                       Int32   lo,
 537                       Int32   hi,
 538                       Int32   d,
 539                       Int32*  budget )
 540 {
 541    Int32 i, j, h, bigN, hp;
 542    UInt32 v;
 543
 544    bigN = hi - lo + 1;
 545    if (bigN < 2) return;
 546
 547    hp = 0;
 548    while (incs[hp] < bigN) hp++;
 549    hp--;
 550
 551    for (; hp >= 0; hp--) {
 552       h = incs[hp];
 553
 554       i = lo + h;
 555       while (True) {
 556
 557          /*-- copy 1 --*/
 558          if (i > hi) break;
 559          v = ptr[i];
 560          j = i;
 561          while ( mainGtU (
 562                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 563                  ) ) {
 564             ptr[j] = ptr[j-h];
 565             j = j - h;
 566             if (j <= (lo + h - 1)) break;
 567          }
 568          ptr[j] = v;
 569          i++;
 570
 571          /*-- copy 2 --*/
 572          if (i > hi) break;
 573          v = ptr[i];
 574          j = i;
 575          while ( mainGtU (
 576                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 577                  ) ) {
 578             ptr[j] = ptr[j-h];
 579             j = j - h;
 580             if (j <= (lo + h - 1)) break;
 581          }
 582          ptr[j] = v;
 583          i++;
 584
 585          /*-- copy 3 --*/
 586          if (i > hi) break;
 587          v = ptr[i];
 588          j = i;
 589          while ( mainGtU (
 590                     ptr[j-h]+d, v+d, block, quadrant, nblock, budget
 591                  ) ) {
 592             ptr[j] = ptr[j-h];
 593             j = j - h;
 594             if (j <= (lo + h - 1)) break;
 595          }
 596          ptr[j] = v;
 597          i++;
 598
 599          if (*budget < 0) return;
 600       }
 601    }
 602 }
 603
 604
 605 /*---------------------------------------------*/
 606 /*--
 607    The following is an implementation of
 608    an elegant 3-way quicksort for strings,
 609    described in a paper "Fast Algorithms for
 610    Sorting and Searching Strings", by Robert
 611    Sedgewick and Jon L. Bentley.
 612 --*/
 613
 614 #define mswap(zz1, zz2) \
 615    { Int32 zztmp = zz1; zz1 = zz2; zz2 = zztmp; }
 616
 617 #define mvswap(zzp1, zzp2, zzn)       \
 618 {                                     \
 619    Int32 yyp1 = (zzp1);               \
 620    Int32 yyp2 = (zzp2);               \
 621    Int32 yyn  = (zzn);                \
 622    while (yyn > 0) {                  \
 623       mswap(ptr[yyp1], ptr[yyp2]);    \
 624       yyp1++; yyp2++; yyn--;          \
 625    }                                  \
 626 }
 627
 628 static
 629 __inline__
 630 UChar mmed3 ( UChar a, UChar b, UChar c )
 631 {
 632    UChar t;
 633    if (a > b) { t = a; a = b; b = t; };
 634    if (b > c) {
 635       b = c;
 636       if (a > b) b = a;
 637    }
 638    return b;
 639 }
 640
 641 #define mmin(a,b) ((a) < (b)) ? (a) : (b)
 642
 643 #define mpush(lz,hz,dz) { stackLo[sp] = lz; \
 644                           stackHi[sp] = hz; \
 645                           stackD [sp] = dz; \
 646                           sp++; }
 647
 648 #define mpop(lz,hz,dz) { sp--;             \
 649                          lz = stackLo[sp]; \
 650                          hz = stackHi[sp]; \
 651                          dz = stackD [sp]; }
 652
 653
 654 #define mnextsize(az) (nextHi[az]-nextLo[az])
 655
 656 #define mnextswap(az,bz)                                        \
 657    { Int32 tz;                                                  \
 658      tz = nextLo[az]; nextLo[az] = nextLo[bz]; nextLo[bz] = tz; \
 659      tz = nextHi[az]; nextHi[az] = nextHi[bz]; nextHi[bz] = tz; \
 660      tz = nextD [az]; nextD [az] = nextD [bz]; nextD [bz] = tz; }
 661
 662
 663 #define MAIN_QSORT_SMALL_THRESH 20
 664 #define MAIN_QSORT_DEPTH_THRESH (BZ_N_RADIX + BZ_N_QSORT)
 665 #define MAIN_QSORT_STACK_SIZE 100
 666
 667 static
 668 void mainQSort3 ( UInt32* ptr,
 669                   UChar*  block,
 670                   UInt16* quadrant,
 671                   Int32   nblock,
 672                   Int32   loSt,
 673                   Int32   hiSt,
 674                   Int32   dSt,
 675                   Int32*  budget )
 676 {
 677    Int32 unLo, unHi, ltLo, gtHi, n, m, med;
 678    Int32 sp, lo, hi, d;
 679
 680    Int32 stackLo[MAIN_QSORT_STACK_SIZE];
 681    Int32 stackHi[MAIN_QSORT_STACK_SIZE];
 682    Int32 stackD [MAIN_QSORT_STACK_SIZE];
 683
 684    Int32 nextLo[3];
 685    Int32 nextHi[3];
 686    Int32 nextD [3];
 687
 688    sp = 0;
 689    mpush ( loSt, hiSt, dSt );
 690
 691    while (sp > 0) {
 692
 693       AssertH ( sp < MAIN_QSORT_STACK_SIZE, 1001 );
 694
 695       mpop ( lo, hi, d );
 696       if (hi - lo < MAIN_QSORT_SMALL_THRESH ||
 697           d > MAIN_QSORT_DEPTH_THRESH) {
 698          mainSimpleSort ( ptr, block, quadrant, nblock, lo, hi, d, budget );
 699          if (*budget < 0) return;
 700          continue;
 701       }
 702
 703       med = (Int32)
 704             mmed3 ( block[ptr[ lo         ]+d],
 705                     block[ptr[ hi         ]+d],
 706                     block[ptr[ (lo+hi)>>1 ]+d] );
 707
 708       unLo = ltLo = lo;
 709       unHi = gtHi = hi;
 710
 711       while (True) {
 712          while (True) {
 713             if (unLo > unHi) break;
 714             n = ((Int32)block[ptr[unLo]+d]) - med;
 715             if (n == 0) {
 716                mswap(ptr[unLo], ptr[ltLo]);
 717                ltLo++; unLo++; continue;
 718             };
 719             if (n >  0) break;
 720             unLo++;
 721          }
 722          while (True) {
 723             if (unLo > unHi) break;
 724             n = ((Int32)block[ptr[unHi]+d]) - med;
 725             if (n == 0) {
 726                mswap(ptr[unHi], ptr[gtHi]);
 727                gtHi--; unHi--; continue;
 728             };
 729             if (n <  0) break;
 730             unHi--;
 731          }
 732          if (unLo > unHi) break;
 733          mswap(ptr[unLo], ptr[unHi]); unLo++; unHi--;
 734       }
 735
 736       AssertD ( unHi == unLo-1, "mainQSort3(2)" );
 737
 738       if (gtHi < ltLo) {
 739          mpush(lo, hi, d+1 );
 740          continue;
 741       }
 742
 743       n = mmin(ltLo-lo, unLo-ltLo); mvswap(lo, unLo-n, n);
 744       m = mmin(hi-gtHi, gtHi-unHi); mvswap(unLo, hi-m+1, m);
 745
 746       n = lo + unLo - ltLo - 1;
 747       m = hi - (gtHi - unHi) + 1;
 748
 749       nextLo[0] = lo;  nextHi[0] = n;   nextD[0] = d;
 750       nextLo[1] = m;   nextHi[1] = hi;  nextD[1] = d;
 751       nextLo[2] = n+1; nextHi[2] = m-1; nextD[2] = d+1;
 752
 753       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 754       if (mnextsize(1) < mnextsize(2)) mnextswap(1,2);
 755       if (mnextsize(0) < mnextsize(1)) mnextswap(0,1);
 756
 757       AssertD (mnextsize(0) >= mnextsize(1), "mainQSort3(8)" );
 758       AssertD (mnextsize(1) >= mnextsize(2), "mainQSort3(9)" );
 759
 760       mpush (nextLo[0], nextHi[0], nextD[0]);
 761       mpush (nextLo[1], nextHi[1], nextD[1]);
 762       mpush (nextLo[2], nextHi[2], nextD[2]);
 763    }
 764 }
 765
 766 #undef mswap
 767 #undef mvswap
 768 #undef mpush
 769 #undef mpop
 770 #undef mmin
 771 #undef mnextsize
 772 #undef mnextswap
 773 #undef MAIN_QSORT_SMALL_THRESH
 774 #undef MAIN_QSORT_DEPTH_THRESH
 775 #undef MAIN_QSORT_STACK_SIZE
 776
 777
 778 /*---------------------------------------------*/
 779 /* Pre:
 780       nblock > N_OVERSHOOT
 781       block32 exists for [0 .. nblock-1 +N_OVERSHOOT]
 782       ((UChar*)block32) [0 .. nblock-1] holds block
 783       ptr exists for [0 .. nblock-1]
 784
 785    Post:
 786       ((UChar*)block32) [0 .. nblock-1] holds block
 787       All other areas of block32 destroyed
 788       ftab [0 .. 65536 ] destroyed
 789       ptr [0 .. nblock-1] holds sorted order
 790       if (*budget < 0), sorting was abandoned
 791 */
 792
 793 #define BIGFREQ(b) (ftab[((b)+1) << 8] - ftab[(b) << 8])
 794 #define SETMASK (1 << 21)
 795 #define CLEARMASK (~(SETMASK))
 796
 797 static
 798 void mainSort ( UInt32* ptr,
 799                 UChar*  block,
 800                 UInt16* quadrant,
 801                 UInt32* ftab,
 802                 Int32   nblock,
 803                 Int32   verb,
 804                 Int32*  budget )
 805 {
 806    Int32  i, j, k, ss, sb;
 807    Int32  runningOrder[256];
 808    Bool   bigDone[256];
 809    Int32  copyStart[256];
 810    Int32  copyEnd  [256];
 811    UChar  c1;
 812    Int32  numQSorted;
 813    UInt16 s;
 814    if (verb >= 4) VPrintf0 ( "        main sort initialise ...\n" );
 815
 816    /*-- set up the 2-byte frequency table --*/
 817    for (i = 65536; i >= 0; i--) ftab[i] = 0;
 818
 819    j = block[0] << 8;
 820    i = nblock-1;
 821    for (; i >= 3; i -= 4) {
 822       quadrant[i] = 0;
 823       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 824       ftab[j]++;
 825       quadrant[i-1] = 0;
 826       j = (j >> 8) | ( ((UInt16)block[i-1]) << 8);
 827       ftab[j]++;
 828       quadrant[i-2] = 0;
 829       j = (j >> 8) | ( ((UInt16)block[i-2]) << 8);
 830       ftab[j]++;
 831       quadrant[i-3] = 0;
 832       j = (j >> 8) | ( ((UInt16)block[i-3]) << 8);
 833       ftab[j]++;
 834    }
 835    for (; i >= 0; i--) {
 836       quadrant[i] = 0;
 837       j = (j >> 8) | ( ((UInt16)block[i]) << 8);
 838       ftab[j]++;
 839    }
 840
 841    /*-- (emphasises close relationship of block & quadrant) --*/
 842    for (i = 0; i < BZ_N_OVERSHOOT; i++) {
 843       block   [nblock+i] = block[i];
 844       quadrant[nblock+i] = 0;
 845    }
 846
 847    if (verb >= 4) VPrintf0 ( "        bucket sorting ...\n" );
 848
 849    /*-- Complete the initial radix sort --*/
 850    for (i = 1; i <= 65536; i++) ftab[i] += ftab[i-1];
 851
 852    s = block[0] << 8;
 853    i = nblock-1;
 854    for (; i >= 3; i -= 4) {
 855       s = (s >> 8) | (block[i] << 8);
 856       j = ftab[s] -1;
 857       ftab[s] = j;
 858       ptr[j] = i;
 859       s = (s >> 8) | (block[i-1] << 8);
 860       j = ftab[s] -1;
 861       ftab[s] = j;
 862       ptr[j] = i-1;
 863       s = (s >> 8) | (block[i-2] << 8);
 864       j = ftab[s] -1;
 865       ftab[s] = j;
 866       ptr[j] = i-2;
 867       s = (s >> 8) | (block[i-3] << 8);
 868       j = ftab[s] -1;
 869       ftab[s] = j;
 870       ptr[j] = i-3;
 871    }
 872    for (; i >= 0; i--) {
 873       s = (s >> 8) | (block[i] << 8);
 874       j = ftab[s] -1;
 875       ftab[s] = j;
 876       ptr[j] = i;
 877    }
 878
 879    /*--
 880       Now ftab contains the first loc of every small bucket.
 881       Calculate the running order, from smallest to largest
 882       big bucket.
 883    --*/
 884    for (i = 0; i <= 255; i++) {
 885       bigDone     [i] = False;
 886       runningOrder[i] = i;
 887    }
 888
 889    {
 890       Int32 vv;
 891       Int32 h = 1;
 892       do h = 3 * h + 1; while (h <= 256);
 893       do {
 894          h = h / 3;
 895          for (i = h; i <= 255; i++) {
 896             vv = runningOrder[i];
 897             j = i;
 898             while ( BIGFREQ(runningOrder[j-h]) > BIGFREQ(vv) ) {
 899                runningOrder[j] = runningOrder[j-h];
 900                j = j - h;
 901                if (j <= (h - 1)) goto zero;
 902             }
 903             zero:
 904             runningOrder[j] = vv;
 905          }
 906       } while (h != 1);
 907    }
 908
 909    /*--
 910       The main sorting loop.
 911    --*/
 912
 913    numQSorted = 0;
 914
 915    for (i = 0; i <= 255; i++) {
 916
 917       /*--
 918          Process big buckets, starting with the least full.
 919          Basically this is a 3-step process in which we call
 920          mainQSort3 to sort the small buckets [ss, j], but
 921          also make a big effort to avoid the calls if we can.
 922       --*/
 923       ss = runningOrder[i];
 924
 925       /*--
 926          Step 1:
 927          Complete the big bucket [ss] by quicksorting
 928          any unsorted small buckets [ss, j], for j != ss.
 929          Hopefully previous pointer-scanning phases have already
 930          completed many of the small buckets [ss, j], so
 931          we don't have to sort them at all.
 932       --*/
 933       for (j = 0; j <= 255; j++) {
 934          if (j != ss) {
 935             sb = (ss << 8) + j;
 936             if ( ! (ftab[sb] & SETMASK) ) {
 937                Int32 lo = ftab[sb]   & CLEARMASK;
 938                Int32 hi = (ftab[sb+1] & CLEARMASK) - 1;
 939                if (hi > lo) {
 940                   if (verb >= 4)
 941                      VPrintf4 ( "        qsort [0x%x, 0x%x]   "
 942                                 "done %d   this %d\n",
 943                                 ss, j, numQSorted, hi - lo + 1 );
 944                   mainQSort3 (
 945                      ptr, block, quadrant, nblock,
 946                      lo, hi, BZ_N_RADIX, budget
 947                   );
 948                   numQSorted += (hi - lo + 1);
 949                   if (*budget < 0) return;
 950                }
 951             }
 952             ftab[sb] |= SETMASK;
 953          }
 954       }
 955
 956       AssertH ( !bigDone[ss], 1006 );
 957
 958       /*--
 959          Step 2:
 960          Now scan this big bucket [ss] so as to synthesise the
 961          sorted order for small buckets [t, ss] for all t,
 962          including, magically, the bucket [ss,ss] too.
 963          This will avoid doing Real Work in subsequent Step 1's.
 964       --*/
 965       {
 966          for (j = 0; j <= 255; j++) {
 967             copyStart[j] =  ftab[(j << 8) + ss]     & CLEARMASK;
 968             copyEnd  [j] = (ftab[(j << 8) + ss + 1] & CLEARMASK) - 1;
 969          }
 970          for (j = ftab[ss << 8] & CLEARMASK; j < copyStart[ss]; j++) {
 971             k = ptr[j]-1; if (k < 0) k += nblock;
 972             c1 = block[k];
 973             if (!bigDone[c1])
 974                ptr[ copyStart[c1]++ ] = k;
 975          }
 976          for (j = (ftab[(ss+1) << 8] & CLEARMASK) - 1; j > copyEnd[ss]; j--) {
 977             k = ptr[j]-1; if (k < 0) k += nblock;
 978             c1 = block[k];
 979             if (!bigDone[c1])
 980                ptr[ copyEnd[c1]-- ] = k;
 981          }
 982       }
 983
 984       AssertH ( (copyStart[ss]-1 == copyEnd[ss])
 985                 ||
 986                 /* Extremely rare case missing in bzip2-1.0.0 and 1.0.1.
 987                    Necessity for this case is demonstrated by compressing
 988                    a sequence of approximately 48.5 million of character
 989                    251; 1.0.0/1.0.1 will then die here. */
 990                 (copyStart[ss] == 0 && copyEnd[ss] == nblock-1),
 991                 1007 )
 992
 993       for (j = 0; j <= 255; j++) ftab[(j << 8) + ss] |= SETMASK;
 994
 995       /*--
 996          Step 3:
 997          The [ss] big bucket is now done.  Record this fact,
 998          and update the quadrant descriptors.  Remember to
 999          update quadrants in the overshoot area too, if
1000          necessary.  The "if (i < 255)" test merely skips
1001          this updating for the last bucket processed, since
1002          updating for the last bucket is pointless.
1003
1004          The quadrant array provides a way to incrementally
1005          cache sort orderings, as they appear, so as to
1006          make subsequent comparisons in fullGtU() complete
1007          faster.  For repetitive blocks this makes a big
1008          difference (but not big enough to be able to avoid
1009          the fallback sorting mechanism, exponential radix sort).
1010
1011          The precise meaning is: at all times:
1012
1013             for 0 <= i < nblock and 0 <= j <= nblock
1014
1015             if block[i] != block[j],
1016
1017                then the relative values of quadrant[i] and
1018                     quadrant[j] are meaningless.
1019
1020                else {
1021                   if quadrant[i] < quadrant[j]
1022                      then the string starting at i lexicographically
1023                      precedes the string starting at j
1024
1025                   else if quadrant[i] > quadrant[j]
1026                      then the string starting at j lexicographically
1027                      precedes the string starting at i
1028
1029                   else
1030                      the relative ordering of the strings starting
1031                      at i and j has not yet been determined.
1032                }
1033       --*/
1034       bigDone[ss] = True;
1035
1036       if (i < 255) {
1037          Int32 bbStart  = ftab[ss << 8] & CLEARMASK;
1038          Int32 bbSize   = (ftab[(ss+1) << 8] & CLEARMASK) - bbStart;
1039          Int32 shifts   = 0;
1040
1041          while ((bbSize >> shifts) > 65534) shifts++;
1042
1043          for (j = bbSize-1; j >= 0; j--) {
1044             Int32 a2update     = ptr[bbStart + j];
1045             UInt16 qVal        = (UInt16)(j >> shifts);
1046             quadrant[a2update] = qVal;
1047             if (a2update < BZ_N_OVERSHOOT)
1048                quadrant[a2update + nblock] = qVal;
1049          }
1050          AssertH ( ((bbSize-1) >> shifts) <= 65535, 1002 );
1051       }
1052
1053    }
1054
1055    if (verb >= 4)
1056       VPrintf3 ( "        %d pointers, %d sorted, %d scanned\n",
1057                  nblock, numQSorted, nblock - numQSorted );
1058 }
1059
1060 #undef BIGFREQ
1061 #undef SETMASK
1062 #undef CLEARMASK
1063
1064
1065 /*---------------------------------------------*/
1066 /* Pre:
1067       nblock > 0
1068       arr2 exists for [0 .. nblock-1 +N_OVERSHOOT]
1069       ((UChar*)arr2)  [0 .. nblock-1] holds block
1070       arr1 exists for [0 .. nblock-1]
1071
1072    Post:
1073       ((UChar*)arr2) [0 .. nblock-1] holds block
1074       All other areas of block destroyed
1075       ftab [ 0 .. 65536 ] destroyed
1076       arr1 [0 .. nblock-1] holds sorted order
1077 */
1078 void BZ2_blockSort ( EState* s )
1079 {
1080    UInt32* ptr    = s->ptr;
1081    UChar*  block  = s->block;
1082    UInt32* ftab   = s->ftab;
1083    Int32   nblock = s->nblock;
1084    Int32   verb   = s->verbosity;
1085    Int32   wfact  = s->workFactor;
1086    UInt16* quadrant;
1087    Int32   budget;
1088    Int32   budgetInit;
1089    Int32   i;
1090
1091    if (nblock < 10000) {
1092       fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1093    } else {
1094       /* Calculate the location for quadrant, remembering to get
1095          the alignment right.  Assumes that &(block[0]) is at least
1096          2-byte aligned -- this should be ok since block is really
1097          the first section of arr2.
1098       */
1099       i = nblock+BZ_N_OVERSHOOT;
1100       if (i & 1) i++;
1101       quadrant = (UInt16*)(&(block[i]));
1102
1103       /* (wfact-1) / 3 puts the default-factor-30
1104          transition point at very roughly the same place as
1105          with v0.1 and v0.9.0.
1106          Not that it particularly matters any more, since the
1107          resulting compressed stream is now the same regardless
1108          of whether or not we use the main sort or fallback sort.
1109       */
1110       if (wfact < 1  ) wfact = 1;
1111       if (wfact > 100) wfact = 100;
1112       budgetInit = nblock * ((wfact-1) / 3);
1113       budget = budgetInit;
1114
1115       mainSort ( ptr, block, quadrant, ftab, nblock, verb, &budget );
1116       if (verb >= 3)
1117          VPrintf3 ( "      %d work, %d block, ratio %5.2f\n",
1118                     budgetInit - budget,
1119                     nblock,
1120                     (float)(budgetInit - budget) /
1121                     (float)(nblock==0 ? 1 : nblock) );
1122       if (budget < 0) {
1123          if (verb >= 2)
1124             VPrintf0 ( "    too repetitive; using fallback"
1125                        " sorting algorithm\n" );
1126          fallbackSort ( s->arr1, s->arr2, ftab, nblock, verb );
1127       }
1128    }
1129
1130    s->origPtr = -1;
1131    for (i = 0; i < s->nblock; i++)
1132       if (ptr[i] == 0)
1133          { s->origPtr = i; break; };
1134
1135    AssertH( s->origPtr != -1, 1003 );
1136 }
1137
1138
1139 /*-------------------------------------------------------------*/
1140 /*--- end                                       blocksort.c ---*/
1141 /*-------------------------------------------------------------*/