ode/src/collision_sapspace.cpp

   1 /*************************************************************************
   2  *                                                                       *
   3  * Open Dynamics Engine, Copyright (C) 2001-2003 Russell L. Smith.       *
   4  * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   5  *                                                                       *
   6  * This library is free software; you can redistribute it and/or         *
   7  * modify it under the terms of EITHER:                                  *
   8  *   (1) The GNU Lesser General Public License as published by the Free  *
   9  *       Software Foundation; either version 2.1 of the License, or (at  *
  10  *       your option) any later version. The text of the GNU Lesser      *
  11  *       General Public License is included with this library in the     *
  12  *       file LICENSE.TXT.                                               *
  13  *   (2) The BSD-style license that is included with this library in     *
  14  *       the file LICENSE-BSD.TXT.                                       *
  15  *                                                                       *
  16  * This library is distributed in the hope that it will be useful,       *
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  19  * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  20  *                                                                       *
  21  *************************************************************************/
  22
  23 /*
  24  *  Sweep and Prune adaptation/tweaks for ODE by Aras Pranckevicius.
  25  *  Additional work by David Walters
  26  *  Original code:
  27  *    OPCODE - Optimized Collision Detection
  28  *    Copyright (C) 2001 Pierre Terdiman
  29  *    Homepage: http://www.codercorner.com/Opcode.htm
  30  *
  31  *  This version does complete radix sort, not "classical" SAP. So, we
  32  *  have no temporal coherence, but are able to handle any movement
  33  *  velocities equally well.
  34  */
  35
  36 #include <ode/common.h>
  37 #include <ode/collision_space.h>
  38 #include <ode/collision.h>
  39
  40 #include "config.h"
  41 #include "matrix.h"
  42 #include "collision_kernel.h"
  43 #include "collision_space_internal.h"
  44
  45 // Reference counting helper for radix sort global data.
  46 //static void RadixSortRef();
  47 //static void RadixSortDeref();
  48
  49
  50 // --------------------------------------------------------------------------
  51 //  Radix Sort Context
  52 // --------------------------------------------------------------------------
  53
  54 struct RaixSortContext
  55 {
  56 public:
  57     RaixSortContext(): mCurrentSize(0), mCurrentUtilization(0), mRanksValid(false), mRanksBuffer(NULL), mPrimaryRanks(NULL) {}
  58     ~RaixSortContext() { FreeRanks(); }
  59
  60     // OPCODE's Radix Sorting, returns a list of indices in sorted order
  61     const uint32* RadixSort( const float* input2, uint32 nb );
  62
  63 private:
  64     void FreeRanks();
  65     void AllocateRanks(sizeint nNewSize);
  66
  67     void ReallocateRanksIfNecessary(sizeint nNewSize);
  68
  69 private:
  70     void SetCurrentSize(sizeint nValue) { mCurrentSize = nValue; }
  71     sizeint GetCurrentSize() const { return mCurrentSize; }
  72
  73     void SetCurrentUtilization(sizeint nValue) { mCurrentUtilization = nValue; }
  74     sizeint GetCurrentUtilization() const { return mCurrentUtilization; }
  75
  76     uint32 *GetRanks1() const { return mPrimaryRanks; }
  77     uint32 *GetRanks2() const { return mRanksBuffer + ((mRanksBuffer + mCurrentSize) - mPrimaryRanks); }
  78     void SwapRanks() { mPrimaryRanks = GetRanks2(); }
  79
  80     bool AreRanksValid() const { return mRanksValid; }
  81     void InvalidateRanks() { mRanksValid = false; }
  82     void ValidateRanks() { mRanksValid = true; }
  83
  84 private:
  85     sizeint mCurrentSize;                                               //!< Current size of the indices list
  86     sizeint mCurrentUtilization;                                        //!< Current utilization of the indices list
  87     bool mRanksValid;
  88     uint32* mRanksBuffer;                                               //!< Two lists allocated sequentially in a single block
  89     uint32* mPrimaryRanks;
  90 };
  91
  92 void RaixSortContext::AllocateRanks(sizeint nNewSize)
  93 {
  94     dIASSERT(GetCurrentSize() == 0);
  95
  96     mRanksBuffer = new uint32[2 * nNewSize];
  97     mPrimaryRanks = mRanksBuffer;
  98
  99     SetCurrentSize(nNewSize);
 100 }
 101
 102 void RaixSortContext::FreeRanks()
 103 {
 104     SetCurrentSize(0);
 105
 106     delete[] mRanksBuffer;
 107 }
 108
 109 void RaixSortContext::ReallocateRanksIfNecessary(sizeint nNewSize)
 110 {
 111     sizeint nCurUtilization = GetCurrentUtilization();
 112
 113     if (nNewSize != nCurUtilization)
 114     {
 115         sizeint nCurSize = GetCurrentSize();
 116
 117         if ( nNewSize > nCurSize )
 118         {
 119             // Free previously used ram
 120             FreeRanks();
 121
 122             // Get some fresh one
 123             AllocateRanks(nNewSize);
 124         }
 125
 126         InvalidateRanks();
 127         SetCurrentUtilization(nNewSize);
 128     }
 129 }
 130
 131 // --------------------------------------------------------------------------
 132 //  SAP space code
 133 // --------------------------------------------------------------------------
 134
 135 struct dxSAPSpace : public dxSpace
 136 {
 137     // Constructor / Destructor
 138     dxSAPSpace( dSpaceID _space, int sortaxis );
 139     ~dxSAPSpace();
 140
 141     // dxSpace
 142     virtual dxGeom* getGeom(int i);
 143     virtual void add(dxGeom* g);
 144     virtual void remove(dxGeom* g);
 145     virtual void dirty(dxGeom* g);
 146     virtual void computeAABB();
 147     virtual void cleanGeoms();
 148     virtual void collide( void *data, dNearCallback *callback );
 149     virtual void collide2( void *data, dxGeom *geom, dNearCallback *callback );
 150
 151 private:
 152
 153     //--------------------------------------------------------------------------
 154     // Local Declarations
 155     //--------------------------------------------------------------------------
 156
 157     //! A generic couple structure
 158     struct Pair
 159     {
 160         uint32 id0;     //!< First index of the pair
 161         uint32 id1;     //!< Second index of the pair
 162
 163         // Default and Value Constructor
 164         Pair() {}
 165         Pair( uint32 i0, uint32 i1 ) : id0( i0 ), id1( i1 ) {}
 166     };
 167
 168     //--------------------------------------------------------------------------
 169     // Helpers
 170     //--------------------------------------------------------------------------
 171
 172     /**
 173     *   Complete box pruning.
 174     *  Returns a list of overlapping pairs of boxes, each box of the pair
 175     *  belongs to the same set.
 176     *
 177     *   @param  count   [in] number of boxes.
 178     *   @param  geoms   [in] geoms of boxes.
 179     *   @param  pairs   [out] array of overlapping pairs.
 180     */
 181     void BoxPruning( int count, const dxGeom** geoms, dArray< Pair >& pairs );
 182
 183
 184     //--------------------------------------------------------------------------
 185     // Implementation Data
 186     //--------------------------------------------------------------------------
 187
 188     // We have two lists (arrays of pointers) to dirty and clean
 189     // geoms. Each geom knows it's index into the corresponding list
 190     // (see macros above).
 191     dArray<dxGeom*> DirtyList; // dirty geoms
 192     dArray<dxGeom*> GeomList;   // clean geoms
 193
 194     // For SAP, we ultimately separate "normal" geoms and the ones that have
 195     // infinite AABBs. No point doing SAP on infinite ones (and it doesn't handle
 196     // infinite geoms anyway).
 197     dArray<dxGeom*> TmpGeomList;        // temporary for normal geoms
 198     dArray<dxGeom*> TmpInfGeomList;     // temporary for geoms with infinite AABBs
 199
 200     // Our sorting axes. (X,Z,Y is often best). Stored *2 for minor speedup
 201     // Axis indices into geom's aabb are: min=idx, max=idx+1
 202     uint32 ax0idx;
 203     uint32 ax1idx;
 204     uint32 ax2idx;
 205
 206     // pruning position array scratch pad
 207     // NOTE: this is float not dReal because of the OPCODE radix sorter
 208     dArray< float > poslist;
 209     RaixSortContext     sortContext;
 210 };
 211
 212 // Creation
 213 dSpaceID dSweepAndPruneSpaceCreate( dxSpace* space, int axisorder ) {
 214     return new dxSAPSpace( space, axisorder );
 215 }
 216
 217
 218 //==============================================================================
 219
 220 #define GEOM_ENABLED(g) (((g)->gflags & GEOM_ENABLE_TEST_MASK) == GEOM_ENABLE_TEST_VALUE)
 221
 222 // HACK: We abuse 'next' and 'tome' members of dxGeom to store indices into dirty/geom lists.
 223 #define GEOM_SET_DIRTY_IDX(g,idx) { (g)->next_ex = (dxGeom*)(sizeint)(idx); }
 224 #define GEOM_SET_GEOM_IDX(g,idx) { (g)->tome_ex = (dxGeom**)(sizeint)(idx); }
 225 #define GEOM_GET_DIRTY_IDX(g) ((int)(sizeint)(g)->next_ex)
 226 #define GEOM_GET_GEOM_IDX(g) ((int)(sizeint)(g)->tome_ex)
 227 #define GEOM_INVALID_IDX (-1)
 228
 229
 230 /*
 231 *  A bit of repetitive work - similar to collideAABBs, but doesn't check
 232 *  if AABBs intersect (because SAP returns pairs with overlapping AABBs).
 233 */
 234 static void collideGeomsNoAABBs( dxGeom *g1, dxGeom *g2, void *data, dNearCallback *callback )
 235 {
 236     dIASSERT( (g1->gflags & GEOM_AABB_BAD)==0 );
 237     dIASSERT( (g2->gflags & GEOM_AABB_BAD)==0 );
 238
 239     // no contacts if both geoms on the same body, and the body is not 0
 240     if (g1->body == g2->body && g1->body) return;
 241
 242     // test if the category and collide bitfields match
 243     if ( ((g1->category_bits & g2->collide_bits) ||
 244         (g2->category_bits & g1->collide_bits)) == 0) {
 245             return;
 246     }
 247
 248     dReal *bounds1 = g1->aabb;
 249     dReal *bounds2 = g2->aabb;
 250
 251     // check if either object is able to prove that it doesn't intersect the
 252     // AABB of the other
 253     if (g1->AABBTest (g2,bounds2) == 0) return;
 254     if (g2->AABBTest (g1,bounds1) == 0) return;
 255
 256     // the objects might actually intersect - call the space callback function
 257     callback (data,g1,g2);
 258 }
 259
 260
 261 dxSAPSpace::dxSAPSpace( dSpaceID _space, int axisorder ) : dxSpace( _space )
 262 {
 263     type = dSweepAndPruneSpaceClass;
 264
 265     // Init AABB to infinity
 266     aabb[0] = -dInfinity;
 267     aabb[1] = dInfinity;
 268     aabb[2] = -dInfinity;
 269     aabb[3] = dInfinity;
 270     aabb[4] = -dInfinity;
 271     aabb[5] = dInfinity;
 272
 273     ax0idx = ( ( axisorder ) & 3 ) << 1;
 274     ax1idx = ( ( axisorder >> 2 ) & 3 ) << 1;
 275     ax2idx = ( ( axisorder >> 4 ) & 3 ) << 1;
 276 }
 277
 278 dxSAPSpace::~dxSAPSpace()
 279 {
 280     CHECK_NOT_LOCKED(this);
 281     if ( cleanup ) {
 282         // note that destroying each geom will call remove()
 283         for ( ; DirtyList.size(); dGeomDestroy( DirtyList[ 0 ] ) ) {}
 284         for ( ; GeomList.size(); dGeomDestroy( GeomList[ 0 ] ) ) {}
 285     }
 286     else {
 287         // just unhook them
 288         for ( ; DirtyList.size(); remove( DirtyList[ 0 ] ) ) {}
 289         for ( ; GeomList.size(); remove( GeomList[ 0 ] ) ) {}
 290     }
 291 }
 292
 293 dxGeom* dxSAPSpace::getGeom( int i )
 294 {
 295     dUASSERT( i >= 0 && i < count, "index out of range" );
 296     int dirtySize = DirtyList.size();
 297     if( i < dirtySize )
 298         return DirtyList[i];
 299     else
 300         return GeomList[i-dirtySize];
 301 }
 302
 303 void dxSAPSpace::add( dxGeom* g )
 304 {
 305     CHECK_NOT_LOCKED (this);
 306     dAASSERT(g);
 307     dUASSERT(g->tome_ex == 0 && g->next_ex == 0, "geom is already in a space");
 308
 309
 310     // add to dirty list
 311     GEOM_SET_DIRTY_IDX( g, DirtyList.size() );
 312     GEOM_SET_GEOM_IDX( g, GEOM_INVALID_IDX );
 313     DirtyList.push( g );
 314
 315     dxSpace::add(g);
 316 }
 317
 318 void dxSAPSpace::remove( dxGeom* g )
 319 {
 320     CHECK_NOT_LOCKED(this);
 321     dAASSERT(g);
 322     dUASSERT(g->parent_space == this,"object is not in this space");
 323
 324     // remove
 325     int dirtyIdx = GEOM_GET_DIRTY_IDX(g);
 326     int geomIdx = GEOM_GET_GEOM_IDX(g);
 327     // must be in one list, not in both
 328     dUASSERT(
 329         (dirtyIdx==GEOM_INVALID_IDX && geomIdx>=0 && geomIdx<GeomList.size()) ||
 330         (geomIdx==GEOM_INVALID_IDX && dirtyIdx>=0 && dirtyIdx<DirtyList.size()),
 331         "geom indices messed up" );
 332     if( dirtyIdx != GEOM_INVALID_IDX ) {
 333         // we're in dirty list, remove
 334         int dirtySize = DirtyList.size();
 335         if (dirtyIdx != dirtySize-1) {
 336             dxGeom* lastG = DirtyList[dirtySize-1];
 337             DirtyList[dirtyIdx] = lastG;
 338             GEOM_SET_DIRTY_IDX(lastG,dirtyIdx);
 339         }
 340         GEOM_SET_DIRTY_IDX(g,GEOM_INVALID_IDX);
 341         DirtyList.setSize( dirtySize-1 );
 342     } else {
 343         // we're in geom list, remove
 344         int geomSize = GeomList.size();
 345         if (geomIdx != geomSize-1) {
 346             dxGeom* lastG = GeomList[geomSize-1];
 347             GeomList[geomIdx] = lastG;
 348             GEOM_SET_GEOM_IDX(lastG,geomIdx);
 349         }
 350         GEOM_SET_GEOM_IDX(g,GEOM_INVALID_IDX);
 351         GeomList.setSize( geomSize-1 );
 352     }
 353
 354     dxSpace::remove(g);
 355 }
 356
 357 void dxSAPSpace::dirty( dxGeom* g )
 358 {
 359     dAASSERT(g);
 360     dUASSERT(g->parent_space == this, "object is not in this space");
 361
 362     // check if already dirtied
 363     int dirtyIdx = GEOM_GET_DIRTY_IDX(g);
 364     if( dirtyIdx != GEOM_INVALID_IDX )
 365         return;
 366
 367     int geomIdx = GEOM_GET_GEOM_IDX(g);
 368     dUASSERT( geomIdx>=0 && geomIdx<GeomList.size(), "geom indices messed up" );
 369
 370     // remove from geom list, place last in place of this
 371     int geomSize = GeomList.size();
 372     if (geomIdx != geomSize-1) {
 373         dxGeom* lastG = GeomList[geomSize-1];
 374         GeomList[geomIdx] = lastG;
 375         GEOM_SET_GEOM_IDX(lastG,geomIdx);
 376     }
 377     GeomList.setSize( geomSize-1 );
 378
 379     // add to dirty list
 380     GEOM_SET_GEOM_IDX( g, GEOM_INVALID_IDX );
 381     GEOM_SET_DIRTY_IDX( g, DirtyList.size() );
 382     DirtyList.push( g );
 383 }
 384
 385 void dxSAPSpace::computeAABB()
 386 {
 387     // TODO?
 388 }
 389
 390 void dxSAPSpace::cleanGeoms()
 391 {
 392     int dirtySize = DirtyList.size();
 393     if( !dirtySize )
 394         return;
 395
 396     // compute the AABBs of all dirty geoms, clear the dirty flags,
 397     // remove from dirty list, place into geom list
 398     lock_count++;
 399
 400     int geomSize = GeomList.size();
 401     GeomList.setSize( geomSize + dirtySize ); // ensure space in geom list
 402
 403     for( int i = 0; i < dirtySize; ++i ) {
 404         dxGeom* g = DirtyList[i];
 405         if( IS_SPACE(g) ) {
 406             ((dxSpace*)g)->cleanGeoms();
 407         }
 408
 409         g->recomputeAABB();
 410         dIASSERT((g->gflags & GEOM_AABB_BAD) == 0);
 411
 412         g->gflags &= ~GEOM_DIRTY;
 413
 414         // remove from dirty list, add to geom list
 415         GEOM_SET_DIRTY_IDX( g, GEOM_INVALID_IDX );
 416         GEOM_SET_GEOM_IDX( g, geomSize + i );
 417         GeomList[geomSize+i] = g;
 418     }
 419     // clear dirty list
 420     DirtyList.setSize( 0 );
 421
 422     lock_count--;
 423 }
 424
 425 void dxSAPSpace::collide( void *data, dNearCallback *callback )
 426 {
 427     dAASSERT (callback);
 428
 429     lock_count++;
 430
 431     cleanGeoms();
 432
 433     // by now all geoms are in GeomList, and DirtyList must be empty
 434     int geom_count = GeomList.size();
 435     dUASSERT( geom_count == count, "geom counts messed up" );
 436
 437     // separate all ENABLED geoms into infinite AABBs and normal AABBs
 438     TmpGeomList.setSize(0);
 439     TmpInfGeomList.setSize(0);
 440     int axis0max = ax0idx + 1;
 441     for( int i = 0; i < geom_count; ++i ) {
 442         dxGeom* g = GeomList[i];
 443         if( !GEOM_ENABLED(g) ) // skip disabled ones
 444             continue;
 445         const dReal& amax = g->aabb[axis0max];
 446         if( amax == dInfinity ) // HACK? probably not...
 447             TmpInfGeomList.push( g );
 448         else
 449             TmpGeomList.push( g );
 450     }
 451
 452     // do SAP on normal AABBs
 453     dArray< Pair > overlapBoxes;
 454     int tmp_geom_count = TmpGeomList.size();
 455     if ( tmp_geom_count > 0 )
 456     {
 457         // Generate a list of overlapping boxes
 458         BoxPruning( tmp_geom_count, (const dxGeom**)TmpGeomList.data(), overlapBoxes );
 459     }
 460
 461     // collide overlapping
 462     int overlapCount = overlapBoxes.size();
 463     for( int j = 0; j < overlapCount; ++j )
 464     {
 465         const Pair& pair = overlapBoxes[ j ];
 466         dxGeom* g1 = TmpGeomList[ pair.id0 ];
 467         dxGeom* g2 = TmpGeomList[ pair.id1 ];
 468         collideGeomsNoAABBs( g1, g2, data, callback );
 469     }
 470
 471     int infSize = TmpInfGeomList.size();
 472     int normSize = TmpGeomList.size();
 473     int m, n;
 474
 475     for ( m = 0; m < infSize; ++m )
 476     {
 477         dxGeom* g1 = TmpInfGeomList[ m ];
 478
 479         // collide infinite ones
 480         for( n = m+1; n < infSize; ++n ) {
 481             dxGeom* g2 = TmpInfGeomList[n];
 482             collideGeomsNoAABBs( g1, g2, data, callback );
 483         }
 484
 485         // collide infinite ones with normal ones
 486         for( n = 0; n < normSize; ++n ) {
 487             dxGeom* g2 = TmpGeomList[n];
 488             collideGeomsNoAABBs( g1, g2, data, callback );
 489         }
 490     }
 491
 492     lock_count--;
 493 }
 494
 495 void dxSAPSpace::collide2( void *data, dxGeom *geom, dNearCallback *callback )
 496 {
 497     dAASSERT (geom && callback);
 498
 499     // TODO: This is just a simple N^2 implementation
 500
 501     lock_count++;
 502
 503     cleanGeoms();
 504     geom->recomputeAABB();
 505
 506     // intersect bounding boxes
 507     int geom_count = GeomList.size();
 508     for ( int i = 0; i < geom_count; ++i ) {
 509         dxGeom* g = GeomList[i];
 510         if ( GEOM_ENABLED(g) )
 511             collideAABBs (g,geom,data,callback);
 512     }
 513
 514     lock_count--;
 515 }
 516
 517
 518 void dxSAPSpace::BoxPruning( int count, const dxGeom** geoms, dArray< Pair >& pairs )
 519 {
 520     // Size the poslist (+1 for infinity end cap)
 521     poslist.setSize( count );
 522
 523     // 1) Build main list using the primary axis
 524     //  NOTE: uses floats instead of dReals because that's what radix sort wants
 525     for( int i = 0; i < count; ++i )
 526         poslist[ i ] = (float)TmpGeomList[i]->aabb[ ax0idx ];
 527
 528     // 2) Sort the list
 529     const uint32* Sorted = sortContext.RadixSort( poslist.data(), count );
 530
 531     // 3) Prune the list
 532     const uint32* const LastSorted = Sorted + count;
 533     const uint32* RunningAddress = Sorted;
 534
 535     bool bExitLoop;
 536     Pair IndexPair;
 537     while ( Sorted < LastSorted )
 538     {
 539         IndexPair.id0 = *Sorted++;
 540
 541         // empty, this loop just advances RunningAddress
 542         for (bExitLoop = false; poslist[*RunningAddress++] < poslist[IndexPair.id0]; )
 543         {
 544             if (RunningAddress == LastSorted)
 545             {
 546                 bExitLoop = true;
 547                 break;
 548             }
 549         }
 550
 551         if ( bExitLoop || RunningAddress == LastSorted) // Not a bug!!!
 552         {
 553             break;
 554         }
 555
 556         const float idx0ax0max = (float)geoms[IndexPair.id0]->aabb[ax0idx+1]; // To avoid wrong decisions caused by rounding errors, cast the AABB element to float similarly as we did at the function beginning
 557         const dReal idx0ax1max = geoms[IndexPair.id0]->aabb[ax1idx+1];
 558         const dReal idx0ax2max = geoms[IndexPair.id0]->aabb[ax2idx+1];
 559
 560         for (const uint32* RunningAddress2 = RunningAddress; poslist[ IndexPair.id1 = *RunningAddress2++ ] <= idx0ax0max; )
 561         {
 562             const dReal* aabb0 = geoms[ IndexPair.id0 ]->aabb;
 563             const dReal* aabb1 = geoms[ IndexPair.id1 ]->aabb;
 564
 565             // Intersection?
 566             if ( idx0ax1max >= aabb1[ax1idx] && aabb1[ax1idx+1] >= aabb0[ax1idx]
 567                 && idx0ax2max >= aabb1[ax2idx] && aabb1[ax2idx+1] >= aabb0[ax2idx] )
 568             {
 569                 pairs.push( IndexPair );
 570             }
 571
 572             if (RunningAddress2 == LastSorted)
 573             {
 574                 break;
 575             }
 576         }
 577
 578     } // while ( RunningAddress < LastSorted && Sorted < LastSorted )
 579 }
 580
 581
 582 //==============================================================================
 583
 584 //------------------------------------------------------------------------------
 585 // Radix Sort
 586 //------------------------------------------------------------------------------
 587
 588
 589
 590 #define CHECK_PASS_VALIDITY(pass)                                                                                                                       \
 591     /* Shortcut to current counters */                                                                                                          \
 592     const uint32* CurCount = &mHistogram[pass<<8];                                                                                              \
 593     \
 594     /* Reset flag. The sorting pass is supposed to be performed. (default) */                           \
 595     bool PerformPass = true;                                                                                                                            \
 596     \
 597     /* Check pass validity */                                                                                                                           \
 598     \
 599     /* If all values have the same byte, sorting is useless. */                                                         \
 600     /* It may happen when sorting bytes or words instead of dwords. */                                          \
 601     /* This routine actually sorts words faster than dwords, and bytes */                                       \
 602     /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */                         \
 603     /* for words and O(n) for bytes. Running time for floats depends on actual values... */     \
 604     \
 605     /* Get first byte */                                                                                                                                        \
 606     uint8 UniqueVal = *(((const uint8*)input)+pass);                                                                                            \
 607     \
 608     /* Check that byte's counter */                                                                                                                     \
 609     if(CurCount[UniqueVal]==nb) PerformPass=false;
 610
 611 // WARNING ONLY SORTS IEEE FLOATING-POINT VALUES
 612 const uint32* RaixSortContext::RadixSort( const float* input2, uint32 nb )
 613 {
 614     union _type_cast_union
 615     {
 616         _type_cast_union(const float *floats): asFloats(floats) {}
 617         _type_cast_union(const uint32 *uints32): asUInts32(uints32) {}
 618
 619         const float *asFloats;
 620         const uint32 *asUInts32;
 621         const uint8 *asUInts8;
 622     };
 623
 624     const uint32* input = _type_cast_union(input2).asUInts32;
 625
 626     // Resize lists if needed
 627     ReallocateRanksIfNecessary(nb);
 628
 629     // Allocate histograms & offsets on the stack
 630     uint32 mHistogram[256*4];
 631     uint32* mLink[256];
 632
 633     // Create histograms (counters). Counters for all passes are created in one run.
 634     // Pros:    read input buffer once instead of four times
 635     // Cons:    mHistogram is 4Kb instead of 1Kb
 636     // Floating-point values are always supposed to be signed values, so there's only one code path there.
 637     // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
 638     // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
 639     // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
 640     // wouldn't work with mixed positive/negative values....
 641     {
 642         /* Clear counters/histograms */
 643         memset(mHistogram, 0, 256*4*sizeof(uint32));
 644
 645         /* Prepare to count */
 646         const uint8* p = _type_cast_union(input).asUInts8;
 647         const uint8* pe = &p[nb*4];
 648         uint32* h0= &mHistogram[0];             /* Histogram for first pass (LSB)       */
 649         uint32* h1= &mHistogram[256];   /* Histogram for second pass            */
 650         uint32* h2= &mHistogram[512];   /* Histogram for third pass                     */
 651         uint32* h3= &mHistogram[768];   /* Histogram for last pass (MSB)        */
 652
 653         bool AlreadySorted = true;      /* Optimism... */
 654
 655         if (!AreRanksValid())
 656         {
 657             /* Prepare for temporal coherence */
 658             const float* Running = input2;
 659             float PrevVal = *Running;
 660
 661             while(p!=pe)
 662             {
 663                 /* Read input input2 in previous sorted order */
 664                 float Val = *Running++;
 665                 /* Check whether already sorted or not */
 666                 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */
 667                 /* Update for next iteration */
 668                 PrevVal = Val;
 669
 670                 /* Create histograms */
 671                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;
 672             }
 673
 674             /* If all input values are already sorted, we just have to return and leave the */
 675             /* previous list unchanged. That way the routine may take advantage of temporal */
 676             /* coherence, for example when used to sort transparent faces.                                      */
 677             if(AlreadySorted)
 678             {
 679                 uint32* const Ranks1 = GetRanks1();
 680                 for(uint32 i=0;i<nb;i++)        Ranks1[i] = i;
 681                 return Ranks1;
 682             }
 683         }
 684         else
 685         {
 686             /* Prepare for temporal coherence */
 687             uint32* const Ranks1 = GetRanks1();
 688
 689             uint32* Indices = Ranks1;
 690             float PrevVal = input2[*Indices];
 691
 692             while(p!=pe)
 693             {
 694                 /* Read input input2 in previous sorted order */
 695                 float Val = input2[*Indices++];
 696                 /* Check whether already sorted or not */
 697                 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */
 698                 /* Update for next iteration */
 699                 PrevVal = Val;
 700
 701                 /* Create histograms */
 702                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;
 703             }
 704
 705             /* If all input values are already sorted, we just have to return and leave the */
 706             /* previous list unchanged. That way the routine may take advantage of temporal */
 707             /* coherence, for example when used to sort transparent faces.                                      */
 708             if(AlreadySorted)   { return Ranks1;        }
 709         }
 710
 711         /* Else there has been an early out and we must finish computing the histograms */
 712         while(p!=pe)
 713         {
 714             /* Create histograms without the previous overhead */
 715             h0[*p++]++; h1[*p++]++;     h2[*p++]++;     h3[*p++]++;
 716         }
 717     }
 718
 719     // Compute #negative values involved if needed
 720     uint32 NbNegativeValues = 0;
 721
 722     // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 723     // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 724     // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 725     uint32* h3= &mHistogram[768];
 726     for(uint32 i=128;i<256;i++) NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 727
 728     // Radix sort, j is the pass number (0=LSB, 3=MSB)
 729     for(uint32 j=0;j<4;j++)
 730     {
 731         // Should we care about negative values?
 732         if(j!=3)
 733         {
 734             // Here we deal with positive values only
 735             CHECK_PASS_VALIDITY(j);
 736
 737             if(PerformPass)
 738             {
 739                 uint32* const Ranks2 = GetRanks2();
 740                 // Create offsets
 741                 mLink[0] = Ranks2;
 742                 for(uint32 i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 743
 744                 // Perform Radix Sort
 745                 const uint8* InputBytes = _type_cast_union(input).asUInts8;
 746                 InputBytes += j;
 747                 if (!AreRanksValid())
 748                 {
 749                     for(uint32 i=0;i<nb;i++)
 750                     {
 751                         *mLink[InputBytes[i<<2]]++ = i;
 752                     }
 753
 754                     ValidateRanks();
 755                 }
 756                 else
 757                 {
 758                     uint32* const Ranks1 = GetRanks1();
 759
 760                     uint32* Indices                             = Ranks1;
 761                     uint32* const IndicesEnd    = Ranks1 + nb;
 762                     while(Indices!=IndicesEnd)
 763                     {
 764                         uint32 id = *Indices++;
 765                         *mLink[InputBytes[id<<2]]++ = id;
 766                     }
 767                 }
 768
 769                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 770                 SwapRanks();
 771             }
 772         }
 773         else
 774         {
 775             // This is a special case to correctly handle negative values
 776             CHECK_PASS_VALIDITY(j);
 777
 778             if(PerformPass)
 779             {
 780                 uint32* const Ranks2 = GetRanks2();
 781
 782                 // Create biased offsets, in order for negative numbers to be sorted as well
 783                 mLink[0] = Ranks2 + NbNegativeValues;                                                                           // First positive number takes place after the negative ones
 784                 for(uint32 i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 785
 786                 // We must reverse the sorting order for negative numbers!
 787                 mLink[255] = Ranks2;
 788                 for(uint32 i=0;i<127;i++)       mLink[254-i] = mLink[255-i] + CurCount[255-i];          // Fixing the wrong order for negative values
 789                 for(uint32 i=128;i<256;i++)     mLink[i] += CurCount[i];                                                        // Fixing the wrong place for negative values
 790
 791                 // Perform Radix Sort
 792                 if (!AreRanksValid())
 793                 {
 794                     for(uint32 i=0;i<nb;i++)
 795                     {
 796                         uint32 Radix = input[i]>>24;                                                    // Radix byte, same as above. AND is useless here (uint32).
 797                         // ### cmp to be killed. Not good. Later.
 798                         if(Radix<128)           *mLink[Radix]++ = i;            // Number is positive, same as above
 799                         else                            *(--mLink[Radix]) = i;          // Number is negative, flip the sorting order
 800                     }
 801
 802                     ValidateRanks();
 803                 }
 804                 else
 805                 {
 806                     uint32* const Ranks1 = GetRanks1();
 807
 808                     for(uint32 i=0;i<nb;i++)
 809                     {
 810                         uint32 Radix = input[Ranks1[i]]>>24;                                                    // Radix byte, same as above. AND is useless here (uint32).
 811                         // ### cmp to be killed. Not good. Later.
 812                         if(Radix<128)           *mLink[Radix]++ = Ranks1[i];            // Number is positive, same as above
 813                         else                            *(--mLink[Radix]) = Ranks1[i];          // Number is negative, flip the sorting order
 814                     }
 815                 }
 816                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 817                 SwapRanks();
 818             }
 819             else
 820             {
 821                 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
 822                 if(UniqueVal>=128)
 823                 {
 824                     if (!AreRanksValid())
 825                     {
 826                         uint32* const Ranks2 = GetRanks2();
 827                         // ###Possible?
 828                         for(uint32 i=0;i<nb;i++)
 829                         {
 830                             Ranks2[i] = nb-i-1;
 831                         }
 832
 833                         ValidateRanks();
 834                     }
 835                     else
 836                     {
 837                         uint32* const Ranks1 = GetRanks1();
 838                         uint32* const Ranks2 = GetRanks2();
 839                         for(uint32 i=0;i<nb;i++)        Ranks2[i] = Ranks1[nb-i-1];
 840                     }
 841
 842                     // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 843                     SwapRanks();
 844                 }
 845             }
 846         }
 847     }
 848
 849     // Return indices
 850     uint32* const Ranks1 = GetRanks1();
 851     return Ranks1;
 852 }
 853