ode/src/collision_sapspace.cpp

   1 /*************************************************************************
   2  *                                                                       *
   3  * Open Dynamics Engine, Copyright (C) 2001-2003 Russell L. Smith.       *
   4  * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   5  *                                                                       *
   6  * This library is free software; you can redistribute it and/or         *
   7  * modify it under the terms of EITHER:                                  *
   8  *   (1) The GNU Lesser General Public License as published by the Free  *
   9  *       Software Foundation; either version 2.1 of the License, or (at  *
  10  *       your option) any later version. The text of the GNU Lesser      *
  11  *       General Public License is included with this library in the     *
  12  *       file LICENSE.TXT.                                               *
  13  *   (2) The BSD-style license that is included with this library in     *
  14  *       the file LICENSE-BSD.TXT.                                       *
  15  *                                                                       *
  16  * This library is distributed in the hope that it will be useful,       *
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  19  * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  20  *                                                                       *
  21  *************************************************************************/
  22
  23 /*
  24  *  Sweep and Prune adaptation/tweaks for ODE by Aras Pranckevicius.
  25  *  Additional work by David Walters
  26  *  Original code:
  27  *    OPCODE - Optimized Collision Detection
  28  *    Copyright (C) 2001 Pierre Terdiman
  29  *    Homepage: http://www.codercorner.com/Opcode.htm
  30  *
  31  *  This version does complete radix sort, not "classical" SAP. So, we
  32  *  have no temporal coherence, but are able to handle any movement
  33  *  velocities equally well.
  34  */
  35
  36 #include <ode/common.h>
  37 #include <ode/collision_space.h>
  38 #include <ode/collision.h>
  39
  40 #include "config.h"
  41 #include "matrix.h"
  42 #include "collision_kernel.h"
  43 #include "collision_space_internal.h"
  44
  45 // Reference counting helper for radix sort global data.
  46 //static void RadixSortRef();
  47 //static void RadixSortDeref();
  48
  49
  50 // --------------------------------------------------------------------------
  51 //  Radix Sort Context
  52 // --------------------------------------------------------------------------
  53
  54 struct RaixSortContext
  55 {
  56 public:
  57     RaixSortContext(): mCurrentSize(0), mCurrentUtilization(0), mRanksValid(false), mRanksBuffer(NULL), mPrimaryRanks(NULL) {}
  58     ~RaixSortContext() { FreeRanks(); }
  59
  60     // OPCODE's Radix Sorting, returns a list of indices in sorted order
  61     const uint32* RadixSort( const float* input2, uint32 nb );
  62
  63 private:
  64     void FreeRanks();
  65     void AllocateRanks(sizeint nNewSize);
  66
  67     void ReallocateRanksIfNecessary(sizeint nNewSize);
  68
  69 private:
  70     void SetCurrentSize(sizeint nValue) { mCurrentSize = nValue; }
  71     sizeint GetCurrentSize() const { return mCurrentSize; }
  72
  73     void SetCurrentUtilization(sizeint nValue) { mCurrentUtilization = nValue; }
  74     sizeint GetCurrentUtilization() const { return mCurrentUtilization; }
  75
  76     uint32 *GetRanks1() const { return mPrimaryRanks; }
  77     uint32 *GetRanks2() const { return mRanksBuffer + ((mRanksBuffer + mCurrentSize) - mPrimaryRanks); }
  78     void SwapRanks() { mPrimaryRanks = GetRanks2(); }
  79
  80     bool AreRanksValid() const { return mRanksValid; }
  81     void InvalidateRanks() { mRanksValid = false; }
  82     void ValidateRanks() { mRanksValid = true; }
  83
  84 private:
  85     sizeint mCurrentSize;                                               //!< Current size of the indices list
  86     sizeint mCurrentUtilization;                                        //!< Current utilization of the indices list
  87     bool mRanksValid;
  88     uint32* mRanksBuffer;                                               //!< Two lists allocated sequentially in a single block
  89     uint32* mPrimaryRanks;
  90 };
  91
  92 void RaixSortContext::AllocateRanks(sizeint nNewSize)
  93 {
  94     dIASSERT(GetCurrentSize() == 0);
  95
  96     mRanksBuffer = new uint32[2 * nNewSize];
  97     mPrimaryRanks = mRanksBuffer;
  98
  99     SetCurrentSize(nNewSize);
 100 }
 101
 102 void RaixSortContext::FreeRanks()
 103 {
 104     SetCurrentSize(0);
 105
 106     delete[] mRanksBuffer;
 107 }
 108
 109 void RaixSortContext::ReallocateRanksIfNecessary(sizeint nNewSize)
 110 {
 111     sizeint nCurUtilization = GetCurrentUtilization();
 112
 113     if (nNewSize != nCurUtilization)
 114     {
 115         sizeint nCurSize = GetCurrentSize();
 116
 117         if ( nNewSize > nCurSize )
 118         {
 119             // Free previously used ram
 120             FreeRanks();
 121
 122             // Get some fresh one
 123             AllocateRanks(nNewSize);
 124         }
 125
 126         InvalidateRanks();
 127         SetCurrentUtilization(nNewSize);
 128     }
 129 }
 130
 131 // --------------------------------------------------------------------------
 132 //  SAP space code
 133 // --------------------------------------------------------------------------
 134
 135 struct dxSAPSpace : public dxSpace
 136 {
 137     // Constructor / Destructor
 138     dxSAPSpace( dSpaceID _space, int sortaxis );
 139     ~dxSAPSpace();
 140
 141     // dxSpace
 142     virtual dxGeom* getGeom(int i);
 143     virtual void add(dxGeom* g);
 144     virtual void remove(dxGeom* g);
 145     virtual void dirty(dxGeom* g);
 146     virtual void computeAABB();
 147     virtual void cleanGeoms();
 148     virtual void collide( void *data, dNearCallback *callback );
 149     virtual void collide2( void *data, dxGeom *geom, dNearCallback *callback );
 150
 151 private:
 152
 153     //--------------------------------------------------------------------------
 154     // Local Declarations
 155     //--------------------------------------------------------------------------
 156
 157     //! A generic couple structure
 158     struct Pair
 159     {
 160         uint32 id0;     //!< First index of the pair
 161         uint32 id1;     //!< Second index of the pair
 162
 163         // Default and Value Constructor
 164         Pair() {}
 165         Pair( uint32 i0, uint32 i1 ) : id0( i0 ), id1( i1 ) {}
 166     };
 167
 168     //--------------------------------------------------------------------------
 169     // Helpers
 170     //--------------------------------------------------------------------------
 171
 172     /**
 173     *   Complete box pruning.
 174     *  Returns a list of overlapping pairs of boxes, each box of the pair
 175     *  belongs to the same set.
 176     *
 177     *   @param  count   [in] number of boxes.
 178     *   @param  geoms   [in] geoms of boxes.
 179     *   @param  pairs   [out] array of overlapping pairs.
 180     */
 181     void BoxPruning( int count, const dxGeom** geoms, dArray< Pair >& pairs );
 182
 183
 184     //--------------------------------------------------------------------------
 185     // Implementation Data
 186     //--------------------------------------------------------------------------
 187
 188     // We have two lists (arrays of pointers) to dirty and clean
 189     // geoms. Each geom knows it's index into the corresponding list
 190     // (see macros above).
 191     dArray<dxGeom*> DirtyList; // dirty geoms
 192     dArray<dxGeom*> GeomList;   // clean geoms
 193
 194     // For SAP, we ultimately separate "normal" geoms and the ones that have
 195     // infinite AABBs. No point doing SAP on infinite ones (and it doesn't handle
 196     // infinite geoms anyway).
 197     dArray<dxGeom*> TmpGeomList;        // temporary for normal geoms
 198     dArray<dxGeom*> TmpInfGeomList;     // temporary for geoms with infinite AABBs
 199
 200     // Our sorting axes. (X,Z,Y is often best). Stored *2 for minor speedup
 201     // Axis indices into geom's aabb are: min=idx, max=idx+1
 202     uint32 ax0idx;
 203     uint32 ax1idx;
 204     uint32 ax2idx;
 205
 206     // pruning position array scratch pad
 207     // NOTE: this is float not dReal because of the OPCODE radix sorter
 208     dArray< float > poslist;
 209     RaixSortContext     sortContext;
 210 };
 211
 212 // Creation
 213 dSpaceID dSweepAndPruneSpaceCreate( dxSpace* space, int axisorder ) {
 214     return new dxSAPSpace( space, axisorder );
 215 }
 216
 217
 218 //==============================================================================
 219
 220 #define GEOM_ENABLED(g) (((g)->gflags & GEOM_ENABLE_TEST_MASK) == GEOM_ENABLE_TEST_VALUE)
 221
 222 // HACK: We abuse 'next' and 'tome' members of dxGeom to store indices into dirty/geom lists.
 223 #define GEOM_SET_DIRTY_IDX(g,idx) { (g)->next_ex = (dxGeom*)(sizeint)(idx); }
 224 #define GEOM_SET_GEOM_IDX(g,idx) { (g)->tome_ex = (dxGeom**)(sizeint)(idx); }
 225 #define GEOM_GET_DIRTY_IDX(g) ((int)(sizeint)(g)->next_ex)
 226 #define GEOM_GET_GEOM_IDX(g) ((int)(sizeint)(g)->tome_ex)
 227 #define GEOM_INVALID_IDX (-1)
 228
 229
 230 /*
 231 *  A bit of repetitive work - similar to collideAABBs, but doesn't check
 232 *  if AABBs intersect (because SAP returns pairs with overlapping AABBs).
 233 */
 234 static void collideGeomsNoAABBs( dxGeom *g1, dxGeom *g2, void *data, dNearCallback *callback )
 235 {
 236     dIASSERT( (g1->gflags & GEOM_AABB_BAD)==0 );
 237     dIASSERT( (g2->gflags & GEOM_AABB_BAD)==0 );
 238
 239     // no contacts if both geoms on the same body, and the body is not 0
 240     if (g1->body == g2->body && g1->body) return;
 241
 242     // test if the category and collide bitfields match
 243     if ( ((g1->category_bits & g2->collide_bits) ||
 244         (g2->category_bits & g1->collide_bits)) == 0) {
 245             return;
 246     }
 247
 248     dReal *bounds1 = g1->aabb;
 249     dReal *bounds2 = g2->aabb;
 250
 251     // check if either object is able to prove that it doesn't intersect the
 252     // AABB of the other
 253     if (g1->AABBTest (g2,bounds2) == 0) return;
 254     if (g2->AABBTest (g1,bounds1) == 0) return;
 255
 256     // the objects might actually intersect - call the space callback function
 257     callback (data,g1,g2);
 258 }
 259
 260
 261 dxSAPSpace::dxSAPSpace( dSpaceID _space, int axisorder ) : dxSpace( _space )
 262 {
 263     type = dSweepAndPruneSpaceClass;
 264
 265     // Init AABB to infinity
 266     aabb[0] = -dInfinity;
 267     aabb[1] = dInfinity;
 268     aabb[2] = -dInfinity;
 269     aabb[3] = dInfinity;
 270     aabb[4] = -dInfinity;
 271     aabb[5] = dInfinity;
 272
 273     ax0idx = ( ( axisorder ) & 3 ) << 1;
 274     ax1idx = ( ( axisorder >> 2 ) & 3 ) << 1;
 275     ax2idx = ( ( axisorder >> 4 ) & 3 ) << 1;
 276 }
 277
 278 dxSAPSpace::~dxSAPSpace()
 279 {
 280     CHECK_NOT_LOCKED(this);
 281     if ( cleanup ) {
 282         // note that destroying each geom will call remove()
 283         for ( ; DirtyList.size(); dGeomDestroy( DirtyList[ 0 ] ) ) {}
 284         for ( ; GeomList.size(); dGeomDestroy( GeomList[ 0 ] ) ) {}
 285     }
 286     else {
 287         // just unhook them
 288         for ( ; DirtyList.size(); remove( DirtyList[ 0 ] ) ) {}
 289         for ( ; GeomList.size(); remove( GeomList[ 0 ] ) ) {}
 290     }
 291 }
 292
 293 dxGeom* dxSAPSpace::getGeom( int i )
 294 {
 295     dUASSERT( i >= 0 && i < count, "index out of range" );
 296     int dirtySize = DirtyList.size();
 297     if( i < dirtySize )
 298         return DirtyList[i];
 299     else
 300         return GeomList[i-dirtySize];
 301 }
 302
 303 void dxSAPSpace::add( dxGeom* g )
 304 {
 305     CHECK_NOT_LOCKED (this);
 306     dAASSERT(g);
 307     dUASSERT(g->tome_ex == 0 && g->next_ex == 0, "geom is already in a space");
 308
 309
 310     // add to dirty list
 311     GEOM_SET_DIRTY_IDX( g, DirtyList.size() );
 312     GEOM_SET_GEOM_IDX( g, GEOM_INVALID_IDX );
 313     DirtyList.push( g );
 314
 315     dxSpace::add(g);
 316 }
 317
 318 void dxSAPSpace::remove( dxGeom* g )
 319 {
 320     CHECK_NOT_LOCKED(this);
 321     dAASSERT(g);
 322     dUASSERT(g->parent_space == this,"object is not in this space");
 323
 324     // remove
 325     int dirtyIdx = GEOM_GET_DIRTY_IDX(g);
 326     int geomIdx = GEOM_GET_GEOM_IDX(g);
 327     // must be in one list, not in both
 328     dUASSERT(
 329         (dirtyIdx==GEOM_INVALID_IDX && geomIdx>=0 && geomIdx<GeomList.size()) ||
 330         (geomIdx==GEOM_INVALID_IDX && dirtyIdx>=0 && dirtyIdx<DirtyList.size()),
 331         "geom indices messed up" );
 332     if( dirtyIdx != GEOM_INVALID_IDX ) {
 333         // we're in dirty list, remove
 334         int dirtySize = DirtyList.size();
 335         if (dirtyIdx != dirtySize-1) {
 336             dxGeom* lastG = DirtyList[dirtySize-1];
 337             DirtyList[dirtyIdx] = lastG;
 338             GEOM_SET_DIRTY_IDX(lastG,dirtyIdx);
 339         }
 340         GEOM_SET_DIRTY_IDX(g,GEOM_INVALID_IDX);
 341         DirtyList.setSize( dirtySize-1 );
 342     } else {
 343         // we're in geom list, remove
 344         int geomSize = GeomList.size();
 345         if (geomIdx != geomSize-1) {
 346             dxGeom* lastG = GeomList[geomSize-1];
 347             GeomList[geomIdx] = lastG;
 348             GEOM_SET_GEOM_IDX(lastG,geomIdx);
 349         }
 350         GEOM_SET_GEOM_IDX(g,GEOM_INVALID_IDX);
 351         GeomList.setSize( geomSize-1 );
 352     }
 353
 354     g->tome_ex = 0;
 355     dUASSERT((g->next_ex = 0, true), "Needed for an assertion check only");
 356
 357     dxSpace::remove(g);
 358 }
 359
 360 void dxSAPSpace::dirty( dxGeom* g )
 361 {
 362     dAASSERT(g);
 363     dUASSERT(g->parent_space == this, "object is not in this space");
 364
 365     // check if already dirtied
 366     int dirtyIdx = GEOM_GET_DIRTY_IDX(g);
 367     if( dirtyIdx != GEOM_INVALID_IDX )
 368         return;
 369
 370     int geomIdx = GEOM_GET_GEOM_IDX(g);
 371     dUASSERT( geomIdx>=0 && geomIdx<GeomList.size(), "geom indices messed up" );
 372
 373     // remove from geom list, place last in place of this
 374     int geomSize = GeomList.size();
 375     if (geomIdx != geomSize-1) {
 376         dxGeom* lastG = GeomList[geomSize-1];
 377         GeomList[geomIdx] = lastG;
 378         GEOM_SET_GEOM_IDX(lastG,geomIdx);
 379     }
 380     GeomList.setSize( geomSize-1 );
 381
 382     // add to dirty list
 383     GEOM_SET_GEOM_IDX( g, GEOM_INVALID_IDX );
 384     GEOM_SET_DIRTY_IDX( g, DirtyList.size() );
 385     DirtyList.push( g );
 386 }
 387
 388 void dxSAPSpace::computeAABB()
 389 {
 390     // TODO?
 391 }
 392
 393 void dxSAPSpace::cleanGeoms()
 394 {
 395     int dirtySize = DirtyList.size();
 396     if( !dirtySize )
 397         return;
 398
 399     // compute the AABBs of all dirty geoms, clear the dirty flags,
 400     // remove from dirty list, place into geom list
 401     lock_count++;
 402
 403     int geomSize = GeomList.size();
 404     GeomList.setSize( geomSize + dirtySize ); // ensure space in geom list
 405
 406     for( int i = 0; i < dirtySize; ++i ) {
 407         dxGeom* g = DirtyList[i];
 408         if( IS_SPACE(g) ) {
 409             ((dxSpace*)g)->cleanGeoms();
 410         }
 411
 412         g->recomputeAABB();
 413         dIASSERT((g->gflags & GEOM_AABB_BAD) == 0);
 414
 415         g->gflags &= ~GEOM_DIRTY;
 416
 417         // remove from dirty list, add to geom list
 418         GEOM_SET_DIRTY_IDX( g, GEOM_INVALID_IDX );
 419         GEOM_SET_GEOM_IDX( g, geomSize + i );
 420         GeomList[geomSize+i] = g;
 421     }
 422     // clear dirty list
 423     DirtyList.setSize( 0 );
 424
 425     lock_count--;
 426 }
 427
 428 void dxSAPSpace::collide( void *data, dNearCallback *callback )
 429 {
 430     dAASSERT (callback);
 431
 432     lock_count++;
 433
 434     cleanGeoms();
 435
 436     // by now all geoms are in GeomList, and DirtyList must be empty
 437     int geom_count = GeomList.size();
 438     dUASSERT( geom_count == count, "geom counts messed up" );
 439
 440     // separate all ENABLED geoms into infinite AABBs and normal AABBs
 441     TmpGeomList.setSize(0);
 442     TmpInfGeomList.setSize(0);
 443     int axis0max = ax0idx + 1;
 444     for( int i = 0; i < geom_count; ++i ) {
 445         dxGeom* g = GeomList[i];
 446         if( !GEOM_ENABLED(g) ) // skip disabled ones
 447             continue;
 448         const dReal& amax = g->aabb[axis0max];
 449         if( amax == dInfinity ) // HACK? probably not...
 450             TmpInfGeomList.push( g );
 451         else
 452             TmpGeomList.push( g );
 453     }
 454
 455     // do SAP on normal AABBs
 456     dArray< Pair > overlapBoxes;
 457     int tmp_geom_count = TmpGeomList.size();
 458     if ( tmp_geom_count > 0 )
 459     {
 460         // Generate a list of overlapping boxes
 461         BoxPruning( tmp_geom_count, (const dxGeom**)TmpGeomList.data(), overlapBoxes );
 462     }
 463
 464     // collide overlapping
 465     int overlapCount = overlapBoxes.size();
 466     for( int j = 0; j < overlapCount; ++j )
 467     {
 468         const Pair& pair = overlapBoxes[ j ];
 469         dxGeom* g1 = TmpGeomList[ pair.id0 ];
 470         dxGeom* g2 = TmpGeomList[ pair.id1 ];
 471         collideGeomsNoAABBs( g1, g2, data, callback );
 472     }
 473
 474     int infSize = TmpInfGeomList.size();
 475     int normSize = TmpGeomList.size();
 476     int m, n;
 477
 478     for ( m = 0; m < infSize; ++m )
 479     {
 480         dxGeom* g1 = TmpInfGeomList[ m ];
 481
 482         // collide infinite ones
 483         for( n = m+1; n < infSize; ++n ) {
 484             dxGeom* g2 = TmpInfGeomList[n];
 485             collideGeomsNoAABBs( g1, g2, data, callback );
 486         }
 487
 488         // collide infinite ones with normal ones
 489         for( n = 0; n < normSize; ++n ) {
 490             dxGeom* g2 = TmpGeomList[n];
 491             collideGeomsNoAABBs( g1, g2, data, callback );
 492         }
 493     }
 494
 495     lock_count--;
 496 }
 497
 498 void dxSAPSpace::collide2( void *data, dxGeom *geom, dNearCallback *callback )
 499 {
 500     dAASSERT (geom && callback);
 501
 502     // TODO: This is just a simple N^2 implementation
 503
 504     lock_count++;
 505
 506     cleanGeoms();
 507     geom->recomputeAABB();
 508
 509     // intersect bounding boxes
 510     int geom_count = GeomList.size();
 511     for ( int i = 0; i < geom_count; ++i ) {
 512         dxGeom* g = GeomList[i];
 513         if ( GEOM_ENABLED(g) )
 514             collideAABBs (g,geom,data,callback);
 515     }
 516
 517     lock_count--;
 518 }
 519
 520
 521 void dxSAPSpace::BoxPruning( int count, const dxGeom** geoms, dArray< Pair >& pairs )
 522 {
 523     // Size the poslist (+1 for infinity end cap)
 524     poslist.setSize( count );
 525
 526     // 1) Build main list using the primary axis
 527     //  NOTE: uses floats instead of dReals because that's what radix sort wants
 528     for( int i = 0; i < count; ++i )
 529         poslist[ i ] = (float)TmpGeomList[i]->aabb[ ax0idx ];
 530
 531     // 2) Sort the list
 532     const uint32* Sorted = sortContext.RadixSort( poslist.data(), count );
 533
 534     // 3) Prune the list
 535     const uint32* const LastSorted = Sorted + count;
 536     const uint32* RunningAddress = Sorted;
 537
 538     bool bExitLoop;
 539     Pair IndexPair;
 540     while ( Sorted < LastSorted )
 541     {
 542         IndexPair.id0 = *Sorted++;
 543
 544         // empty, this loop just advances RunningAddress
 545         for (bExitLoop = false; poslist[*RunningAddress++] < poslist[IndexPair.id0]; )
 546         {
 547             if (RunningAddress == LastSorted)
 548             {
 549                 bExitLoop = true;
 550                 break;
 551             }
 552         }
 553
 554         if ( bExitLoop || RunningAddress == LastSorted) // Not a bug!!!
 555         {
 556             break;
 557         }
 558
 559         const float idx0ax0max = (float)geoms[IndexPair.id0]->aabb[ax0idx+1]; // To avoid wrong decisions caused by rounding errors, cast the AABB element to float similarly as we did at the function beginning
 560         const dReal idx0ax1max = geoms[IndexPair.id0]->aabb[ax1idx+1];
 561         const dReal idx0ax2max = geoms[IndexPair.id0]->aabb[ax2idx+1];
 562
 563         for (const uint32* RunningAddress2 = RunningAddress; poslist[ IndexPair.id1 = *RunningAddress2++ ] <= idx0ax0max; )
 564         {
 565             const dReal* aabb0 = geoms[ IndexPair.id0 ]->aabb;
 566             const dReal* aabb1 = geoms[ IndexPair.id1 ]->aabb;
 567
 568             // Intersection?
 569             if ( idx0ax1max >= aabb1[ax1idx] && aabb1[ax1idx+1] >= aabb0[ax1idx]
 570                 && idx0ax2max >= aabb1[ax2idx] && aabb1[ax2idx+1] >= aabb0[ax2idx] )
 571             {
 572                 pairs.push( IndexPair );
 573             }
 574
 575             if (RunningAddress2 == LastSorted)
 576             {
 577                 break;
 578             }
 579         }
 580
 581     } // while ( RunningAddress < LastSorted && Sorted < LastSorted )
 582 }
 583
 584
 585 //==============================================================================
 586
 587 //------------------------------------------------------------------------------
 588 // Radix Sort
 589 //------------------------------------------------------------------------------
 590
 591
 592
 593 #define CHECK_PASS_VALIDITY(pass)                                                                                                                       \
 594     /* Shortcut to current counters */                                                                                                          \
 595     const uint32* CurCount = &mHistogram[pass<<8];                                                                                              \
 596     \
 597     /* Reset flag. The sorting pass is supposed to be performed. (default) */                           \
 598     bool PerformPass = true;                                                                                                                            \
 599     \
 600     /* Check pass validity */                                                                                                                           \
 601     \
 602     /* If all values have the same byte, sorting is useless. */                                                         \
 603     /* It may happen when sorting bytes or words instead of dwords. */                                          \
 604     /* This routine actually sorts words faster than dwords, and bytes */                                       \
 605     /* faster than words. Standard running time (O(4*n))is reduced to O(2*n) */                         \
 606     /* for words and O(n) for bytes. Running time for floats depends on actual values... */     \
 607     \
 608     /* Get first byte */                                                                                                                                        \
 609     uint8 UniqueVal = *(((const uint8*)input)+pass);                                                                                            \
 610     \
 611     /* Check that byte's counter */                                                                                                                     \
 612     if(CurCount[UniqueVal]==nb) PerformPass=false;
 613
 614 // WARNING ONLY SORTS IEEE FLOATING-POINT VALUES
 615 const uint32* RaixSortContext::RadixSort( const float* input2, uint32 nb )
 616 {
 617     union _type_cast_union
 618     {
 619         _type_cast_union(const float *floats): asFloats(floats) {}
 620         _type_cast_union(const uint32 *uints32): asUInts32(uints32) {}
 621
 622         const float *asFloats;
 623         const uint32 *asUInts32;
 624         const uint8 *asUInts8;
 625     };
 626
 627     const uint32* input = _type_cast_union(input2).asUInts32;
 628
 629     // Resize lists if needed
 630     ReallocateRanksIfNecessary(nb);
 631
 632     // Allocate histograms & offsets on the stack
 633     uint32 mHistogram[256*4];
 634     uint32* mLink[256];
 635
 636     // Create histograms (counters). Counters for all passes are created in one run.
 637     // Pros:    read input buffer once instead of four times
 638     // Cons:    mHistogram is 4Kb instead of 1Kb
 639     // Floating-point values are always supposed to be signed values, so there's only one code path there.
 640     // Please note the floating point comparison needed for temporal coherence! Although the resulting asm code
 641     // is dreadful, this is surprisingly not such a performance hit - well, I suppose that's a big one on first
 642     // generation Pentiums....We can't make comparison on integer representations because, as Chris said, it just
 643     // wouldn't work with mixed positive/negative values....
 644     {
 645         /* Clear counters/histograms */
 646         memset(mHistogram, 0, 256*4*sizeof(uint32));
 647
 648         /* Prepare to count */
 649         const uint8* p = _type_cast_union(input).asUInts8;
 650         const uint8* pe = &p[nb*4];
 651         uint32* h0= &mHistogram[0];             /* Histogram for first pass (LSB)       */
 652         uint32* h1= &mHistogram[256];   /* Histogram for second pass            */
 653         uint32* h2= &mHistogram[512];   /* Histogram for third pass                     */
 654         uint32* h3= &mHistogram[768];   /* Histogram for last pass (MSB)        */
 655
 656         bool AlreadySorted = true;      /* Optimism... */
 657
 658         if (!AreRanksValid())
 659         {
 660             /* Prepare for temporal coherence */
 661             const float* Running = input2;
 662             float PrevVal = *Running;
 663
 664             while(p!=pe)
 665             {
 666                 /* Read input input2 in previous sorted order */
 667                 float Val = *Running++;
 668                 /* Check whether already sorted or not */
 669                 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */
 670                 /* Update for next iteration */
 671                 PrevVal = Val;
 672
 673                 /* Create histograms */
 674                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;
 675             }
 676
 677             /* If all input values are already sorted, we just have to return and leave the */
 678             /* previous list unchanged. That way the routine may take advantage of temporal */
 679             /* coherence, for example when used to sort transparent faces.                                      */
 680             if(AlreadySorted)
 681             {
 682                 uint32* const Ranks1 = GetRanks1();
 683                 for(uint32 i=0;i<nb;i++)        Ranks1[i] = i;
 684                 return Ranks1;
 685             }
 686         }
 687         else
 688         {
 689             /* Prepare for temporal coherence */
 690             uint32* const Ranks1 = GetRanks1();
 691
 692             uint32* Indices = Ranks1;
 693             float PrevVal = input2[*Indices];
 694
 695             while(p!=pe)
 696             {
 697                 /* Read input input2 in previous sorted order */
 698                 float Val = input2[*Indices++];
 699                 /* Check whether already sorted or not */
 700                 if(Val<PrevVal) { AlreadySorted = false; break; } /* Early out */
 701                 /* Update for next iteration */
 702                 PrevVal = Val;
 703
 704                 /* Create histograms */
 705                 h0[*p++]++;     h1[*p++]++;     h2[*p++]++;     h3[*p++]++;
 706             }
 707
 708             /* If all input values are already sorted, we just have to return and leave the */
 709             /* previous list unchanged. That way the routine may take advantage of temporal */
 710             /* coherence, for example when used to sort transparent faces.                                      */
 711             if(AlreadySorted)   { return Ranks1;        }
 712         }
 713
 714         /* Else there has been an early out and we must finish computing the histograms */
 715         while(p!=pe)
 716         {
 717             /* Create histograms without the previous overhead */
 718             h0[*p++]++; h1[*p++]++;     h2[*p++]++;     h3[*p++]++;
 719         }
 720     }
 721
 722     // Compute #negative values involved if needed
 723     uint32 NbNegativeValues = 0;
 724
 725     // An efficient way to compute the number of negatives values we'll have to deal with is simply to sum the 128
 726     // last values of the last histogram. Last histogram because that's the one for the Most Significant Byte,
 727     // responsible for the sign. 128 last values because the 128 first ones are related to positive numbers.
 728     uint32* h3= &mHistogram[768];
 729     for(uint32 i=128;i<256;i++) NbNegativeValues += h3[i];      // 768 for last histogram, 128 for negative part
 730
 731     // Radix sort, j is the pass number (0=LSB, 3=MSB)
 732     for(uint32 j=0;j<4;j++)
 733     {
 734         // Should we care about negative values?
 735         if(j!=3)
 736         {
 737             // Here we deal with positive values only
 738             CHECK_PASS_VALIDITY(j);
 739
 740             if(PerformPass)
 741             {
 742                 uint32* const Ranks2 = GetRanks2();
 743                 // Create offsets
 744                 mLink[0] = Ranks2;
 745                 for(uint32 i=1;i<256;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];
 746
 747                 // Perform Radix Sort
 748                 const uint8* InputBytes = _type_cast_union(input).asUInts8;
 749                 InputBytes += j;
 750                 if (!AreRanksValid())
 751                 {
 752                     for(uint32 i=0;i<nb;i++)
 753                     {
 754                         *mLink[InputBytes[i<<2]]++ = i;
 755                     }
 756
 757                     ValidateRanks();
 758                 }
 759                 else
 760                 {
 761                     uint32* const Ranks1 = GetRanks1();
 762
 763                     uint32* Indices                             = Ranks1;
 764                     uint32* const IndicesEnd    = Ranks1 + nb;
 765                     while(Indices!=IndicesEnd)
 766                     {
 767                         uint32 id = *Indices++;
 768                         *mLink[InputBytes[id<<2]]++ = id;
 769                     }
 770                 }
 771
 772                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 773                 SwapRanks();
 774             }
 775         }
 776         else
 777         {
 778             // This is a special case to correctly handle negative values
 779             CHECK_PASS_VALIDITY(j);
 780
 781             if(PerformPass)
 782             {
 783                 uint32* const Ranks2 = GetRanks2();
 784
 785                 // Create biased offsets, in order for negative numbers to be sorted as well
 786                 mLink[0] = Ranks2 + NbNegativeValues;                                                                           // First positive number takes place after the negative ones
 787                 for(uint32 i=1;i<128;i++)               mLink[i] = mLink[i-1] + CurCount[i-1];          // 1 to 128 for positive numbers
 788
 789                 // We must reverse the sorting order for negative numbers!
 790                 mLink[255] = Ranks2;
 791                 for(uint32 i=0;i<127;i++)       mLink[254-i] = mLink[255-i] + CurCount[255-i];          // Fixing the wrong order for negative values
 792                 for(uint32 i=128;i<256;i++)     mLink[i] += CurCount[i];                                                        // Fixing the wrong place for negative values
 793
 794                 // Perform Radix Sort
 795                 if (!AreRanksValid())
 796                 {
 797                     for(uint32 i=0;i<nb;i++)
 798                     {
 799                         uint32 Radix = input[i]>>24;                                                    // Radix byte, same as above. AND is useless here (uint32).
 800                         // ### cmp to be killed. Not good. Later.
 801                         if(Radix<128)           *mLink[Radix]++ = i;            // Number is positive, same as above
 802                         else                            *(--mLink[Radix]) = i;          // Number is negative, flip the sorting order
 803                     }
 804
 805                     ValidateRanks();
 806                 }
 807                 else
 808                 {
 809                     uint32* const Ranks1 = GetRanks1();
 810
 811                     for(uint32 i=0;i<nb;i++)
 812                     {
 813                         uint32 Radix = input[Ranks1[i]]>>24;                                                    // Radix byte, same as above. AND is useless here (uint32).
 814                         // ### cmp to be killed. Not good. Later.
 815                         if(Radix<128)           *mLink[Radix]++ = Ranks1[i];            // Number is positive, same as above
 816                         else                            *(--mLink[Radix]) = Ranks1[i];          // Number is negative, flip the sorting order
 817                     }
 818                 }
 819                 // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 820                 SwapRanks();
 821             }
 822             else
 823             {
 824                 // The pass is useless, yet we still have to reverse the order of current list if all values are negative.
 825                 if(UniqueVal>=128)
 826                 {
 827                     if (!AreRanksValid())
 828                     {
 829                         uint32* const Ranks2 = GetRanks2();
 830                         // ###Possible?
 831                         for(uint32 i=0;i<nb;i++)
 832                         {
 833                             Ranks2[i] = nb-i-1;
 834                         }
 835
 836                         ValidateRanks();
 837                     }
 838                     else
 839                     {
 840                         uint32* const Ranks1 = GetRanks1();
 841                         uint32* const Ranks2 = GetRanks2();
 842                         for(uint32 i=0;i<nb;i++)        Ranks2[i] = Ranks1[nb-i-1];
 843                     }
 844
 845                     // Swap pointers for next pass. Valid indices - the most recent ones - are in mRanks after the swap.
 846                     SwapRanks();
 847                 }
 848             }
 849         }
 850     }
 851
 852     // Return indices
 853     uint32* const Ranks1 = GetRanks1();
 854     return Ranks1;
 855 }
 856