ode/src/lcp.cpp

   1 /*************************************************************************
   2  *                                                                       *
   3  * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
   4  * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   5  *                                                                       *
   6  * This library is free software; you can redistribute it and/or         *
   7  * modify it under the terms of EITHER:                                  *
   8  *   (1) The GNU Lesser General Public License as published by the Free  *
   9  *       Software Foundation; either version 2.1 of the License, or (at  *
  10  *       your option) any later version. The text of the GNU Lesser      *
  11  *       General Public License is included with this library in the     *
  12  *       file LICENSE.TXT.                                               *
  13  *   (2) The BSD-style license that is included with this library in     *
  14  *       the file LICENSE-BSD.TXT.                                       *
  15  *                                                                       *
  16  * This library is distributed in the hope that it will be useful,       *
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  19  * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  20  *                                                                       *
  21  *************************************************************************/
  22
  23 /*
  24
  25
  26 THE ALGORITHM
  27 -------------
  28
  29 solve A*x = b+w, with x and w subject to certain LCP conditions.
  30 each x(i),w(i) must lie on one of the three line segments in the following
  31 diagram. each line segment corresponds to one index set :
  32
  33      w(i)
  34      /|\      |           :
  35       |       |           :
  36       |       |i in N     :
  37   w>0 |       |state[i]=0 :
  38       |       |           :
  39       |       |           :  i in C
  40   w=0 +       +-----------------------+
  41       |                   :           |
  42       |                   :           |
  43   w<0 |                   :           |i in N
  44       |                   :           |state[i]=1
  45       |                   :           |
  46       |                   :           |
  47       +-------|-----------|-----------|----------> x(i)
  48              lo           0           hi
  49
  50 the Dantzig algorithm proceeds as follows:
  51   for i=1:n
  52     * if (x(i),w(i)) is not on the line, push x(i) and w(i) positive or
  53       negative towards the line. as this is done, the other (x(j),w(j))
  54       for j<i are constrained to be on the line. if any (x,w) reaches the
  55       end of a line segment then it is switched between index sets.
  56     * i is added to the appropriate index set depending on what line segment
  57       it hits.
  58
  59 we restrict lo(i) <= 0 and hi(i) >= 0. this makes the algorithm a bit
  60 simpler, because the starting point for x(i),w(i) is always on the dotted
  61 line x=0 and x will only ever increase in one direction, so it can only hit
  62 two out of the three line segments.
  63
  64
  65 NOTES
  66 -----
  67
  68 this is an implementation of "lcp_dantzig2_ldlt.m" and "lcp_dantzig_lohi.m".
  69 the implementation is split into an LCP problem object (dLCP) and an LCP
  70 driver function. most optimization occurs in the dLCP object.
  71
  72 a naive implementation of the algorithm requires either a lot of data motion
  73 or a lot of permutation-array lookup, because we are constantly re-ordering
  74 rows and columns. to avoid this and make a more optimized algorithm, a
  75 non-trivial data structure is used to represent the matrix A (this is
  76 implemented in the fast version of the dLCP object).
  77
  78 during execution of this algorithm, some indexes in A are clamped (set C),
  79 some are non-clamped (set N), and some are "don't care" (where x=0).
  80 A,x,b,w (and other problem vectors) are permuted such that the clamped
  81 indexes are first, the unclamped indexes are next, and the don't-care
  82 indexes are last. this permutation is recorded in the array `p'.
  83 initially p = 0..n-1, and as the rows and columns of A,x,b,w are swapped,
  84 the corresponding elements of p are swapped.
  85
  86 because the C and N elements are grouped together in the rows of A, we can do
  87 lots of work with a fast dot product function. if A,x,etc were not permuted
  88 and we only had a permutation array, then those dot products would be much
  89 slower as we would have a permutation array lookup in some inner loops.
  90
  91 A is accessed through an array of row pointers, so that element (i,j) of the
  92 permuted matrix is A[i][j]. this makes row swapping fast. for column swapping
  93 we still have to actually move the data.
  94
  95 during execution of this algorithm we maintain an L*D*L' factorization of
  96 the clamped submatrix of A (call it `AC') which is the top left nC*nC
  97 submatrix of A. there are two ways we could arrange the rows/columns in AC.
  98
  99 (1) AC is always permuted such that L*D*L' = AC. this causes a problem
 100 when a row/column is removed from C, because then all the rows/columns of A
 101 between the deleted index and the end of C need to be rotated downward.
 102 this results in a lot of data motion and slows things down.
 103 (2) L*D*L' is actually a factorization of a *permutation* of AC (which is
 104 itself a permutation of the underlying A). this is what we do - the
 105 permutation is recorded in the vector C. call this permutation A[C,C].
 106 when a row/column is removed from C, all we have to do is swap two
 107 rows/columns and manipulate C.
 108
 109 */
 110
 111 #include <ode/common.h>
 112 #include <ode/misc.h>
 113 #include <ode/timer.h>          // for testing
 114 #include "config.h"
 115 #include "lcp.h"
 116 #include "util.h"
 117 #include "matrix.h"
 118 #include "mat.h"                // for testing
 119 #include "threaded_solver_ldlt.h"
 120
 121 #include "fastdot_impl.h"
 122 #include "fastldltfactor_impl.h"
 123 #include "fastldltsolve_impl.h"
 124
 125
 126 //***************************************************************************
 127 // code generation parameters
 128
 129 // LCP debugging (mostly for fast dLCP) - this slows things down a lot
 130 //#define DEBUG_LCP
 131
 132 #define dLCP_FAST               // use fast dLCP object
 133
 134 #define NUB_OPTIMIZATIONS // use NUB optimizations
 135
 136
 137 // option 1 : matrix row pointers (less data copying)
 138 #define ROWPTRS
 139 #define ATYPE dReal **
 140 #define AROW(i) (m_A[i])
 141
 142 // option 2 : no matrix row pointers (slightly faster inner loops)
 143 //#define NOROWPTRS
 144 //#define ATYPE dReal *
 145 //#define AROW(i) (m_A+(i)*m_nskip)
 146
 147
 148 //***************************************************************************
 149
 150 #define dMIN(A,B)  ((A)>(B) ? (B) : (A))
 151 #define dMAX(A,B)  ((B)>(A) ? (B) : (A))
 152
 153
 154 #define LMATRIX_ALIGNMENT       dMAX(64, EFFICIENT_ALIGNMENT)
 155
 156 //***************************************************************************
 157
 158
 159 // transfer b-values to x-values
 160 template<bool zero_b>
 161 inline
 162 void transfer_b_to_x(dReal pairsbx[PBX__MAX], unsigned n)
 163 {
 164     dReal *const endbx = pairsbx + (sizeint)n * PBX__MAX;
 165     for (dReal *currbx = pairsbx; currbx != endbx; currbx += PBX__MAX) {
 166         currbx[PBX_X] = currbx[PBX_B];
 167         if (zero_b) {
 168             currbx[PBX_B] = REAL(0.0);
 169         }
 170     }
 171 }
 172
 173 // swap row/column i1 with i2 in the n*n matrix A. the leading dimension of
 174 // A is nskip. this only references and swaps the lower triangle.
 175 // if `do_fast_row_swaps' is nonzero and row pointers are being used, then
 176 // rows will be swapped by exchanging row pointers. otherwise the data will
 177 // be copied.
 178
 179 static
 180 void swapRowsAndCols (ATYPE A, unsigned n, unsigned i1, unsigned i2, unsigned nskip,
 181                              int do_fast_row_swaps)
 182 {
 183     dAASSERT (A && n > 0 && i1 >= 0 && i2 >= 0 && i1 < n && i2 < n &&
 184         nskip >= n && i1 < i2);
 185
 186 # ifdef ROWPTRS
 187     dReal *A_i1 = A[i1];
 188     dReal *A_i2 = A[i2];
 189     for (unsigned i=i1+1; i<i2; ++i) {
 190         dReal *A_i_i1 = A[i] + i1;
 191         A_i1[i] = *A_i_i1;
 192         *A_i_i1 = A_i2[i];
 193     }
 194     A_i1[i2] = A_i1[i1];
 195     A_i1[i1] = A_i2[i1];
 196     A_i2[i1] = A_i2[i2];
 197     // swap rows, by swapping row pointers
 198     if (do_fast_row_swaps) {
 199         A[i1] = A_i2;
 200         A[i2] = A_i1;
 201     }
 202     else {
 203         // Only swap till i2 column to match A plain storage variant.
 204         for (unsigned k = 0; k <= i2; ++k) {
 205             dxSwap(A_i1[k], A_i2[k]);
 206         }
 207     }
 208     // swap columns the hard way
 209     for (unsigned j = i2 + 1; j < n; ++j) {
 210         dReal *A_j = A[j];
 211         dxSwap(A_j[i1], A_j[i2]);
 212     }
 213 # else
 214     dReal *A_i1 = A + (sizeint)nskip * i1;
 215     dReal *A_i2 = A + (sizeint)nskip * i2;
 216
 217     for (unsigned k = 0; k < i1; ++k) {
 218         dxSwap(A_i1[k], A_i2[k]);
 219     }
 220
 221     dReal *A_i = A_i1 + nskip;
 222     for (unsigned i= i1 + 1; i < i2; A_i += nskip, ++i) {
 223         dxSwap(A_i2[i], A_i[i1]);
 224     }
 225
 226     dxSwap(A_i1[i1], A_i2[i2]);
 227
 228     dReal *A_j = A_i2 + nskip;
 229     for (unsigned j = i2 + 1; j < n; A_j += nskip, ++j) {
 230         dxSwap(A_j[i1], A_j[i2]);
 231     }
 232 # endif
 233 }
 234
 235
 236 // swap two indexes in the n*n LCP problem. i1 must be <= i2.
 237
 238 static
 239 void swapProblem (ATYPE A, dReal pairsbx[PBX__MAX], dReal *w, dReal pairslh[PLH__MAX],
 240                          unsigned *p, bool *state, int *findex,
 241                          unsigned n, unsigned i1, unsigned i2, unsigned nskip,
 242                          int do_fast_row_swaps)
 243 {
 244     dIASSERT (n>0 && i1 < n && i2 < n && nskip >= n && i1 <= i2);
 245
 246     if (i1 != i2) {
 247         swapRowsAndCols (A, n, i1, i2, nskip, do_fast_row_swaps);
 248
 249         dxSwap((pairsbx + (sizeint)i1 * PBX__MAX)[PBX_B], (pairsbx + (sizeint)i2 * PBX__MAX)[PBX_B]);
 250         dxSwap((pairsbx + (sizeint)i1 * PBX__MAX)[PBX_X], (pairsbx + (sizeint)i2 * PBX__MAX)[PBX_X]);
 251         dSASSERT(PBX__MAX == 2);
 252
 253         dxSwap(w[i1], w[i2]);
 254
 255         dxSwap((pairslh + (sizeint)i1 * PLH__MAX)[PLH_LO], (pairslh + (sizeint)i2 * PLH__MAX)[PLH_LO]);
 256         dxSwap((pairslh + (sizeint)i1 * PLH__MAX)[PLH_HI], (pairslh + (sizeint)i2 * PLH__MAX)[PLH_HI]);
 257         dSASSERT(PLH__MAX == 2);
 258
 259         dxSwap(p[i1], p[i2]);
 260         dxSwap(state[i1], state[i2]);
 261
 262         if (findex != NULL) {
 263             dxSwap(findex[i1], findex[i2]);
 264         }
 265     }
 266 }
 267
 268
 269 // for debugging - check that L,d is the factorization of A[C,C].
 270 // A[C,C] has size nC*nC and leading dimension nskip.
 271 // L has size nC*nC and leading dimension nskip.
 272 // d has size nC.
 273
 274 #ifdef DEBUG_LCP
 275
 276 static
 277 void checkFactorization (ATYPE A, dReal *_L, dReal *_d,
 278                                 unsigned nC, unsigned *C, unsigned nskip)
 279 {
 280     unsigned i, j;
 281     if (nC == 0) return;
 282
 283     // get A1=A, copy the lower triangle to the upper triangle, get A2=A[C,C]
 284     dMatrix A1 (nC, nC);
 285     for (i=0; i < nC; i++) {
 286         for (j = 0; j <= i; j++) A1(i, j) = A1(j, i) = AROW(i)[j];
 287     }
 288     dMatrix A2 = A1.select (nC, C, nC, C);
 289
 290     // printf ("A1=\n"); A1.print(); printf ("\n");
 291     // printf ("A2=\n"); A2.print(); printf ("\n");
 292
 293     // compute A3 = L*D*L'
 294     dMatrix L (nC, nC, _L, nskip, 1);
 295     dMatrix D (nC, nC);
 296     for (i = 0; i < nC; i++) D(i, i) = 1.0 / _d[i];
 297     L.clearUpperTriangle();
 298     for (i = 0; i < nC; i++) L(i, i) = 1;
 299     dMatrix A3 = L * D * L.transpose();
 300
 301     // printf ("L=\n"); L.print(); printf ("\n");
 302     // printf ("D=\n"); D.print(); printf ("\n");
 303     // printf ("A3=\n"); A2.print(); printf ("\n");
 304
 305     // compare A2 and A3
 306     dReal diff = A2.maxDifference (A3);
 307     if (diff > 1e-8)
 308         dDebug (0, "L*D*L' check, maximum difference = %.6e\n", diff);
 309 }
 310
 311 #endif
 312
 313
 314 // for debugging
 315
 316 #ifdef DEBUG_LCP
 317
 318 static
 319 void checkPermutations (unsigned i, unsigned n, unsigned nC, unsigned nN, unsigned *p, unsigned *C)
 320 {
 321     unsigned j,k;
 322     dIASSERT (/*nC >= 0 && nN >= 0 && */(nC + nN) == i && i < n);
 323     for (k=0; k<i; k++) dIASSERT (p[k] >= 0 && p[k] < i);
 324     for (k=i; k<n; k++) dIASSERT (p[k] == k);
 325     for (j=0; j<nC; j++) {
 326         int C_is_bad = 1;
 327         for (k=0; k<nC; k++) if (C[k]==j) C_is_bad = 0;
 328         dIASSERT (C_is_bad==0);
 329     }
 330 }
 331
 332 #endif
 333
 334 //***************************************************************************
 335 // dLCP manipulator object. this represents an n*n LCP problem.
 336 //
 337 // two index sets C and N are kept. each set holds a subset of
 338 // the variable indexes 0..n-1. an index can only be in one set.
 339 // initially both sets are empty.
 340 //
 341 // the index set C is special: solutions to A(C,C)\A(C,i) can be generated.
 342
 343 //***************************************************************************
 344 // fast implementation of dLCP. see the above definition of dLCP for
 345 // interface comments.
 346 //
 347 // `p' records the permutation of A,x,b,w,etc. p is initially 1:n and is
 348 // permuted as the other vectors/matrices are permuted.
 349 //
 350 // A,x,b,w,lo,hi,state,findex,p,c are permuted such that sets C,N have
 351 // contiguous indexes. the don't-care indexes follow N.
 352 //
 353 // an L*D*L' factorization is maintained of A(C,C), and whenever indexes are
 354 // added or removed from the set C the factorization is updated.
 355 // thus L*D*L'=A[C,C], i.e. a permuted top left nC*nC submatrix of A.
 356 // the leading dimension of the matrix L is always `nskip'.
 357 //
 358 // at the start there may be other indexes that are unbounded but are not
 359 // included in `nub'. dLCP will permute the matrix so that absolutely all
 360 // unbounded vectors are at the start. thus there may be some initial
 361 // permutation.
 362 //
 363 // the algorithms here assume certain patterns, particularly with respect to
 364 // index transfer.
 365
 366 #ifdef dLCP_FAST
 367
 368 struct dLCP {
 369     const unsigned m_n;
 370     const unsigned m_nskip;
 371     unsigned m_nub;
 372     unsigned m_nC, m_nN;                                // size of each index set
 373     ATYPE const m_A;                            // A rows
 374     dReal *const m_pairsbx, *const m_w, *const m_pairslh;       // permuted LCP problem data
 375     dReal *const m_L, *const m_d;                               // L*D*L' factorization of set C
 376     dReal *const m_Dell, *const m_ell, *const m_tmp;
 377     bool *const m_state;
 378     int *const m_findex;
 379     unsigned *const m_p, *const m_C;
 380
 381     dLCP (unsigned n, unsigned nskip, unsigned nub, dReal *Adata, dReal *pairsbx, dReal *w,
 382         dReal *pairslh, dReal *L, dReal *d,
 383         dReal *Dell, dReal *ell, dReal *tmp,
 384         bool *state, int *findex, unsigned *p, unsigned *C, dReal **Arows);
 385     unsigned getNub() const { return m_nub; }
 386     void transfer_i_to_C (unsigned i);
 387     void transfer_i_to_N (unsigned /*i*/) { m_nN++; }                   // because we can assume C and N span 1:i-1
 388     void transfer_i_from_N_to_C (unsigned i);
 389     void transfer_i_from_C_to_N (unsigned i, void *tmpbuf);
 390     static sizeint estimate_transfer_i_from_C_to_N_mem_req(unsigned nC, unsigned nskip) { return dEstimateLDLTRemoveTmpbufSize(nC, nskip); }
 391     unsigned numC() const { return m_nC; }
 392     unsigned numN() const { return m_nN; }
 393     unsigned indexC (unsigned i) const { return i; }
 394     unsigned indexN (unsigned i) const { return i+m_nC; }
 395     dReal Aii (unsigned i) const  { return AROW(i)[i]; }
 396     template<unsigned q_stride>
 397     dReal AiC_times_qC (unsigned i, dReal *q) const { return calculateLargeVectorDot<q_stride> (AROW(i), q, m_nC); }
 398     template<unsigned q_stride>
 399     dReal AiN_times_qN (unsigned i, dReal *q) const { return calculateLargeVectorDot<q_stride> (AROW(i) + m_nC, q + (sizeint)m_nC * q_stride, m_nN); }
 400     void pN_equals_ANC_times_qC (dReal *p, dReal *q);
 401     void pN_plusequals_ANi (dReal *p, unsigned i, bool dir_positive);
 402     template<unsigned p_stride>
 403     void pC_plusequals_s_times_qC (dReal *p, dReal s, dReal *q);
 404     void pN_plusequals_s_times_qN (dReal *p, dReal s, dReal *q);
 405     void solve1 (dReal *a, unsigned i, bool dir_positive, int only_transfer=0);
 406     void unpermute_X();
 407     void unpermute_W();
 408 };
 409
 410
 411 dLCP::dLCP (unsigned n, unsigned nskip, unsigned nub, dReal *Adata, dReal *pairsbx, dReal *w,
 412             dReal *pairslh, dReal *L, dReal *d,
 413             dReal *Dell, dReal *ell, dReal *tmp,
 414             bool *state, int *findex, unsigned *p, unsigned *C, dReal **Arows):
 415     m_n(n), m_nskip(nskip), m_nub(nub), m_nC(0), m_nN(0),
 416 # ifdef ROWPTRS
 417     m_A(Arows),
 418 #else
 419     m_A(Adata),
 420 #endif
 421     m_pairsbx(pairsbx), m_w(w), m_pairslh(pairslh),
 422     m_L(L), m_d(d), m_Dell(Dell), m_ell(ell), m_tmp(tmp),
 423     m_state(state), m_findex(findex), m_p(p), m_C(C)
 424 {
 425     dxtSetZero<PBX__MAX>(pairsbx + PBX_X, n);
 426
 427     {
 428 # ifdef ROWPTRS
 429         // make matrix row pointers
 430         dReal *aptr = Adata;
 431         ATYPE A = m_A;
 432         for (unsigned k=0; k<n; aptr+=nskip, ++k) A[k] = aptr;
 433 # endif
 434     }
 435
 436     {
 437         for (unsigned k=0; k != n; ++k) p[k] = k;               // initially unpermutted
 438     }
 439
 440     /*
 441     // for testing, we can do some random swaps in the area i > nub
 442     {
 443     if (nub < n) {
 444     for (unsigned k=0; k<100; k++) {
 445     unsigned i1,i2;
 446     do {
 447     i1 = dRandInt(n-nub)+nub;
 448     i2 = dRandInt(n-nub)+nub;
 449     }
 450     while (i1 > i2);
 451     //printf ("--> %d %d\n",i1,i2);
 452     swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, n, i1, i2, m_nskip, 0);
 453     }
 454     }
 455     */
 456
 457     // permute the problem so that *all* the unbounded variables are at the
 458     // start, i.e. look for unbounded variables not included in `nub'. we can
 459     // potentially push up `nub' this way and get a bigger initial factorization.
 460     // note that when we swap rows/cols here we must not just swap row pointers,
 461     // as the initial factorization relies on the data being all in one chunk.
 462     // variables that have findex >= 0 are *not* considered to be unbounded even
 463     // if lo=-inf and hi=inf - this is because these limits may change during the
 464     // solution process.
 465
 466         unsigned currNub = nub;
 467     {
 468         for (unsigned k = currNub; k < n; ++k) {
 469             if (findex && findex[k] >= 0) continue;
 470             if ((pairslh + (sizeint)k * PLH__MAX)[PLH_LO] == -dInfinity && (pairslh + (sizeint)k * PLH__MAX)[PLH_HI] == dInfinity) {
 471                 swapProblem (m_A, m_pairsbx, m_w, pairslh, m_p, m_state, findex, n, currNub, k, nskip, 0);
 472                 m_nub = ++currNub;
 473             }
 474         }
 475     }
 476
 477     // if there are unbounded variables at the start, factorize A up to that
 478     // point and solve for x. this puts all indexes 0..currNub-1 into C.
 479     if (currNub > 0) {
 480         {
 481             dReal *Lrow = m_L;
 482             for (unsigned j = 0; j < currNub; Lrow += nskip, ++j) memcpy(Lrow, AROW(j), (j + 1) * sizeof(dReal));
 483         }
 484         transfer_b_to_x<false> (m_pairsbx, currNub);
 485         factorMatrixAsLDLT<1> (m_L, m_d, currNub, nskip);
 486         solveEquationSystemWithLDLT<1, PBX__MAX> (m_L, m_d, m_pairsbx + PBX_X, currNub, nskip);
 487         dSetZero (m_w, currNub);
 488         {
 489             unsigned *C = m_C;
 490             for (unsigned k = 0; k < currNub; ++k) C[k] = k;
 491         }
 492         m_nC = currNub;
 493     }
 494
 495     // permute the indexes > currNub such that all findex variables are at the end
 496     if (findex) {
 497         unsigned num_at_end = 0;
 498         for (unsigned k = m_n; k > currNub; ) {
 499             --k;
 500             if (findex[k] >= 0) {
 501                 swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, findex, m_n, k, m_n - 1 - num_at_end, nskip, 1);
 502                 num_at_end++;
 503             }
 504         }
 505     }
 506
 507     // print info about indexes
 508     /*
 509     {
 510     for (unsigned k=0; k<n; k++) {
 511     if (k<currNub) printf ("C");
 512     else if ((m_pairslh + (sizeint)k * PLH__MAX)[PLH_LO] == -dInfinity && (m_pairslh + (sizeint)k * PLH__MAX)[PLH_HI] == dInfinity) printf ("c");
 513     else printf (".");
 514     }
 515     printf ("\n");
 516     }
 517     */
 518 }
 519
 520
 521 void dLCP::transfer_i_to_C (unsigned i)
 522 {
 523     {
 524         const unsigned nC = m_nC;
 525
 526         if (nC > 0) {
 527             // ell,Dell were computed by solve1(). note, ell = D \ L1solve (L,A(i,C))
 528             dReal *const Ltgt = m_L + (sizeint)m_nskip * nC, *ell = m_ell;
 529             memcpy(Ltgt, ell, nC * sizeof(dReal));
 530
 531             dReal ell_Dell_dot = dxDot(m_ell, m_Dell, nC);
 532             dReal AROW_i_i = AROW(i)[i] != ell_Dell_dot ? AROW(i)[i] : dNextAfter(AROW(i)[i], dInfinity); // A hack to avoid getting a zero in the denominator
 533             m_d[nC] = dRecip (AROW_i_i - ell_Dell_dot);
 534         }
 535         else {
 536             m_d[0] = dRecip (AROW(i)[i]);
 537         }
 538
 539         swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, m_n, nC, i, m_nskip, 1);
 540
 541         m_C[nC] = nC;
 542         m_nC = nC + 1; // nC value is outdated after this line
 543     }
 544
 545 # ifdef DEBUG_LCP
 546     checkFactorization (m_A, m_L, m_d, m_nC, m_C, m_nskip);
 547     if (i < (m_n-1)) checkPermutations (i+1, m_n, m_nC, m_nN, m_p, m_C);
 548 # endif
 549 }
 550
 551
 552 void dLCP::transfer_i_from_N_to_C (unsigned i)
 553 {
 554     {
 555         const unsigned nC = m_nC;
 556         if (nC > 0) {
 557             {
 558                 dReal *const aptr = AROW(i);
 559                 dReal *Dell = m_Dell;
 560                 const unsigned *C = m_C;
 561 #   ifdef NUB_OPTIMIZATIONS
 562                 // if nub>0, initial part of aptr unpermuted
 563                 const unsigned nub = m_nub;
 564                 unsigned j=0;
 565                 for ( ; j<nub; ++j) Dell[j] = aptr[j];
 566                 for ( ; j<nC; ++j) Dell[j] = aptr[C[j]];
 567 #   else
 568                 for (unsigned j=0; j<nC; ++j) Dell[j] = aptr[C[j]];
 569 #   endif
 570             }
 571             solveL1Straight<1>(m_L, m_Dell, nC, m_nskip);
 572
 573             dReal ell_Dell_dot = REAL(0.0);
 574             dReal *const Ltgt = m_L + (sizeint)m_nskip * nC;
 575             dReal *ell = m_ell, *Dell = m_Dell, *d = m_d;
 576             for (unsigned j = 0; j < nC; ++j) {
 577                 dReal ell_j, Dell_j = Dell[j];
 578                 Ltgt[j] = ell[j] = ell_j = Dell_j * d[j];
 579                 ell_Dell_dot += ell_j * Dell_j;
 580             }
 581
 582             dReal AROW_i_i = AROW(i)[i] != ell_Dell_dot ? AROW(i)[i] : dNextAfter(AROW(i)[i], dInfinity); // A hack to avoid getting a zero in the denominator
 583             m_d[nC] = dRecip (AROW_i_i - ell_Dell_dot);
 584         }
 585         else {
 586             m_d[0] = dRecip (AROW(i)[i]);
 587         }
 588
 589         swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, m_n, nC, i, m_nskip, 1);
 590
 591         m_C[nC] = nC;
 592         m_nN--;
 593         m_nC = nC + 1; // nC value is outdated after this line
 594     }
 595
 596     // @@@ TO DO LATER
 597     // if we just finish here then we'll go back and re-solve for
 598     // delta_x. but actually we can be more efficient and incrementally
 599     // update delta_x here. but if we do this, we wont have ell and Dell
 600     // to use in updating the factorization later.
 601
 602 # ifdef DEBUG_LCP
 603     checkFactorization (m_A,m_L,m_d,m_nC,m_C,m_nskip);
 604 # endif
 605 }
 606
 607
 608 void dLCP::transfer_i_from_C_to_N (unsigned i, void *tmpbuf)
 609 {
 610     {
 611         unsigned *C = m_C;
 612         // remove a row/column from the factorization, and adjust the
 613         // indexes (black magic!)
 614         int last_idx = -1;
 615         const unsigned nC = m_nC;
 616         unsigned j = 0;
 617         for ( ; j < nC; ++j) {
 618             if (C[j] == nC - 1) {
 619                 last_idx = j;
 620             }
 621             if (C[j] == i) {
 622                 dxLDLTRemove (m_A, C, m_L, m_d, m_n, nC, j, m_nskip, tmpbuf);
 623                 unsigned k;
 624                 if (last_idx == -1) {
 625                     for (k = j + 1 ; k < nC; ++k) {
 626                         if (C[k] == nC - 1) {
 627                             break;
 628                         }
 629                     }
 630                     dIASSERT (k < nC);
 631                 }
 632                 else {
 633                     k = last_idx;
 634                 }
 635                 C[k] = C[j];
 636                 if (j != (nC - 1)) memmove (C + j, C + j + 1, (nC - j - 1) * sizeof(C[0]));
 637                 break;
 638             }
 639         }
 640         dIASSERT (j < nC);
 641
 642         swapProblem (m_A, m_pairsbx, m_w, m_pairslh, m_p, m_state, m_findex, m_n, i, nC - 1, m_nskip, 1);
 643
 644         m_nN++;
 645         m_nC = nC - 1; // nC value is outdated after this line
 646     }
 647
 648 # ifdef DEBUG_LCP
 649     checkFactorization (m_A, m_L, m_d, m_nC, m_C, m_nskip);
 650 # endif
 651 }
 652
 653
 654 void dLCP::pN_equals_ANC_times_qC (dReal *p, dReal *q)
 655 {
 656     // we could try to make this matrix-vector multiplication faster using
 657     // outer product matrix tricks, e.g. with the dMultidotX() functions.
 658     // but i tried it and it actually made things slower on random 100x100
 659     // problems because of the overhead involved. so we'll stick with the
 660     // simple method for now.
 661     const unsigned nC = m_nC;
 662     dReal *ptgt = p + nC;
 663     const unsigned nN = m_nN;
 664     for (unsigned i = 0; i < nN; ++i) {
 665         ptgt[i] = dxDot (AROW(i + nC), q, nC);
 666     }
 667 }
 668
 669
 670 void dLCP::pN_plusequals_ANi (dReal *p, unsigned i, bool dir_positive)
 671 {
 672     const unsigned nC = m_nC;
 673     dReal *aptr = AROW(i) + nC;
 674     dReal *ptgt = p + nC;
 675     if (dir_positive) {
 676         const unsigned nN = m_nN;
 677         for (unsigned j=0; j < nN; ++j) ptgt[j] += aptr[j];
 678     }
 679     else {
 680         const unsigned nN = m_nN;
 681         for (unsigned j=0; j < nN; ++j) ptgt[j] -= aptr[j];
 682     }
 683 }
 684
 685 template<unsigned p_stride>
 686 void dLCP::pC_plusequals_s_times_qC (dReal *p, dReal s, dReal *q)
 687 {
 688     const unsigned nC = m_nC;
 689     dReal *q_end = q + nC;
 690     for (; q != q_end; p += p_stride, ++q) {
 691         *p += s * (*q);
 692     }
 693 }
 694
 695 void dLCP::pN_plusequals_s_times_qN (dReal *p, dReal s, dReal *q)
 696 {
 697     const unsigned nC = m_nC;
 698     dReal *ptgt = p + nC, *qsrc = q + nC;
 699     const unsigned nN = m_nN;
 700     for (unsigned i = 0; i < nN; ++i) {
 701         ptgt[i] += s * qsrc[i];
 702     }
 703 }
 704
 705 void dLCP::solve1 (dReal *a, unsigned i, bool dir_positive, int only_transfer)
 706 {
 707     // the `Dell' and `ell' that are computed here are saved. if index i is
 708     // later added to the factorization then they can be reused.
 709     //
 710     // @@@ question: do we need to solve for entire delta_x??? yes, but
 711     //     only if an x goes below 0 during the step.
 712
 713     const unsigned nC = m_nC;
 714     if (nC > 0) {
 715         {
 716             dReal *Dell = m_Dell;
 717             unsigned *C = m_C;
 718             dReal *aptr = AROW(i);
 719 #   ifdef NUB_OPTIMIZATIONS
 720             // if nub>0, initial part of aptr[] is guaranteed unpermuted
 721             const unsigned nub = m_nub;
 722             unsigned j = 0;
 723             for ( ; j < nub; ++j) Dell[j] = aptr[j];
 724             for ( ; j < nC; ++j) Dell[j] = aptr[C[j]];
 725 #   else
 726             for (unsigned j = 0; j < nC; ++j) Dell[j] = aptr[C[j]];
 727 #   endif
 728         }
 729         solveL1Straight<1>(m_L, m_Dell, nC, m_nskip);
 730         {
 731             dReal *ell = m_ell, *Dell = m_Dell, *d = m_d;
 732             for (unsigned j = 0; j < nC; ++j) ell[j] = Dell[j] * d[j];
 733         }
 734
 735         if (!only_transfer) {
 736             dReal *tmp = m_tmp, *ell = m_ell;
 737             {
 738                 for (unsigned j = 0; j < nC; ++j) tmp[j] = ell[j];
 739             }
 740             solveL1Transposed<1>(m_L, tmp, nC, m_nskip);
 741             if (dir_positive) {
 742                 unsigned *C = m_C;
 743                 dReal *tmp = m_tmp;
 744                 for (unsigned j = 0; j < nC; ++j) a[C[j]] = -tmp[j];
 745             } else {
 746                 unsigned *C = m_C;
 747                 dReal *tmp = m_tmp;
 748                 for (unsigned j = 0; j < nC; ++j) a[C[j]] = tmp[j];
 749             }
 750         }
 751     }
 752 }
 753
 754
 755 void dLCP::unpermute_X()
 756 {
 757     unsigned *p = m_p;
 758     dReal *pairsbx = m_pairsbx;
 759     const unsigned n = m_n;
 760     for (unsigned j = 0; j < n; ++j) {
 761         unsigned k = p[j];
 762         if (k != j) {
 763             // p[j] = j; -- not going to be checked anymore anyway
 764             dReal x_j = (pairsbx + (sizeint)j * PBX__MAX)[PBX_X];
 765             for (;;) {
 766                 dxSwap(x_j, (pairsbx + (sizeint)k * PBX__MAX)[PBX_X]);
 767
 768                 unsigned orig_k = p[k];
 769                 p[k] = k;
 770                 if (orig_k == j) {
 771                     break;
 772                 }
 773                 k = orig_k;
 774             }
 775             (pairsbx + (sizeint)j * PBX__MAX)[PBX_X] = x_j;
 776         }
 777     }
 778 }
 779
 780 void dLCP::unpermute_W()
 781 {
 782     memcpy (m_tmp, m_w, m_n * sizeof(dReal));
 783
 784     const unsigned *p = m_p;
 785     dReal *w = m_w, *tmp = m_tmp;
 786     const unsigned n = m_n;
 787     for (unsigned j = 0; j < n; ++j) {
 788         unsigned k = p[j];
 789         w[k] = tmp[j];
 790     }
 791 }
 792
 793 #endif // dLCP_FAST
 794
 795
 796 static void dxSolveLCP_AllUnbounded (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX]);
 797 static void dxSolveLCP_Generic (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX],
 798                                 dReal *outer_w/*=NULL*/, unsigned nub, dReal pairslh[PLH__MAX], int *findex);
 799
 800 /*extern */
 801 void dxSolveLCP (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX],
 802     dReal *outer_w/*=NULL*/, unsigned nub, dReal pairslh[PLH__MAX], int *findex)
 803 {
 804     if (nub >= n)
 805     {
 806         dxSolveLCP_AllUnbounded (memarena, n, A, pairsbx);
 807     }
 808     else
 809     {
 810         dxSolveLCP_Generic (memarena, n, A, pairsbx, outer_w, nub, pairslh, findex);
 811     }
 812 }
 813
 814 //***************************************************************************
 815 // if all the variables are unbounded then we can just factor, solve, and return
 816
 817 static
 818 void dxSolveLCP_AllUnbounded (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX])
 819 {
 820     dAASSERT(A != NULL);
 821     dAASSERT(pairsbx != NULL);
 822     dAASSERT(n != 0);
 823
 824     transfer_b_to_x<true>(pairsbx, n);
 825
 826     unsigned nskip = dPAD(n);
 827     factorMatrixAsLDLT<PBX__MAX> (A, pairsbx + PBX_B, n, nskip);
 828     solveEquationSystemWithLDLT<PBX__MAX, PBX__MAX> (A, pairsbx + PBX_B, pairsbx + PBX_X, n, nskip);
 829 }
 830
 831 //***************************************************************************
 832 // an optimized Dantzig LCP driver routine for the lo-hi LCP problem.
 833
 834 static
 835 void dxSolveLCP_Generic (dxWorldProcessMemArena *memarena, unsigned n, dReal *A, dReal pairsbx[PBX__MAX],
 836     dReal *outer_w/*=NULL*/, unsigned nub, dReal pairslh[PLH__MAX], int *findex)
 837 {
 838     dAASSERT (n > 0 && A && pairsbx && pairslh && nub >= 0 && nub < n);
 839 # ifndef dNODEBUG
 840     {
 841         // check restrictions on lo and hi
 842         dReal *endlh = pairslh + (sizeint)n * PLH__MAX;
 843         for (dReal *currlh = pairslh; currlh != endlh; currlh += PLH__MAX) dIASSERT (currlh[PLH_LO] <= 0 && currlh[PLH_HI] >= 0);
 844     }
 845 # endif
 846
 847     const unsigned nskip = dPAD(n);
 848     dReal *L = memarena->AllocateOveralignedArray<dReal> ((sizeint)nskip * n, LMATRIX_ALIGNMENT);
 849     dReal *d = memarena->AllocateArray<dReal> (n);
 850     dReal *w = outer_w != NULL ? outer_w : memarena->AllocateArray<dReal> (n);
 851     dReal *delta_w = memarena->AllocateArray<dReal> (n);
 852     dReal *delta_x = memarena->AllocateArray<dReal> (n);
 853     dReal *Dell = memarena->AllocateArray<dReal> (n);
 854     dReal *ell = memarena->AllocateArray<dReal> (n);
 855 #ifdef ROWPTRS
 856     dReal **Arows = memarena->AllocateArray<dReal *> (n);
 857 #else
 858     dReal **Arows = NULL;
 859 #endif
 860     unsigned *p = memarena->AllocateArray<unsigned> (n);
 861     unsigned *C = memarena->AllocateArray<unsigned> (n);
 862
 863     // for i in N, state[i] is 0 if x(i)==lo(i) or 1 if x(i)==hi(i)
 864     bool *state = memarena->AllocateArray<bool> (n);
 865
 866     // create LCP object. note that tmp is set to delta_w to save space, this
 867     // optimization relies on knowledge of how tmp is used, so be careful!
 868     dLCP lcp(n, nskip, nub, A, pairsbx, w, pairslh, L, d, Dell, ell, delta_w, state, findex, p, C, Arows);
 869     unsigned adj_nub = lcp.getNub();
 870
 871     // loop over all indexes adj_nub..n-1. for index i, if x(i),w(i) satisfy the
 872     // LCP conditions then i is added to the appropriate index set. otherwise
 873     // x(i),w(i) is driven either +ve or -ve to force it to the valid region.
 874     // as we drive x(i), x(C) is also adjusted to keep w(C) at zero.
 875     // while driving x(i) we maintain the LCP conditions on the other variables
 876     // 0..i-1. we do this by watching out for other x(i),w(i) values going
 877     // outside the valid region, and then switching them between index sets
 878     // when that happens.
 879
 880     bool hit_first_friction_index = false;
 881     for (unsigned i = adj_nub; i < n; ++i) {
 882         bool s_error = false;
 883         // the index i is the driving index and indexes i+1..n-1 are "dont care",
 884         // i.e. when we make changes to the system those x's will be zero and we
 885         // don't care what happens to those w's. in other words, we only consider
 886         // an (i+1)*(i+1) sub-problem of A*x=b+w.
 887
 888         // if we've hit the first friction index, we have to compute the lo and
 889         // hi values based on the values of x already computed. we have been
 890         // permuting the indexes, so the values stored in the findex vector are
 891         // no longer valid. thus we have to temporarily unpermute the x vector.
 892         // for the purposes of this computation, 0*infinity = 0 ... so if the
 893         // contact constraint's normal force is 0, there should be no tangential
 894         // force applied.
 895
 896         if (!hit_first_friction_index && findex && findex[i] >= 0) {
 897             // un-permute x into delta_w, which is not being used at the moment
 898             for (unsigned j = 0; j < n; ++j) delta_w[p[j]] = (pairsbx + (sizeint)j * PBX__MAX)[PBX_X];
 899
 900             // set lo and hi values
 901             for (unsigned k = i; k < n; ++k) {
 902                 dReal *currlh = pairslh + (sizeint)k * PLH__MAX;
 903                 dReal wfk = delta_w[findex[k]];
 904                 if (wfk == 0) {
 905                     currlh[PLH_HI] = 0;
 906                     currlh[PLH_LO] = 0;
 907                 }
 908                 else {
 909                     currlh[PLH_HI] = dFabs (currlh[PLH_HI] * wfk);
 910                     currlh[PLH_LO] = -currlh[PLH_HI];
 911                 }
 912             }
 913             hit_first_friction_index = true;
 914         }
 915
 916         // thus far we have not even been computing the w values for indexes
 917         // greater than i, so compute w[i] now.
 918         dReal wPrep = lcp.AiC_times_qC<PBX__MAX> (i, pairsbx + PBX_X) + lcp.AiN_times_qN<PBX__MAX> (i, pairsbx + PBX_X);
 919
 920         dReal *currbx = pairsbx + (sizeint)i * PBX__MAX;
 921
 922         w[i] = wPrep - currbx[PBX_B];
 923
 924         // if lo=hi=0 (which can happen for tangential friction when normals are
 925         // 0) then the index will be assigned to set N with some state. however,
 926         // set C's line has zero size, so the index will always remain in set N.
 927         // with the "normal" switching logic, if w changed sign then the index
 928         // would have to switch to set C and then back to set N with an inverted
 929         // state. this is pointless, and also computationally expensive. to
 930         // prevent this from happening, we use the rule that indexes with lo=hi=0
 931         // will never be checked for set changes. this means that the state for
 932         // these indexes may be incorrect, but that doesn't matter.
 933
 934         dReal *currlh = pairslh + (sizeint)i * PLH__MAX;
 935
 936         // see if x(i),w(i) is in a valid region
 937         if (currlh[PLH_LO] == 0 && w[i] >= 0) {
 938             lcp.transfer_i_to_N (i);
 939             state[i] = false;
 940         }
 941         else if (currlh[PLH_HI] == 0 && w[i] <= 0) {
 942             lcp.transfer_i_to_N (i);
 943             state[i] = true;
 944         }
 945         else if (w[i] == 0) {
 946             // this is a degenerate case. by the time we get to this test we know
 947             // that lo != 0, which means that lo < 0 as lo is not allowed to be +ve,
 948             // and similarly that hi > 0. this means that the line segment
 949             // corresponding to set C is at least finite in extent, and we are on it.
 950             // NOTE: we must call lcp.solve1() before lcp.transfer_i_to_C()
 951             lcp.solve1 (delta_x, i, false, 1);
 952
 953             lcp.transfer_i_to_C (i);
 954         }
 955         else {
 956             // we must push x(i) and w(i)
 957             for (;;) {
 958                 // find direction to push on x(i)
 959                 bool dir_positive = (w[i] <= 0);
 960
 961                 // compute: delta_x(C) = -dir*A(C,C)\A(C,i)
 962                 lcp.solve1 (delta_x, i, dir_positive);
 963
 964                 // note that delta_x[i] = (dir_positive ? 1 : -1), but we wont bother to set it
 965
 966                 // compute: delta_w = A*delta_x ... note we only care about
 967                 // delta_w(N) and delta_w(i), the rest is ignored
 968                 lcp.pN_equals_ANC_times_qC (delta_w, delta_x);
 969                 lcp.pN_plusequals_ANi (delta_w, i, dir_positive);
 970                 delta_w[i] = dir_positive
 971                     ? lcp.AiC_times_qC<1> (i, delta_x) + lcp.Aii(i)
 972                     : lcp.AiC_times_qC<1> (i, delta_x) - lcp.Aii(i);
 973
 974                 // find largest step we can take (size=s), either to drive x(i),w(i)
 975                 // to the valid LCP region or to drive an already-valid variable
 976                 // outside the valid region.
 977
 978                 int cmd = 1;            // index switching command
 979                 unsigned si = 0;                // si = index to switch if cmd>3
 980
 981                 dReal s = delta_w[i] != REAL(0.0)
 982                     ? -w[i] / delta_w[i]
 983                     : (w[i] != REAL(0.0) ? dCopySign(dInfinity, -w[i]) : REAL(0.0));
 984
 985                 if (dir_positive) {
 986                     if (currlh[PLH_HI] < dInfinity) {
 987                         dReal s2 = (currlh[PLH_HI] - currbx[PBX_X]);    // was (hi[i]-x[i])/dirf        // step to x(i)=hi(i)
 988                         if (s2 < s) {
 989                             s = s2;
 990                             cmd = 3;
 991                         }
 992                     }
 993                 }
 994                 else {
 995                     if (currlh[PLH_LO] > -dInfinity) {
 996                         dReal s2 = (currbx[PBX_X] - currlh[PLH_LO]); // was (lo[i]-x[i])/dirf   // step to x(i)=lo(i)
 997                         if (s2 < s) {
 998                             s = s2;
 999                             cmd = 2;
1000                         }
1001                     }
1002                 }
1003
1004                 {
1005                     const unsigned numN = lcp.numN();
1006                     for (unsigned k = 0; k < numN; ++k) {
1007                         const unsigned indexN_k = lcp.indexN(k);
1008                         if (!state[indexN_k] ? delta_w[indexN_k] < 0 : delta_w[indexN_k] > 0) {
1009                             // don't bother checking if lo=hi=0
1010                             dReal *indexlh = pairslh + (sizeint)indexN_k * PLH__MAX;
1011                             if (indexlh[PLH_LO] == 0 && indexlh[PLH_HI] == 0) continue;
1012                             dReal s2 = -w[indexN_k] / delta_w[indexN_k];
1013                             if (s2 < s) {
1014                                 s = s2;
1015                                 cmd = 4;
1016                                 si = indexN_k;
1017                             }
1018                         }
1019                     }
1020                 }
1021
1022                 {
1023                     const unsigned numC = lcp.numC();
1024                     for (unsigned k = adj_nub; k < numC; ++k) {
1025                         const unsigned indexC_k = lcp.indexC(k);
1026                         dReal *indexlh = pairslh + (sizeint)indexC_k * PLH__MAX;
1027                         if (delta_x[indexC_k] < 0 && indexlh[PLH_LO] > -dInfinity) {
1028                             dReal s2 = (indexlh[PLH_LO] - (pairsbx + (sizeint)indexC_k * PBX__MAX)[PBX_X]) / delta_x[indexC_k];
1029                             if (s2 < s) {
1030                                 s = s2;
1031                                 cmd = 5;
1032                                 si = indexC_k;
1033                             }
1034                         }
1035                         if (delta_x[indexC_k] > 0 && indexlh[PLH_HI] < dInfinity) {
1036                             dReal s2 = (indexlh[PLH_HI] - (pairsbx + (sizeint)indexC_k * PBX__MAX)[PBX_X]) / delta_x[indexC_k];
1037                             if (s2 < s) {
1038                                 s = s2;
1039                                 cmd = 6;
1040                                 si = indexC_k;
1041                             }
1042                         }
1043                     }
1044                 }
1045
1046                 //static char* cmdstring[8] = {0,"->C","->NL","->NH","N->C",
1047                 //                           "C->NL","C->NH"};
1048                 //printf ("cmd=%d (%s), si=%d\n",cmd,cmdstring[cmd],(cmd>3) ? si : i);
1049
1050                 // if s <= 0 then we've got a problem. if we just keep going then
1051                 // we're going to get stuck in an infinite loop. instead, just cross
1052                 // our fingers and exit with the current solution.
1053                 if (s <= REAL(0.0)) {
1054                     dMessage (d_ERR_LCP, "LCP internal error, s <= 0 (s=%.4e)",(double)s);
1055                     if (i < n) {
1056                         dxtSetZero<PBX__MAX>(currbx + PBX_X, n - i);
1057                         dxSetZero (w + i, n - i);
1058                     }
1059                     s_error = true;
1060                     break;
1061                 }
1062
1063                 // apply x = x + s * delta_x
1064                 lcp.pC_plusequals_s_times_qC<PBX__MAX> (pairsbx + PBX_X, s, delta_x);
1065                 currbx[PBX_X] = dir_positive
1066                     ? currbx[PBX_X] + s
1067                     : currbx[PBX_X] - s;
1068
1069                 // apply w = w + s * delta_w
1070                 lcp.pN_plusequals_s_times_qN (w, s, delta_w);
1071                 w[i] += s * delta_w[i];
1072
1073                 void *tmpbuf;
1074                 // switch indexes between sets if necessary
1075                 switch (cmd) {
1076                 case 1:         // done
1077                     w[i] = 0;
1078                     lcp.transfer_i_to_C (i);
1079                     break;
1080                 case 2:         // done
1081                     currbx[PBX_X] = currlh[PLH_LO];
1082                     state[i] = false;
1083                     lcp.transfer_i_to_N (i);
1084                     break;
1085                 case 3:         // done
1086                     currbx[PBX_X] = currlh[PLH_HI];
1087                     state[i] = true;
1088                     lcp.transfer_i_to_N (i);
1089                     break;
1090                 case 4:         // keep going
1091                     w[si] = 0;
1092                     lcp.transfer_i_from_N_to_C (si);
1093                     break;
1094                 case 5:         // keep going
1095                     (pairsbx + (sizeint)si * PBX__MAX)[PBX_X] = (pairslh + (sizeint)si * PLH__MAX)[PLH_LO];
1096                     state[si] = false;
1097                     tmpbuf = memarena->PeekBufferRemainder();
1098                     lcp.transfer_i_from_C_to_N (si, tmpbuf);
1099                     break;
1100                 case 6:         // keep going
1101                     (pairsbx + (sizeint)si * PBX__MAX)[PBX_X] = (pairslh + (sizeint)si * PLH__MAX)[PLH_HI];
1102                     state[si] = true;
1103                     tmpbuf = memarena->PeekBufferRemainder();
1104                     lcp.transfer_i_from_C_to_N (si, tmpbuf);
1105                     break;
1106                 }
1107
1108                 if (cmd <= 3) break;
1109             } // for (;;)
1110         } // else
1111
1112         if (s_error) {
1113             break;
1114         }
1115     } // for (unsigned i = adj_nub; i < n; ++i)
1116
1117     // now we have to un-permute x and w
1118     if (outer_w != NULL) {
1119         lcp.unpermute_W();
1120     }
1121     lcp.unpermute_X(); // This destroys p[] and must be done last
1122 }
1123
1124 sizeint dxEstimateSolveLCPMemoryReq(unsigned n, bool outer_w_avail)
1125 {
1126     const unsigned nskip = dPAD(n);
1127
1128     sizeint res = 0;
1129
1130     res += dOVERALIGNED_SIZE(sizeof(dReal) * ((sizeint)n * nskip), LMATRIX_ALIGNMENT); // for L
1131     res += 5 * dEFFICIENT_SIZE(sizeof(dReal) * n); // for d, delta_w, delta_x, Dell, ell
1132     if (!outer_w_avail) {
1133         res += dEFFICIENT_SIZE(sizeof(dReal) * n); // for w
1134     }
1135 #ifdef ROWPTRS
1136     res += dEFFICIENT_SIZE(sizeof(dReal *) * n); // for Arows
1137 #endif
1138     res += 2 * dEFFICIENT_SIZE(sizeof(unsigned) * n); // for p, C
1139     res += dEFFICIENT_SIZE(sizeof(bool) * n); // for state
1140
1141     // Use n instead of nC as nC varies at runtime while n is greater or equal to nC
1142     sizeint lcp_transfer_req = dLCP::estimate_transfer_i_from_C_to_N_mem_req(n, nskip);
1143     res += dEFFICIENT_SIZE(lcp_transfer_req); // for dLCP::transfer_i_from_C_to_N
1144
1145     return res;
1146 }
1147
1148
1149 //***************************************************************************
1150 // accuracy and timing test
1151
1152 static sizeint EstimateTestSolveLCPMemoryReq(unsigned n)
1153 {
1154     const unsigned nskip = dPAD(n);
1155
1156     sizeint res = 0;
1157
1158     res += 2 * dEFFICIENT_SIZE(sizeof(dReal) * ((sizeint)n * nskip)); // for A, A2
1159     res += 7 * dEFFICIENT_SIZE(sizeof(dReal) * n); // for x, b, w, lo, hi, tmp1, tmp2
1160     res += dEFFICIENT_SIZE(sizeof(dReal) * PBX__MAX * n); // for pairsbx,
1161     res += dEFFICIENT_SIZE(sizeof(dReal) * PLH__MAX * n); // for pairslh
1162
1163     res += dxEstimateSolveLCPMemoryReq(n, true);
1164
1165     return res;
1166 }
1167
1168 extern "C" ODE_API int dTestSolveLCP()
1169 {
1170     const unsigned n = 100;
1171
1172     sizeint memreq = EstimateTestSolveLCPMemoryReq(n);
1173     dxWorldProcessMemArena *arena = dxAllocateTemporaryWorldProcessMemArena(memreq, NULL, NULL);
1174     if (arena == NULL) {
1175         return 0;
1176     }
1177     arena->ResetState();
1178
1179     unsigned i,nskip = dPAD(n);
1180 #ifdef dDOUBLE
1181     const dReal tol = REAL(1e-9);
1182 #endif
1183 #ifdef dSINGLE
1184     const dReal tol = REAL(1e-4);
1185 #endif
1186     printf ("dTestSolveLCP()\n");
1187
1188     dReal *A = arena->AllocateArray<dReal> (n*nskip);
1189     dReal *x = arena->AllocateArray<dReal> (n);
1190     dReal *b = arena->AllocateArray<dReal> (n);
1191     dReal *w = arena->AllocateArray<dReal> (n);
1192     dReal *lo = arena->AllocateArray<dReal> (n);
1193     dReal *hi = arena->AllocateArray<dReal> (n);
1194
1195     dReal *A2 = arena->AllocateArray<dReal> (n*nskip);
1196     dReal *pairsbx = arena->AllocateArray<dReal> (n * PBX__MAX);
1197     dReal *pairslh = arena->AllocateArray<dReal> (n * PLH__MAX);
1198
1199     dReal *tmp1 = arena->AllocateArray<dReal> (n);
1200     dReal *tmp2 = arena->AllocateArray<dReal> (n);
1201
1202     double total_time = 0;
1203     for (unsigned count=0; count < 1000; count++) {
1204         BEGIN_STATE_SAVE(arena, saveInner) {
1205
1206             // form (A,b) = a random positive definite LCP problem
1207             dMakeRandomMatrix (A2,n,n,1.0);
1208             dMultiply2 (A,A2,A2,n,n,n);
1209             dMakeRandomMatrix (x,n,1,1.0);
1210             dMultiply0 (b,A,x,n,n,1);
1211             for (i=0; i<n; i++) b[i] += (dRandReal()*REAL(0.2))-REAL(0.1);
1212
1213             // choose `nub' in the range 0..n-1
1214             unsigned nub = 50; //dRandInt (n);
1215
1216             // make limits
1217             for (i=0; i<nub; i++) lo[i] = -dInfinity;
1218             for (i=0; i<nub; i++) hi[i] = dInfinity;
1219             //for (i=nub; i<n; i++) lo[i] = 0;
1220             //for (i=nub; i<n; i++) hi[i] = dInfinity;
1221             //for (i=nub; i<n; i++) lo[i] = -dInfinity;
1222             //for (i=nub; i<n; i++) hi[i] = 0;
1223             for (i=nub; i<n; i++) lo[i] = -(dRandReal()*REAL(1.0))-REAL(0.01);
1224             for (i=nub; i<n; i++) hi[i] =  (dRandReal()*REAL(1.0))+REAL(0.01);
1225
1226             // set a few limits to lo=hi=0
1227             /*
1228             for (i=0; i<10; i++) {
1229             unsigned j = dRandInt (n-nub) + nub;
1230             lo[j] = 0;
1231             hi[j] = 0;
1232             }
1233             */
1234
1235             // solve the LCP. we must make copy of A,b,lo,hi (A2,b2,lo2,hi2) for
1236             // SolveLCP() to permute. also, we'll clear the upper triangle of A2 to
1237             // ensure that it doesn't get referenced (if it does, the answer will be
1238             // wrong).
1239
1240             memcpy (A2, A, n * nskip * sizeof(dReal));
1241             dClearUpperTriangle (A2, n);
1242             for (i = 0; i != n; ++i) {
1243                 dReal *currbx = pairsbx + i * PBX__MAX;
1244                 currbx[PBX_B] = b[i];
1245                 currbx[PBX_X] = 0;
1246             }
1247             for (i = 0; i != n; ++i) {
1248                 dReal *currlh = pairslh + i * PLH__MAX;
1249                 currlh[PLH_LO] = lo[i];
1250                 currlh[PLH_HI] = hi[i];
1251             }
1252             dSetZero (w,n);
1253
1254             dStopwatch sw;
1255             dStopwatchReset (&sw);
1256             dStopwatchStart (&sw);
1257
1258             dxSolveLCP (arena,n,A2,pairsbx,w,nub,pairslh,0);
1259
1260             dStopwatchStop (&sw);
1261             double time = dStopwatchTime(&sw);
1262             total_time += time;
1263             double average = total_time / double(count+1) * 1000.0;
1264
1265             for (i = 0; i != n; ++i) {
1266                 const dReal *currbx = pairsbx + i * PBX__MAX;
1267                 x[i] = currbx[PBX_X];
1268             }
1269
1270             // check the solution
1271
1272             dMultiply0 (tmp1,A,x,n,n,1);
1273             for (i=0; i<n; i++) tmp2[i] = b[i] + w[i];
1274             dReal diff = dMaxDifference (tmp1,tmp2,n,1);
1275             // printf ("\tA*x = b+w, maximum difference = %.6e - %s (1)\n",diff,
1276             //      diff > tol ? "FAILED" : "passed");
1277             if (diff > tol) dDebug (0,"A*x = b+w, maximum difference = %.6e",diff);
1278             unsigned n1=0,n2=0,n3=0;
1279             for (i=0; i<n; i++) {
1280                 if (x[i]==lo[i] && w[i] >= 0) {
1281                     n1++;       // ok
1282                 }
1283                 else if (x[i]==hi[i] && w[i] <= 0) {
1284                     n2++;       // ok
1285                 }
1286                 else if (x[i] >= lo[i] && x[i] <= hi[i] && w[i] == 0) {
1287                     n3++;       // ok
1288                 }
1289                 else {
1290                     dDebug (0,"FAILED: i=%d x=%.4e w=%.4e lo=%.4e hi=%.4e",i,
1291                         x[i],w[i],lo[i],hi[i]);
1292                 }
1293             }
1294
1295             // pacifier
1296             printf ("passed: NL=%3d NH=%3d C=%3d   ",n1,n2,n3);
1297             printf ("time=%10.3f ms  avg=%10.4f\n",time * 1000.0,average);
1298
1299         } END_STATE_SAVE(arena, saveInner);
1300     }
1301
1302     dxFreeTemporaryWorldProcessMemArena(arena);
1303     return 1;
1304 }