ode/src/fastldltfactor_impl.h

   1
   2
   3 /*************************************************************************
   4  *                                                                       *
   5  * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
   6  * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   7  *                                                                       *
   8  * This library is free software; you can redistribute it and/or         *
   9  * modify it under the terms of EITHER:                                  *
  10  *   (1) The GNU Lesser General Public License as published by the Free  *
  11  *       Software Foundation; either version 2.1 of the License, or (at  *
  12  *       your option) any later version. The text of the GNU Lesser      *
  13  *       General Public License is included with this library in the     *
  14  *       file LICENSE.TXT.                                               *
  15  *   (2) The BSD-style license that is included with this library in     *
  16  *       the file LICENSE-BSD.TXT.                                       *
  17  *                                                                       *
  18  * This library is distributed in the hope that it will be useful,       *
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  21  * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  22  *                                                                       *
  23  *************************************************************************/
  24
  25 /*
  26  * Code style improvements and optimizations by Oleh Derevenko ????-2019
  27  * LDLT cooperative factorization code of ThreadedEquationSolverLDLT copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
  28  */
  29
  30 #ifndef _ODE_FASTLDLT_IMPL_H_
  31 #define _ODE_FASTLDLT_IMPL_H_
  32
  33
  34 #include "error.h"
  35 #include "common.h"
  36
  37
  38 static void solveL1Stripe_2 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
  39 template<unsigned int d_stride>
  40 void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned rowIndex, unsigned rowSkip);
  41 template<unsigned int d_stride>
  42 inline void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip);
  43
  44 static void solveStripeL1_1 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
  45 template<unsigned int d_stride>
  46 void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned rowIndex);
  47 template<unsigned int d_stride>
  48 inline void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d);
  49
  50
  51 template<unsigned int d_stride>
  52 void factorMatrixAsLDLT(dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip)
  53 {
  54     if (rowCount < 1) return;
  55
  56     dReal *ARow = A;
  57     unsigned blockStartRow = 0;
  58
  59     const unsigned blockStep = 2;
  60     const unsigned lastRowIndex = rowCount >= blockStep ? rowCount - blockStep + 1 : 0;
  61
  62     /* compute blocks of 2 rows */
  63     bool subsequentPass = false;
  64     for (; blockStartRow < lastRowIndex; subsequentPass = true, ARow += blockStep * rowSkip, blockStartRow += blockStep)
  65     {
  66         if (subsequentPass)
  67         {
  68             /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
  69             solveL1Stripe_2(A, ARow, blockStartRow, rowSkip);
  70             scaleAndFactorizeL1Stripe_2<d_stride>(ARow, d, blockStartRow, rowSkip);
  71         }
  72         else
  73         {
  74             scaleAndFactorizeL1FirstRowStripe_2<d_stride>(ARow, d, rowSkip);
  75         }
  76         dSASSERT(blockStep == 2);
  77         /* done factorizing 2 x 2 block */
  78     }
  79
  80     /* compute the (less than 2) rows at the bottom */
  81     if (!subsequentPass || blockStartRow == lastRowIndex)
  82     {
  83         dSASSERT(blockStep == 2); // for the blockStartRow == lastRowIndex comparison above
  84
  85         if (subsequentPass)
  86         {
  87             solveStripeL1_1(A, ARow, blockStartRow, rowSkip);
  88             scaleAndFactorizeL1Stripe_1<d_stride>(ARow, d, blockStartRow);
  89         }
  90         else
  91         {
  92             scaleAndFactorizeL1FirstRowStripe_1<d_stride>(ARow, d);
  93         }
  94         dSASSERT(blockStep == 2);
  95         /* done factorizing 1 x 1 block */
  96     }
  97 }
  98
  99 /* solve L*X=B, with B containing 2 right hand sides.
 100  * L is an n*n lower triangular matrix with ones on the diagonal.
 101  * L is stored by rows and its leading dimension is rowSkip.
 102  * B is an n*2 matrix that contains the right hand sides.
 103  * B is stored by columns and its leading dimension is also rowSkip.
 104  * B is overwritten with X.
 105  * this processes blocks of 2*2.
 106  * if this is in the factorizer source file, n must be a multiple of 2.
 107  */
 108 static
 109 void solveL1Stripe_2(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
 110 {
 111     dIASSERT(rowCount != 0);
 112     dIASSERT(rowCount % 2 == 0);
 113
 114     /* compute all 2 x 2 blocks of X */
 115     unsigned blockStartRow = 0;
 116     for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount)
 117     {
 118         const dReal *ptrLElement;
 119         dReal *ptrBElement;
 120
 121         /* declare variables - Z matrix */
 122         dReal Z11, Z12, Z21, Z22;
 123
 124         /* compute all 2 x 2 block of X, from rows i..i+2-1 */
 125         if (subsequentPass)
 126         {
 127             ptrLElement = L + blockStartRow * rowSkip;
 128             ptrBElement = B;
 129
 130             /* set Z matrix to 0 */
 131             Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
 132
 133             /* the inner loop that computes outer products and adds them to Z */
 134             // The iteration starts with even number and decreases it by 2. So, it must end in zero
 135             for (unsigned columnCounter = blockStartRow; ;)
 136             {
 137                 /* declare p and q vectors, etc */
 138                 dReal p1, q1, p2, q2;
 139
 140                 /* compute outer product and add it to the Z matrix */
 141                 p1 = ptrLElement[0];
 142                 q1 = ptrBElement[0];
 143                 Z11 += p1 * q1;
 144                 q2 = ptrBElement[rowSkip];
 145                 Z12 += p1 * q2;
 146                 p2 = ptrLElement[rowSkip];
 147                 Z21 += p2 * q1;
 148                 Z22 += p2 * q2;
 149
 150                 /* compute outer product and add it to the Z matrix */
 151                 p1 = ptrLElement[1];
 152                 q1 = ptrBElement[1];
 153                 Z11 += p1 * q1;
 154                 q2 = ptrBElement[1 + rowSkip];
 155                 Z12 += p1 * q2;
 156                 p2 = ptrLElement[1 + rowSkip];
 157                 Z21 += p2 * q1;
 158                 Z22 += p2 * q2;
 159
 160                 if (columnCounter > 6)
 161                 {
 162                     columnCounter -= 6;
 163
 164                     /* advance pointers */
 165                     ptrLElement += 6;
 166                     ptrBElement += 6;
 167
 168                     /* compute outer product and add it to the Z matrix */
 169                     p1 = ptrLElement[-4];
 170                     q1 = ptrBElement[-4];
 171                     Z11 += p1 * q1;
 172                     q2 = ptrBElement[-4 + rowSkip];
 173                     Z12 += p1 * q2;
 174                     p2 = ptrLElement[-4 + rowSkip];
 175                     Z21 += p2 * q1;
 176                     Z22 += p2 * q2;
 177
 178                     /* compute outer product and add it to the Z matrix */
 179                     p1 = ptrLElement[-3];
 180                     q1 = ptrBElement[-3];
 181                     Z11 += p1 * q1;
 182                     q2 = ptrBElement[-3 + rowSkip];
 183                     Z12 += p1 * q2;
 184                     p2 = ptrLElement[-3 + rowSkip];
 185                     Z21 += p2 * q1;
 186                     Z22 += p2 * q2;
 187
 188                     /* compute outer product and add it to the Z matrix */
 189                     p1 = ptrLElement[-2];
 190                     q1 = ptrBElement[-2];
 191                     Z11 += p1 * q1;
 192                     q2 = ptrBElement[-2 + rowSkip];
 193                     Z12 += p1 * q2;
 194                     p2 = ptrLElement[-2 + rowSkip];
 195                     Z21 += p2 * q1;
 196                     Z22 += p2 * q2;
 197
 198                     /* compute outer product and add it to the Z matrix */
 199                     p1 = ptrLElement[-1];
 200                     q1 = ptrBElement[-1];
 201                     Z11 += p1 * q1;
 202                     q2 = ptrBElement[-1 + rowSkip];
 203                     Z12 += p1 * q2;
 204                     p2 = ptrLElement[-1 + rowSkip];
 205                     Z21 += p2 * q1;
 206                     Z22 += p2 * q2;
 207                 }
 208                 else
 209                 {
 210                     /* advance pointers */
 211                     ptrLElement += 2;
 212                     ptrBElement += 2;
 213
 214                     if ((columnCounter -= 2) == 0)
 215                     {
 216                         break;
 217                     }
 218                 }
 219                 /* end of inner loop */
 220             }
 221         }
 222         else
 223         {
 224             ptrLElement = L/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
 225             ptrBElement = B;
 226
 227             /* set Z matrix to 0 */
 228             Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
 229         }
 230
 231         /* finish computing the X(i) block */
 232
 233         dReal Y11 = ptrBElement[0] - Z11;
 234         dReal Y12 = ptrBElement[rowSkip] - Z12;
 235
 236         dReal p2 = ptrLElement[rowSkip];
 237
 238         ptrBElement[0] = Y11;
 239         ptrBElement[rowSkip] = Y12;
 240
 241         dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
 242         dReal Y22 = ptrBElement[1 + rowSkip] - Z22 - p2 * Y12;
 243
 244         ptrBElement[1] = Y21;
 245         ptrBElement[1 + rowSkip] = Y22;
 246         /* end of outer loop */
 247     }
 248 }
 249
 250 template<unsigned int d_stride>
 251 void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip)
 252 {
 253     dIASSERT(factorizationRow != 0);
 254     dIASSERT(factorizationRow % 2 == 0);
 255
 256     dReal *ptrAElement = ARow;
 257     dReal *ptrDElement = d;
 258
 259     /* scale the elements in a 2 x i block at A(i,0), and also */
 260     /* compute Z = the outer product matrix that we'll need. */
 261     dReal Z11 = 0, Z21 = 0, Z22 = 0;
 262
 263     for (unsigned columnCounter = factorizationRow; ; )
 264     {
 265         dReal p1, q1, p2, q2, dd;
 266
 267         p1 = ptrAElement[0];
 268         p2 = ptrAElement[rowSkip];
 269         dd = ptrDElement[0 * d_stride];
 270         q1 = p1 * dd;
 271         q2 = p2 * dd;
 272         ptrAElement[0] = q1;
 273         ptrAElement[rowSkip] = q2;
 274         Z11 += p1 * q1;
 275         Z21 += p2 * q1;
 276         Z22 += p2 * q2;
 277
 278         p1 = ptrAElement[1];
 279         p2 = ptrAElement[1 + rowSkip];
 280         dd = ptrDElement[1 * d_stride];
 281         q1 = p1 * dd;
 282         q2 = p2 * dd;
 283         ptrAElement[1] = q1;
 284         ptrAElement[1 + rowSkip] = q2;
 285         Z11 += p1 * q1;
 286         Z21 += p2 * q1;
 287         Z22 += p2 * q2;
 288
 289         if (columnCounter > 6)
 290         {
 291             columnCounter -= 6;
 292
 293             ptrAElement += 6;
 294             ptrDElement += 6 * d_stride;
 295
 296             p1 = ptrAElement[-4];
 297             p2 = ptrAElement[-4 + rowSkip];
 298             dd = ptrDElement[-4 * (int)d_stride];
 299             q1 = p1 * dd;
 300             q2 = p2 * dd;
 301             ptrAElement[-4] = q1;
 302             ptrAElement[-4 + rowSkip] = q2;
 303             Z11 += p1 * q1;
 304             Z21 += p2 * q1;
 305             Z22 += p2 * q2;
 306
 307             p1 = ptrAElement[-3];
 308             p2 = ptrAElement[-3 + rowSkip];
 309             dd = ptrDElement[-3 * (int)d_stride];
 310             q1 = p1 * dd;
 311             q2 = p2 * dd;
 312             ptrAElement[-3] = q1;
 313             ptrAElement[-3 + rowSkip] = q2;
 314             Z11 += p1 * q1;
 315             Z21 += p2 * q1;
 316             Z22 += p2 * q2;
 317
 318             p1 = ptrAElement[-2];
 319             p2 = ptrAElement[-2 + rowSkip];
 320             dd = ptrDElement[-2 * (int)d_stride];
 321             q1 = p1 * dd;
 322             q2 = p2 * dd;
 323             ptrAElement[-2] = q1;
 324             ptrAElement[-2 + rowSkip] = q2;
 325             Z11 += p1 * q1;
 326             Z21 += p2 * q1;
 327             Z22 += p2 * q2;
 328
 329             p1 = ptrAElement[-1];
 330             p2 = ptrAElement[-1 + rowSkip];
 331             dd = ptrDElement[-1 * (int)d_stride];
 332             q1 = p1 * dd;
 333             q2 = p2 * dd;
 334             ptrAElement[-1] = q1;
 335             ptrAElement[-1 + rowSkip] = q2;
 336             Z11 += p1 * q1;
 337             Z21 += p2 * q1;
 338             Z22 += p2 * q2;
 339         }
 340         else
 341         {
 342             ptrAElement += 2;
 343             ptrDElement += 2 * d_stride;
 344
 345             if ((columnCounter -= 2) == 0)
 346             {
 347                 break;
 348             }
 349         }
 350     }
 351
 352     /* solve for diagonal 2 x 2 block at A(i,i) */
 353     dReal Y11 = ptrAElement[0] - Z11;
 354     dReal Y21 = ptrAElement[rowSkip] - Z21;
 355     dReal Y22 = ptrAElement[1 + rowSkip] - Z22;
 356
 357     /* factorize 2 x 2 block Y, ptrDElement */
 358     /* factorize row 1 */
 359     dReal dd = dRecip(Y11);
 360
 361     ptrDElement[0 * d_stride] = dd;
 362     dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
 363
 364     /* factorize row 2 */
 365     dReal q2 = Y21 * dd;
 366     ptrAElement[rowSkip] = q2;
 367
 368     dReal sum = Y21 * q2;
 369     ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
 370 }
 371
 372 template<unsigned int d_stride>
 373 void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip)
 374 {
 375     dReal *ptrAElement = ARow;
 376     dReal *ptrDElement = d;
 377
 378     /* solve for diagonal 2 x 2 block at A(0,0) */
 379     dReal Y11 = ptrAElement[0]/* - Z11*/;
 380     dReal Y21 = ptrAElement[rowSkip]/* - Z21*/;
 381     dReal Y22 = ptrAElement[1 + rowSkip]/* - Z22*/;
 382
 383     /* factorize 2 x 2 block Y, ptrDElement */
 384     /* factorize row 1 */
 385     dReal dd = dRecip(Y11);
 386
 387     ptrDElement[0 * d_stride] = dd;
 388     dIASSERT(ptrDElement == d/* + (sizeint)factorizationRow * d_stride*/);
 389
 390     /* factorize row 2 */
 391     dReal q2 = Y21 * dd;
 392     ptrAElement[rowSkip] = q2;
 393
 394     dReal sum = Y21 * q2;
 395     ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
 396 }
 397
 398
 399 /* solve L*X=B, with B containing 1 right hand sides.
 400  * L is an n*n lower triangular matrix with ones on the diagonal.
 401  * L is stored by rows and its leading dimension is lskip.
 402  * B is an n*1 matrix that contains the right hand sides.
 403  * B is stored by columns and its leading dimension is also lskip.
 404  * B is overwritten with X.
 405  * this processes blocks of 2*2.
 406  * if this is in the factorizer source file, n must be a multiple of 2.
 407  */
 408 static
 409 void solveStripeL1_1(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
 410 {
 411     dIASSERT(rowCount != 0);
 412     dIASSERT(rowCount % 2 == 0);
 413
 414     /* compute all 2 x 1 blocks of X */
 415     unsigned blockStartRow = 0;
 416     for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount)
 417     {
 418         const dReal *ptrLElement;
 419         dReal *ptrBElement;
 420
 421         /* declare variables - Z matrix */
 422         dReal Z11, Z21;
 423
 424         if (subsequentPass)
 425         {
 426             ptrLElement = L + (sizeint)blockStartRow * rowSkip;
 427             ptrBElement = B;
 428
 429             /* set the Z matrix to 0 */
 430             Z11 = 0; Z21 = 0;
 431
 432             /* compute all 2 x 1 block of X, from rows i..i+2-1 */
 433
 434             /* the inner loop that computes outer products and adds them to Z */
 435             // The iteration starts with even number and decreases it by 2. So, it must end in zero
 436             for (unsigned columnCounter = blockStartRow; ; )
 437             {
 438                 /* declare p and q vectors, etc */
 439                 dReal p1, q1, p2;
 440
 441                 /* compute outer product and add it to the Z matrix */
 442                 p1 = ptrLElement[0];
 443                 q1 = ptrBElement[0];
 444                 Z11 += p1 * q1;
 445                 p2 = ptrLElement[rowSkip];
 446                 Z21 += p2 * q1;
 447
 448                 /* compute outer product and add it to the Z matrix */
 449                 p1 = ptrLElement[1];
 450                 q1 = ptrBElement[1];
 451                 Z11 += p1 * q1;
 452                 p2 = ptrLElement[1 + rowSkip];
 453                 Z21 += p2 * q1;
 454
 455                 if (columnCounter > 6)
 456                 {
 457                     columnCounter -= 6;
 458
 459                     /* advance pointers */
 460                     ptrLElement += 6;
 461                     ptrBElement += 6;
 462
 463                     /* compute outer product and add it to the Z matrix */
 464                     p1 = ptrLElement[-4];
 465                     q1 = ptrBElement[-4];
 466                     Z11 += p1 * q1;
 467                     p2 = ptrLElement[-4 + rowSkip];
 468                     Z21 += p2 * q1;
 469
 470                     /* compute outer product and add it to the Z matrix */
 471                     p1 = ptrLElement[-3];
 472                     q1 = ptrBElement[-3];
 473                     Z11 += p1 * q1;
 474                     p2 = ptrLElement[-3 + rowSkip];
 475                     Z21 += p2 * q1;
 476
 477                     /* compute outer product and add it to the Z matrix */
 478                     p1 = ptrLElement[-2];
 479                     q1 = ptrBElement[-2];
 480                     Z11 += p1 * q1;
 481                     p2 = ptrLElement[-2 + rowSkip];
 482                     Z21 += p2 * q1;
 483
 484                     /* compute outer product and add it to the Z matrix */
 485                     p1 = ptrLElement[-1];
 486                     q1 = ptrBElement[-1];
 487                     Z11 += p1 * q1;
 488                     p2 = ptrLElement[-1 + rowSkip];
 489                     Z21 += p2 * q1;
 490                 }
 491                 else
 492                 {
 493                     /* advance pointers */
 494                     ptrLElement += 2;
 495                     ptrBElement += 2;
 496
 497                     if ((columnCounter -= 2) == 0)
 498                     {
 499                         break;
 500                     }
 501                 }
 502                 /* end of inner loop */
 503             }
 504         }
 505         else
 506         {
 507             ptrLElement = L/* + (sizeint)blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
 508             ptrBElement = B;
 509
 510             /* set the Z matrix to 0 */
 511             Z11 = 0; Z21 = 0;
 512         }
 513
 514         /* finish computing the X(i) block */
 515         dReal p2 = ptrLElement[rowSkip];
 516
 517         dReal Y11 = ptrBElement[0] - Z11;
 518         dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
 519
 520         ptrBElement[0] = Y11;
 521         ptrBElement[1] = Y21;
 522         /* end of outer loop */
 523     }
 524 }
 525
 526 template<unsigned int d_stride>
 527 void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned factorizationRow)
 528 {
 529     dReal *ptrAElement = ARow;
 530     dReal *ptrDElement = d;
 531
 532     /* scale the elements in a 1 x i block at A(i,0), and also */
 533     /* compute Z = the outer product matrix that we'll need. */
 534     dReal Z11 = 0, Z22 = 0;
 535
 536     for (unsigned columnCounter = factorizationRow; ; )
 537     {
 538         dReal p1, p2, q1, q2, dd1, dd2;
 539
 540         p1 = ptrAElement[0];
 541         p2 = ptrAElement[1];
 542         dd1 = ptrDElement[0 * d_stride];
 543         dd2 = ptrDElement[1 * d_stride];
 544         q1 = p1 * dd1;
 545         q2 = p2 * dd2;
 546         ptrAElement[0] = q1;
 547         ptrAElement[1] = q2;
 548         Z11 += p1 * q1;
 549         Z22 += p2 * q2;
 550
 551         if (columnCounter > 6)
 552         {
 553             columnCounter -= 6;
 554
 555             ptrAElement += 6;
 556             ptrDElement += 6 * d_stride;
 557
 558             p1 = ptrAElement[-4];
 559             p2 = ptrAElement[-3];
 560             dd1 = ptrDElement[-4 * (int)d_stride];
 561             dd2 = ptrDElement[-3 * (int)d_stride];
 562             q1 = p1 * dd1;
 563             q2 = p2 * dd2;
 564             ptrAElement[-4] = q1;
 565             ptrAElement[-3] = q2;
 566             Z11 += p1 * q1;
 567             Z22 += p2 * q2;
 568
 569             p1 = ptrAElement[-2];
 570             p2 = ptrAElement[-1];
 571             dd1 = ptrDElement[-2 * (int)d_stride];
 572             dd2 = ptrDElement[-1 * (int)d_stride];
 573             q1 = p1 * dd1;
 574             q2 = p2 * dd2;
 575             ptrAElement[-2] = q1;
 576             ptrAElement[-1] = q2;
 577             Z11 += p1 * q1;
 578             Z22 += p2 * q2;
 579         }
 580         else
 581         {
 582             ptrAElement += 2;
 583             ptrDElement += 2 * d_stride;
 584
 585             if ((columnCounter -= 2) == 0)
 586             {
 587                 break;
 588             }
 589         }
 590     }
 591
 592     dReal Y11 = ptrAElement[0] - (Z11 + Z22);
 593
 594     /* solve for diagonal 1 x 1 block at A(i,i) */
 595     dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
 596     /* factorize 1 x 1 block Y, ptrDElement */
 597     /* factorize row 1 */
 598     ptrDElement[0 * d_stride] = dRecip(Y11);
 599 }
 600
 601 template<unsigned int d_stride>
 602 void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d)
 603 {
 604     dReal *ptrAElement = ARow;
 605     dReal *ptrDElement = d;
 606
 607     dReal Y11 = ptrAElement[0];
 608
 609     /* solve for diagonal 1 x 1 block at A(0,0) */
 610     /* factorize 1 x 1 block Y, ptrDElement */
 611     /* factorize row 1 */
 612     ptrDElement[0 * d_stride] = dRecip(Y11);
 613 }
 614
 615
 616
 617
 618 template<unsigned int block_step, unsigned int b_rows>
 619 /*static */
 620 void ThreadedEquationSolverLDLT::participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip,
 621     volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
 622     FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
 623 {
 624     const unsigned lookaheadRange = 64;
 625     BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
 626
 627     unsigned completedBlocks = refBlockCompletionProgress;
 628     unsigned currentBlock = completedBlocks;
 629     dIASSERT(completedBlocks <= blockCount);
 630
 631     for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
 632     {
 633         bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
 634         bool goAssigningTheResult = false, stayWithinTheBlock = false;
 635
 636         dReal Z[block_step][b_rows];
 637         dReal Y[block_step][b_rows];
 638
 639         dReal *ptrBElement;
 640
 641         CellContextInstance previousContextInstance;
 642         unsigned completedColumnBlock;
 643
 644         for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
 645         {
 646             if (testDescriptor == INVALID_CELLDESCRIPTOR)
 647             {
 648                 // Invalid descriptor is the indication that the row has been fully calculated
 649                 // Test if this was the last row and break out if so.
 650                 if (currentBlock + 1 == blockCount)
 651                 {
 652                     exitLoop = true;
 653                     break;
 654                 }
 655
 656                 // Treat detected row advancement as a row processed
 657                 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
 658                 break;
 659             }
 660
 661             CooperativeAtomics::AtomicReadReorderBarrier();
 662             // It is necessary to read up to date completedBblocks value after the descriptor retrieval
 663             // as otherwise the logic below breaks
 664             completedBlocks = refBlockCompletionProgress;
 665
 666             if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
 667             {
 668                 completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
 669                 dIASSERT(completedColumnBlock < currentBlock || (completedColumnBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
 670                 dIASSERT(completedColumnBlock <= completedBlocks); // Since the descriptor is not locked
 671
 672                 if (completedColumnBlock == completedBlocks && currentBlock != completedBlocks)
 673                 {
 674                     dIASSERT(completedBlocks < currentBlock);
 675                     break;
 676                 }
 677
 678                 if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
 679                 {
 680                     if (completedColumnBlock != 0)
 681                     {
 682                         CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
 683                         previousContextInstance = contextInstance;
 684
 685                         const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
 686                         sourceContext.loadPrecalculatedZs(Z);
 687                     }
 688                     else
 689                     {
 690                         previousContextInstance = CCI__MIN;
 691                         FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
 692                     }
 693
 694                     goForLockedBlockPrimaryCalculation = true;
 695                     break;
 696                 }
 697
 698                 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
 699                 {
 700                     break;
 701                 }
 702
 703                 testDescriptor = blockProgressDescriptors[currentBlock];
 704             }
 705             else
 706             {
 707                 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
 708                 {
 709                     break;
 710                 }
 711
 712                 cellindexint verificativeDescriptor;
 713                 bool verificationFailure = false;
 714
 715                 completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
 716                 dIASSERT(completedColumnBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
 717
 718                 if (completedColumnBlock != 0)
 719                 {
 720                     CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
 721                     const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
 722                     sourceContext.loadPrecalculatedZs(Z);
 723                 }
 724                 else
 725                 {
 726                     FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
 727                 }
 728
 729                 if (completedColumnBlock != 0 && completedColumnBlock <= currentBlock)
 730                 {
 731                     // Make sure the descriptor is re-read after the precalculates
 732                     CooperativeAtomics::AtomicReadReorderBarrier();
 733                 }
 734
 735                 if (completedColumnBlock <= currentBlock)
 736                 {
 737                     verificativeDescriptor = blockProgressDescriptors[currentBlock];
 738                     verificationFailure = verificativeDescriptor != testDescriptor;
 739                 }
 740
 741                 if (!verificationFailure)
 742                 {
 743                     dIASSERT(completedColumnBlock <= currentBlock + 1);
 744
 745                     goForLockedBlockDuplicateCalculation = true;
 746                     break;
 747                 }
 748
 749                 testDescriptor = verificativeDescriptor;
 750             }
 751         }
 752
 753         if (exitLoop)
 754         {
 755             break;
 756         }
 757
 758         if (goForLockedBlockPrimaryCalculation)
 759         {
 760             blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
 761
 762             // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
 763             bool handleComputationTakenOver = false, rowEndReached = false;
 764
 765             const dReal *ptrLElement;
 766             unsigned finalColumnBlock;
 767
 768             if (currentBlock != 0)
 769             {
 770                 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
 771                 ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
 772                 ptrBElement = B + completedColumnBlock * block_step;
 773
 774                 /* the inner loop that computes outer products and adds them to Z */
 775                 finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
 776                 dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);
 777
 778                 // The iteration starts with even number and decreases it by 2. So, it must end in zero
 779                 for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
 780                 {
 781                     /* declare p and q vectors, etc */
 782                     dReal p[block_step], q[b_rows];
 783
 784                     /* compute outer product and add it to the Z matrix */
 785                     p[0] = ptrLElement[0];
 786                     q[0] = ptrBElement[0];
 787                     Z[0][0] += p[0] * q[0];
 788                     if (b_rows >= 2)
 789                     {
 790                         q[1] = ptrBElement[rowSkip];
 791                         Z[0][1] += p[0] * q[1];
 792                     }
 793                     p[1] = ptrLElement[rowSkip];
 794                     Z[1][0] += p[1] * q[0];
 795                     if (b_rows >= 2)
 796                     {
 797                         Z[1][1] += p[1] * q[1];
 798                     }
 799
 800                     /* compute outer product and add it to the Z matrix */
 801                     p[0] = ptrLElement[1];
 802                     q[0] = ptrBElement[1];
 803                     Z[0][0] += p[0] * q[0];
 804                     if (b_rows >= 2)
 805                     {
 806                         q[1] = ptrBElement[1 + rowSkip];
 807                         Z[0][1] += p[0] * q[1];
 808                     }
 809                     p[1] = ptrLElement[1 + rowSkip];
 810                     Z[1][0] += p[1] * q[0];
 811                     if (b_rows >= 2)
 812                     {
 813                         Z[1][1] += p[1] * q[1];
 814                     }
 815
 816                     dSASSERT(block_step == 2);
 817                     dSASSERT(b_rows >= 1 && b_rows <= 2);
 818
 819                     if (columnCounter > 2)
 820                     {
 821                         /* compute outer product and add it to the Z matrix */
 822                         p[0] = ptrLElement[2];
 823                         q[0] = ptrBElement[2];
 824                         Z[0][0] += p[0] * q[0];
 825                         if (b_rows >= 2)
 826                         {
 827                             q[1] = ptrBElement[2 + rowSkip];
 828                             Z[0][1] += p[0] * q[1];
 829                         }
 830                         p[1] = ptrLElement[2 + rowSkip];
 831                         Z[1][0] += p[1] * q[0];
 832                         if (b_rows >= 2)
 833                         {
 834                             Z[1][1] += p[1] * q[1];
 835                         }
 836
 837                         /* compute outer product and add it to the Z matrix */
 838                         p[0] = ptrLElement[3];
 839                         q[0] = ptrBElement[3];
 840                         Z[0][0] += p[0] * q[0];
 841                         if (b_rows >= 2)
 842                         {
 843                             q[1] = ptrBElement[3 + rowSkip];
 844                             Z[0][1] += p[0] * q[1];
 845                         }
 846                         p[1] = ptrLElement[3 + rowSkip];
 847                         Z[1][0] += p[1] * q[0];
 848                         if (b_rows >= 2)
 849                         {
 850                             Z[1][1] += p[1] * q[1];
 851                         }
 852
 853                         dSASSERT(block_step == 2);
 854                         dSASSERT(b_rows >= 1 && b_rows <= 2);
 855
 856                         /* advance pointers */
 857                         ptrLElement += 2 * block_step;
 858                         ptrBElement += 2 * block_step;
 859                         columnCounter -= 2;
 860                     }
 861                     else
 862                     {
 863                         /* advance pointers */
 864                         ptrLElement += block_step;
 865                         ptrBElement += block_step;
 866                         /* end of inner loop */
 867
 868                         if (--columnCounter == 0)
 869                         {
 870                             if (finalColumnBlock == currentBlock)
 871                             {
 872                                 rowEndReached = true;
 873                                 break;
 874                             }
 875
 876                             // Take a look if any more rows have been completed...
 877                             completedBlocks = refBlockCompletionProgress;
 878                             dIASSERT(completedBlocks >= finalColumnBlock);
 879
 880                             if (completedBlocks == finalColumnBlock)
 881                             {
 882                                 break;
 883                             }
 884
 885                             // ...continue if so.
 886                             unsigned columnCompletedSoFar = finalColumnBlock;
 887                             finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
 888                             columnCounter = finalColumnBlock - columnCompletedSoFar;
 889                         }
 890                     }
 891                 }
 892             }
 893             else
 894             {
 895                 ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
 896                 ptrBElement = B/* + completedColumnBlock * block_step*/;
 897
 898                 rowEndReached = true;
 899             }
 900
 901             if (rowEndReached)
 902             {
 903                 // Check whether there is still a need to proceed or if the computation has been taken over by another thread
 904                 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
 905
 906                 if (blockProgressDescriptors[currentBlock] == oldDescriptor)
 907                 {
 908                     /* finish computing the X(i) block */
 909                     Y[0][0] = ptrBElement[0] - Z[0][0];
 910                     if (b_rows >= 2)
 911                     {
 912                         Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
 913                     }
 914
 915                     dReal p2 = ptrLElement[rowSkip];
 916
 917                     Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
 918                     if (b_rows >= 2)
 919                     {
 920                         Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
 921                     }
 922
 923                     dSASSERT(block_step == 2);
 924                     dSASSERT(b_rows >= 1 && b_rows <= 2);
 925
 926                     // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
 927                     CooperativeAtomics::AtomicReadReorderBarrier();
 928
 929                     // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
 930                     // and the computation result is valid.
 931                     if (blockProgressDescriptors[currentBlock] == oldDescriptor)
 932                     {
 933                         // Assign the results to the result context (possibly in parallel with other threads
 934                         // that could and ought to be assigning exactly the same values)
 935                         FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
 936                         resultContext.storePrecalculatedZs(Y);
 937
 938                         // Assign the result assignment progress descriptor
 939                         cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
 940                         CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
 941
 942                         // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
 943                         goAssigningTheResult = true;
 944                     }
 945                     else
 946                     {
 947                         // Otherwise, go on competing for copying the results
 948                         handleComputationTakenOver = true;
 949                     }
 950                 }
 951                 else
 952                 {
 953                     handleComputationTakenOver = true;
 954                 }
 955             }
 956             else
 957             {
 958                 // If the final column has not been reached yet, store current values to the context.
 959                 // Select the other context instance as the previous one might be read by other threads.
 960                 CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
 961                 FactorizationSolveL1StripeCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
 962                 destinationContext.storePrecalculatedZs(Z);
 963
 964                 // Unlock the row until more columns can be used
 965                 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
 966                 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalColumnBlock, nextContextInstance, false);
 967                 // The descriptor might have been updated by a competing thread
 968                 if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
 969                 {
 970                     // Adjust the ptrBElement to point to the result area...
 971                     ptrBElement = B + currentBlock * block_step;
 972                     // ...and go on handling the case
 973                     handleComputationTakenOver = true;
 974                 }
 975             }
 976
 977             if (handleComputationTakenOver)
 978             {
 979                 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
 980                 // This can only happen if the row was (has become) the uppermost not fully completed one
 981                 // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
 982                 if (existingDescriptor != INVALID_CELLDESCRIPTOR)
 983                 {
 984                     // If not fully completed this must be the final stage of the result assignment into the matrix
 985                     dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
 986
 987                     // Go on competing copying the result as anyway the block is the topmost not completed one
 988                     // and since there was competition for it, there is no other work that can be done right now.
 989                     const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
 990                     resultContext.loadPrecalculatedZs(Y);
 991
 992                     goAssigningTheResult = true;
 993                 }
 994                 else
 995                 {
 996                     // everything is over -- just go handling next blocks
 997                 }
 998             }
 999         }
1000         else if (goForLockedBlockDuplicateCalculation)
1001         {
1002             blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
1003
1004             bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
1005
1006             /* declare variables */
1007             const dReal *ptrLElement;
1008
1009             if (completedColumnBlock < currentBlock)
1010             {
1011                 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
1012                 ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
1013                 ptrBElement = B + completedColumnBlock * block_step;
1014
1015                 /* the inner loop that computes outer products and adds them to Z */
1016                 // The iteration starts with even number and decreases it by 2. So, it must end in zero
1017                 const unsigned finalColumnBlock = currentBlock;
1018                 dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
1019
1020                 unsigned lastCompletedColumn = completedColumnBlock;
1021                 unsigned columnCounter = finalColumnBlock - completedColumnBlock;
1022                 for (bool exitInnerLoop = false; !exitInnerLoop; exitInnerLoop = --columnCounter == 0)
1023                 {
1024                     /* declare p and q vectors, etc */
1025                     dReal p[block_step], q[b_rows];
1026
1027                     /* compute outer product and add it to the Z matrix */
1028                     p[0] = ptrLElement[0];
1029                     q[0] = ptrBElement[0];
1030                     Z[0][0] += p[0] * q[0];
1031                     if (b_rows >= 2)
1032                     {
1033                         q[1] = ptrBElement[rowSkip];
1034                         Z[0][1] += p[0] * q[1];
1035                     }
1036                     p[1] = ptrLElement[rowSkip];
1037                     Z[1][0] += p[1] * q[0];
1038                     if (b_rows >= 2)
1039                     {
1040                         Z[1][1] += p[1] * q[1];
1041                     }
1042
1043                     /* compute outer product and add it to the Z matrix */
1044                     p[0] = ptrLElement[1];
1045                     q[0] = ptrBElement[1];
1046                     Z[0][0] += p[0] * q[0];
1047                     if (b_rows >= 2)
1048                     {
1049                         q[1] = ptrBElement[1 + rowSkip];
1050                         Z[0][1] += p[0] * q[1];
1051                     }
1052                     p[1] = ptrLElement[1 + rowSkip];
1053                     Z[1][0] += p[1] * q[0];
1054                     if (b_rows >= 2)
1055                     {
1056                         Z[1][1] += p[1] * q[1];
1057                     }
1058
1059                     dSASSERT(block_step == 2);
1060                     dSASSERT(b_rows >= 1 && b_rows <= 2);
1061
1062                     // Check if the primary solver thread has not made any progress
1063                     cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
1064                     unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
1065
1066                     if (newCompletedColumn != lastCompletedColumn)
1067                     {
1068                         // Check, this is the first change the current thread detects.
1069                         // There is absolutely no reason in code for the computation to stop/resume twice
1070                         // while the current thread is competing.
1071                         dIASSERT(lastCompletedColumn == completedColumnBlock);
1072
1073                         if (descriptorVerification == INVALID_CELLDESCRIPTOR)
1074                         {
1075                             skipToHandlingSubsequentRows = true;
1076                             break;
1077                         }
1078
1079                         if (newCompletedColumn == currentBlock + 1)
1080                         {
1081                             skiptoCopyingResult = true;
1082                             break;
1083                         }
1084
1085                         // Check if the current thread is behind
1086                         if (newCompletedColumn > finalColumnBlock - columnCounter)
1087                         {
1088                             // If so, go starting over one more time
1089                             blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1090                             stayWithinTheBlock = true;
1091                             skipToHandlingSubsequentRows = true;
1092                             break;
1093                         }
1094
1095                         // If current thread is ahead, just save new completed column for further comparisons and go on calculating
1096                         lastCompletedColumn = newCompletedColumn;
1097                     }
1098
1099                     /* advance pointers */
1100                     ptrLElement += block_step;
1101                     ptrBElement += block_step;
1102                     /* end of inner loop */
1103                 }
1104             }
1105             else if (completedColumnBlock > currentBlock)
1106             {
1107                 dIASSERT(completedColumnBlock == currentBlock + 1);
1108
1109                 skiptoCopyingResult = true;
1110             }
1111             else
1112             {
1113                 dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
1114
1115                 /* assign the pointers appropriately and go on computing the results */
1116                 ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
1117                 ptrBElement = B/* + completedColumnBlock * block_step*/;
1118             }
1119
1120             if (!skipToHandlingSubsequentRows)
1121             {
1122                 if (!skiptoCopyingResult)
1123                 {
1124                     /* finish computing the X(i) block */
1125                     Y[0][0] = ptrBElement[0] - Z[0][0];
1126                     if (b_rows >= 2)
1127                     {
1128                         Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
1129                     }
1130
1131                     dReal p2 = ptrLElement[rowSkip];
1132
1133                     Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
1134                     if (b_rows >= 2)
1135                     {
1136                         Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
1137                     }
1138
1139                     dSASSERT(block_step == 2);
1140                     dSASSERT(b_rows >= 1 && b_rows <= 2);
1141
1142                     // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
1143                     CooperativeAtomics::AtomicReadReorderBarrier();
1144
1145                     cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1146
1147                     if (existingDescriptor == INVALID_CELLDESCRIPTOR)
1148                     {
1149                         // Everything is over -- proceed to subsequent rows
1150                         skipToHandlingSubsequentRows = true;
1151                     }
1152                     else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
1153                     {
1154                         // The values computed above may not be valid. Copy the values already in the result context.
1155                         skiptoCopyingResult = true;
1156                     }
1157                     else
1158                     {
1159                         // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
1160                         // and the computation result is valid.
1161                         cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
1162
1163                         // Assign the results to the result context (possibly in parallel with other threads
1164                         // that could and ought to be assigning exactly the same values)
1165                         FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1166                         resultContext.storePrecalculatedZs(Y);
1167
1168                         // Assign the result assignment progress descriptor
1169                         CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
1170
1171                         // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
1172                     }
1173                 }
1174
1175                 if (!skipToHandlingSubsequentRows)
1176                 {
1177                     if (skiptoCopyingResult)
1178                     {
1179                         // Extract the result values stored in the result context
1180                         const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1181                         resultContext.loadPrecalculatedZs(Y);
1182
1183                         ptrBElement = B + currentBlock * block_step;
1184                     }
1185
1186                     goAssigningTheResult = true;
1187                 }
1188             }
1189         }
1190
1191         if (goAssigningTheResult)
1192         {
1193             cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1194             // Check if the assignment has not been completed yet
1195             if (existingDescriptor != INVALID_CELLDESCRIPTOR)
1196             {
1197                 // Assign the computation results to their places in the matrix
1198                 ptrBElement[0] = Y[0][0];
1199                 ptrBElement[1] = Y[1][0];
1200                 if (b_rows >= 2)
1201                 {
1202                     ptrBElement[rowSkip] = Y[0][1];
1203                     ptrBElement[1 + rowSkip] = Y[1][1];
1204                 }
1205
1206                 dSASSERT(block_step == 2);
1207                 dSASSERT(b_rows >= 1 && b_rows <= 2);
1208
1209                 ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
1210                 dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
1211
1212                 // And assign the completed status no matter what
1213                 CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
1214             }
1215             else
1216             {
1217                 // everything is over -- just go handling next blocks
1218             }
1219         }
1220
1221         if (!stayWithinTheBlock)
1222         {
1223             completedBlocks = refBlockCompletionProgress;
1224
1225             if (completedBlocks == blockCount)
1226             {
1227                 break;
1228             }
1229
1230             currentBlock += 1;
1231
1232             bool lookaheadBoundaryReached = false;
1233
1234             if (currentBlock == blockCount || completedBlocks == 0)
1235             {
1236                 lookaheadBoundaryReached = true;
1237             }
1238             else if (currentBlock >= completedBlocks + lookaheadRange)
1239             {
1240                 lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
1241             }
1242             else if (currentBlock < completedBlocks)
1243             {
1244                 // Treat detected row advancement as a row processed
1245                 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
1246
1247                 currentBlock = completedBlocks;
1248             }
1249
1250             if (lookaheadBoundaryReached)
1251             {
1252                 dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did not we compete???
1253
1254                 // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
1255                 if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
1256                 {
1257                     // Abandon job if too few blocks remain
1258                     if (blockCount - completedBlocks <= ownThreadIndex)
1259                     {
1260                         break;
1261                     }
1262
1263                     blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1264                 }
1265                 else
1266                 {
1267                     // If there was some progress, just continue to the next pass
1268                     blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
1269                 }
1270
1271                 currentBlock = completedBlocks;
1272             }
1273         }
1274     }
1275 }
1276
1277
1278 template<unsigned int a_rows, unsigned int d_stride>
1279 /*static */
1280 void ThreadedEquationSolverLDLT::participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip,
1281     FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex)
1282 {
1283     dIASSERT(factorizationRow != 0);
1284     dIASSERT(factorizationRow % 2 == 0);
1285
1286     /* scale the elements in a 2 x i block at A(i,0), and also */
1287     /* compute Z = the outer product matrix that we'll need. */
1288     dReal sameZ[a_rows] = { REAL(0.0), }, mixedZ[dMACRO_MAX(a_rows - 1, 1)] = { REAL(0.0), };
1289     bool doneAnything = false;
1290
1291     const unsigned blockSize = deriveScalingAndFactorizingL1StripeBlockSize(a_rows);
1292
1293     const unsigned blockCount = deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(factorizationRow, blockSize);
1294     dIASSERT(blockCount != 0);
1295
1296     unsigned blockIndex;
1297     while ((blockIndex = ThrsafeIncrementIntUpToLimit(&factorizationContext->m_nextColumnIndex, blockCount)) != blockCount)
1298     {
1299         doneAnything = true;
1300         unsigned blockStartRow = blockIndex * blockSize;
1301
1302         dReal *ptrAElement = ARow + blockStartRow;
1303         dReal *ptrDElement = d + blockStartRow * d_stride;
1304         for (unsigned columnCounter = blockIndex != blockCount - 1 ? blockSize : factorizationRow - blockStartRow; ; )
1305         {
1306             dReal p1, q1, p2, q2, dd;
1307
1308             p1 = ptrAElement[0];
1309             if (a_rows >= 2)
1310             {
1311                 p2 = ptrAElement[rowSkip];
1312             }
1313             dd = ptrDElement[0 * d_stride];
1314             q1 = p1 * dd;
1315             if (a_rows >= 2)
1316             {
1317                 q2 = p2 * dd;
1318             }
1319             ptrAElement[0] = q1;
1320             if (a_rows >= 2)
1321             {
1322                 ptrAElement[rowSkip] = q2;
1323             }
1324             sameZ[0] += p1 * q1;
1325             if (a_rows >= 2)
1326             {
1327                 sameZ[1] += p2 * q2;
1328                 mixedZ[0] += p2 * q1;
1329             }
1330
1331             p1 = ptrAElement[1];
1332             if (a_rows >= 2)
1333             {
1334                 p2 = ptrAElement[1 + rowSkip];
1335             }
1336             dd = ptrDElement[1 * d_stride];
1337             q1 = p1 * dd;
1338             if (a_rows >= 2)
1339             {
1340                 q2 = p2 * dd;
1341             }
1342             ptrAElement[1] = q1;
1343             if (a_rows >= 2)
1344             {
1345                 ptrAElement[1 + rowSkip] = q2;
1346             }
1347             sameZ[0] += p1 * q1;
1348             if (a_rows >= 2)
1349             {
1350                 sameZ[1] += p2 * q2;
1351                 mixedZ[0] += p2 * q1;
1352             }
1353
1354             if (columnCounter > 6)
1355             {
1356                 columnCounter -= 6;
1357
1358                 ptrAElement += 6;
1359                 ptrDElement += 6 * d_stride;
1360
1361                 p1 = ptrAElement[-4];
1362                 if (a_rows >= 2)
1363                 {
1364                     p2 = ptrAElement[-4 + rowSkip];
1365                 }
1366                 dd = ptrDElement[-4 * (int)d_stride];
1367                 q1 = p1 * dd;
1368                 if (a_rows >= 2)
1369                 {
1370                     q2 = p2 * dd;
1371                 }
1372                 ptrAElement[-4] = q1;
1373                 if (a_rows >= 2)
1374                 {
1375                     ptrAElement[-4 + rowSkip] = q2;
1376                 }
1377                 sameZ[0] += p1 * q1;
1378                 if (a_rows >= 2)
1379                 {
1380                     sameZ[1] += p2 * q2;
1381                     mixedZ[0] += p2 * q1;
1382                 }
1383
1384                 p1 = ptrAElement[-3];
1385                 if (a_rows >= 2)
1386                 {
1387                     p2 = ptrAElement[-3 + rowSkip];
1388                 }
1389                 dd = ptrDElement[-3 * (int)d_stride];
1390                 q1 = p1 * dd;
1391                 if (a_rows >= 2)
1392                 {
1393                     q2 = p2 * dd;
1394                 }
1395                 ptrAElement[-3] = q1;
1396                 if (a_rows >= 2)
1397                 {
1398                     ptrAElement[-3 + rowSkip] = q2;
1399                 }
1400                 sameZ[0] += p1 * q1;
1401                 if (a_rows >= 2)
1402                 {
1403                     sameZ[1] += p2 * q2;
1404                     mixedZ[0] += p2 * q1;
1405                 }
1406
1407                 p1 = ptrAElement[-2];
1408                 if (a_rows >= 2)
1409                 {
1410                     p2 = ptrAElement[-2 + rowSkip];
1411                 }
1412                 dd = ptrDElement[-2 * (int)d_stride];
1413                 q1 = p1 * dd;
1414                 if (a_rows >= 2)
1415                 {
1416                     q2 = p2 * dd;
1417                 }
1418                 ptrAElement[-2] = q1;
1419                 if (a_rows >= 2)
1420                 {
1421                     ptrAElement[-2 + rowSkip] = q2;
1422                 }
1423                 sameZ[0] += p1 * q1;
1424                 if (a_rows >= 2)
1425                 {
1426                     sameZ[1] += p2 * q2;
1427                     mixedZ[0] += p2 * q1;
1428                 }
1429
1430                 p1 = ptrAElement[-1];
1431                 if (a_rows >= 2)
1432                 {
1433                     p2 = ptrAElement[-1 + rowSkip];
1434                 }
1435                 dd = ptrDElement[-1 * (int)d_stride];
1436                 q1 = p1 * dd;
1437                 if (a_rows >= 2)
1438                 {
1439                     q2 = p2 * dd;
1440                 }
1441                 ptrAElement[-1] = q1;
1442                 if (a_rows >= 2)
1443                 {
1444                     ptrAElement[-1 + rowSkip] = q2;
1445                 }
1446                 sameZ[0] += p1 * q1;
1447                 if (a_rows >= 2)
1448                 {
1449                     sameZ[1] += p2 * q2;
1450                     mixedZ[0] += p2 * q1;
1451                 }
1452             }
1453             else
1454             {
1455                 ptrAElement += 2;
1456                 ptrDElement += 2 * d_stride;
1457
1458                 if ((columnCounter -= 2) == 0)
1459                 {
1460                     break;
1461                 }
1462             }
1463         }
1464     }
1465
1466     if (doneAnything)
1467     {
1468         unsigned partialSumThreadIndex;
1469         for (bool exitLoop = false; !exitLoop; exitLoop = CooperativeAtomics::AtomicCompareExchangeUint32(&factorizationContext->m_sumThreadIndex, partialSumThreadIndex, ownThreadIndex + 1))
1470         {
1471             partialSumThreadIndex = factorizationContext->m_sumThreadIndex;
1472
1473             if (partialSumThreadIndex != 0)
1474             {
1475                 const FactorizationFactorizeL1StripeThreadContext &partialSumContext = factorizationContext->m_threadContexts[partialSumThreadIndex - 1];
1476                 factorizationContext->m_threadContexts[ownThreadIndex].assignDataSum<a_rows>(sameZ, mixedZ, partialSumContext);
1477             }
1478             else
1479             {
1480                 factorizationContext->m_threadContexts[ownThreadIndex].assignDataAlone<a_rows>(sameZ, mixedZ);
1481             }
1482         }
1483     }
1484
1485     unsigned threadExitIndex = CooperativeAtomics::AtomicDecrementUint32(&factorizationContext->m_threadsRunning);
1486     dIASSERT(threadExitIndex + 1U != 0);
1487
1488     if (threadExitIndex == 0)
1489     {
1490         // Let the last thread retrieve the sum and perform final computations
1491         unsigned sumThreadIndex = factorizationContext->m_sumThreadIndex;
1492         dIASSERT(sumThreadIndex != 0); // The rowIndex was asserted to be not zero, so at least one thread must have done something
1493
1494         const FactorizationFactorizeL1StripeThreadContext &sumContext = factorizationContext->m_threadContexts[sumThreadIndex - 1];
1495         sumContext.retrieveData<a_rows>(sameZ, mixedZ);
1496
1497         dReal *ptrAElement = ARow + factorizationRow;
1498         dReal *ptrDElement = d + factorizationRow * d_stride;
1499
1500         /* solve for diagonal 2 x 2 block at A(i,i) */
1501         dReal Y11, Y21, Y22;
1502
1503         Y11 = ptrAElement[0] - sameZ[0];
1504         if (a_rows >= 2)
1505         {
1506             Y21 = ptrAElement[rowSkip] - mixedZ[0];
1507             Y22 = ptrAElement[1 + rowSkip] - sameZ[1];
1508         }
1509
1510         /* factorize 2 x 2 block Y, ptrDElement */
1511         /* factorize row 1 */
1512         dReal dd = dRecip(Y11);
1513
1514         ptrDElement[0 * d_stride] = dd;
1515         dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
1516
1517         if (a_rows >= 2)
1518         {
1519             /* factorize row 2 */
1520             dReal q2 = Y21 * dd;
1521             ptrAElement[rowSkip] = q2;
1522
1523             dReal sum = Y21 * q2;
1524             ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
1525         }
1526     }
1527 }
1528
1529
1530 #endif // #ifndef _ODE_FASTLDLT_IMPL_H_