ode/src/fastltsolve_impl.h

   1
   2
   3 /*************************************************************************
   4  *                                                                       *
   5  * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
   6  * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   7  *                                                                       *
   8  * This library is free software; you can redistribute it and/or         *
   9  * modify it under the terms of EITHER:                                  *
  10  *   (1) The GNU Lesser General Public License as published by the Free  *
  11  *       Software Foundation; either version 2.1 of the License, or (at  *
  12  *       your option) any later version. The text of the GNU Lesser      *
  13  *       General Public License is included with this library in the     *
  14  *       file LICENSE.TXT.                                               *
  15  *   (2) The BSD-style license that is included with this library in     *
  16  *       the file LICENSE-BSD.TXT.                                       *
  17  *                                                                       *
  18  * This library is distributed in the hope that it will be useful,       *
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  21  * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  22  *                                                                       *
  23  *************************************************************************/
  24
  25 /*
  26  * Code style improvements and optimizations by Oleh Derevenko ????-2024
  27  * L1Transposed cooperative solving code of ThreadedEquationSolverLDLT copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
  28  */
  29
  30
  31 #ifndef _ODE_FASTLTSOLVE_IMPL_H_
  32 #define _ODE_FASTLTSOLVE_IMPL_H_
  33
  34
  35 /* solve L^T * x=b, with b containing 1 right hand side.
  36  * L is an n*n lower triangular matrix with ones on the diagonal.
  37  * L is stored by rows and its leading dimension is rowSkip.
  38  * b is an n*1 matrix that contains the right hand side.
  39  * b is overwritten with x.
  40  * this processes blocks of 4.
  41  */
  42
  43 template<unsigned int b_stride>
  44 void solveL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
  45 {
  46     dIASSERT(rowCount != 0);
  47
  48     /* special handling for L and B because we're solving L1 *transpose* */
  49     const dReal *lastLElement = L + (sizeint)(rowCount - 1) * (rowSkip + 1);
  50     dReal *lastBElement = B + (sizeint)(rowCount - 1) * b_stride;
  51
  52     /* compute rows at end that are not a multiple of block size */
  53     const unsigned loopX1RowCount = rowCount % 4;
  54
  55     unsigned blockStartRow = loopX1RowCount;
  56     bool subsequentPass  = false;
  57
  58     /* compute rightmost bottom X(i) block */
  59     if (loopX1RowCount != 0)
  60     {
  61         subsequentPass = true;
  62
  63         const dReal *ptrLElement = lastLElement;
  64         dReal *ptrBElement = lastBElement;
  65
  66         dReal Y11 = ptrBElement[0 * b_stride]/* - Z11*/;
  67         // ptrBElement[0 * b_stride] = Y11; -- unchanged
  68
  69         if (loopX1RowCount >= 2)
  70         {
  71             dReal p2 = ptrLElement[-1];
  72             dReal Y21 = ptrBElement[-1 * (int)b_stride]/* - Z21 */- p2 * Y11;
  73             ptrBElement[-1 * (int)b_stride] = Y21;
  74
  75             if (loopX1RowCount > 2)
  76             {
  77                 dReal p3 = ptrLElement[-2];
  78                 dReal p3_1 = (ptrLElement - rowSkip)[-2];
  79                 dReal Y31 = ptrBElement[-2 * (int)b_stride]/* - Z31 */- p3 * Y11 - p3_1 * Y21;
  80                 ptrBElement[-2 * (int)b_stride] = Y31;
  81             }
  82         }
  83     }
  84
  85     /* compute all 4 x 1 blocks of X */
  86     for (; !subsequentPass || blockStartRow < rowCount; subsequentPass = true, blockStartRow += 4)
  87     {
  88         /* compute all 4 x 1 block of X, from rows i..i+4-1 */
  89
  90         /* declare variables - Z matrix, p and q vectors, etc */
  91         const dReal *ptrLElement;
  92         dReal *ptrBElement;
  93
  94         dReal Z41, Z31, Z21, Z11;
  95
  96         if (subsequentPass)
  97         {
  98             ptrLElement = lastLElement - blockStartRow;
  99             ptrBElement = lastBElement;
 100
 101             /* set the Z matrix to 0 */
 102             Z41 = 0; Z31 = 0; Z21 = 0; Z11 = 0;
 103
 104             unsigned rowCounter = blockStartRow;
 105
 106             if (rowCounter % 2 != 0)
 107             {
 108                 dReal q1, p4, p3, p2, p1;
 109
 110                 /* load p and q values */
 111                 q1 = ptrBElement[0 * (int)b_stride];
 112                 p4 = ptrLElement[-3];
 113                 p3 = ptrLElement[-2];
 114                 p2 = ptrLElement[-1];
 115                 p1 = ptrLElement[0];
 116                 ptrLElement -= rowSkip;
 117
 118                 /* compute outer product and add it to the Z matrix */
 119                 Z41 += p4 * q1;
 120                 Z31 += p3 * q1;
 121                 Z21 += p2 * q1;
 122                 Z11 += p1 * q1;
 123
 124                 ptrBElement -= 1 * b_stride;
 125                 rowCounter -= 1;
 126             }
 127
 128             if (rowCounter % 4 != 0)
 129             {
 130                 dReal q1, p4, p3, p2, p1;
 131
 132                 /* load p and q values */
 133                 q1 = ptrBElement[0 * (int)b_stride];
 134                 p4 = ptrLElement[-3];
 135                 p3 = ptrLElement[-2];
 136                 p2 = ptrLElement[-1];
 137                 p1 = ptrLElement[0];
 138                 ptrLElement -= rowSkip;
 139
 140                 /* compute outer product and add it to the Z matrix */
 141                 Z41 += p4 * q1;
 142                 Z31 += p3 * q1;
 143                 Z21 += p2 * q1;
 144                 Z11 += p1 * q1;
 145
 146                 /* load p and q values */
 147                 q1 = ptrBElement[-1 * (int)b_stride];
 148                 p4 = ptrLElement[-3];
 149                 p3 = ptrLElement[-2];
 150                 p2 = ptrLElement[-1];
 151                 p1 = ptrLElement[0];
 152                 ptrLElement -= rowSkip;
 153
 154                 /* compute outer product and add it to the Z matrix */
 155                 Z41 += p4 * q1;
 156                 Z31 += p3 * q1;
 157                 Z21 += p2 * q1;
 158                 Z11 += p1 * q1;
 159
 160                 ptrBElement -= 2 * b_stride;
 161                 rowCounter -= 2;
 162             }
 163
 164             /* the inner loop that computes outer products and adds them to Z */
 165             for (bool exitLoop = rowCounter == 0; !exitLoop; exitLoop = false)
 166             {
 167                 dReal q1, p4, p3, p2, p1;
 168
 169                 /* load p and q values */
 170                 q1 = ptrBElement[0 * (int)b_stride];
 171                 p4 = ptrLElement[-3];
 172                 p3 = ptrLElement[-2];
 173                 p2 = ptrLElement[-1];
 174                 p1 = ptrLElement[0];
 175                 ptrLElement -= rowSkip;
 176
 177                 /* compute outer product and add it to the Z matrix */
 178                 Z41 += p4 * q1;
 179                 Z31 += p3 * q1;
 180                 Z21 += p2 * q1;
 181                 Z11 += p1 * q1;
 182
 183                 /* load p and q values */
 184                 q1 = ptrBElement[-1 * (int)b_stride];
 185                 p4 = ptrLElement[-3];
 186                 p3 = ptrLElement[-2];
 187                 p2 = ptrLElement[-1];
 188                 p1 = ptrLElement[0];
 189                 ptrLElement -= rowSkip;
 190
 191                 /* compute outer product and add it to the Z matrix */
 192                 Z41 += p4 * q1;
 193                 Z31 += p3 * q1;
 194                 Z21 += p2 * q1;
 195                 Z11 += p1 * q1;
 196
 197                 /* load p and q values */
 198                 q1 = ptrBElement[-2 * (int)b_stride];
 199                 p4 = ptrLElement[-3];
 200                 p3 = ptrLElement[-2];
 201                 p2 = ptrLElement[-1];
 202                 p1 = ptrLElement[0];
 203                 ptrLElement -= rowSkip;
 204
 205                 /* compute outer product and add it to the Z matrix */
 206                 Z41 += p4 * q1;
 207                 Z31 += p3 * q1;
 208                 Z21 += p2 * q1;
 209                 Z11 += p1 * q1;
 210
 211                 /* load p and q values */
 212                 q1 = ptrBElement[-3 * (int)b_stride];
 213                 p4 = ptrLElement[-3];
 214                 p3 = ptrLElement[-2];
 215                 p2 = ptrLElement[-1];
 216                 p1 = ptrLElement[0];
 217                 ptrLElement -= rowSkip;
 218
 219                 /* compute outer product and add it to the Z matrix */
 220                 Z41 += p4 * q1;
 221                 Z31 += p3 * q1;
 222                 Z21 += p2 * q1;
 223                 Z11 += p1 * q1;
 224
 225                 if (rowCounter > 12)
 226                 {
 227                     rowCounter -= 12;
 228
 229                     ptrBElement -= 12 * b_stride;
 230
 231                     /* load p and q values */
 232                     q1 = ptrBElement[8 * b_stride];
 233                     p4 = ptrLElement[-3];
 234                     p3 = ptrLElement[-2];
 235                     p2 = ptrLElement[-1];
 236                     p1 = ptrLElement[0];
 237                     ptrLElement -= rowSkip;
 238
 239                     /* compute outer product and add it to the Z matrix */
 240                     Z41 += p4 * q1;
 241                     Z31 += p3 * q1;
 242                     Z21 += p2 * q1;
 243                     Z11 += p1 * q1;
 244
 245                     /* load p and q values */
 246                     q1 = ptrBElement[7 * b_stride];
 247                     p4 = ptrLElement[-3];
 248                     p3 = ptrLElement[-2];
 249                     p2 = ptrLElement[-1];
 250                     p1 = ptrLElement[0];
 251                     ptrLElement -= rowSkip;
 252
 253                     /* compute outer product and add it to the Z matrix */
 254                     Z41 += p4 * q1;
 255                     Z31 += p3 * q1;
 256                     Z21 += p2 * q1;
 257                     Z11 += p1 * q1;
 258
 259                     /* load p and q values */
 260                     q1 = ptrBElement[6 * b_stride];
 261                     p4 = ptrLElement[-3];
 262                     p3 = ptrLElement[-2];
 263                     p2 = ptrLElement[-1];
 264                     p1 = ptrLElement[0];
 265                     ptrLElement -= rowSkip;
 266
 267                     /* compute outer product and add it to the Z matrix */
 268                     Z41 += p4 * q1;
 269                     Z31 += p3 * q1;
 270                     Z21 += p2 * q1;
 271                     Z11 += p1 * q1;
 272
 273                     /* load p and q values */
 274                     q1 = ptrBElement[5 * b_stride];
 275                     p4 = ptrLElement[-3];
 276                     p3 = ptrLElement[-2];
 277                     p2 = ptrLElement[-1];
 278                     p1 = ptrLElement[0];
 279                     ptrLElement -= rowSkip;
 280
 281                     /* compute outer product and add it to the Z matrix */
 282                     Z41 += p4 * q1;
 283                     Z31 += p3 * q1;
 284                     Z21 += p2 * q1;
 285                     Z11 += p1 * q1;
 286
 287                     /* load p and q values */
 288                     q1 = ptrBElement[4 * b_stride];
 289                     p4 = ptrLElement[-3];
 290                     p3 = ptrLElement[-2];
 291                     p2 = ptrLElement[-1];
 292                     p1 = ptrLElement[0];
 293                     ptrLElement -= rowSkip;
 294
 295                     /* compute outer product and add it to the Z matrix */
 296                     Z41 += p4 * q1;
 297                     Z31 += p3 * q1;
 298                     Z21 += p2 * q1;
 299                     Z11 += p1 * q1;
 300
 301                     /* load p and q values */
 302                     q1 = ptrBElement[3 * b_stride];
 303                     p4 = ptrLElement[-3];
 304                     p3 = ptrLElement[-2];
 305                     p2 = ptrLElement[-1];
 306                     p1 = ptrLElement[0];
 307                     ptrLElement -= rowSkip;
 308
 309                     /* compute outer product and add it to the Z matrix */
 310                     Z41 += p4 * q1;
 311                     Z31 += p3 * q1;
 312                     Z21 += p2 * q1;
 313                     Z11 += p1 * q1;
 314
 315                     /* load p and q values */
 316                     q1 = ptrBElement[2 * b_stride];
 317                     p4 = ptrLElement[-3];
 318                     p3 = ptrLElement[-2];
 319                     p2 = ptrLElement[-1];
 320                     p1 = ptrLElement[0];
 321                     ptrLElement -= rowSkip;
 322
 323                     /* compute outer product and add it to the Z matrix */
 324                     Z41 += p4 * q1;
 325                     Z31 += p3 * q1;
 326                     Z21 += p2 * q1;
 327                     Z11 += p1 * q1;
 328
 329                     /* load p and q values */
 330                     q1 = ptrBElement[1 * b_stride];
 331                     p4 = ptrLElement[-3];
 332                     p3 = ptrLElement[-2];
 333                     p2 = ptrLElement[-1];
 334                     p1 = ptrLElement[0];
 335                     ptrLElement -= rowSkip;
 336
 337                     /* compute outer product and add it to the Z matrix */
 338                     Z41 += p4 * q1;
 339                     Z31 += p3 * q1;
 340                     Z21 += p2 * q1;
 341                     Z11 += p1 * q1;
 342                 }
 343                 else
 344                 {
 345                     ptrBElement -= 4 * b_stride;
 346
 347                     if ((rowCounter -= 4) == 0)
 348                     {
 349                         break;
 350                     }
 351                 }
 352                 /* end of inner loop */
 353             }
 354         }
 355         else
 356         {
 357             ptrLElement = lastLElement/* - blockStartRow*/; dIASSERT(blockStartRow == 0);
 358             ptrBElement = lastBElement;
 359
 360             /* set the Z matrix to 0 */
 361             Z41 = 0; Z31 = 0; Z21 = 0; Z11 = 0;
 362         }
 363
 364         /* finish computing the X(i) block */
 365         dReal Y11, Y21, Y31, Y41;
 366         {
 367             Y11 = ptrBElement[0 * b_stride] - Z11;
 368             ptrBElement[0 * b_stride] = Y11;
 369         }
 370         {
 371             dReal p2 = ptrLElement[-1];
 372             Y21 = ptrBElement[-1 * (int)b_stride] - Z21 - p2 * Y11;
 373             ptrBElement[-1 * (int)b_stride] = Y21;
 374         }
 375         {
 376             dReal p3 = ptrLElement[-2];
 377             dReal p3_1 = (ptrLElement - rowSkip)[-2];
 378             Y31 = ptrBElement[-2 * (int)b_stride] - Z31 - p3 * Y11 - p3_1 * Y21;
 379             ptrBElement[-2 * (int)b_stride] = Y31;
 380         }
 381         {
 382             dReal p4 = ptrLElement[-3];
 383             dReal p4_1 = (ptrLElement - rowSkip)[-3];
 384             dReal p4_2 = (ptrLElement - rowSkip * 2)[-3];
 385             Y41 = ptrBElement[-3 * (int)b_stride] - Z41 - p4 * Y11 - p4_1 * Y21 - p4_2 * Y31;
 386             ptrBElement[-3 * (int)b_stride] = Y41;
 387         }
 388         /* end of outer loop */
 389     }
 390 }
 391
 392
 393
 394 template<unsigned int block_step>
 395 /*static */
 396 sizeint ThreadedEquationSolverLDLT::estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount, SolvingL1TransposedMemoryEstimates &ref_solvingMemoryEstimates)
 397 {
 398     unsigned blockCount = deriveSolvingL1TransposedBlockCount(rowCount, block_step);
 399     sizeint descriptorSizeRequired = dEFFICIENT_SIZE(sizeof(cellindexint) * blockCount);
 400     sizeint contextSizeRequired = dEFFICIENT_SIZE(sizeof(SolveL1TransposedCellContext) * (CCI__MAX + 1) * blockCount);
 401     ref_solvingMemoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired);
 402
 403     sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired;
 404     return totalSizeRequired;
 405 }
 406
 407 template<unsigned int block_step>
 408 /*static */
 409 void ThreadedEquationSolverLDLT::initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount,
 410     atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *dUNUSED(cellContexts))
 411 {
 412     unsigned blockCount = deriveSolvingL1TransposedBlockCount(rowCount, block_step);
 413
 414     out_blockCompletionProgress = 0;
 415     memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors));
 416 }
 417
 418 template<unsigned int block_step, unsigned int b_stride>
 419 /*static */
 420 void ThreadedEquationSolverLDLT::participateSolvingL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
 421     volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
 422     SolveL1TransposedCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
 423 {
 424     const unsigned lookaheadRange = 32;
 425     const unsigned blockCount = deriveSolvingL1TransposedBlockCount(rowCount, block_step);
 426     /* compute rows at end that are not a multiple of block size */
 427     const unsigned loopX1RowCount = rowCount % block_step;
 428
 429     /* special handling for L and B because we're solving L1 *transpose* */
 430     const dReal *lastLElement = L + (rowCount - 1) * ((sizeint)rowSkip + 1);
 431     dReal *lastBElement = B + (rowCount - 1) * (sizeint)b_stride;
 432
 433     /* elements adjusted as if the last block was full block_step elements */
 434     unsigned x1AdjustmentElements = (block_step - loopX1RowCount) % block_step;
 435     const dReal *columnAdjustedLastLElement = lastLElement + x1AdjustmentElements;
 436     const dReal *fullyAdjustedLastLElement = columnAdjustedLastLElement + (sizeint)rowSkip * x1AdjustmentElements;
 437     dReal *adjustedLastBElement = lastBElement + b_stride * x1AdjustmentElements;
 438
 439     BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
 440
 441     unsigned completedBlocks = refBlockCompletionProgress;
 442     unsigned currentBlock = completedBlocks;
 443     dIASSERT(completedBlocks <= blockCount);
 444
 445     for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
 446     {
 447         bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
 448         bool goAssigningTheResult = false, stayWithinTheBlock = false;
 449
 450         dReal Z[block_step];
 451         dReal Y[block_step];
 452
 453         dReal *ptrBElement;
 454
 455         CellContextInstance previousContextInstance;
 456         unsigned completedRowBlock;
 457         bool partialBlock;
 458
 459         for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
 460         {
 461             if (testDescriptor == INVALID_CELLDESCRIPTOR)
 462             {
 463                 // Invalid descriptor is the indication that the row has been fully calculated
 464                 // Test if this was the last row and break out if so.
 465                 if (currentBlock + 1 == blockCount)
 466                 {
 467                     exitLoop = true;
 468                     break;
 469                 }
 470
 471                 // Treat detected row advancement as a row processed
 472                 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
 473                 break;
 474             }
 475
 476             CooperativeAtomics::AtomicReadReorderBarrier();
 477             // It is necessary to read up to date completedBblocks value after the descriptor retrieval
 478             // as otherwise the logic below breaks
 479             completedBlocks = refBlockCompletionProgress;
 480
 481             if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
 482             {
 483                 completedRowBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
 484                 dIASSERT(completedRowBlock < currentBlock || (completedRowBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
 485                 dIASSERT(completedRowBlock <= completedBlocks); // Since the descriptor is not locked
 486
 487                 if (completedRowBlock == completedBlocks && currentBlock != completedBlocks)
 488                 {
 489                     dIASSERT(completedBlocks < currentBlock);
 490                     break;
 491                 }
 492
 493                 if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
 494                 {
 495                     if (completedRowBlock != 0)
 496                     {
 497                         CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
 498                         previousContextInstance = contextInstance;
 499
 500                         const SolveL1TransposedCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
 501                         sourceContext.loadPrecalculatedZs(Z);
 502                     }
 503                     else
 504                     {
 505                         previousContextInstance = CCI__MIN;
 506                         SolveL1TransposedCellContext::initializePrecalculatedZs(Z);
 507                     }
 508
 509                     goForLockedBlockPrimaryCalculation = true;
 510                     break;
 511                 }
 512
 513                 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
 514                 {
 515                     break;
 516                 }
 517
 518                 testDescriptor = blockProgressDescriptors[currentBlock];
 519             }
 520             else
 521             {
 522                 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
 523                 {
 524                     break;
 525                 }
 526
 527                 cellindexint verificativeDescriptor;
 528                 bool verificationFailure = false;
 529
 530                 completedRowBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
 531                 dIASSERT(completedRowBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
 532
 533                 if (completedRowBlock != 0)
 534                 {
 535                     CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
 536                     const SolveL1TransposedCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
 537                     sourceContext.loadPrecalculatedZs(Z);
 538                 }
 539                 else
 540                 {
 541                     SolveL1TransposedCellContext::initializePrecalculatedZs(Z);
 542                 }
 543
 544                 if (completedRowBlock != 0 && completedRowBlock <= currentBlock)
 545                 {
 546                     // Make sure the descriptor is re-read after the precalculates
 547                     CooperativeAtomics::AtomicReadReorderBarrier();
 548                 }
 549
 550                 if (completedRowBlock <= currentBlock)
 551                 {
 552                     verificativeDescriptor = blockProgressDescriptors[currentBlock];
 553                     verificationFailure = verificativeDescriptor != testDescriptor;
 554                 }
 555
 556                 if (!verificationFailure)
 557                 {
 558                     dIASSERT(completedRowBlock <= currentBlock + 1);
 559
 560                     goForLockedBlockDuplicateCalculation = true;
 561                     break;
 562                 }
 563
 564                 testDescriptor = verificativeDescriptor;
 565             }
 566         }
 567
 568         if (exitLoop)
 569         {
 570             break;
 571         }
 572
 573         if (goForLockedBlockPrimaryCalculation)
 574         {
 575             blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
 576
 577             // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
 578             bool handleComputationTakenOver = false, columnEndReached = false;
 579
 580             const dReal *ptrLElement;
 581             unsigned finalRowBlock;
 582
 583             /* check if this is not the partial block of fewer rows */
 584             if (currentBlock != 0 || loopX1RowCount == 0)
 585             {
 586                 partialBlock = false;
 587
 588                 ptrLElement = completedRowBlock != 0
 589                     ? fullyAdjustedLastLElement - currentBlock * block_step - (sizeint)(completedRowBlock * block_step) * rowSkip
 590                     : columnAdjustedLastLElement - currentBlock * block_step;
 591                 ptrBElement = completedRowBlock != 0
 592                     ? adjustedLastBElement - (sizeint)(completedRowBlock * block_step) * b_stride
 593                     : lastBElement;
 594
 595                 finalRowBlock = dMACRO_MIN(currentBlock, completedBlocks);
 596                 dIASSERT(finalRowBlock != completedRowBlock || finalRowBlock == 0);
 597
 598                 unsigned rowCounter = finalRowBlock - completedRowBlock;
 599                 bool exitLoop = rowCounter == 0;
 600
 601                 if (exitLoop)
 602                 {
 603                     columnEndReached = true;
 604                 }
 605                 else if (completedRowBlock == 0 && currentBlock != 0 && loopX1RowCount != 0)
 606                 {
 607                     if ((loopX1RowCount & 1) != 0)
 608                     {
 609                         dReal q1, p4, p3, p2, p1;
 610
 611                         /* load p and q values */
 612                         q1 = ptrBElement[0 * (int)b_stride];
 613                         p4 = ptrLElement[-3];
 614                         p3 = ptrLElement[-2];
 615                         p2 = ptrLElement[-1];
 616                         p1 = ptrLElement[0];
 617                         ptrLElement -= rowSkip;
 618
 619                         /* compute outer product and add it to the Z matrix */
 620                         Z[3] += p4 * q1;
 621                         Z[2] += p3 * q1;
 622                         Z[1] += p2 * q1;
 623                         Z[0] += p1 * q1;
 624
 625                         ptrBElement -= 1 * b_stride;
 626                     }
 627
 628                     if ((loopX1RowCount & 2) != 0)
 629                     {
 630                         dReal q1, p4, p3, p2, p1;
 631
 632                         /* load p and q values */
 633                         q1 = ptrBElement[0 * (int)b_stride];
 634                         p4 = ptrLElement[-3];
 635                         p3 = ptrLElement[-2];
 636                         p2 = ptrLElement[-1];
 637                         p1 = ptrLElement[0];
 638                         ptrLElement -= rowSkip;
 639
 640                         /* compute outer product and add it to the Z matrix */
 641                         Z[3] += p4 * q1;
 642                         Z[2] += p3 * q1;
 643                         Z[1] += p2 * q1;
 644                         Z[0] += p1 * q1;
 645
 646                         /* load p and q values */
 647                         q1 = ptrBElement[-1 * (int)b_stride];
 648                         p4 = ptrLElement[-3];
 649                         p3 = ptrLElement[-2];
 650                         p2 = ptrLElement[-1];
 651                         p1 = ptrLElement[0];
 652                         ptrLElement -= rowSkip;
 653
 654                         /* compute outer product and add it to the Z matrix */
 655                         Z[3] += p4 * q1;
 656                         Z[2] += p3 * q1;
 657                         Z[1] += p2 * q1;
 658                         Z[0] += p1 * q1;
 659
 660                         ptrBElement -= 2 * b_stride;
 661                     }
 662                     dSASSERT(block_step == 4);
 663
 664                     if (--rowCounter == 0)
 665                     {
 666                         do
 667                         {
 668                             if (finalRowBlock == currentBlock)
 669                             {
 670                                 columnEndReached = true;
 671                                 exitLoop = true;
 672                                 break;
 673                             }
 674
 675                             // Take a look if any more columns have been completed...
 676                             completedBlocks = refBlockCompletionProgress;
 677                             dIASSERT(completedBlocks >= finalRowBlock);
 678
 679                             if (completedBlocks == finalRowBlock)
 680                             {
 681                                 exitLoop = true;
 682                                 break;
 683                             }
 684
 685                             // ...continue if so.
 686                             unsigned rowCompletedSoFar = finalRowBlock;
 687                             finalRowBlock = dMACRO_MIN(currentBlock, completedBlocks);
 688                             rowCounter = finalRowBlock - rowCompletedSoFar;
 689                         }
 690                         while (false);
 691                     }
 692                 }
 693
 694                 for (; !exitLoop; exitLoop = false)
 695                 {
 696                     dReal q1, p4, p3, p2, p1;
 697
 698                     /* load p and q values */
 699                     q1 = ptrBElement[0 * (int)b_stride];
 700                     p4 = ptrLElement[-3];
 701                     p3 = ptrLElement[-2];
 702                     p2 = ptrLElement[-1];
 703                     p1 = ptrLElement[0];
 704                     ptrLElement -= rowSkip;
 705
 706                     /* compute outer product and add it to the Z matrix */
 707                     Z[3] += p4 * q1;
 708                     Z[2] += p3 * q1;
 709                     Z[1] += p2 * q1;
 710                     Z[0] += p1 * q1;
 711
 712                     /* load p and q values */
 713                     q1 = ptrBElement[-1 * (int)b_stride];
 714                     p4 = ptrLElement[-3];
 715                     p3 = ptrLElement[-2];
 716                     p2 = ptrLElement[-1];
 717                     p1 = ptrLElement[0];
 718                     ptrLElement -= rowSkip;
 719
 720                     /* compute outer product and add it to the Z matrix */
 721                     Z[3] += p4 * q1;
 722                     Z[2] += p3 * q1;
 723                     Z[1] += p2 * q1;
 724                     Z[0] += p1 * q1;
 725
 726                     /* load p and q values */
 727                     q1 = ptrBElement[-2 * (int)b_stride];
 728                     p4 = ptrLElement[-3];
 729                     p3 = ptrLElement[-2];
 730                     p2 = ptrLElement[-1];
 731                     p1 = ptrLElement[0];
 732                     ptrLElement -= rowSkip;
 733
 734                     /* compute outer product and add it to the Z matrix */
 735                     Z[3] += p4 * q1;
 736                     Z[2] += p3 * q1;
 737                     Z[1] += p2 * q1;
 738                     Z[0] += p1 * q1;
 739
 740                     /* load p and q values */
 741                     q1 = ptrBElement[-3 * (int)b_stride];
 742                     p4 = ptrLElement[-3];
 743                     p3 = ptrLElement[-2];
 744                     p2 = ptrLElement[-1];
 745                     p1 = ptrLElement[0];
 746                     ptrLElement -= rowSkip;
 747
 748                     /* compute outer product and add it to the Z matrix */
 749                     Z[3] += p4 * q1;
 750                     Z[2] += p3 * q1;
 751                     Z[1] += p2 * q1;
 752                     Z[0] += p1 * q1;
 753                     dSASSERT(block_step == 4);
 754
 755                     if (rowCounter > 3)
 756                     {
 757                         rowCounter -= 3;
 758
 759                         ptrBElement -= 3 * block_step * b_stride;
 760
 761                         /* load p and q values */
 762                         q1 = ptrBElement[8 * b_stride];
 763                         p4 = ptrLElement[-3];
 764                         p3 = ptrLElement[-2];
 765                         p2 = ptrLElement[-1];
 766                         p1 = ptrLElement[0];
 767                         ptrLElement -= rowSkip;
 768
 769                         /* compute outer product and add it to the Z matrix */
 770                         Z[3] += p4 * q1;
 771                         Z[2] += p3 * q1;
 772                         Z[1] += p2 * q1;
 773                         Z[0] += p1 * q1;
 774
 775                         /* load p and q values */
 776                         q1 = ptrBElement[7 * b_stride];
 777                         p4 = ptrLElement[-3];
 778                         p3 = ptrLElement[-2];
 779                         p2 = ptrLElement[-1];
 780                         p1 = ptrLElement[0];
 781                         ptrLElement -= rowSkip;
 782
 783                         /* compute outer product and add it to the Z matrix */
 784                         Z[3] += p4 * q1;
 785                         Z[2] += p3 * q1;
 786                         Z[1] += p2 * q1;
 787                         Z[0] += p1 * q1;
 788
 789                         /* load p and q values */
 790                         q1 = ptrBElement[6 * b_stride];
 791                         p4 = ptrLElement[-3];
 792                         p3 = ptrLElement[-2];
 793                         p2 = ptrLElement[-1];
 794                         p1 = ptrLElement[0];
 795                         ptrLElement -= rowSkip;
 796
 797                         /* compute outer product and add it to the Z matrix */
 798                         Z[3] += p4 * q1;
 799                         Z[2] += p3 * q1;
 800                         Z[1] += p2 * q1;
 801                         Z[0] += p1 * q1;
 802
 803                         /* load p and q values */
 804                         q1 = ptrBElement[5 * b_stride];
 805                         p4 = ptrLElement[-3];
 806                         p3 = ptrLElement[-2];
 807                         p2 = ptrLElement[-1];
 808                         p1 = ptrLElement[0];
 809                         ptrLElement -= rowSkip;
 810
 811                         /* compute outer product and add it to the Z matrix */
 812                         Z[3] += p4 * q1;
 813                         Z[2] += p3 * q1;
 814                         Z[1] += p2 * q1;
 815                         Z[0] += p1 * q1;
 816
 817                         /* load p and q values */
 818                         q1 = ptrBElement[4 * b_stride];
 819                         p4 = ptrLElement[-3];
 820                         p3 = ptrLElement[-2];
 821                         p2 = ptrLElement[-1];
 822                         p1 = ptrLElement[0];
 823                         ptrLElement -= rowSkip;
 824
 825                         /* compute outer product and add it to the Z matrix */
 826                         Z[3] += p4 * q1;
 827                         Z[2] += p3 * q1;
 828                         Z[1] += p2 * q1;
 829                         Z[0] += p1 * q1;
 830
 831                         /* load p and q values */
 832                         q1 = ptrBElement[3 * b_stride];
 833                         p4 = ptrLElement[-3];
 834                         p3 = ptrLElement[-2];
 835                         p2 = ptrLElement[-1];
 836                         p1 = ptrLElement[0];
 837                         ptrLElement -= rowSkip;
 838
 839                         /* compute outer product and add it to the Z matrix */
 840                         Z[3] += p4 * q1;
 841                         Z[2] += p3 * q1;
 842                         Z[1] += p2 * q1;
 843                         Z[0] += p1 * q1;
 844
 845                         /* load p and q values */
 846                         q1 = ptrBElement[2 * b_stride];
 847                         p4 = ptrLElement[-3];
 848                         p3 = ptrLElement[-2];
 849                         p2 = ptrLElement[-1];
 850                         p1 = ptrLElement[0];
 851                         ptrLElement -= rowSkip;
 852
 853                         /* compute outer product and add it to the Z matrix */
 854                         Z[3] += p4 * q1;
 855                         Z[2] += p3 * q1;
 856                         Z[1] += p2 * q1;
 857                         Z[0] += p1 * q1;
 858
 859                         /* load p and q values */
 860                         q1 = ptrBElement[1 * b_stride];
 861                         p4 = ptrLElement[-3];
 862                         p3 = ptrLElement[-2];
 863                         p2 = ptrLElement[-1];
 864                         p1 = ptrLElement[0];
 865                         ptrLElement -= rowSkip;
 866
 867                         /* compute outer product and add it to the Z matrix */
 868                         Z[3] += p4 * q1;
 869                         Z[2] += p3 * q1;
 870                         Z[1] += p2 * q1;
 871                         Z[0] += p1 * q1;
 872                         dSASSERT(block_step == 4);
 873                     }
 874                     else
 875                     {
 876                         ptrBElement -= block_step * b_stride;
 877
 878                         if (--rowCounter == 0)
 879                         {
 880                             if (finalRowBlock == currentBlock)
 881                             {
 882                                 columnEndReached = true;
 883                                 break;
 884                             }
 885
 886                             // Take a look if any more columns have been completed...
 887                             completedBlocks = refBlockCompletionProgress;
 888                             dIASSERT(completedBlocks >= finalRowBlock);
 889
 890                             if (completedBlocks == finalRowBlock)
 891                             {
 892                                 break;
 893                             }
 894
 895                             // ...continue if so.
 896                             unsigned rowCompletedSoFar = finalRowBlock;
 897                             finalRowBlock = dMACRO_MIN(currentBlock, completedBlocks);
 898                             rowCounter = finalRowBlock - rowCompletedSoFar;
 899                         }
 900                     }
 901                     /* end of inner loop */
 902                 }
 903             }
 904             else /* compute rightmost bottom X(i) block */
 905             {
 906                 partialBlock = true;
 907
 908                 ptrLElement = lastLElement;
 909                 ptrBElement = lastBElement;
 910                 dIASSERT(completedRowBlock == 0);
 911
 912                 columnEndReached = true;
 913             }
 914
 915             if (columnEndReached)
 916             {
 917                 // Check whether there is still a need to proceed or if the computation has been taken over by another thread
 918                 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedRowBlock, previousContextInstance, true);
 919
 920                 if (blockProgressDescriptors[currentBlock] == oldDescriptor)
 921                 {
 922                     if (partialBlock)
 923                     {
 924                         Y[0] = ptrBElement[0 * b_stride]/* - Z[0]*/;
 925
 926                         if (loopX1RowCount >= 2)
 927                         {
 928                             dReal p2 = ptrLElement[-1];
 929                             Y[1] = ptrBElement[-1 * (int)b_stride]/* - Z[1] */- p2 * Y[0];
 930
 931                             if (loopX1RowCount > 2)
 932                             {
 933                                 dReal p3 = ptrLElement[-2];
 934                                 dReal p3_1 = (ptrLElement - rowSkip)[-2];
 935                                 Y[2] = ptrBElement[-2 * (int)b_stride]/* - Z[2] */- p3 * Y[0] - p3_1 * Y[1];
 936                             }
 937                         }
 938
 939                         dSASSERT(block_step == 4);
 940                     }
 941                     else
 942                     {
 943                         Y[0] = ptrBElement[0 * b_stride] - Z[0];
 944
 945                         dReal p2 = ptrLElement[-1];
 946                         Y[1] = ptrBElement[-1 * (int)b_stride] - Z[1] - p2 * Y[0];
 947
 948                         dReal p3 = ptrLElement[-2];
 949                         dReal p3_1 = (ptrLElement - rowSkip)[-2];
 950                         Y[2] = ptrBElement[-2 * (int)b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
 951
 952                         dReal p4 = ptrLElement[-3];
 953                         dReal p4_1 = (ptrLElement - rowSkip)[-3];
 954                         dReal p4_2 = (ptrLElement - rowSkip * 2)[-3];
 955                         Y[3] = ptrBElement[-3 * (int)b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
 956
 957                         dSASSERT(block_step == 4);
 958                     }
 959
 960                     // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
 961                     CooperativeAtomics::AtomicReadReorderBarrier();
 962
 963                     // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
 964                     // and the computation result is valid.
 965                     if (blockProgressDescriptors[currentBlock] == oldDescriptor)
 966                     {
 967                         // Assign the results to the result context (possibly in parallel with other threads
 968                         // that could and ought to be assigning exactly the same values)
 969                         SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
 970                         resultContext.storePrecalculatedZs(Y);
 971
 972                         // Assign the result assignment progress descriptor
 973                         cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
 974                         CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
 975
 976                         // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
 977                         goAssigningTheResult = true;
 978                     }
 979                     else
 980                     {
 981                         // Otherwise, go on competing for copying the results
 982                         handleComputationTakenOver = true;
 983                     }
 984                 }
 985                 else
 986                 {
 987                     handleComputationTakenOver = true;
 988                 }
 989             }
 990             else
 991             {
 992                 // If the final column has not been reached yet, store current values to the context.
 993                 // Select the other context instance as the previous one might be read by other threads.
 994                 CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
 995                 SolveL1TransposedCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
 996                 destinationContext.storePrecalculatedZs(Z);
 997
 998                 // Unlock the row until more columns can be used
 999                 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedRowBlock, previousContextInstance, true);
1000                 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalRowBlock, nextContextInstance, false);
1001                 // The descriptor might have been updated by a competing thread
1002                 if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
1003                 {
1004                     // Adjust the ptrBElement to point to the result area...
1005                     ptrBElement = adjustedLastBElement - (sizeint)(currentBlock * block_step) * b_stride;
1006                     dIASSERT(currentBlock != 0 || adjustedLastBElement == lastBElement);
1007                     // ...and go on handling the case
1008                     handleComputationTakenOver = true;
1009                 }
1010             }
1011
1012             if (handleComputationTakenOver)
1013             {
1014                 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1015                 // This can only happen if the row was (has become) the uppermost not fully completed one
1016                 // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
1017                 if (existingDescriptor != INVALID_CELLDESCRIPTOR)
1018                 {
1019                     // If not fully completed this must be the final stage of the result assignment into the matrix
1020                     dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
1021
1022                     // Go on competing copying the result as anyway the block is the topmost not completed one
1023                     // and since there was competition for it, there is no other work that can be done right now.
1024                     const SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1025                     resultContext.loadPrecalculatedZs(Y);
1026
1027                     goAssigningTheResult = true;
1028                 }
1029                 else
1030                 {
1031                     // everything is over -- just go handling next blocks
1032                 }
1033             }
1034         }
1035         else if (goForLockedBlockDuplicateCalculation)
1036         {
1037             blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
1038
1039             bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
1040
1041             /* declare variables */
1042             const dReal *ptrLElement;
1043
1044             if (completedRowBlock < currentBlock)
1045             {
1046                 partialBlock = false;
1047
1048                 ptrLElement = completedRowBlock != 0
1049                     ? fullyAdjustedLastLElement - currentBlock * block_step - (sizeint)(completedRowBlock * block_step) * rowSkip
1050                     : columnAdjustedLastLElement - currentBlock * block_step;
1051                 ptrBElement = completedRowBlock != 0
1052                     ? adjustedLastBElement - (sizeint)(completedRowBlock * block_step) * b_stride
1053                     : lastBElement;
1054
1055                 unsigned finalRowBlock = currentBlock/*std::min(currentBlock, completedBlocks)*/;
1056                 dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
1057
1058                 bool exitInnerLoop = false;
1059                 unsigned lastCompletedRow = completedRowBlock;
1060                 unsigned rowCounter = finalRowBlock - completedRowBlock;
1061
1062                 if (completedRowBlock == 0/* && currentBlock != 0 */&& loopX1RowCount != 0)
1063                 {
1064                     if ((loopX1RowCount & 1) != 0)
1065                     {
1066                         dReal q1, p4, p3, p2, p1;
1067
1068                         /* load p and q values */
1069                         q1 = ptrBElement[0 * (int)b_stride];
1070                         p4 = ptrLElement[-3];
1071                         p3 = ptrLElement[-2];
1072                         p2 = ptrLElement[-1];
1073                         p1 = ptrLElement[0];
1074                         ptrLElement -= rowSkip;
1075
1076                         /* compute outer product and add it to the Z matrix */
1077                         Z[3] += p4 * q1;
1078                         Z[2] += p3 * q1;
1079                         Z[1] += p2 * q1;
1080                         Z[0] += p1 * q1;
1081
1082                         ptrBElement -= 1 * b_stride;
1083                     }
1084
1085                     if ((loopX1RowCount & 2) != 0)
1086                     {
1087                         dReal q1, p4, p3, p2, p1;
1088
1089                         /* load p and q values */
1090                         q1 = ptrBElement[0 * (int)b_stride];
1091                         p4 = ptrLElement[-3];
1092                         p3 = ptrLElement[-2];
1093                         p2 = ptrLElement[-1];
1094                         p1 = ptrLElement[0];
1095                         ptrLElement -= rowSkip;
1096
1097                         /* compute outer product and add it to the Z matrix */
1098                         Z[3] += p4 * q1;
1099                         Z[2] += p3 * q1;
1100                         Z[1] += p2 * q1;
1101                         Z[0] += p1 * q1;
1102
1103                         /* load p and q values */
1104                         q1 = ptrBElement[-1 * (int)b_stride];
1105                         p4 = ptrLElement[-3];
1106                         p3 = ptrLElement[-2];
1107                         p2 = ptrLElement[-1];
1108                         p1 = ptrLElement[0];
1109                         ptrLElement -= rowSkip;
1110
1111                         /* compute outer product and add it to the Z matrix */
1112                         Z[3] += p4 * q1;
1113                         Z[2] += p3 * q1;
1114                         Z[1] += p2 * q1;
1115                         Z[0] += p1 * q1;
1116
1117                         ptrBElement -= 2 * b_stride;
1118                     }
1119                     dSASSERT(block_step == 4);
1120
1121                     if (--rowCounter == 0)
1122                     {
1123                         exitInnerLoop = true;
1124                     }
1125                 }
1126
1127                 for (; !exitInnerLoop; exitInnerLoop = --rowCounter == 0)
1128                 {
1129                     dReal q1, p4, p3, p2, p1;
1130
1131                     /* load p and q values */
1132                     q1 = ptrBElement[0 * (int)b_stride];
1133                     p4 = ptrLElement[-3];
1134                     p3 = ptrLElement[-2];
1135                     p2 = ptrLElement[-1];
1136                     p1 = ptrLElement[0];
1137                     ptrLElement -= rowSkip;
1138
1139                     /* compute outer product and add it to the Z matrix */
1140                     Z[3] += p4 * q1;
1141                     Z[2] += p3 * q1;
1142                     Z[1] += p2 * q1;
1143                     Z[0] += p1 * q1;
1144
1145                     /* load p and q values */
1146                     q1 = ptrBElement[-1 * (int)b_stride];
1147                     p4 = ptrLElement[-3];
1148                     p3 = ptrLElement[-2];
1149                     p2 = ptrLElement[-1];
1150                     p1 = ptrLElement[0];
1151                     ptrLElement -= rowSkip;
1152
1153                     /* compute outer product and add it to the Z matrix */
1154                     Z[3] += p4 * q1;
1155                     Z[2] += p3 * q1;
1156                     Z[1] += p2 * q1;
1157                     Z[0] += p1 * q1;
1158
1159                     /* load p and q values */
1160                     q1 = ptrBElement[-2 * (int)b_stride];
1161                     p4 = ptrLElement[-3];
1162                     p3 = ptrLElement[-2];
1163                     p2 = ptrLElement[-1];
1164                     p1 = ptrLElement[0];
1165                     ptrLElement -= rowSkip;
1166
1167                     /* compute outer product and add it to the Z matrix */
1168                     Z[3] += p4 * q1;
1169                     Z[2] += p3 * q1;
1170                     Z[1] += p2 * q1;
1171                     Z[0] += p1 * q1;
1172
1173                     /* load p and q values */
1174                     q1 = ptrBElement[-3 * (int)b_stride];
1175                     p4 = ptrLElement[-3];
1176                     p3 = ptrLElement[-2];
1177                     p2 = ptrLElement[-1];
1178                     p1 = ptrLElement[0];
1179                     ptrLElement -= rowSkip;
1180
1181                     /* compute outer product and add it to the Z matrix */
1182                     Z[3] += p4 * q1;
1183                     Z[2] += p3 * q1;
1184                     Z[1] += p2 * q1;
1185                     Z[0] += p1 * q1;
1186                     dSASSERT(block_step == 4);
1187
1188                     // Check if the primary solver thread has not made any progress
1189                     cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
1190                     unsigned newCompletedRow = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
1191
1192                     if (newCompletedRow != lastCompletedRow)
1193                     {
1194                         // Check, this is the first change the current thread detects.
1195                         // There is absolutely no reason in code for the computation to stop/resume twice
1196                         // while the current thread is competing.
1197                         dIASSERT(lastCompletedRow == completedRowBlock);
1198
1199                         if (descriptorVerification == INVALID_CELLDESCRIPTOR)
1200                         {
1201                             skipToHandlingSubsequentRows = true;
1202                             break;
1203                         }
1204
1205                         if (newCompletedRow == currentBlock + 1)
1206                         {
1207                             skiptoCopyingResult = true;
1208                             break;
1209                         }
1210
1211                         // Check if the current thread is behind
1212                         if (newCompletedRow > finalRowBlock - rowCounter)
1213                         {
1214                             // If so, go starting over one more time
1215                             blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1216                             stayWithinTheBlock = true;
1217                             skipToHandlingSubsequentRows = true;
1218                             break;
1219                         }
1220
1221                         // If current thread is ahead, just save new completed column for further comparisons and go on calculating
1222                         lastCompletedRow = newCompletedRow;
1223                     }
1224
1225                     /* advance pointers */
1226                     ptrBElement -= block_step * b_stride;
1227                     /* end of inner loop */
1228                 }
1229             }
1230             else if (completedRowBlock > currentBlock)
1231             {
1232                 dIASSERT(completedRowBlock == currentBlock + 1);
1233
1234                 partialBlock = currentBlock == 0 && loopX1RowCount != 0;
1235
1236                 skiptoCopyingResult = true;
1237             }
1238             else
1239             {
1240                 dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
1241
1242                 partialBlock = /*currentBlock == 0 && */loopX1RowCount != 0;
1243
1244                 /* just assign the pointers appropriately and go on computing the results */
1245                 ptrLElement = lastLElement;
1246                 ptrBElement = lastBElement;
1247             }
1248
1249             if (!skipToHandlingSubsequentRows)
1250             {
1251                 if (!skiptoCopyingResult)
1252                 {
1253                     if (partialBlock)
1254                     {
1255                         Y[0] = ptrBElement[0 * b_stride]/* - Z[0]*/;
1256
1257                         if (loopX1RowCount >= 2)
1258                         {
1259                             dReal p2 = ptrLElement[-1];
1260                             Y[1] = ptrBElement[-1 * (int)b_stride]/* - Z[1] */- p2 * Y[0];
1261
1262                             if (loopX1RowCount > 2)
1263                             {
1264                                 dReal p3 = ptrLElement[-2];
1265                                 dReal p3_1 = (ptrLElement - rowSkip)[-2];
1266                                 Y[2] = ptrBElement[-2 * (int)b_stride]/* - Z[2] */- p3 * Y[0] - p3_1 * Y[1];
1267                             }
1268                         }
1269
1270                         dSASSERT(block_step == 4);
1271                     }
1272                     else
1273                     {
1274                         Y[0] = ptrBElement[0 * b_stride] - Z[0];
1275
1276                         dReal p2 = ptrLElement[-1];
1277                         Y[1] = ptrBElement[-1 * (int)b_stride] - Z[1] - p2 * Y[0];
1278
1279                         dReal p3 = ptrLElement[-2];
1280                         dReal p3_1 = (ptrLElement - rowSkip)[-2];
1281                         Y[2] = ptrBElement[-2 * (int)b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
1282
1283                         dReal p4 = ptrLElement[-3];
1284                         dReal p4_1 = (ptrLElement - rowSkip)[-3];
1285                         dReal p4_2 = (ptrLElement - rowSkip * 2)[-3];
1286                         Y[3] = ptrBElement[-3 * (int)b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
1287
1288                         dSASSERT(block_step == 4);
1289                     }
1290
1291                     // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
1292                     CooperativeAtomics::AtomicReadReorderBarrier();
1293
1294                     cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1295
1296                     if (existingDescriptor == INVALID_CELLDESCRIPTOR)
1297                     {
1298                         // Everything is over -- proceed to subsequent rows
1299                         skipToHandlingSubsequentRows = true;
1300                     }
1301                     else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
1302                     {
1303                         // The values computed above may not be valid. Copy the values already in the result context.
1304                         skiptoCopyingResult = true;
1305                     }
1306                     else
1307                     {
1308                         // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
1309                         // and the computation result is valid.
1310                         cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
1311
1312                         // Assign the results to the result context (possibly in parallel with other threads
1313                         // that could and ought to be assigning exactly the same values)
1314                         SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1315                         resultContext.storePrecalculatedZs(Y);
1316
1317                         // Assign the result assignment progress descriptor
1318                         CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
1319
1320                         // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
1321                     }
1322                 }
1323
1324                 if (!skipToHandlingSubsequentRows)
1325                 {
1326                     if (skiptoCopyingResult)
1327                     {
1328                         // Extract the result values stored in the result context
1329                         const SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1330                         resultContext.loadPrecalculatedZs(Y);
1331
1332                         ptrBElement = currentBlock != 0 ? adjustedLastBElement - (sizeint)(currentBlock * block_step) * b_stride : lastBElement;
1333                     }
1334
1335                     goAssigningTheResult = true;
1336                 }
1337             }
1338         }
1339
1340         if (goAssigningTheResult)
1341         {
1342             cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1343             // Check if the assignment has not been completed yet
1344             if (existingDescriptor != INVALID_CELLDESCRIPTOR)
1345             {
1346                 // Assign the computation results to B vector
1347                 if (partialBlock)
1348                 {
1349                     // ptrBElement[0 * b_stride] = Y[0]; -- unchanged
1350
1351                     if (loopX1RowCount >= 2)
1352                     {
1353                         ptrBElement[-1 * (int)b_stride] = Y[1];
1354
1355                         if (loopX1RowCount > 2)
1356                         {
1357                             ptrBElement[-2 * (int)b_stride] = Y[2];
1358                         }
1359                     }
1360                     dSASSERT(block_step == 4);
1361                 }
1362                 else
1363                 {
1364                     ptrBElement[0 * b_stride] = Y[0];
1365                     ptrBElement[-1 * (int)b_stride] = Y[1];
1366                     ptrBElement[-2 * (int)b_stride] = Y[2];
1367                     ptrBElement[-3 * (int)b_stride] = Y[3];
1368                     dSASSERT(block_step == 4);
1369                 }
1370
1371                 ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
1372                 dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
1373
1374                 // And assign the completed status no matter what
1375                 CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
1376             }
1377             else
1378             {
1379                 // everything is over -- just go handling next blocks
1380             }
1381         }
1382
1383         if (!stayWithinTheBlock)
1384         {
1385             completedBlocks = refBlockCompletionProgress;
1386
1387             if (completedBlocks == blockCount)
1388             {
1389                 break;
1390             }
1391
1392             currentBlock += 1;
1393
1394             bool lookaheadBoundaryReached = false;
1395
1396             if (currentBlock == blockCount || completedBlocks == 0)
1397             {
1398                 lookaheadBoundaryReached = true;
1399             }
1400             else if (currentBlock >= completedBlocks + lookaheadRange)
1401             {
1402                 lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
1403             }
1404             else if (currentBlock < completedBlocks)
1405             {
1406                 // Treat detected row advancement as a row processed
1407                 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
1408
1409                 currentBlock = completedBlocks;
1410             }
1411
1412             if (lookaheadBoundaryReached)
1413             {
1414                 dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did not we compete???
1415
1416                 // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
1417                 if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
1418                 {
1419                     // Abandon job if too few blocks remain
1420                     if (blockCount - completedBlocks <= ownThreadIndex)
1421                     {
1422                         break;
1423                     }
1424
1425                     blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1426                 }
1427                 else
1428                 {
1429                     // If there was some progress, just continue to the next pass
1430                     blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
1431                 }
1432
1433                 currentBlock = completedBlocks;
1434             }
1435         }
1436     }
1437 }
1438
1439
1440 #endif