Cosmetic: Copyright years were updated
[ode.git] / ode / src / fastltsolve_impl.h
blob86ed950a5ecc84f28c4e629549dd3e478fe216df
3 /*************************************************************************
4 * *
5 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
6 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
7 * *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of EITHER: *
10 * (1) The GNU Lesser General Public License as published by the Free *
11 * Software Foundation; either version 2.1 of the License, or (at *
12 * your option) any later version. The text of the GNU Lesser *
13 * General Public License is included with this library in the *
14 * file LICENSE.TXT. *
15 * (2) The BSD-style license that is included with this library in *
16 * the file LICENSE-BSD.TXT. *
17 * *
18 * This library is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
21 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
22 * *
23 *************************************************************************/
26 * Code style improvements and optimizations by Oleh Derevenko ????-2024
27 * L1Transposed cooperative solving code of ThreadedEquationSolverLDLT copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
31 #ifndef _ODE_FASTLTSOLVE_IMPL_H_
32 #define _ODE_FASTLTSOLVE_IMPL_H_
35 /* solve L^T * x=b, with b containing 1 right hand side.
36 * L is an n*n lower triangular matrix with ones on the diagonal.
37 * L is stored by rows and its leading dimension is rowSkip.
38 * b is an n*1 matrix that contains the right hand side.
39 * b is overwritten with x.
40 * this processes blocks of 4.
43 template<unsigned int b_stride>
44 void solveL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
46 dIASSERT(rowCount != 0);
48 /* special handling for L and B because we're solving L1 *transpose* */
49 const dReal *lastLElement = L + (sizeint)(rowCount - 1) * (rowSkip + 1);
50 dReal *lastBElement = B + (sizeint)(rowCount - 1) * b_stride;
52 /* compute rows at end that are not a multiple of block size */
53 const unsigned loopX1RowCount = rowCount % 4;
55 unsigned blockStartRow = loopX1RowCount;
56 bool subsequentPass = false;
58 /* compute rightmost bottom X(i) block */
59 if (loopX1RowCount != 0)
61 subsequentPass = true;
63 const dReal *ptrLElement = lastLElement;
64 dReal *ptrBElement = lastBElement;
66 dReal Y11 = ptrBElement[0 * b_stride]/* - Z11*/;
67 // ptrBElement[0 * b_stride] = Y11; -- unchanged
69 if (loopX1RowCount >= 2)
71 dReal p2 = ptrLElement[-1];
72 dReal Y21 = ptrBElement[-1 * (int)b_stride]/* - Z21 */- p2 * Y11;
73 ptrBElement[-1 * (int)b_stride] = Y21;
75 if (loopX1RowCount > 2)
77 dReal p3 = ptrLElement[-2];
78 dReal p3_1 = (ptrLElement - rowSkip)[-2];
79 dReal Y31 = ptrBElement[-2 * (int)b_stride]/* - Z31 */- p3 * Y11 - p3_1 * Y21;
80 ptrBElement[-2 * (int)b_stride] = Y31;
85 /* compute all 4 x 1 blocks of X */
86 for (; !subsequentPass || blockStartRow < rowCount; subsequentPass = true, blockStartRow += 4)
88 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
90 /* declare variables - Z matrix, p and q vectors, etc */
91 const dReal *ptrLElement;
92 dReal *ptrBElement;
94 dReal Z41, Z31, Z21, Z11;
96 if (subsequentPass)
98 ptrLElement = lastLElement - blockStartRow;
99 ptrBElement = lastBElement;
101 /* set the Z matrix to 0 */
102 Z41 = 0; Z31 = 0; Z21 = 0; Z11 = 0;
104 unsigned rowCounter = blockStartRow;
106 if (rowCounter % 2 != 0)
108 dReal q1, p4, p3, p2, p1;
110 /* load p and q values */
111 q1 = ptrBElement[0 * (int)b_stride];
112 p4 = ptrLElement[-3];
113 p3 = ptrLElement[-2];
114 p2 = ptrLElement[-1];
115 p1 = ptrLElement[0];
116 ptrLElement -= rowSkip;
118 /* compute outer product and add it to the Z matrix */
119 Z41 += p4 * q1;
120 Z31 += p3 * q1;
121 Z21 += p2 * q1;
122 Z11 += p1 * q1;
124 ptrBElement -= 1 * b_stride;
125 rowCounter -= 1;
128 if (rowCounter % 4 != 0)
130 dReal q1, p4, p3, p2, p1;
132 /* load p and q values */
133 q1 = ptrBElement[0 * (int)b_stride];
134 p4 = ptrLElement[-3];
135 p3 = ptrLElement[-2];
136 p2 = ptrLElement[-1];
137 p1 = ptrLElement[0];
138 ptrLElement -= rowSkip;
140 /* compute outer product and add it to the Z matrix */
141 Z41 += p4 * q1;
142 Z31 += p3 * q1;
143 Z21 += p2 * q1;
144 Z11 += p1 * q1;
146 /* load p and q values */
147 q1 = ptrBElement[-1 * (int)b_stride];
148 p4 = ptrLElement[-3];
149 p3 = ptrLElement[-2];
150 p2 = ptrLElement[-1];
151 p1 = ptrLElement[0];
152 ptrLElement -= rowSkip;
154 /* compute outer product and add it to the Z matrix */
155 Z41 += p4 * q1;
156 Z31 += p3 * q1;
157 Z21 += p2 * q1;
158 Z11 += p1 * q1;
160 ptrBElement -= 2 * b_stride;
161 rowCounter -= 2;
164 /* the inner loop that computes outer products and adds them to Z */
165 for (bool exitLoop = rowCounter == 0; !exitLoop; exitLoop = false)
167 dReal q1, p4, p3, p2, p1;
169 /* load p and q values */
170 q1 = ptrBElement[0 * (int)b_stride];
171 p4 = ptrLElement[-3];
172 p3 = ptrLElement[-2];
173 p2 = ptrLElement[-1];
174 p1 = ptrLElement[0];
175 ptrLElement -= rowSkip;
177 /* compute outer product and add it to the Z matrix */
178 Z41 += p4 * q1;
179 Z31 += p3 * q1;
180 Z21 += p2 * q1;
181 Z11 += p1 * q1;
183 /* load p and q values */
184 q1 = ptrBElement[-1 * (int)b_stride];
185 p4 = ptrLElement[-3];
186 p3 = ptrLElement[-2];
187 p2 = ptrLElement[-1];
188 p1 = ptrLElement[0];
189 ptrLElement -= rowSkip;
191 /* compute outer product and add it to the Z matrix */
192 Z41 += p4 * q1;
193 Z31 += p3 * q1;
194 Z21 += p2 * q1;
195 Z11 += p1 * q1;
197 /* load p and q values */
198 q1 = ptrBElement[-2 * (int)b_stride];
199 p4 = ptrLElement[-3];
200 p3 = ptrLElement[-2];
201 p2 = ptrLElement[-1];
202 p1 = ptrLElement[0];
203 ptrLElement -= rowSkip;
205 /* compute outer product and add it to the Z matrix */
206 Z41 += p4 * q1;
207 Z31 += p3 * q1;
208 Z21 += p2 * q1;
209 Z11 += p1 * q1;
211 /* load p and q values */
212 q1 = ptrBElement[-3 * (int)b_stride];
213 p4 = ptrLElement[-3];
214 p3 = ptrLElement[-2];
215 p2 = ptrLElement[-1];
216 p1 = ptrLElement[0];
217 ptrLElement -= rowSkip;
219 /* compute outer product and add it to the Z matrix */
220 Z41 += p4 * q1;
221 Z31 += p3 * q1;
222 Z21 += p2 * q1;
223 Z11 += p1 * q1;
225 if (rowCounter > 12)
227 rowCounter -= 12;
229 ptrBElement -= 12 * b_stride;
231 /* load p and q values */
232 q1 = ptrBElement[8 * b_stride];
233 p4 = ptrLElement[-3];
234 p3 = ptrLElement[-2];
235 p2 = ptrLElement[-1];
236 p1 = ptrLElement[0];
237 ptrLElement -= rowSkip;
239 /* compute outer product and add it to the Z matrix */
240 Z41 += p4 * q1;
241 Z31 += p3 * q1;
242 Z21 += p2 * q1;
243 Z11 += p1 * q1;
245 /* load p and q values */
246 q1 = ptrBElement[7 * b_stride];
247 p4 = ptrLElement[-3];
248 p3 = ptrLElement[-2];
249 p2 = ptrLElement[-1];
250 p1 = ptrLElement[0];
251 ptrLElement -= rowSkip;
253 /* compute outer product and add it to the Z matrix */
254 Z41 += p4 * q1;
255 Z31 += p3 * q1;
256 Z21 += p2 * q1;
257 Z11 += p1 * q1;
259 /* load p and q values */
260 q1 = ptrBElement[6 * b_stride];
261 p4 = ptrLElement[-3];
262 p3 = ptrLElement[-2];
263 p2 = ptrLElement[-1];
264 p1 = ptrLElement[0];
265 ptrLElement -= rowSkip;
267 /* compute outer product and add it to the Z matrix */
268 Z41 += p4 * q1;
269 Z31 += p3 * q1;
270 Z21 += p2 * q1;
271 Z11 += p1 * q1;
273 /* load p and q values */
274 q1 = ptrBElement[5 * b_stride];
275 p4 = ptrLElement[-3];
276 p3 = ptrLElement[-2];
277 p2 = ptrLElement[-1];
278 p1 = ptrLElement[0];
279 ptrLElement -= rowSkip;
281 /* compute outer product and add it to the Z matrix */
282 Z41 += p4 * q1;
283 Z31 += p3 * q1;
284 Z21 += p2 * q1;
285 Z11 += p1 * q1;
287 /* load p and q values */
288 q1 = ptrBElement[4 * b_stride];
289 p4 = ptrLElement[-3];
290 p3 = ptrLElement[-2];
291 p2 = ptrLElement[-1];
292 p1 = ptrLElement[0];
293 ptrLElement -= rowSkip;
295 /* compute outer product and add it to the Z matrix */
296 Z41 += p4 * q1;
297 Z31 += p3 * q1;
298 Z21 += p2 * q1;
299 Z11 += p1 * q1;
301 /* load p and q values */
302 q1 = ptrBElement[3 * b_stride];
303 p4 = ptrLElement[-3];
304 p3 = ptrLElement[-2];
305 p2 = ptrLElement[-1];
306 p1 = ptrLElement[0];
307 ptrLElement -= rowSkip;
309 /* compute outer product and add it to the Z matrix */
310 Z41 += p4 * q1;
311 Z31 += p3 * q1;
312 Z21 += p2 * q1;
313 Z11 += p1 * q1;
315 /* load p and q values */
316 q1 = ptrBElement[2 * b_stride];
317 p4 = ptrLElement[-3];
318 p3 = ptrLElement[-2];
319 p2 = ptrLElement[-1];
320 p1 = ptrLElement[0];
321 ptrLElement -= rowSkip;
323 /* compute outer product and add it to the Z matrix */
324 Z41 += p4 * q1;
325 Z31 += p3 * q1;
326 Z21 += p2 * q1;
327 Z11 += p1 * q1;
329 /* load p and q values */
330 q1 = ptrBElement[1 * b_stride];
331 p4 = ptrLElement[-3];
332 p3 = ptrLElement[-2];
333 p2 = ptrLElement[-1];
334 p1 = ptrLElement[0];
335 ptrLElement -= rowSkip;
337 /* compute outer product and add it to the Z matrix */
338 Z41 += p4 * q1;
339 Z31 += p3 * q1;
340 Z21 += p2 * q1;
341 Z11 += p1 * q1;
343 else
345 ptrBElement -= 4 * b_stride;
347 if ((rowCounter -= 4) == 0)
349 break;
352 /* end of inner loop */
355 else
357 ptrLElement = lastLElement/* - blockStartRow*/; dIASSERT(blockStartRow == 0);
358 ptrBElement = lastBElement;
360 /* set the Z matrix to 0 */
361 Z41 = 0; Z31 = 0; Z21 = 0; Z11 = 0;
364 /* finish computing the X(i) block */
365 dReal Y11, Y21, Y31, Y41;
367 Y11 = ptrBElement[0 * b_stride] - Z11;
368 ptrBElement[0 * b_stride] = Y11;
371 dReal p2 = ptrLElement[-1];
372 Y21 = ptrBElement[-1 * (int)b_stride] - Z21 - p2 * Y11;
373 ptrBElement[-1 * (int)b_stride] = Y21;
376 dReal p3 = ptrLElement[-2];
377 dReal p3_1 = (ptrLElement - rowSkip)[-2];
378 Y31 = ptrBElement[-2 * (int)b_stride] - Z31 - p3 * Y11 - p3_1 * Y21;
379 ptrBElement[-2 * (int)b_stride] = Y31;
382 dReal p4 = ptrLElement[-3];
383 dReal p4_1 = (ptrLElement - rowSkip)[-3];
384 dReal p4_2 = (ptrLElement - rowSkip * 2)[-3];
385 Y41 = ptrBElement[-3 * (int)b_stride] - Z41 - p4 * Y11 - p4_1 * Y21 - p4_2 * Y31;
386 ptrBElement[-3 * (int)b_stride] = Y41;
388 /* end of outer loop */
394 template<unsigned int block_step>
395 /*static */
396 sizeint ThreadedEquationSolverLDLT::estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount, SolvingL1TransposedMemoryEstimates &ref_solvingMemoryEstimates)
398 unsigned blockCount = deriveSolvingL1TransposedBlockCount(rowCount, block_step);
399 sizeint descriptorSizeRequired = dEFFICIENT_SIZE(sizeof(cellindexint) * blockCount);
400 sizeint contextSizeRequired = dEFFICIENT_SIZE(sizeof(SolveL1TransposedCellContext) * (CCI__MAX + 1) * blockCount);
401 ref_solvingMemoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired);
403 sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired;
404 return totalSizeRequired;
407 template<unsigned int block_step>
408 /*static */
409 void ThreadedEquationSolverLDLT::initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount,
410 atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *dUNUSED(cellContexts))
412 unsigned blockCount = deriveSolvingL1TransposedBlockCount(rowCount, block_step);
414 out_blockCompletionProgress = 0;
415 memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors));
418 template<unsigned int block_step, unsigned int b_stride>
419 /*static */
420 void ThreadedEquationSolverLDLT::participateSolvingL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
421 volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
422 SolveL1TransposedCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
424 const unsigned lookaheadRange = 32;
425 const unsigned blockCount = deriveSolvingL1TransposedBlockCount(rowCount, block_step);
426 /* compute rows at end that are not a multiple of block size */
427 const unsigned loopX1RowCount = rowCount % block_step;
429 /* special handling for L and B because we're solving L1 *transpose* */
430 const dReal *lastLElement = L + (rowCount - 1) * ((sizeint)rowSkip + 1);
431 dReal *lastBElement = B + (rowCount - 1) * (sizeint)b_stride;
433 /* elements adjusted as if the last block was full block_step elements */
434 unsigned x1AdjustmentElements = (block_step - loopX1RowCount) % block_step;
435 const dReal *columnAdjustedLastLElement = lastLElement + x1AdjustmentElements;
436 const dReal *fullyAdjustedLastLElement = columnAdjustedLastLElement + (sizeint)rowSkip * x1AdjustmentElements;
437 dReal *adjustedLastBElement = lastBElement + b_stride * x1AdjustmentElements;
439 BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
441 unsigned completedBlocks = refBlockCompletionProgress;
442 unsigned currentBlock = completedBlocks;
443 dIASSERT(completedBlocks <= blockCount);
445 for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
447 bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
448 bool goAssigningTheResult = false, stayWithinTheBlock = false;
450 dReal Z[block_step];
451 dReal Y[block_step];
453 dReal *ptrBElement;
455 CellContextInstance previousContextInstance;
456 unsigned completedRowBlock;
457 bool partialBlock;
459 for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
461 if (testDescriptor == INVALID_CELLDESCRIPTOR)
463 // Invalid descriptor is the indication that the row has been fully calculated
464 // Test if this was the last row and break out if so.
465 if (currentBlock + 1 == blockCount)
467 exitLoop = true;
468 break;
471 // Treat detected row advancement as a row processed
472 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
473 break;
476 CooperativeAtomics::AtomicReadReorderBarrier();
477 // It is necessary to read up to date completedBblocks value after the descriptor retrieval
478 // as otherwise the logic below breaks
479 completedBlocks = refBlockCompletionProgress;
481 if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
483 completedRowBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
484 dIASSERT(completedRowBlock < currentBlock || (completedRowBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
485 dIASSERT(completedRowBlock <= completedBlocks); // Since the descriptor is not locked
487 if (completedRowBlock == completedBlocks && currentBlock != completedBlocks)
489 dIASSERT(completedBlocks < currentBlock);
490 break;
493 if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
495 if (completedRowBlock != 0)
497 CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
498 previousContextInstance = contextInstance;
500 const SolveL1TransposedCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
501 sourceContext.loadPrecalculatedZs(Z);
503 else
505 previousContextInstance = CCI__MIN;
506 SolveL1TransposedCellContext::initializePrecalculatedZs(Z);
509 goForLockedBlockPrimaryCalculation = true;
510 break;
513 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
515 break;
518 testDescriptor = blockProgressDescriptors[currentBlock];
520 else
522 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
524 break;
527 cellindexint verificativeDescriptor;
528 bool verificationFailure = false;
530 completedRowBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
531 dIASSERT(completedRowBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
533 if (completedRowBlock != 0)
535 CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
536 const SolveL1TransposedCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
537 sourceContext.loadPrecalculatedZs(Z);
539 else
541 SolveL1TransposedCellContext::initializePrecalculatedZs(Z);
544 if (completedRowBlock != 0 && completedRowBlock <= currentBlock)
546 // Make sure the descriptor is re-read after the precalculates
547 CooperativeAtomics::AtomicReadReorderBarrier();
550 if (completedRowBlock <= currentBlock)
552 verificativeDescriptor = blockProgressDescriptors[currentBlock];
553 verificationFailure = verificativeDescriptor != testDescriptor;
556 if (!verificationFailure)
558 dIASSERT(completedRowBlock <= currentBlock + 1);
560 goForLockedBlockDuplicateCalculation = true;
561 break;
564 testDescriptor = verificativeDescriptor;
568 if (exitLoop)
570 break;
573 if (goForLockedBlockPrimaryCalculation)
575 blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
577 // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
578 bool handleComputationTakenOver = false, columnEndReached = false;
580 const dReal *ptrLElement;
581 unsigned finalRowBlock;
583 /* check if this is not the partial block of fewer rows */
584 if (currentBlock != 0 || loopX1RowCount == 0)
586 partialBlock = false;
588 ptrLElement = completedRowBlock != 0
589 ? fullyAdjustedLastLElement - currentBlock * block_step - (sizeint)(completedRowBlock * block_step) * rowSkip
590 : columnAdjustedLastLElement - currentBlock * block_step;
591 ptrBElement = completedRowBlock != 0
592 ? adjustedLastBElement - (sizeint)(completedRowBlock * block_step) * b_stride
593 : lastBElement;
595 finalRowBlock = dMACRO_MIN(currentBlock, completedBlocks);
596 dIASSERT(finalRowBlock != completedRowBlock || finalRowBlock == 0);
598 unsigned rowCounter = finalRowBlock - completedRowBlock;
599 bool exitLoop = rowCounter == 0;
601 if (exitLoop)
603 columnEndReached = true;
605 else if (completedRowBlock == 0 && currentBlock != 0 && loopX1RowCount != 0)
607 if ((loopX1RowCount & 1) != 0)
609 dReal q1, p4, p3, p2, p1;
611 /* load p and q values */
612 q1 = ptrBElement[0 * (int)b_stride];
613 p4 = ptrLElement[-3];
614 p3 = ptrLElement[-2];
615 p2 = ptrLElement[-1];
616 p1 = ptrLElement[0];
617 ptrLElement -= rowSkip;
619 /* compute outer product and add it to the Z matrix */
620 Z[3] += p4 * q1;
621 Z[2] += p3 * q1;
622 Z[1] += p2 * q1;
623 Z[0] += p1 * q1;
625 ptrBElement -= 1 * b_stride;
628 if ((loopX1RowCount & 2) != 0)
630 dReal q1, p4, p3, p2, p1;
632 /* load p and q values */
633 q1 = ptrBElement[0 * (int)b_stride];
634 p4 = ptrLElement[-3];
635 p3 = ptrLElement[-2];
636 p2 = ptrLElement[-1];
637 p1 = ptrLElement[0];
638 ptrLElement -= rowSkip;
640 /* compute outer product and add it to the Z matrix */
641 Z[3] += p4 * q1;
642 Z[2] += p3 * q1;
643 Z[1] += p2 * q1;
644 Z[0] += p1 * q1;
646 /* load p and q values */
647 q1 = ptrBElement[-1 * (int)b_stride];
648 p4 = ptrLElement[-3];
649 p3 = ptrLElement[-2];
650 p2 = ptrLElement[-1];
651 p1 = ptrLElement[0];
652 ptrLElement -= rowSkip;
654 /* compute outer product and add it to the Z matrix */
655 Z[3] += p4 * q1;
656 Z[2] += p3 * q1;
657 Z[1] += p2 * q1;
658 Z[0] += p1 * q1;
660 ptrBElement -= 2 * b_stride;
662 dSASSERT(block_step == 4);
664 if (--rowCounter == 0)
668 if (finalRowBlock == currentBlock)
670 columnEndReached = true;
671 exitLoop = true;
672 break;
675 // Take a look if any more columns have been completed...
676 completedBlocks = refBlockCompletionProgress;
677 dIASSERT(completedBlocks >= finalRowBlock);
679 if (completedBlocks == finalRowBlock)
681 exitLoop = true;
682 break;
685 // ...continue if so.
686 unsigned rowCompletedSoFar = finalRowBlock;
687 finalRowBlock = dMACRO_MIN(currentBlock, completedBlocks);
688 rowCounter = finalRowBlock - rowCompletedSoFar;
690 while (false);
694 for (; !exitLoop; exitLoop = false)
696 dReal q1, p4, p3, p2, p1;
698 /* load p and q values */
699 q1 = ptrBElement[0 * (int)b_stride];
700 p4 = ptrLElement[-3];
701 p3 = ptrLElement[-2];
702 p2 = ptrLElement[-1];
703 p1 = ptrLElement[0];
704 ptrLElement -= rowSkip;
706 /* compute outer product and add it to the Z matrix */
707 Z[3] += p4 * q1;
708 Z[2] += p3 * q1;
709 Z[1] += p2 * q1;
710 Z[0] += p1 * q1;
712 /* load p and q values */
713 q1 = ptrBElement[-1 * (int)b_stride];
714 p4 = ptrLElement[-3];
715 p3 = ptrLElement[-2];
716 p2 = ptrLElement[-1];
717 p1 = ptrLElement[0];
718 ptrLElement -= rowSkip;
720 /* compute outer product and add it to the Z matrix */
721 Z[3] += p4 * q1;
722 Z[2] += p3 * q1;
723 Z[1] += p2 * q1;
724 Z[0] += p1 * q1;
726 /* load p and q values */
727 q1 = ptrBElement[-2 * (int)b_stride];
728 p4 = ptrLElement[-3];
729 p3 = ptrLElement[-2];
730 p2 = ptrLElement[-1];
731 p1 = ptrLElement[0];
732 ptrLElement -= rowSkip;
734 /* compute outer product and add it to the Z matrix */
735 Z[3] += p4 * q1;
736 Z[2] += p3 * q1;
737 Z[1] += p2 * q1;
738 Z[0] += p1 * q1;
740 /* load p and q values */
741 q1 = ptrBElement[-3 * (int)b_stride];
742 p4 = ptrLElement[-3];
743 p3 = ptrLElement[-2];
744 p2 = ptrLElement[-1];
745 p1 = ptrLElement[0];
746 ptrLElement -= rowSkip;
748 /* compute outer product and add it to the Z matrix */
749 Z[3] += p4 * q1;
750 Z[2] += p3 * q1;
751 Z[1] += p2 * q1;
752 Z[0] += p1 * q1;
753 dSASSERT(block_step == 4);
755 if (rowCounter > 3)
757 rowCounter -= 3;
759 ptrBElement -= 3 * block_step * b_stride;
761 /* load p and q values */
762 q1 = ptrBElement[8 * b_stride];
763 p4 = ptrLElement[-3];
764 p3 = ptrLElement[-2];
765 p2 = ptrLElement[-1];
766 p1 = ptrLElement[0];
767 ptrLElement -= rowSkip;
769 /* compute outer product and add it to the Z matrix */
770 Z[3] += p4 * q1;
771 Z[2] += p3 * q1;
772 Z[1] += p2 * q1;
773 Z[0] += p1 * q1;
775 /* load p and q values */
776 q1 = ptrBElement[7 * b_stride];
777 p4 = ptrLElement[-3];
778 p3 = ptrLElement[-2];
779 p2 = ptrLElement[-1];
780 p1 = ptrLElement[0];
781 ptrLElement -= rowSkip;
783 /* compute outer product and add it to the Z matrix */
784 Z[3] += p4 * q1;
785 Z[2] += p3 * q1;
786 Z[1] += p2 * q1;
787 Z[0] += p1 * q1;
789 /* load p and q values */
790 q1 = ptrBElement[6 * b_stride];
791 p4 = ptrLElement[-3];
792 p3 = ptrLElement[-2];
793 p2 = ptrLElement[-1];
794 p1 = ptrLElement[0];
795 ptrLElement -= rowSkip;
797 /* compute outer product and add it to the Z matrix */
798 Z[3] += p4 * q1;
799 Z[2] += p3 * q1;
800 Z[1] += p2 * q1;
801 Z[0] += p1 * q1;
803 /* load p and q values */
804 q1 = ptrBElement[5 * b_stride];
805 p4 = ptrLElement[-3];
806 p3 = ptrLElement[-2];
807 p2 = ptrLElement[-1];
808 p1 = ptrLElement[0];
809 ptrLElement -= rowSkip;
811 /* compute outer product and add it to the Z matrix */
812 Z[3] += p4 * q1;
813 Z[2] += p3 * q1;
814 Z[1] += p2 * q1;
815 Z[0] += p1 * q1;
817 /* load p and q values */
818 q1 = ptrBElement[4 * b_stride];
819 p4 = ptrLElement[-3];
820 p3 = ptrLElement[-2];
821 p2 = ptrLElement[-1];
822 p1 = ptrLElement[0];
823 ptrLElement -= rowSkip;
825 /* compute outer product and add it to the Z matrix */
826 Z[3] += p4 * q1;
827 Z[2] += p3 * q1;
828 Z[1] += p2 * q1;
829 Z[0] += p1 * q1;
831 /* load p and q values */
832 q1 = ptrBElement[3 * b_stride];
833 p4 = ptrLElement[-3];
834 p3 = ptrLElement[-2];
835 p2 = ptrLElement[-1];
836 p1 = ptrLElement[0];
837 ptrLElement -= rowSkip;
839 /* compute outer product and add it to the Z matrix */
840 Z[3] += p4 * q1;
841 Z[2] += p3 * q1;
842 Z[1] += p2 * q1;
843 Z[0] += p1 * q1;
845 /* load p and q values */
846 q1 = ptrBElement[2 * b_stride];
847 p4 = ptrLElement[-3];
848 p3 = ptrLElement[-2];
849 p2 = ptrLElement[-1];
850 p1 = ptrLElement[0];
851 ptrLElement -= rowSkip;
853 /* compute outer product and add it to the Z matrix */
854 Z[3] += p4 * q1;
855 Z[2] += p3 * q1;
856 Z[1] += p2 * q1;
857 Z[0] += p1 * q1;
859 /* load p and q values */
860 q1 = ptrBElement[1 * b_stride];
861 p4 = ptrLElement[-3];
862 p3 = ptrLElement[-2];
863 p2 = ptrLElement[-1];
864 p1 = ptrLElement[0];
865 ptrLElement -= rowSkip;
867 /* compute outer product and add it to the Z matrix */
868 Z[3] += p4 * q1;
869 Z[2] += p3 * q1;
870 Z[1] += p2 * q1;
871 Z[0] += p1 * q1;
872 dSASSERT(block_step == 4);
874 else
876 ptrBElement -= block_step * b_stride;
878 if (--rowCounter == 0)
880 if (finalRowBlock == currentBlock)
882 columnEndReached = true;
883 break;
886 // Take a look if any more columns have been completed...
887 completedBlocks = refBlockCompletionProgress;
888 dIASSERT(completedBlocks >= finalRowBlock);
890 if (completedBlocks == finalRowBlock)
892 break;
895 // ...continue if so.
896 unsigned rowCompletedSoFar = finalRowBlock;
897 finalRowBlock = dMACRO_MIN(currentBlock, completedBlocks);
898 rowCounter = finalRowBlock - rowCompletedSoFar;
901 /* end of inner loop */
904 else /* compute rightmost bottom X(i) block */
906 partialBlock = true;
908 ptrLElement = lastLElement;
909 ptrBElement = lastBElement;
910 dIASSERT(completedRowBlock == 0);
912 columnEndReached = true;
915 if (columnEndReached)
917 // Check whether there is still a need to proceed or if the computation has been taken over by another thread
918 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedRowBlock, previousContextInstance, true);
920 if (blockProgressDescriptors[currentBlock] == oldDescriptor)
922 if (partialBlock)
924 Y[0] = ptrBElement[0 * b_stride]/* - Z[0]*/;
926 if (loopX1RowCount >= 2)
928 dReal p2 = ptrLElement[-1];
929 Y[1] = ptrBElement[-1 * (int)b_stride]/* - Z[1] */- p2 * Y[0];
931 if (loopX1RowCount > 2)
933 dReal p3 = ptrLElement[-2];
934 dReal p3_1 = (ptrLElement - rowSkip)[-2];
935 Y[2] = ptrBElement[-2 * (int)b_stride]/* - Z[2] */- p3 * Y[0] - p3_1 * Y[1];
939 dSASSERT(block_step == 4);
941 else
943 Y[0] = ptrBElement[0 * b_stride] - Z[0];
945 dReal p2 = ptrLElement[-1];
946 Y[1] = ptrBElement[-1 * (int)b_stride] - Z[1] - p2 * Y[0];
948 dReal p3 = ptrLElement[-2];
949 dReal p3_1 = (ptrLElement - rowSkip)[-2];
950 Y[2] = ptrBElement[-2 * (int)b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
952 dReal p4 = ptrLElement[-3];
953 dReal p4_1 = (ptrLElement - rowSkip)[-3];
954 dReal p4_2 = (ptrLElement - rowSkip * 2)[-3];
955 Y[3] = ptrBElement[-3 * (int)b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
957 dSASSERT(block_step == 4);
960 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
961 CooperativeAtomics::AtomicReadReorderBarrier();
963 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
964 // and the computation result is valid.
965 if (blockProgressDescriptors[currentBlock] == oldDescriptor)
967 // Assign the results to the result context (possibly in parallel with other threads
968 // that could and ought to be assigning exactly the same values)
969 SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
970 resultContext.storePrecalculatedZs(Y);
972 // Assign the result assignment progress descriptor
973 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
974 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
976 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
977 goAssigningTheResult = true;
979 else
981 // Otherwise, go on competing for copying the results
982 handleComputationTakenOver = true;
985 else
987 handleComputationTakenOver = true;
990 else
992 // If the final column has not been reached yet, store current values to the context.
993 // Select the other context instance as the previous one might be read by other threads.
994 CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
995 SolveL1TransposedCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
996 destinationContext.storePrecalculatedZs(Z);
998 // Unlock the row until more columns can be used
999 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedRowBlock, previousContextInstance, true);
1000 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalRowBlock, nextContextInstance, false);
1001 // The descriptor might have been updated by a competing thread
1002 if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
1004 // Adjust the ptrBElement to point to the result area...
1005 ptrBElement = adjustedLastBElement - (sizeint)(currentBlock * block_step) * b_stride;
1006 dIASSERT(currentBlock != 0 || adjustedLastBElement == lastBElement);
1007 // ...and go on handling the case
1008 handleComputationTakenOver = true;
1012 if (handleComputationTakenOver)
1014 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1015 // This can only happen if the row was (has become) the uppermost not fully completed one
1016 // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
1017 if (existingDescriptor != INVALID_CELLDESCRIPTOR)
1019 // If not fully completed this must be the final stage of the result assignment into the matrix
1020 dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
1022 // Go on competing copying the result as anyway the block is the topmost not completed one
1023 // and since there was competition for it, there is no other work that can be done right now.
1024 const SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1025 resultContext.loadPrecalculatedZs(Y);
1027 goAssigningTheResult = true;
1029 else
1031 // everything is over -- just go handling next blocks
1035 else if (goForLockedBlockDuplicateCalculation)
1037 blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
1039 bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
1041 /* declare variables */
1042 const dReal *ptrLElement;
1044 if (completedRowBlock < currentBlock)
1046 partialBlock = false;
1048 ptrLElement = completedRowBlock != 0
1049 ? fullyAdjustedLastLElement - currentBlock * block_step - (sizeint)(completedRowBlock * block_step) * rowSkip
1050 : columnAdjustedLastLElement - currentBlock * block_step;
1051 ptrBElement = completedRowBlock != 0
1052 ? adjustedLastBElement - (sizeint)(completedRowBlock * block_step) * b_stride
1053 : lastBElement;
1055 unsigned finalRowBlock = currentBlock/*std::min(currentBlock, completedBlocks)*/;
1056 dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
1058 bool exitInnerLoop = false;
1059 unsigned lastCompletedRow = completedRowBlock;
1060 unsigned rowCounter = finalRowBlock - completedRowBlock;
1062 if (completedRowBlock == 0/* && currentBlock != 0 */&& loopX1RowCount != 0)
1064 if ((loopX1RowCount & 1) != 0)
1066 dReal q1, p4, p3, p2, p1;
1068 /* load p and q values */
1069 q1 = ptrBElement[0 * (int)b_stride];
1070 p4 = ptrLElement[-3];
1071 p3 = ptrLElement[-2];
1072 p2 = ptrLElement[-1];
1073 p1 = ptrLElement[0];
1074 ptrLElement -= rowSkip;
1076 /* compute outer product and add it to the Z matrix */
1077 Z[3] += p4 * q1;
1078 Z[2] += p3 * q1;
1079 Z[1] += p2 * q1;
1080 Z[0] += p1 * q1;
1082 ptrBElement -= 1 * b_stride;
1085 if ((loopX1RowCount & 2) != 0)
1087 dReal q1, p4, p3, p2, p1;
1089 /* load p and q values */
1090 q1 = ptrBElement[0 * (int)b_stride];
1091 p4 = ptrLElement[-3];
1092 p3 = ptrLElement[-2];
1093 p2 = ptrLElement[-1];
1094 p1 = ptrLElement[0];
1095 ptrLElement -= rowSkip;
1097 /* compute outer product and add it to the Z matrix */
1098 Z[3] += p4 * q1;
1099 Z[2] += p3 * q1;
1100 Z[1] += p2 * q1;
1101 Z[0] += p1 * q1;
1103 /* load p and q values */
1104 q1 = ptrBElement[-1 * (int)b_stride];
1105 p4 = ptrLElement[-3];
1106 p3 = ptrLElement[-2];
1107 p2 = ptrLElement[-1];
1108 p1 = ptrLElement[0];
1109 ptrLElement -= rowSkip;
1111 /* compute outer product and add it to the Z matrix */
1112 Z[3] += p4 * q1;
1113 Z[2] += p3 * q1;
1114 Z[1] += p2 * q1;
1115 Z[0] += p1 * q1;
1117 ptrBElement -= 2 * b_stride;
1119 dSASSERT(block_step == 4);
1121 if (--rowCounter == 0)
1123 exitInnerLoop = true;
1127 for (; !exitInnerLoop; exitInnerLoop = --rowCounter == 0)
1129 dReal q1, p4, p3, p2, p1;
1131 /* load p and q values */
1132 q1 = ptrBElement[0 * (int)b_stride];
1133 p4 = ptrLElement[-3];
1134 p3 = ptrLElement[-2];
1135 p2 = ptrLElement[-1];
1136 p1 = ptrLElement[0];
1137 ptrLElement -= rowSkip;
1139 /* compute outer product and add it to the Z matrix */
1140 Z[3] += p4 * q1;
1141 Z[2] += p3 * q1;
1142 Z[1] += p2 * q1;
1143 Z[0] += p1 * q1;
1145 /* load p and q values */
1146 q1 = ptrBElement[-1 * (int)b_stride];
1147 p4 = ptrLElement[-3];
1148 p3 = ptrLElement[-2];
1149 p2 = ptrLElement[-1];
1150 p1 = ptrLElement[0];
1151 ptrLElement -= rowSkip;
1153 /* compute outer product and add it to the Z matrix */
1154 Z[3] += p4 * q1;
1155 Z[2] += p3 * q1;
1156 Z[1] += p2 * q1;
1157 Z[0] += p1 * q1;
1159 /* load p and q values */
1160 q1 = ptrBElement[-2 * (int)b_stride];
1161 p4 = ptrLElement[-3];
1162 p3 = ptrLElement[-2];
1163 p2 = ptrLElement[-1];
1164 p1 = ptrLElement[0];
1165 ptrLElement -= rowSkip;
1167 /* compute outer product and add it to the Z matrix */
1168 Z[3] += p4 * q1;
1169 Z[2] += p3 * q1;
1170 Z[1] += p2 * q1;
1171 Z[0] += p1 * q1;
1173 /* load p and q values */
1174 q1 = ptrBElement[-3 * (int)b_stride];
1175 p4 = ptrLElement[-3];
1176 p3 = ptrLElement[-2];
1177 p2 = ptrLElement[-1];
1178 p1 = ptrLElement[0];
1179 ptrLElement -= rowSkip;
1181 /* compute outer product and add it to the Z matrix */
1182 Z[3] += p4 * q1;
1183 Z[2] += p3 * q1;
1184 Z[1] += p2 * q1;
1185 Z[0] += p1 * q1;
1186 dSASSERT(block_step == 4);
1188 // Check if the primary solver thread has not made any progress
1189 cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
1190 unsigned newCompletedRow = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
1192 if (newCompletedRow != lastCompletedRow)
1194 // Check, this is the first change the current thread detects.
1195 // There is absolutely no reason in code for the computation to stop/resume twice
1196 // while the current thread is competing.
1197 dIASSERT(lastCompletedRow == completedRowBlock);
1199 if (descriptorVerification == INVALID_CELLDESCRIPTOR)
1201 skipToHandlingSubsequentRows = true;
1202 break;
1205 if (newCompletedRow == currentBlock + 1)
1207 skiptoCopyingResult = true;
1208 break;
1211 // Check if the current thread is behind
1212 if (newCompletedRow > finalRowBlock - rowCounter)
1214 // If so, go starting over one more time
1215 blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1216 stayWithinTheBlock = true;
1217 skipToHandlingSubsequentRows = true;
1218 break;
1221 // If current thread is ahead, just save new completed column for further comparisons and go on calculating
1222 lastCompletedRow = newCompletedRow;
1225 /* advance pointers */
1226 ptrBElement -= block_step * b_stride;
1227 /* end of inner loop */
1230 else if (completedRowBlock > currentBlock)
1232 dIASSERT(completedRowBlock == currentBlock + 1);
1234 partialBlock = currentBlock == 0 && loopX1RowCount != 0;
1236 skiptoCopyingResult = true;
1238 else
1240 dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
1242 partialBlock = /*currentBlock == 0 && */loopX1RowCount != 0;
1244 /* just assign the pointers appropriately and go on computing the results */
1245 ptrLElement = lastLElement;
1246 ptrBElement = lastBElement;
1249 if (!skipToHandlingSubsequentRows)
1251 if (!skiptoCopyingResult)
1253 if (partialBlock)
1255 Y[0] = ptrBElement[0 * b_stride]/* - Z[0]*/;
1257 if (loopX1RowCount >= 2)
1259 dReal p2 = ptrLElement[-1];
1260 Y[1] = ptrBElement[-1 * (int)b_stride]/* - Z[1] */- p2 * Y[0];
1262 if (loopX1RowCount > 2)
1264 dReal p3 = ptrLElement[-2];
1265 dReal p3_1 = (ptrLElement - rowSkip)[-2];
1266 Y[2] = ptrBElement[-2 * (int)b_stride]/* - Z[2] */- p3 * Y[0] - p3_1 * Y[1];
1270 dSASSERT(block_step == 4);
1272 else
1274 Y[0] = ptrBElement[0 * b_stride] - Z[0];
1276 dReal p2 = ptrLElement[-1];
1277 Y[1] = ptrBElement[-1 * (int)b_stride] - Z[1] - p2 * Y[0];
1279 dReal p3 = ptrLElement[-2];
1280 dReal p3_1 = (ptrLElement - rowSkip)[-2];
1281 Y[2] = ptrBElement[-2 * (int)b_stride] - Z[2] - p3 * Y[0] - p3_1 * Y[1];
1283 dReal p4 = ptrLElement[-3];
1284 dReal p4_1 = (ptrLElement - rowSkip)[-3];
1285 dReal p4_2 = (ptrLElement - rowSkip * 2)[-3];
1286 Y[3] = ptrBElement[-3 * (int)b_stride] - Z[3] - p4 * Y[0] - p4_1 * Y[1] - p4_2 * Y[2];
1288 dSASSERT(block_step == 4);
1291 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
1292 CooperativeAtomics::AtomicReadReorderBarrier();
1294 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1296 if (existingDescriptor == INVALID_CELLDESCRIPTOR)
1298 // Everything is over -- proceed to subsequent rows
1299 skipToHandlingSubsequentRows = true;
1301 else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
1303 // The values computed above may not be valid. Copy the values already in the result context.
1304 skiptoCopyingResult = true;
1306 else
1308 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
1309 // and the computation result is valid.
1310 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
1312 // Assign the results to the result context (possibly in parallel with other threads
1313 // that could and ought to be assigning exactly the same values)
1314 SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1315 resultContext.storePrecalculatedZs(Y);
1317 // Assign the result assignment progress descriptor
1318 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
1320 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
1324 if (!skipToHandlingSubsequentRows)
1326 if (skiptoCopyingResult)
1328 // Extract the result values stored in the result context
1329 const SolveL1TransposedCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1330 resultContext.loadPrecalculatedZs(Y);
1332 ptrBElement = currentBlock != 0 ? adjustedLastBElement - (sizeint)(currentBlock * block_step) * b_stride : lastBElement;
1335 goAssigningTheResult = true;
1340 if (goAssigningTheResult)
1342 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1343 // Check if the assignment has not been completed yet
1344 if (existingDescriptor != INVALID_CELLDESCRIPTOR)
1346 // Assign the computation results to B vector
1347 if (partialBlock)
1349 // ptrBElement[0 * b_stride] = Y[0]; -- unchanged
1351 if (loopX1RowCount >= 2)
1353 ptrBElement[-1 * (int)b_stride] = Y[1];
1355 if (loopX1RowCount > 2)
1357 ptrBElement[-2 * (int)b_stride] = Y[2];
1360 dSASSERT(block_step == 4);
1362 else
1364 ptrBElement[0 * b_stride] = Y[0];
1365 ptrBElement[-1 * (int)b_stride] = Y[1];
1366 ptrBElement[-2 * (int)b_stride] = Y[2];
1367 ptrBElement[-3 * (int)b_stride] = Y[3];
1368 dSASSERT(block_step == 4);
1371 ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
1372 dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
1374 // And assign the completed status no matter what
1375 CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
1377 else
1379 // everything is over -- just go handling next blocks
1383 if (!stayWithinTheBlock)
1385 completedBlocks = refBlockCompletionProgress;
1387 if (completedBlocks == blockCount)
1389 break;
1392 currentBlock += 1;
1394 bool lookaheadBoundaryReached = false;
1396 if (currentBlock == blockCount || completedBlocks == 0)
1398 lookaheadBoundaryReached = true;
1400 else if (currentBlock >= completedBlocks + lookaheadRange)
1402 lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
1404 else if (currentBlock < completedBlocks)
1406 // Treat detected row advancement as a row processed
1407 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
1409 currentBlock = completedBlocks;
1412 if (lookaheadBoundaryReached)
1414 dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did not we compete???
1416 // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
1417 if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
1419 // Abandon job if too few blocks remain
1420 if (blockCount - completedBlocks <= ownThreadIndex)
1422 break;
1425 blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1427 else
1429 // If there was some progress, just continue to the next pass
1430 blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
1433 currentBlock = completedBlocks;
1440 #endif