3 /*************************************************************************
5 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
6 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of EITHER: *
10 * (1) The GNU Lesser General Public License as published by the Free *
11 * Software Foundation; either version 2.1 of the License, or (at *
12 * your option) any later version. The text of the GNU Lesser *
13 * General Public License is included with this library in the *
15 * (2) The BSD-style license that is included with this library in *
16 * the file LICENSE-BSD.TXT. *
18 * This library is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
21 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
23 *************************************************************************/
26 * Code style improvements and optimizations by Oleh Derevenko ????-2024
27 * L1Transposed cooperative solving code of ThreadedEquationSolverLDLT copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
31 #ifndef _ODE_FASTLTSOLVE_IMPL_H_
32 #define _ODE_FASTLTSOLVE_IMPL_H_
35 /* solve L^T * x=b, with b containing 1 right hand side.
36 * L is an n*n lower triangular matrix with ones on the diagonal.
37 * L is stored by rows and its leading dimension is rowSkip.
38 * b is an n*1 matrix that contains the right hand side.
39 * b is overwritten with x.
40 * this processes blocks of 4.
43 template<unsigned int b_stride
>
44 void solveL1Transposed(const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
)
46 dIASSERT(rowCount
!= 0);
48 /* special handling for L and B because we're solving L1 *transpose* */
49 const dReal
*lastLElement
= L
+ (sizeint
)(rowCount
- 1) * (rowSkip
+ 1);
50 dReal
*lastBElement
= B
+ (sizeint
)(rowCount
- 1) * b_stride
;
52 /* compute rows at end that are not a multiple of block size */
53 const unsigned loopX1RowCount
= rowCount
% 4;
55 unsigned blockStartRow
= loopX1RowCount
;
56 bool subsequentPass
= false;
58 /* compute rightmost bottom X(i) block */
59 if (loopX1RowCount
!= 0)
61 subsequentPass
= true;
63 const dReal
*ptrLElement
= lastLElement
;
64 dReal
*ptrBElement
= lastBElement
;
66 dReal Y11
= ptrBElement
[0 * b_stride
]/* - Z11*/;
67 // ptrBElement[0 * b_stride] = Y11; -- unchanged
69 if (loopX1RowCount
>= 2)
71 dReal p2
= ptrLElement
[-1];
72 dReal Y21
= ptrBElement
[-1 * (int)b_stride
]/* - Z21 */- p2
* Y11
;
73 ptrBElement
[-1 * (int)b_stride
] = Y21
;
75 if (loopX1RowCount
> 2)
77 dReal p3
= ptrLElement
[-2];
78 dReal p3_1
= (ptrLElement
- rowSkip
)[-2];
79 dReal Y31
= ptrBElement
[-2 * (int)b_stride
]/* - Z31 */- p3
* Y11
- p3_1
* Y21
;
80 ptrBElement
[-2 * (int)b_stride
] = Y31
;
85 /* compute all 4 x 1 blocks of X */
86 for (; !subsequentPass
|| blockStartRow
< rowCount
; subsequentPass
= true, blockStartRow
+= 4)
88 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
90 /* declare variables - Z matrix, p and q vectors, etc */
91 const dReal
*ptrLElement
;
94 dReal Z41
, Z31
, Z21
, Z11
;
98 ptrLElement
= lastLElement
- blockStartRow
;
99 ptrBElement
= lastBElement
;
101 /* set the Z matrix to 0 */
102 Z41
= 0; Z31
= 0; Z21
= 0; Z11
= 0;
104 unsigned rowCounter
= blockStartRow
;
106 if (rowCounter
% 2 != 0)
108 dReal q1
, p4
, p3
, p2
, p1
;
110 /* load p and q values */
111 q1
= ptrBElement
[0 * (int)b_stride
];
112 p4
= ptrLElement
[-3];
113 p3
= ptrLElement
[-2];
114 p2
= ptrLElement
[-1];
116 ptrLElement
-= rowSkip
;
118 /* compute outer product and add it to the Z matrix */
124 ptrBElement
-= 1 * b_stride
;
128 if (rowCounter
% 4 != 0)
130 dReal q1
, p4
, p3
, p2
, p1
;
132 /* load p and q values */
133 q1
= ptrBElement
[0 * (int)b_stride
];
134 p4
= ptrLElement
[-3];
135 p3
= ptrLElement
[-2];
136 p2
= ptrLElement
[-1];
138 ptrLElement
-= rowSkip
;
140 /* compute outer product and add it to the Z matrix */
146 /* load p and q values */
147 q1
= ptrBElement
[-1 * (int)b_stride
];
148 p4
= ptrLElement
[-3];
149 p3
= ptrLElement
[-2];
150 p2
= ptrLElement
[-1];
152 ptrLElement
-= rowSkip
;
154 /* compute outer product and add it to the Z matrix */
160 ptrBElement
-= 2 * b_stride
;
164 /* the inner loop that computes outer products and adds them to Z */
165 for (bool exitLoop
= rowCounter
== 0; !exitLoop
; exitLoop
= false)
167 dReal q1
, p4
, p3
, p2
, p1
;
169 /* load p and q values */
170 q1
= ptrBElement
[0 * (int)b_stride
];
171 p4
= ptrLElement
[-3];
172 p3
= ptrLElement
[-2];
173 p2
= ptrLElement
[-1];
175 ptrLElement
-= rowSkip
;
177 /* compute outer product and add it to the Z matrix */
183 /* load p and q values */
184 q1
= ptrBElement
[-1 * (int)b_stride
];
185 p4
= ptrLElement
[-3];
186 p3
= ptrLElement
[-2];
187 p2
= ptrLElement
[-1];
189 ptrLElement
-= rowSkip
;
191 /* compute outer product and add it to the Z matrix */
197 /* load p and q values */
198 q1
= ptrBElement
[-2 * (int)b_stride
];
199 p4
= ptrLElement
[-3];
200 p3
= ptrLElement
[-2];
201 p2
= ptrLElement
[-1];
203 ptrLElement
-= rowSkip
;
205 /* compute outer product and add it to the Z matrix */
211 /* load p and q values */
212 q1
= ptrBElement
[-3 * (int)b_stride
];
213 p4
= ptrLElement
[-3];
214 p3
= ptrLElement
[-2];
215 p2
= ptrLElement
[-1];
217 ptrLElement
-= rowSkip
;
219 /* compute outer product and add it to the Z matrix */
229 ptrBElement
-= 12 * b_stride
;
231 /* load p and q values */
232 q1
= ptrBElement
[8 * b_stride
];
233 p4
= ptrLElement
[-3];
234 p3
= ptrLElement
[-2];
235 p2
= ptrLElement
[-1];
237 ptrLElement
-= rowSkip
;
239 /* compute outer product and add it to the Z matrix */
245 /* load p and q values */
246 q1
= ptrBElement
[7 * b_stride
];
247 p4
= ptrLElement
[-3];
248 p3
= ptrLElement
[-2];
249 p2
= ptrLElement
[-1];
251 ptrLElement
-= rowSkip
;
253 /* compute outer product and add it to the Z matrix */
259 /* load p and q values */
260 q1
= ptrBElement
[6 * b_stride
];
261 p4
= ptrLElement
[-3];
262 p3
= ptrLElement
[-2];
263 p2
= ptrLElement
[-1];
265 ptrLElement
-= rowSkip
;
267 /* compute outer product and add it to the Z matrix */
273 /* load p and q values */
274 q1
= ptrBElement
[5 * b_stride
];
275 p4
= ptrLElement
[-3];
276 p3
= ptrLElement
[-2];
277 p2
= ptrLElement
[-1];
279 ptrLElement
-= rowSkip
;
281 /* compute outer product and add it to the Z matrix */
287 /* load p and q values */
288 q1
= ptrBElement
[4 * b_stride
];
289 p4
= ptrLElement
[-3];
290 p3
= ptrLElement
[-2];
291 p2
= ptrLElement
[-1];
293 ptrLElement
-= rowSkip
;
295 /* compute outer product and add it to the Z matrix */
301 /* load p and q values */
302 q1
= ptrBElement
[3 * b_stride
];
303 p4
= ptrLElement
[-3];
304 p3
= ptrLElement
[-2];
305 p2
= ptrLElement
[-1];
307 ptrLElement
-= rowSkip
;
309 /* compute outer product and add it to the Z matrix */
315 /* load p and q values */
316 q1
= ptrBElement
[2 * b_stride
];
317 p4
= ptrLElement
[-3];
318 p3
= ptrLElement
[-2];
319 p2
= ptrLElement
[-1];
321 ptrLElement
-= rowSkip
;
323 /* compute outer product and add it to the Z matrix */
329 /* load p and q values */
330 q1
= ptrBElement
[1 * b_stride
];
331 p4
= ptrLElement
[-3];
332 p3
= ptrLElement
[-2];
333 p2
= ptrLElement
[-1];
335 ptrLElement
-= rowSkip
;
337 /* compute outer product and add it to the Z matrix */
345 ptrBElement
-= 4 * b_stride
;
347 if ((rowCounter
-= 4) == 0)
352 /* end of inner loop */
357 ptrLElement
= lastLElement
/* - blockStartRow*/; dIASSERT(blockStartRow
== 0);
358 ptrBElement
= lastBElement
;
360 /* set the Z matrix to 0 */
361 Z41
= 0; Z31
= 0; Z21
= 0; Z11
= 0;
364 /* finish computing the X(i) block */
365 dReal Y11
, Y21
, Y31
, Y41
;
367 Y11
= ptrBElement
[0 * b_stride
] - Z11
;
368 ptrBElement
[0 * b_stride
] = Y11
;
371 dReal p2
= ptrLElement
[-1];
372 Y21
= ptrBElement
[-1 * (int)b_stride
] - Z21
- p2
* Y11
;
373 ptrBElement
[-1 * (int)b_stride
] = Y21
;
376 dReal p3
= ptrLElement
[-2];
377 dReal p3_1
= (ptrLElement
- rowSkip
)[-2];
378 Y31
= ptrBElement
[-2 * (int)b_stride
] - Z31
- p3
* Y11
- p3_1
* Y21
;
379 ptrBElement
[-2 * (int)b_stride
] = Y31
;
382 dReal p4
= ptrLElement
[-3];
383 dReal p4_1
= (ptrLElement
- rowSkip
)[-3];
384 dReal p4_2
= (ptrLElement
- rowSkip
* 2)[-3];
385 Y41
= ptrBElement
[-3 * (int)b_stride
] - Z41
- p4
* Y11
- p4_1
* Y21
- p4_2
* Y31
;
386 ptrBElement
[-3 * (int)b_stride
] = Y41
;
388 /* end of outer loop */
394 template<unsigned int block_step
>
396 sizeint
ThreadedEquationSolverLDLT::estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount
, SolvingL1TransposedMemoryEstimates
&ref_solvingMemoryEstimates
)
398 unsigned blockCount
= deriveSolvingL1TransposedBlockCount(rowCount
, block_step
);
399 sizeint descriptorSizeRequired
= dEFFICIENT_SIZE(sizeof(cellindexint
) * blockCount
);
400 sizeint contextSizeRequired
= dEFFICIENT_SIZE(sizeof(SolveL1TransposedCellContext
) * (CCI__MAX
+ 1) * blockCount
);
401 ref_solvingMemoryEstimates
.assignData(descriptorSizeRequired
, contextSizeRequired
);
403 sizeint totalSizeRequired
= descriptorSizeRequired
+ contextSizeRequired
;
404 return totalSizeRequired
;
407 template<unsigned int block_step
>
409 void ThreadedEquationSolverLDLT::initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount
,
410 atomicord32
&out_blockCompletionProgress
, cellindexint
*blockProgressDescriptors
, SolveL1TransposedCellContext
*dUNUSED(cellContexts
))
412 unsigned blockCount
= deriveSolvingL1TransposedBlockCount(rowCount
, block_step
);
414 out_blockCompletionProgress
= 0;
415 memset(blockProgressDescriptors
, 0, blockCount
* sizeof(*blockProgressDescriptors
));
418 template<unsigned int block_step
, unsigned int b_stride
>
420 void ThreadedEquationSolverLDLT::participateSolvingL1Transposed(const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
,
421 volatile atomicord32
&refBlockCompletionProgress
/*=0*/, volatile cellindexint
*blockProgressDescriptors
/*=[blockCount]*/,
422 SolveL1TransposedCellContext
*cellContexts
/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex
)
424 const unsigned lookaheadRange
= 32;
425 const unsigned blockCount
= deriveSolvingL1TransposedBlockCount(rowCount
, block_step
);
426 /* compute rows at end that are not a multiple of block size */
427 const unsigned loopX1RowCount
= rowCount
% block_step
;
429 /* special handling for L and B because we're solving L1 *transpose* */
430 const dReal
*lastLElement
= L
+ (rowCount
- 1) * ((sizeint
)rowSkip
+ 1);
431 dReal
*lastBElement
= B
+ (rowCount
- 1) * (sizeint
)b_stride
;
433 /* elements adjusted as if the last block was full block_step elements */
434 unsigned x1AdjustmentElements
= (block_step
- loopX1RowCount
) % block_step
;
435 const dReal
*columnAdjustedLastLElement
= lastLElement
+ x1AdjustmentElements
;
436 const dReal
*fullyAdjustedLastLElement
= columnAdjustedLastLElement
+ (sizeint
)rowSkip
* x1AdjustmentElements
;
437 dReal
*adjustedLastBElement
= lastBElement
+ b_stride
* x1AdjustmentElements
;
439 BlockProcessingState blockProcessingState
= BPS_NO_BLOCKS_PROCESSED
;
441 unsigned completedBlocks
= refBlockCompletionProgress
;
442 unsigned currentBlock
= completedBlocks
;
443 dIASSERT(completedBlocks
<= blockCount
);
445 for (bool exitLoop
= completedBlocks
== blockCount
; !exitLoop
; exitLoop
= false)
447 bool goForLockedBlockPrimaryCalculation
= false, goForLockedBlockDuplicateCalculation
= false;
448 bool goAssigningTheResult
= false, stayWithinTheBlock
= false;
455 CellContextInstance previousContextInstance
;
456 unsigned completedRowBlock
;
459 for (cellindexint testDescriptor
= blockProgressDescriptors
[currentBlock
]; ; )
461 if (testDescriptor
== INVALID_CELLDESCRIPTOR
)
463 // Invalid descriptor is the indication that the row has been fully calculated
464 // Test if this was the last row and break out if so.
465 if (currentBlock
+ 1 == blockCount
)
471 // Treat detected row advancement as a row processed
472 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
476 CooperativeAtomics::AtomicReadReorderBarrier();
477 // It is necessary to read up to date completedBblocks value after the descriptor retrieval
478 // as otherwise the logic below breaks
479 completedBlocks
= refBlockCompletionProgress
;
481 if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor
))
483 completedRowBlock
= GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor
);
484 dIASSERT(completedRowBlock
< currentBlock
|| (completedRowBlock
== currentBlock
&& currentBlock
== 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
485 dIASSERT(completedRowBlock
<= completedBlocks
); // Since the descriptor is not locked
487 if (completedRowBlock
== completedBlocks
&& currentBlock
!= completedBlocks
)
489 dIASSERT(completedBlocks
< currentBlock
);
493 if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], testDescriptor
, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor
)))
495 if (completedRowBlock
!= 0)
497 CellContextInstance contextInstance
= GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor
);
498 previousContextInstance
= contextInstance
;
500 const SolveL1TransposedCellContext
&sourceContext
= buildBlockContextRef(cellContexts
, currentBlock
, contextInstance
);
501 sourceContext
.loadPrecalculatedZs(Z
);
505 previousContextInstance
= CCI__MIN
;
506 SolveL1TransposedCellContext::initializePrecalculatedZs(Z
);
509 goForLockedBlockPrimaryCalculation
= true;
513 if (blockProcessingState
!= BPS_COMPETING_FOR_A_BLOCK
)
518 testDescriptor
= blockProgressDescriptors
[currentBlock
];
522 if (blockProcessingState
!= BPS_COMPETING_FOR_A_BLOCK
)
527 cellindexint verificativeDescriptor
;
528 bool verificationFailure
= false;
530 completedRowBlock
= GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor
);
531 dIASSERT(completedRowBlock
!= currentBlock
|| currentBlock
== 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
533 if (completedRowBlock
!= 0)
535 CellContextInstance contextInstance
= GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor
);
536 const SolveL1TransposedCellContext
&sourceContext
= buildBlockContextRef(cellContexts
, currentBlock
, contextInstance
);
537 sourceContext
.loadPrecalculatedZs(Z
);
541 SolveL1TransposedCellContext::initializePrecalculatedZs(Z
);
544 if (completedRowBlock
!= 0 && completedRowBlock
<= currentBlock
)
546 // Make sure the descriptor is re-read after the precalculates
547 CooperativeAtomics::AtomicReadReorderBarrier();
550 if (completedRowBlock
<= currentBlock
)
552 verificativeDescriptor
= blockProgressDescriptors
[currentBlock
];
553 verificationFailure
= verificativeDescriptor
!= testDescriptor
;
556 if (!verificationFailure
)
558 dIASSERT(completedRowBlock
<= currentBlock
+ 1);
560 goForLockedBlockDuplicateCalculation
= true;
564 testDescriptor
= verificativeDescriptor
;
573 if (goForLockedBlockPrimaryCalculation
)
575 blockProcessingState
= BPS_SOME_BLOCKS_PROCESSED
;
577 // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
578 bool handleComputationTakenOver
= false, columnEndReached
= false;
580 const dReal
*ptrLElement
;
581 unsigned finalRowBlock
;
583 /* check if this is not the partial block of fewer rows */
584 if (currentBlock
!= 0 || loopX1RowCount
== 0)
586 partialBlock
= false;
588 ptrLElement
= completedRowBlock
!= 0
589 ? fullyAdjustedLastLElement
- currentBlock
* block_step
- (sizeint
)(completedRowBlock
* block_step
) * rowSkip
590 : columnAdjustedLastLElement
- currentBlock
* block_step
;
591 ptrBElement
= completedRowBlock
!= 0
592 ? adjustedLastBElement
- (sizeint
)(completedRowBlock
* block_step
) * b_stride
595 finalRowBlock
= dMACRO_MIN(currentBlock
, completedBlocks
);
596 dIASSERT(finalRowBlock
!= completedRowBlock
|| finalRowBlock
== 0);
598 unsigned rowCounter
= finalRowBlock
- completedRowBlock
;
599 bool exitLoop
= rowCounter
== 0;
603 columnEndReached
= true;
605 else if (completedRowBlock
== 0 && currentBlock
!= 0 && loopX1RowCount
!= 0)
607 if ((loopX1RowCount
& 1) != 0)
609 dReal q1
, p4
, p3
, p2
, p1
;
611 /* load p and q values */
612 q1
= ptrBElement
[0 * (int)b_stride
];
613 p4
= ptrLElement
[-3];
614 p3
= ptrLElement
[-2];
615 p2
= ptrLElement
[-1];
617 ptrLElement
-= rowSkip
;
619 /* compute outer product and add it to the Z matrix */
625 ptrBElement
-= 1 * b_stride
;
628 if ((loopX1RowCount
& 2) != 0)
630 dReal q1
, p4
, p3
, p2
, p1
;
632 /* load p and q values */
633 q1
= ptrBElement
[0 * (int)b_stride
];
634 p4
= ptrLElement
[-3];
635 p3
= ptrLElement
[-2];
636 p2
= ptrLElement
[-1];
638 ptrLElement
-= rowSkip
;
640 /* compute outer product and add it to the Z matrix */
646 /* load p and q values */
647 q1
= ptrBElement
[-1 * (int)b_stride
];
648 p4
= ptrLElement
[-3];
649 p3
= ptrLElement
[-2];
650 p2
= ptrLElement
[-1];
652 ptrLElement
-= rowSkip
;
654 /* compute outer product and add it to the Z matrix */
660 ptrBElement
-= 2 * b_stride
;
662 dSASSERT(block_step
== 4);
664 if (--rowCounter
== 0)
668 if (finalRowBlock
== currentBlock
)
670 columnEndReached
= true;
675 // Take a look if any more columns have been completed...
676 completedBlocks
= refBlockCompletionProgress
;
677 dIASSERT(completedBlocks
>= finalRowBlock
);
679 if (completedBlocks
== finalRowBlock
)
685 // ...continue if so.
686 unsigned rowCompletedSoFar
= finalRowBlock
;
687 finalRowBlock
= dMACRO_MIN(currentBlock
, completedBlocks
);
688 rowCounter
= finalRowBlock
- rowCompletedSoFar
;
694 for (; !exitLoop
; exitLoop
= false)
696 dReal q1
, p4
, p3
, p2
, p1
;
698 /* load p and q values */
699 q1
= ptrBElement
[0 * (int)b_stride
];
700 p4
= ptrLElement
[-3];
701 p3
= ptrLElement
[-2];
702 p2
= ptrLElement
[-1];
704 ptrLElement
-= rowSkip
;
706 /* compute outer product and add it to the Z matrix */
712 /* load p and q values */
713 q1
= ptrBElement
[-1 * (int)b_stride
];
714 p4
= ptrLElement
[-3];
715 p3
= ptrLElement
[-2];
716 p2
= ptrLElement
[-1];
718 ptrLElement
-= rowSkip
;
720 /* compute outer product and add it to the Z matrix */
726 /* load p and q values */
727 q1
= ptrBElement
[-2 * (int)b_stride
];
728 p4
= ptrLElement
[-3];
729 p3
= ptrLElement
[-2];
730 p2
= ptrLElement
[-1];
732 ptrLElement
-= rowSkip
;
734 /* compute outer product and add it to the Z matrix */
740 /* load p and q values */
741 q1
= ptrBElement
[-3 * (int)b_stride
];
742 p4
= ptrLElement
[-3];
743 p3
= ptrLElement
[-2];
744 p2
= ptrLElement
[-1];
746 ptrLElement
-= rowSkip
;
748 /* compute outer product and add it to the Z matrix */
753 dSASSERT(block_step
== 4);
759 ptrBElement
-= 3 * block_step
* b_stride
;
761 /* load p and q values */
762 q1
= ptrBElement
[8 * b_stride
];
763 p4
= ptrLElement
[-3];
764 p3
= ptrLElement
[-2];
765 p2
= ptrLElement
[-1];
767 ptrLElement
-= rowSkip
;
769 /* compute outer product and add it to the Z matrix */
775 /* load p and q values */
776 q1
= ptrBElement
[7 * b_stride
];
777 p4
= ptrLElement
[-3];
778 p3
= ptrLElement
[-2];
779 p2
= ptrLElement
[-1];
781 ptrLElement
-= rowSkip
;
783 /* compute outer product and add it to the Z matrix */
789 /* load p and q values */
790 q1
= ptrBElement
[6 * b_stride
];
791 p4
= ptrLElement
[-3];
792 p3
= ptrLElement
[-2];
793 p2
= ptrLElement
[-1];
795 ptrLElement
-= rowSkip
;
797 /* compute outer product and add it to the Z matrix */
803 /* load p and q values */
804 q1
= ptrBElement
[5 * b_stride
];
805 p4
= ptrLElement
[-3];
806 p3
= ptrLElement
[-2];
807 p2
= ptrLElement
[-1];
809 ptrLElement
-= rowSkip
;
811 /* compute outer product and add it to the Z matrix */
817 /* load p and q values */
818 q1
= ptrBElement
[4 * b_stride
];
819 p4
= ptrLElement
[-3];
820 p3
= ptrLElement
[-2];
821 p2
= ptrLElement
[-1];
823 ptrLElement
-= rowSkip
;
825 /* compute outer product and add it to the Z matrix */
831 /* load p and q values */
832 q1
= ptrBElement
[3 * b_stride
];
833 p4
= ptrLElement
[-3];
834 p3
= ptrLElement
[-2];
835 p2
= ptrLElement
[-1];
837 ptrLElement
-= rowSkip
;
839 /* compute outer product and add it to the Z matrix */
845 /* load p and q values */
846 q1
= ptrBElement
[2 * b_stride
];
847 p4
= ptrLElement
[-3];
848 p3
= ptrLElement
[-2];
849 p2
= ptrLElement
[-1];
851 ptrLElement
-= rowSkip
;
853 /* compute outer product and add it to the Z matrix */
859 /* load p and q values */
860 q1
= ptrBElement
[1 * b_stride
];
861 p4
= ptrLElement
[-3];
862 p3
= ptrLElement
[-2];
863 p2
= ptrLElement
[-1];
865 ptrLElement
-= rowSkip
;
867 /* compute outer product and add it to the Z matrix */
872 dSASSERT(block_step
== 4);
876 ptrBElement
-= block_step
* b_stride
;
878 if (--rowCounter
== 0)
880 if (finalRowBlock
== currentBlock
)
882 columnEndReached
= true;
886 // Take a look if any more columns have been completed...
887 completedBlocks
= refBlockCompletionProgress
;
888 dIASSERT(completedBlocks
>= finalRowBlock
);
890 if (completedBlocks
== finalRowBlock
)
895 // ...continue if so.
896 unsigned rowCompletedSoFar
= finalRowBlock
;
897 finalRowBlock
= dMACRO_MIN(currentBlock
, completedBlocks
);
898 rowCounter
= finalRowBlock
- rowCompletedSoFar
;
901 /* end of inner loop */
904 else /* compute rightmost bottom X(i) block */
908 ptrLElement
= lastLElement
;
909 ptrBElement
= lastBElement
;
910 dIASSERT(completedRowBlock
== 0);
912 columnEndReached
= true;
915 if (columnEndReached
)
917 // Check whether there is still a need to proceed or if the computation has been taken over by another thread
918 cellindexint oldDescriptor
= MAKE_CELLDESCRIPTOR(completedRowBlock
, previousContextInstance
, true);
920 if (blockProgressDescriptors
[currentBlock
] == oldDescriptor
)
924 Y
[0] = ptrBElement
[0 * b_stride
]/* - Z[0]*/;
926 if (loopX1RowCount
>= 2)
928 dReal p2
= ptrLElement
[-1];
929 Y
[1] = ptrBElement
[-1 * (int)b_stride
]/* - Z[1] */- p2
* Y
[0];
931 if (loopX1RowCount
> 2)
933 dReal p3
= ptrLElement
[-2];
934 dReal p3_1
= (ptrLElement
- rowSkip
)[-2];
935 Y
[2] = ptrBElement
[-2 * (int)b_stride
]/* - Z[2] */- p3
* Y
[0] - p3_1
* Y
[1];
939 dSASSERT(block_step
== 4);
943 Y
[0] = ptrBElement
[0 * b_stride
] - Z
[0];
945 dReal p2
= ptrLElement
[-1];
946 Y
[1] = ptrBElement
[-1 * (int)b_stride
] - Z
[1] - p2
* Y
[0];
948 dReal p3
= ptrLElement
[-2];
949 dReal p3_1
= (ptrLElement
- rowSkip
)[-2];
950 Y
[2] = ptrBElement
[-2 * (int)b_stride
] - Z
[2] - p3
* Y
[0] - p3_1
* Y
[1];
952 dReal p4
= ptrLElement
[-3];
953 dReal p4_1
= (ptrLElement
- rowSkip
)[-3];
954 dReal p4_2
= (ptrLElement
- rowSkip
* 2)[-3];
955 Y
[3] = ptrBElement
[-3 * (int)b_stride
] - Z
[3] - p4
* Y
[0] - p4_1
* Y
[1] - p4_2
* Y
[2];
957 dSASSERT(block_step
== 4);
960 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
961 CooperativeAtomics::AtomicReadReorderBarrier();
963 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
964 // and the computation result is valid.
965 if (blockProgressDescriptors
[currentBlock
] == oldDescriptor
)
967 // Assign the results to the result context (possibly in parallel with other threads
968 // that could and ought to be assigning exactly the same values)
969 SolveL1TransposedCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
970 resultContext
.storePrecalculatedZs(Y
);
972 // Assign the result assignment progress descriptor
973 cellindexint newDescriptor
= MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true);
974 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], oldDescriptor
, newDescriptor
); // the result is to be ignored
976 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
977 goAssigningTheResult
= true;
981 // Otherwise, go on competing for copying the results
982 handleComputationTakenOver
= true;
987 handleComputationTakenOver
= true;
992 // If the final column has not been reached yet, store current values to the context.
993 // Select the other context instance as the previous one might be read by other threads.
994 CellContextInstance nextContextInstance
= buildNextContextInstance(previousContextInstance
);
995 SolveL1TransposedCellContext
&destinationContext
= buildBlockContextRef(cellContexts
, currentBlock
, nextContextInstance
);
996 destinationContext
.storePrecalculatedZs(Z
);
998 // Unlock the row until more columns can be used
999 cellindexint oldDescriptor
= MAKE_CELLDESCRIPTOR(completedRowBlock
, previousContextInstance
, true);
1000 cellindexint newDescriptor
= MAKE_CELLDESCRIPTOR(finalRowBlock
, nextContextInstance
, false);
1001 // The descriptor might have been updated by a competing thread
1002 if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], oldDescriptor
, newDescriptor
))
1004 // Adjust the ptrBElement to point to the result area...
1005 ptrBElement
= adjustedLastBElement
- (sizeint
)(currentBlock
* block_step
) * b_stride
;
1006 dIASSERT(currentBlock
!= 0 || adjustedLastBElement
== lastBElement
);
1007 // ...and go on handling the case
1008 handleComputationTakenOver
= true;
1012 if (handleComputationTakenOver
)
1014 cellindexint existingDescriptor
= blockProgressDescriptors
[currentBlock
];
1015 // This can only happen if the row was (has become) the uppermost not fully completed one
1016 // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
1017 if (existingDescriptor
!= INVALID_CELLDESCRIPTOR
)
1019 // If not fully completed this must be the final stage of the result assignment into the matrix
1020 dIASSERT(existingDescriptor
== MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true));
1022 // Go on competing copying the result as anyway the block is the topmost not completed one
1023 // and since there was competition for it, there is no other work that can be done right now.
1024 const SolveL1TransposedCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
1025 resultContext
.loadPrecalculatedZs(Y
);
1027 goAssigningTheResult
= true;
1031 // everything is over -- just go handling next blocks
1035 else if (goForLockedBlockDuplicateCalculation
)
1037 blockProcessingState
= BPS_SOME_BLOCKS_PROCESSED
;
1039 bool skipToHandlingSubsequentRows
= false, skiptoCopyingResult
= false;
1041 /* declare variables */
1042 const dReal
*ptrLElement
;
1044 if (completedRowBlock
< currentBlock
)
1046 partialBlock
= false;
1048 ptrLElement
= completedRowBlock
!= 0
1049 ? fullyAdjustedLastLElement
- currentBlock
* block_step
- (sizeint
)(completedRowBlock
* block_step
) * rowSkip
1050 : columnAdjustedLastLElement
- currentBlock
* block_step
;
1051 ptrBElement
= completedRowBlock
!= 0
1052 ? adjustedLastBElement
- (sizeint
)(completedRowBlock
* block_step
) * b_stride
1055 unsigned finalRowBlock
= currentBlock
/*std::min(currentBlock, completedBlocks)*/;
1056 dIASSERT(currentBlock
== completedBlocks
); // Why would we be competing for a row otherwise?
1058 bool exitInnerLoop
= false;
1059 unsigned lastCompletedRow
= completedRowBlock
;
1060 unsigned rowCounter
= finalRowBlock
- completedRowBlock
;
1062 if (completedRowBlock
== 0/* && currentBlock != 0 */&& loopX1RowCount
!= 0)
1064 if ((loopX1RowCount
& 1) != 0)
1066 dReal q1
, p4
, p3
, p2
, p1
;
1068 /* load p and q values */
1069 q1
= ptrBElement
[0 * (int)b_stride
];
1070 p4
= ptrLElement
[-3];
1071 p3
= ptrLElement
[-2];
1072 p2
= ptrLElement
[-1];
1073 p1
= ptrLElement
[0];
1074 ptrLElement
-= rowSkip
;
1076 /* compute outer product and add it to the Z matrix */
1082 ptrBElement
-= 1 * b_stride
;
1085 if ((loopX1RowCount
& 2) != 0)
1087 dReal q1
, p4
, p3
, p2
, p1
;
1089 /* load p and q values */
1090 q1
= ptrBElement
[0 * (int)b_stride
];
1091 p4
= ptrLElement
[-3];
1092 p3
= ptrLElement
[-2];
1093 p2
= ptrLElement
[-1];
1094 p1
= ptrLElement
[0];
1095 ptrLElement
-= rowSkip
;
1097 /* compute outer product and add it to the Z matrix */
1103 /* load p and q values */
1104 q1
= ptrBElement
[-1 * (int)b_stride
];
1105 p4
= ptrLElement
[-3];
1106 p3
= ptrLElement
[-2];
1107 p2
= ptrLElement
[-1];
1108 p1
= ptrLElement
[0];
1109 ptrLElement
-= rowSkip
;
1111 /* compute outer product and add it to the Z matrix */
1117 ptrBElement
-= 2 * b_stride
;
1119 dSASSERT(block_step
== 4);
1121 if (--rowCounter
== 0)
1123 exitInnerLoop
= true;
1127 for (; !exitInnerLoop
; exitInnerLoop
= --rowCounter
== 0)
1129 dReal q1
, p4
, p3
, p2
, p1
;
1131 /* load p and q values */
1132 q1
= ptrBElement
[0 * (int)b_stride
];
1133 p4
= ptrLElement
[-3];
1134 p3
= ptrLElement
[-2];
1135 p2
= ptrLElement
[-1];
1136 p1
= ptrLElement
[0];
1137 ptrLElement
-= rowSkip
;
1139 /* compute outer product and add it to the Z matrix */
1145 /* load p and q values */
1146 q1
= ptrBElement
[-1 * (int)b_stride
];
1147 p4
= ptrLElement
[-3];
1148 p3
= ptrLElement
[-2];
1149 p2
= ptrLElement
[-1];
1150 p1
= ptrLElement
[0];
1151 ptrLElement
-= rowSkip
;
1153 /* compute outer product and add it to the Z matrix */
1159 /* load p and q values */
1160 q1
= ptrBElement
[-2 * (int)b_stride
];
1161 p4
= ptrLElement
[-3];
1162 p3
= ptrLElement
[-2];
1163 p2
= ptrLElement
[-1];
1164 p1
= ptrLElement
[0];
1165 ptrLElement
-= rowSkip
;
1167 /* compute outer product and add it to the Z matrix */
1173 /* load p and q values */
1174 q1
= ptrBElement
[-3 * (int)b_stride
];
1175 p4
= ptrLElement
[-3];
1176 p3
= ptrLElement
[-2];
1177 p2
= ptrLElement
[-1];
1178 p1
= ptrLElement
[0];
1179 ptrLElement
-= rowSkip
;
1181 /* compute outer product and add it to the Z matrix */
1186 dSASSERT(block_step
== 4);
1188 // Check if the primary solver thread has not made any progress
1189 cellindexint descriptorVerification
= blockProgressDescriptors
[currentBlock
];
1190 unsigned newCompletedRow
= GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification
);
1192 if (newCompletedRow
!= lastCompletedRow
)
1194 // Check, this is the first change the current thread detects.
1195 // There is absolutely no reason in code for the computation to stop/resume twice
1196 // while the current thread is competing.
1197 dIASSERT(lastCompletedRow
== completedRowBlock
);
1199 if (descriptorVerification
== INVALID_CELLDESCRIPTOR
)
1201 skipToHandlingSubsequentRows
= true;
1205 if (newCompletedRow
== currentBlock
+ 1)
1207 skiptoCopyingResult
= true;
1211 // Check if the current thread is behind
1212 if (newCompletedRow
> finalRowBlock
- rowCounter
)
1214 // If so, go starting over one more time
1215 blockProcessingState
= BPS_COMPETING_FOR_A_BLOCK
;
1216 stayWithinTheBlock
= true;
1217 skipToHandlingSubsequentRows
= true;
1221 // If current thread is ahead, just save new completed column for further comparisons and go on calculating
1222 lastCompletedRow
= newCompletedRow
;
1225 /* advance pointers */
1226 ptrBElement
-= block_step
* b_stride
;
1227 /* end of inner loop */
1230 else if (completedRowBlock
> currentBlock
)
1232 dIASSERT(completedRowBlock
== currentBlock
+ 1);
1234 partialBlock
= currentBlock
== 0 && loopX1RowCount
!= 0;
1236 skiptoCopyingResult
= true;
1240 dIASSERT(currentBlock
== 0); // Execution can get here within the very first block only
1242 partialBlock
= /*currentBlock == 0 && */loopX1RowCount
!= 0;
1244 /* just assign the pointers appropriately and go on computing the results */
1245 ptrLElement
= lastLElement
;
1246 ptrBElement
= lastBElement
;
1249 if (!skipToHandlingSubsequentRows
)
1251 if (!skiptoCopyingResult
)
1255 Y
[0] = ptrBElement
[0 * b_stride
]/* - Z[0]*/;
1257 if (loopX1RowCount
>= 2)
1259 dReal p2
= ptrLElement
[-1];
1260 Y
[1] = ptrBElement
[-1 * (int)b_stride
]/* - Z[1] */- p2
* Y
[0];
1262 if (loopX1RowCount
> 2)
1264 dReal p3
= ptrLElement
[-2];
1265 dReal p3_1
= (ptrLElement
- rowSkip
)[-2];
1266 Y
[2] = ptrBElement
[-2 * (int)b_stride
]/* - Z[2] */- p3
* Y
[0] - p3_1
* Y
[1];
1270 dSASSERT(block_step
== 4);
1274 Y
[0] = ptrBElement
[0 * b_stride
] - Z
[0];
1276 dReal p2
= ptrLElement
[-1];
1277 Y
[1] = ptrBElement
[-1 * (int)b_stride
] - Z
[1] - p2
* Y
[0];
1279 dReal p3
= ptrLElement
[-2];
1280 dReal p3_1
= (ptrLElement
- rowSkip
)[-2];
1281 Y
[2] = ptrBElement
[-2 * (int)b_stride
] - Z
[2] - p3
* Y
[0] - p3_1
* Y
[1];
1283 dReal p4
= ptrLElement
[-3];
1284 dReal p4_1
= (ptrLElement
- rowSkip
)[-3];
1285 dReal p4_2
= (ptrLElement
- rowSkip
* 2)[-3];
1286 Y
[3] = ptrBElement
[-3 * (int)b_stride
] - Z
[3] - p4
* Y
[0] - p4_1
* Y
[1] - p4_2
* Y
[2];
1288 dSASSERT(block_step
== 4);
1291 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
1292 CooperativeAtomics::AtomicReadReorderBarrier();
1294 cellindexint existingDescriptor
= blockProgressDescriptors
[currentBlock
];
1296 if (existingDescriptor
== INVALID_CELLDESCRIPTOR
)
1298 // Everything is over -- proceed to subsequent rows
1299 skipToHandlingSubsequentRows
= true;
1301 else if (existingDescriptor
== MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true))
1303 // The values computed above may not be valid. Copy the values already in the result context.
1304 skiptoCopyingResult
= true;
1308 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
1309 // and the computation result is valid.
1310 cellindexint newDescriptor
= MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true); // put the computation at the top so that the evaluation result from the expression above is reused
1312 // Assign the results to the result context (possibly in parallel with other threads
1313 // that could and ought to be assigning exactly the same values)
1314 SolveL1TransposedCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
1315 resultContext
.storePrecalculatedZs(Y
);
1317 // Assign the result assignment progress descriptor
1318 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], existingDescriptor
, newDescriptor
); // the result is to be ignored
1320 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
1324 if (!skipToHandlingSubsequentRows
)
1326 if (skiptoCopyingResult
)
1328 // Extract the result values stored in the result context
1329 const SolveL1TransposedCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
1330 resultContext
.loadPrecalculatedZs(Y
);
1332 ptrBElement
= currentBlock
!= 0 ? adjustedLastBElement
- (sizeint
)(currentBlock
* block_step
) * b_stride
: lastBElement
;
1335 goAssigningTheResult
= true;
1340 if (goAssigningTheResult
)
1342 cellindexint existingDescriptor
= blockProgressDescriptors
[currentBlock
];
1343 // Check if the assignment has not been completed yet
1344 if (existingDescriptor
!= INVALID_CELLDESCRIPTOR
)
1346 // Assign the computation results to B vector
1349 // ptrBElement[0 * b_stride] = Y[0]; -- unchanged
1351 if (loopX1RowCount
>= 2)
1353 ptrBElement
[-1 * (int)b_stride
] = Y
[1];
1355 if (loopX1RowCount
> 2)
1357 ptrBElement
[-2 * (int)b_stride
] = Y
[2];
1360 dSASSERT(block_step
== 4);
1364 ptrBElement
[0 * b_stride
] = Y
[0];
1365 ptrBElement
[-1 * (int)b_stride
] = Y
[1];
1366 ptrBElement
[-2 * (int)b_stride
] = Y
[2];
1367 ptrBElement
[-3 * (int)b_stride
] = Y
[3];
1368 dSASSERT(block_step
== 4);
1371 ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress
, currentBlock
+ 1);
1372 dIASSERT(refBlockCompletionProgress
>= currentBlock
+ 1);
1374 // And assign the completed status no matter what
1375 CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors
[currentBlock
], INVALID_CELLDESCRIPTOR
);
1379 // everything is over -- just go handling next blocks
1383 if (!stayWithinTheBlock
)
1385 completedBlocks
= refBlockCompletionProgress
;
1387 if (completedBlocks
== blockCount
)
1394 bool lookaheadBoundaryReached
= false;
1396 if (currentBlock
== blockCount
|| completedBlocks
== 0)
1398 lookaheadBoundaryReached
= true;
1400 else if (currentBlock
>= completedBlocks
+ lookaheadRange
)
1402 lookaheadBoundaryReached
= blockProcessingState
> BPS_NO_BLOCKS_PROCESSED
;
1404 else if (currentBlock
< completedBlocks
)
1406 // Treat detected row advancement as a row processed
1407 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
1409 currentBlock
= completedBlocks
;
1412 if (lookaheadBoundaryReached
)
1414 dIASSERT(blockProcessingState
!= BPS_COMPETING_FOR_A_BLOCK
); // Why did not we compete???
1416 // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
1417 if (blockProcessingState
<= BPS_NO_BLOCKS_PROCESSED
)
1419 // Abandon job if too few blocks remain
1420 if (blockCount
- completedBlocks
<= ownThreadIndex
)
1425 blockProcessingState
= BPS_COMPETING_FOR_A_BLOCK
;
1429 // If there was some progress, just continue to the next pass
1430 blockProcessingState
= BPS_NO_BLOCKS_PROCESSED
;
1433 currentBlock
= completedBlocks
;