3 /*************************************************************************
5 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
6 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of EITHER: *
10 * (1) The GNU Lesser General Public License as published by the Free *
11 * Software Foundation; either version 2.1 of the License, or (at *
12 * your option) any later version. The text of the GNU Lesser *
13 * General Public License is included with this library in the *
15 * (2) The BSD-style license that is included with this library in *
16 * the file LICENSE-BSD.TXT. *
18 * This library is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
21 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
23 *************************************************************************/
26 * Code style improvements and optimizations by Oleh Derevenko ????-2019
27 * LDLT cooperative factorization code of ThreadedEquationSolverLDLT copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
30 #ifndef _ODE_FASTLDLT_IMPL_H_
31 #define _ODE_FASTLDLT_IMPL_H_
38 static void solveL1Stripe_2 (const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
);
39 template<unsigned int d_stride
>
40 void scaleAndFactorizeL1Stripe_2(dReal
*ARow
, dReal
*d
, unsigned rowIndex
, unsigned rowSkip
);
41 template<unsigned int d_stride
>
42 inline void scaleAndFactorizeL1FirstRowStripe_2(dReal
*ARow
, dReal
*d
, unsigned rowSkip
);
44 static void solveStripeL1_1 (const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
);
45 template<unsigned int d_stride
>
46 void scaleAndFactorizeL1Stripe_1(dReal
*ARow
, dReal
*d
, unsigned rowIndex
);
47 template<unsigned int d_stride
>
48 inline void scaleAndFactorizeL1FirstRowStripe_1(dReal
*ARow
, dReal
*d
);
51 template<unsigned int d_stride
>
52 void factorMatrixAsLDLT(dReal
*A
, dReal
*d
, unsigned rowCount
, unsigned rowSkip
)
54 if (rowCount
< 1) return;
57 unsigned blockStartRow
= 0;
59 const unsigned blockStep
= 2;
60 const unsigned lastRowIndex
= rowCount
>= blockStep
? rowCount
- blockStep
+ 1 : 0;
62 /* compute blocks of 2 rows */
63 bool subsequentPass
= false;
64 for (; blockStartRow
< lastRowIndex
; subsequentPass
= true, ARow
+= blockStep
* rowSkip
, blockStartRow
+= blockStep
)
68 /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
69 solveL1Stripe_2(A
, ARow
, blockStartRow
, rowSkip
);
70 scaleAndFactorizeL1Stripe_2
<d_stride
>(ARow
, d
, blockStartRow
, rowSkip
);
74 scaleAndFactorizeL1FirstRowStripe_2
<d_stride
>(ARow
, d
, rowSkip
);
76 dSASSERT(blockStep
== 2);
77 /* done factorizing 2 x 2 block */
80 /* compute the (less than 2) rows at the bottom */
81 if (!subsequentPass
|| blockStartRow
== lastRowIndex
)
83 dSASSERT(blockStep
== 2); // for the blockStartRow == lastRowIndex comparison above
87 solveStripeL1_1(A
, ARow
, blockStartRow
, rowSkip
);
88 scaleAndFactorizeL1Stripe_1
<d_stride
>(ARow
, d
, blockStartRow
);
92 scaleAndFactorizeL1FirstRowStripe_1
<d_stride
>(ARow
, d
);
94 dSASSERT(blockStep
== 2);
95 /* done factorizing 1 x 1 block */
99 /* solve L*X=B, with B containing 2 right hand sides.
100 * L is an n*n lower triangular matrix with ones on the diagonal.
101 * L is stored by rows and its leading dimension is rowSkip.
102 * B is an n*2 matrix that contains the right hand sides.
103 * B is stored by columns and its leading dimension is also rowSkip.
104 * B is overwritten with X.
105 * this processes blocks of 2*2.
106 * if this is in the factorizer source file, n must be a multiple of 2.
109 void solveL1Stripe_2(const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
)
111 dIASSERT(rowCount
!= 0);
112 dIASSERT(rowCount
% 2 == 0);
114 /* compute all 2 x 2 blocks of X */
115 unsigned blockStartRow
= 0;
116 for (bool exitLoop
= false, subsequentPass
= false; !exitLoop
; subsequentPass
= true, exitLoop
= (blockStartRow
+= 2) == rowCount
)
118 const dReal
*ptrLElement
;
121 /* declare variables - Z matrix */
122 dReal Z11
, Z12
, Z21
, Z22
;
124 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
127 ptrLElement
= L
+ blockStartRow
* rowSkip
;
130 /* set Z matrix to 0 */
131 Z11
= 0; Z12
= 0; Z21
= 0; Z22
= 0;
133 /* the inner loop that computes outer products and adds them to Z */
134 // The iteration starts with even number and decreases it by 2. So, it must end in zero
135 for (unsigned columnCounter
= blockStartRow
; ;)
137 /* declare p and q vectors, etc */
138 dReal p1
, q1
, p2
, q2
;
140 /* compute outer product and add it to the Z matrix */
144 q2
= ptrBElement
[rowSkip
];
146 p2
= ptrLElement
[rowSkip
];
150 /* compute outer product and add it to the Z matrix */
154 q2
= ptrBElement
[1 + rowSkip
];
156 p2
= ptrLElement
[1 + rowSkip
];
160 if (columnCounter
> 6)
164 /* advance pointers */
168 /* compute outer product and add it to the Z matrix */
169 p1
= ptrLElement
[-4];
170 q1
= ptrBElement
[-4];
172 q2
= ptrBElement
[-4 + rowSkip
];
174 p2
= ptrLElement
[-4 + rowSkip
];
178 /* compute outer product and add it to the Z matrix */
179 p1
= ptrLElement
[-3];
180 q1
= ptrBElement
[-3];
182 q2
= ptrBElement
[-3 + rowSkip
];
184 p2
= ptrLElement
[-3 + rowSkip
];
188 /* compute outer product and add it to the Z matrix */
189 p1
= ptrLElement
[-2];
190 q1
= ptrBElement
[-2];
192 q2
= ptrBElement
[-2 + rowSkip
];
194 p2
= ptrLElement
[-2 + rowSkip
];
198 /* compute outer product and add it to the Z matrix */
199 p1
= ptrLElement
[-1];
200 q1
= ptrBElement
[-1];
202 q2
= ptrBElement
[-1 + rowSkip
];
204 p2
= ptrLElement
[-1 + rowSkip
];
210 /* advance pointers */
214 if ((columnCounter
-= 2) == 0)
219 /* end of inner loop */
224 ptrLElement
= L
/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow
== 0);
227 /* set Z matrix to 0 */
228 Z11
= 0; Z12
= 0; Z21
= 0; Z22
= 0;
231 /* finish computing the X(i) block */
233 dReal Y11
= ptrBElement
[0] - Z11
;
234 dReal Y12
= ptrBElement
[rowSkip
] - Z12
;
236 dReal p2
= ptrLElement
[rowSkip
];
238 ptrBElement
[0] = Y11
;
239 ptrBElement
[rowSkip
] = Y12
;
241 dReal Y21
= ptrBElement
[1] - Z21
- p2
* Y11
;
242 dReal Y22
= ptrBElement
[1 + rowSkip
] - Z22
- p2
* Y12
;
244 ptrBElement
[1] = Y21
;
245 ptrBElement
[1 + rowSkip
] = Y22
;
246 /* end of outer loop */
250 template<unsigned int d_stride
>
251 void scaleAndFactorizeL1Stripe_2(dReal
*ARow
, dReal
*d
, unsigned factorizationRow
, unsigned rowSkip
)
253 dIASSERT(factorizationRow
!= 0);
254 dIASSERT(factorizationRow
% 2 == 0);
256 dReal
*ptrAElement
= ARow
;
257 dReal
*ptrDElement
= d
;
259 /* scale the elements in a 2 x i block at A(i,0), and also */
260 /* compute Z = the outer product matrix that we'll need. */
261 dReal Z11
= 0, Z21
= 0, Z22
= 0;
263 for (unsigned columnCounter
= factorizationRow
; ; )
265 dReal p1
, q1
, p2
, q2
, dd
;
268 p2
= ptrAElement
[rowSkip
];
269 dd
= ptrDElement
[0 * d_stride
];
273 ptrAElement
[rowSkip
] = q2
;
279 p2
= ptrAElement
[1 + rowSkip
];
280 dd
= ptrDElement
[1 * d_stride
];
284 ptrAElement
[1 + rowSkip
] = q2
;
289 if (columnCounter
> 6)
294 ptrDElement
+= 6 * d_stride
;
296 p1
= ptrAElement
[-4];
297 p2
= ptrAElement
[-4 + rowSkip
];
298 dd
= ptrDElement
[-4 * (int)d_stride
];
301 ptrAElement
[-4] = q1
;
302 ptrAElement
[-4 + rowSkip
] = q2
;
307 p1
= ptrAElement
[-3];
308 p2
= ptrAElement
[-3 + rowSkip
];
309 dd
= ptrDElement
[-3 * (int)d_stride
];
312 ptrAElement
[-3] = q1
;
313 ptrAElement
[-3 + rowSkip
] = q2
;
318 p1
= ptrAElement
[-2];
319 p2
= ptrAElement
[-2 + rowSkip
];
320 dd
= ptrDElement
[-2 * (int)d_stride
];
323 ptrAElement
[-2] = q1
;
324 ptrAElement
[-2 + rowSkip
] = q2
;
329 p1
= ptrAElement
[-1];
330 p2
= ptrAElement
[-1 + rowSkip
];
331 dd
= ptrDElement
[-1 * (int)d_stride
];
334 ptrAElement
[-1] = q1
;
335 ptrAElement
[-1 + rowSkip
] = q2
;
343 ptrDElement
+= 2 * d_stride
;
345 if ((columnCounter
-= 2) == 0)
352 /* solve for diagonal 2 x 2 block at A(i,i) */
353 dReal Y11
= ptrAElement
[0] - Z11
;
354 dReal Y21
= ptrAElement
[rowSkip
] - Z21
;
355 dReal Y22
= ptrAElement
[1 + rowSkip
] - Z22
;
357 /* factorize 2 x 2 block Y, ptrDElement */
358 /* factorize row 1 */
359 dReal dd
= dRecip(Y11
);
361 ptrDElement
[0 * d_stride
] = dd
;
362 dIASSERT(ptrDElement
== d
+ (sizeint
)factorizationRow
* d_stride
);
364 /* factorize row 2 */
366 ptrAElement
[rowSkip
] = q2
;
368 dReal sum
= Y21
* q2
;
369 ptrDElement
[1 * d_stride
] = dRecip(Y22
- sum
);
372 template<unsigned int d_stride
>
373 void scaleAndFactorizeL1FirstRowStripe_2(dReal
*ARow
, dReal
*d
, unsigned rowSkip
)
375 dReal
*ptrAElement
= ARow
;
376 dReal
*ptrDElement
= d
;
378 /* solve for diagonal 2 x 2 block at A(0,0) */
379 dReal Y11
= ptrAElement
[0]/* - Z11*/;
380 dReal Y21
= ptrAElement
[rowSkip
]/* - Z21*/;
381 dReal Y22
= ptrAElement
[1 + rowSkip
]/* - Z22*/;
383 /* factorize 2 x 2 block Y, ptrDElement */
384 /* factorize row 1 */
385 dReal dd
= dRecip(Y11
);
387 ptrDElement
[0 * d_stride
] = dd
;
388 dIASSERT(ptrDElement
== d
/* + (sizeint)factorizationRow * d_stride*/);
390 /* factorize row 2 */
392 ptrAElement
[rowSkip
] = q2
;
394 dReal sum
= Y21
* q2
;
395 ptrDElement
[1 * d_stride
] = dRecip(Y22
- sum
);
399 /* solve L*X=B, with B containing 1 right hand sides.
400 * L is an n*n lower triangular matrix with ones on the diagonal.
401 * L is stored by rows and its leading dimension is lskip.
402 * B is an n*1 matrix that contains the right hand sides.
403 * B is stored by columns and its leading dimension is also lskip.
404 * B is overwritten with X.
405 * this processes blocks of 2*2.
406 * if this is in the factorizer source file, n must be a multiple of 2.
409 void solveStripeL1_1(const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
)
411 dIASSERT(rowCount
!= 0);
412 dIASSERT(rowCount
% 2 == 0);
414 /* compute all 2 x 1 blocks of X */
415 unsigned blockStartRow
= 0;
416 for (bool exitLoop
= false, subsequentPass
= false; !exitLoop
; subsequentPass
= true, exitLoop
= (blockStartRow
+= 2) == rowCount
)
418 const dReal
*ptrLElement
;
421 /* declare variables - Z matrix */
426 ptrLElement
= L
+ (sizeint
)blockStartRow
* rowSkip
;
429 /* set the Z matrix to 0 */
432 /* compute all 2 x 1 block of X, from rows i..i+2-1 */
434 /* the inner loop that computes outer products and adds them to Z */
435 // The iteration starts with even number and decreases it by 2. So, it must end in zero
436 for (unsigned columnCounter
= blockStartRow
; ; )
438 /* declare p and q vectors, etc */
441 /* compute outer product and add it to the Z matrix */
445 p2
= ptrLElement
[rowSkip
];
448 /* compute outer product and add it to the Z matrix */
452 p2
= ptrLElement
[1 + rowSkip
];
455 if (columnCounter
> 6)
459 /* advance pointers */
463 /* compute outer product and add it to the Z matrix */
464 p1
= ptrLElement
[-4];
465 q1
= ptrBElement
[-4];
467 p2
= ptrLElement
[-4 + rowSkip
];
470 /* compute outer product and add it to the Z matrix */
471 p1
= ptrLElement
[-3];
472 q1
= ptrBElement
[-3];
474 p2
= ptrLElement
[-3 + rowSkip
];
477 /* compute outer product and add it to the Z matrix */
478 p1
= ptrLElement
[-2];
479 q1
= ptrBElement
[-2];
481 p2
= ptrLElement
[-2 + rowSkip
];
484 /* compute outer product and add it to the Z matrix */
485 p1
= ptrLElement
[-1];
486 q1
= ptrBElement
[-1];
488 p2
= ptrLElement
[-1 + rowSkip
];
493 /* advance pointers */
497 if ((columnCounter
-= 2) == 0)
502 /* end of inner loop */
507 ptrLElement
= L
/* + (sizeint)blockStartRow * rowSkip*/; dIASSERT(blockStartRow
== 0);
510 /* set the Z matrix to 0 */
514 /* finish computing the X(i) block */
515 dReal p2
= ptrLElement
[rowSkip
];
517 dReal Y11
= ptrBElement
[0] - Z11
;
518 dReal Y21
= ptrBElement
[1] - Z21
- p2
* Y11
;
520 ptrBElement
[0] = Y11
;
521 ptrBElement
[1] = Y21
;
522 /* end of outer loop */
526 template<unsigned int d_stride
>
527 void scaleAndFactorizeL1Stripe_1(dReal
*ARow
, dReal
*d
, unsigned factorizationRow
)
529 dReal
*ptrAElement
= ARow
;
530 dReal
*ptrDElement
= d
;
532 /* scale the elements in a 1 x i block at A(i,0), and also */
533 /* compute Z = the outer product matrix that we'll need. */
534 dReal Z11
= 0, Z22
= 0;
536 for (unsigned columnCounter
= factorizationRow
; ; )
538 dReal p1
, p2
, q1
, q2
, dd1
, dd2
;
542 dd1
= ptrDElement
[0 * d_stride
];
543 dd2
= ptrDElement
[1 * d_stride
];
551 if (columnCounter
> 6)
556 ptrDElement
+= 6 * d_stride
;
558 p1
= ptrAElement
[-4];
559 p2
= ptrAElement
[-3];
560 dd1
= ptrDElement
[-4 * (int)d_stride
];
561 dd2
= ptrDElement
[-3 * (int)d_stride
];
564 ptrAElement
[-4] = q1
;
565 ptrAElement
[-3] = q2
;
569 p1
= ptrAElement
[-2];
570 p2
= ptrAElement
[-1];
571 dd1
= ptrDElement
[-2 * (int)d_stride
];
572 dd2
= ptrDElement
[-1 * (int)d_stride
];
575 ptrAElement
[-2] = q1
;
576 ptrAElement
[-1] = q2
;
583 ptrDElement
+= 2 * d_stride
;
585 if ((columnCounter
-= 2) == 0)
592 dReal Y11
= ptrAElement
[0] - (Z11
+ Z22
);
594 /* solve for diagonal 1 x 1 block at A(i,i) */
595 dIASSERT(ptrDElement
== d
+ (sizeint
)factorizationRow
* d_stride
);
596 /* factorize 1 x 1 block Y, ptrDElement */
597 /* factorize row 1 */
598 ptrDElement
[0 * d_stride
] = dRecip(Y11
);
601 template<unsigned int d_stride
>
602 void scaleAndFactorizeL1FirstRowStripe_1(dReal
*ARow
, dReal
*d
)
604 dReal
*ptrAElement
= ARow
;
605 dReal
*ptrDElement
= d
;
607 dReal Y11
= ptrAElement
[0];
609 /* solve for diagonal 1 x 1 block at A(0,0) */
610 /* factorize 1 x 1 block Y, ptrDElement */
611 /* factorize row 1 */
612 ptrDElement
[0 * d_stride
] = dRecip(Y11
);
618 template<unsigned int block_step
, unsigned int b_rows
>
620 void ThreadedEquationSolverLDLT::participateSolvingL1Stripe_X(const dReal
*L
, dReal
*B
, unsigned blockCount
, unsigned rowSkip
,
621 volatile atomicord32
&refBlockCompletionProgress
/*=0*/, volatile cellindexint
*blockProgressDescriptors
/*=[blockCount]*/,
622 FactorizationSolveL1StripeCellContext
*cellContexts
/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex
)
624 const unsigned lookaheadRange
= 64;
625 BlockProcessingState blockProcessingState
= BPS_NO_BLOCKS_PROCESSED
;
627 unsigned completedBlocks
= refBlockCompletionProgress
;
628 unsigned currentBlock
= completedBlocks
;
629 dIASSERT(completedBlocks
<= blockCount
);
631 for (bool exitLoop
= completedBlocks
== blockCount
; !exitLoop
; exitLoop
= false)
633 bool goForLockedBlockPrimaryCalculation
= false, goForLockedBlockDuplicateCalculation
= false;
634 bool goAssigningTheResult
= false, stayWithinTheBlock
= false;
636 dReal Z
[block_step
][b_rows
];
637 dReal Y
[block_step
][b_rows
];
641 CellContextInstance previousContextInstance
;
642 unsigned completedColumnBlock
;
644 for (cellindexint testDescriptor
= blockProgressDescriptors
[currentBlock
]; ; )
646 if (testDescriptor
== INVALID_CELLDESCRIPTOR
)
648 // Invalid descriptor is the indication that the row has been fully calculated
649 // Test if this was the last row and break out if so.
650 if (currentBlock
+ 1 == blockCount
)
656 // Treat detected row advancement as a row processed
657 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
661 CooperativeAtomics::AtomicReadReorderBarrier();
662 // It is necessary to read up to date completedBblocks value after the descriptor retrieval
663 // as otherwise the logic below breaks
664 completedBlocks
= refBlockCompletionProgress
;
666 if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor
))
668 completedColumnBlock
= GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor
);
669 dIASSERT(completedColumnBlock
< currentBlock
|| (completedColumnBlock
== currentBlock
&& currentBlock
== 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
670 dIASSERT(completedColumnBlock
<= completedBlocks
); // Since the descriptor is not locked
672 if (completedColumnBlock
== completedBlocks
&& currentBlock
!= completedBlocks
)
674 dIASSERT(completedBlocks
< currentBlock
);
678 if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], testDescriptor
, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor
)))
680 if (completedColumnBlock
!= 0)
682 CellContextInstance contextInstance
= GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor
);
683 previousContextInstance
= contextInstance
;
685 const FactorizationSolveL1StripeCellContext
&sourceContext
= buildBlockContextRef(cellContexts
, currentBlock
, contextInstance
);
686 sourceContext
.loadPrecalculatedZs(Z
);
690 previousContextInstance
= CCI__MIN
;
691 FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z
);
694 goForLockedBlockPrimaryCalculation
= true;
698 if (blockProcessingState
!= BPS_COMPETING_FOR_A_BLOCK
)
703 testDescriptor
= blockProgressDescriptors
[currentBlock
];
707 if (blockProcessingState
!= BPS_COMPETING_FOR_A_BLOCK
)
712 cellindexint verificativeDescriptor
;
713 bool verificationFailure
= false;
715 completedColumnBlock
= GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor
);
716 dIASSERT(completedColumnBlock
!= currentBlock
|| currentBlock
== 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
718 if (completedColumnBlock
!= 0)
720 CellContextInstance contextInstance
= GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor
);
721 const FactorizationSolveL1StripeCellContext
&sourceContext
= buildBlockContextRef(cellContexts
, currentBlock
, contextInstance
);
722 sourceContext
.loadPrecalculatedZs(Z
);
726 FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z
);
729 if (completedColumnBlock
!= 0 && completedColumnBlock
<= currentBlock
)
731 // Make sure the descriptor is re-read after the precalculates
732 CooperativeAtomics::AtomicReadReorderBarrier();
735 if (completedColumnBlock
<= currentBlock
)
737 verificativeDescriptor
= blockProgressDescriptors
[currentBlock
];
738 verificationFailure
= verificativeDescriptor
!= testDescriptor
;
741 if (!verificationFailure
)
743 dIASSERT(completedColumnBlock
<= currentBlock
+ 1);
745 goForLockedBlockDuplicateCalculation
= true;
749 testDescriptor
= verificativeDescriptor
;
758 if (goForLockedBlockPrimaryCalculation
)
760 blockProcessingState
= BPS_SOME_BLOCKS_PROCESSED
;
762 // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
763 bool handleComputationTakenOver
= false, rowEndReached
= false;
765 const dReal
*ptrLElement
;
766 unsigned finalColumnBlock
;
768 if (currentBlock
!= 0)
770 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
771 ptrLElement
= L
+ (currentBlock
* rowSkip
+ completedColumnBlock
) * block_step
;
772 ptrBElement
= B
+ completedColumnBlock
* block_step
;
774 /* the inner loop that computes outer products and adds them to Z */
775 finalColumnBlock
= dMACRO_MIN(currentBlock
, completedBlocks
);
776 dIASSERT(completedColumnBlock
!= finalColumnBlock
/* || currentBlock == 0*/);
778 // The iteration starts with even number and decreases it by 2. So, it must end in zero
779 for (unsigned columnCounter
= finalColumnBlock
- completedColumnBlock
; ; )
781 /* declare p and q vectors, etc */
782 dReal p
[block_step
], q
[b_rows
];
784 /* compute outer product and add it to the Z matrix */
785 p
[0] = ptrLElement
[0];
786 q
[0] = ptrBElement
[0];
787 Z
[0][0] += p
[0] * q
[0];
790 q
[1] = ptrBElement
[rowSkip
];
791 Z
[0][1] += p
[0] * q
[1];
793 p
[1] = ptrLElement
[rowSkip
];
794 Z
[1][0] += p
[1] * q
[0];
797 Z
[1][1] += p
[1] * q
[1];
800 /* compute outer product and add it to the Z matrix */
801 p
[0] = ptrLElement
[1];
802 q
[0] = ptrBElement
[1];
803 Z
[0][0] += p
[0] * q
[0];
806 q
[1] = ptrBElement
[1 + rowSkip
];
807 Z
[0][1] += p
[0] * q
[1];
809 p
[1] = ptrLElement
[1 + rowSkip
];
810 Z
[1][0] += p
[1] * q
[0];
813 Z
[1][1] += p
[1] * q
[1];
816 dSASSERT(block_step
== 2);
817 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
819 if (columnCounter
> 2)
821 /* compute outer product and add it to the Z matrix */
822 p
[0] = ptrLElement
[2];
823 q
[0] = ptrBElement
[2];
824 Z
[0][0] += p
[0] * q
[0];
827 q
[1] = ptrBElement
[2 + rowSkip
];
828 Z
[0][1] += p
[0] * q
[1];
830 p
[1] = ptrLElement
[2 + rowSkip
];
831 Z
[1][0] += p
[1] * q
[0];
834 Z
[1][1] += p
[1] * q
[1];
837 /* compute outer product and add it to the Z matrix */
838 p
[0] = ptrLElement
[3];
839 q
[0] = ptrBElement
[3];
840 Z
[0][0] += p
[0] * q
[0];
843 q
[1] = ptrBElement
[3 + rowSkip
];
844 Z
[0][1] += p
[0] * q
[1];
846 p
[1] = ptrLElement
[3 + rowSkip
];
847 Z
[1][0] += p
[1] * q
[0];
850 Z
[1][1] += p
[1] * q
[1];
853 dSASSERT(block_step
== 2);
854 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
856 /* advance pointers */
857 ptrLElement
+= 2 * block_step
;
858 ptrBElement
+= 2 * block_step
;
863 /* advance pointers */
864 ptrLElement
+= block_step
;
865 ptrBElement
+= block_step
;
866 /* end of inner loop */
868 if (--columnCounter
== 0)
870 if (finalColumnBlock
== currentBlock
)
872 rowEndReached
= true;
876 // Take a look if any more rows have been completed...
877 completedBlocks
= refBlockCompletionProgress
;
878 dIASSERT(completedBlocks
>= finalColumnBlock
);
880 if (completedBlocks
== finalColumnBlock
)
885 // ...continue if so.
886 unsigned columnCompletedSoFar
= finalColumnBlock
;
887 finalColumnBlock
= dMACRO_MIN(currentBlock
, completedBlocks
);
888 columnCounter
= finalColumnBlock
- columnCompletedSoFar
;
895 ptrLElement
= L
/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
896 ptrBElement
= B
/* + completedColumnBlock * block_step*/;
898 rowEndReached
= true;
903 // Check whether there is still a need to proceed or if the computation has been taken over by another thread
904 cellindexint oldDescriptor
= MAKE_CELLDESCRIPTOR(completedColumnBlock
, previousContextInstance
, true);
906 if (blockProgressDescriptors
[currentBlock
] == oldDescriptor
)
908 /* finish computing the X(i) block */
909 Y
[0][0] = ptrBElement
[0] - Z
[0][0];
912 Y
[0][1] = ptrBElement
[rowSkip
] - Z
[0][1];
915 dReal p2
= ptrLElement
[rowSkip
];
917 Y
[1][0] = ptrBElement
[1] - Z
[1][0] - p2
* Y
[0][0];
920 Y
[1][1] = ptrBElement
[1 + rowSkip
] - Z
[1][1] - p2
* Y
[0][1];
923 dSASSERT(block_step
== 2);
924 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
926 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
927 CooperativeAtomics::AtomicReadReorderBarrier();
929 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
930 // and the computation result is valid.
931 if (blockProgressDescriptors
[currentBlock
] == oldDescriptor
)
933 // Assign the results to the result context (possibly in parallel with other threads
934 // that could and ought to be assigning exactly the same values)
935 FactorizationSolveL1StripeCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
936 resultContext
.storePrecalculatedZs(Y
);
938 // Assign the result assignment progress descriptor
939 cellindexint newDescriptor
= MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true);
940 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], oldDescriptor
, newDescriptor
); // the result is to be ignored
942 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
943 goAssigningTheResult
= true;
947 // Otherwise, go on competing for copying the results
948 handleComputationTakenOver
= true;
953 handleComputationTakenOver
= true;
958 // If the final column has not been reached yet, store current values to the context.
959 // Select the other context instance as the previous one might be read by other threads.
960 CellContextInstance nextContextInstance
= buildNextContextInstance(previousContextInstance
);
961 FactorizationSolveL1StripeCellContext
&destinationContext
= buildBlockContextRef(cellContexts
, currentBlock
, nextContextInstance
);
962 destinationContext
.storePrecalculatedZs(Z
);
964 // Unlock the row until more columns can be used
965 cellindexint oldDescriptor
= MAKE_CELLDESCRIPTOR(completedColumnBlock
, previousContextInstance
, true);
966 cellindexint newDescriptor
= MAKE_CELLDESCRIPTOR(finalColumnBlock
, nextContextInstance
, false);
967 // The descriptor might have been updated by a competing thread
968 if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], oldDescriptor
, newDescriptor
))
970 // Adjust the ptrBElement to point to the result area...
971 ptrBElement
= B
+ currentBlock
* block_step
;
972 // ...and go on handling the case
973 handleComputationTakenOver
= true;
977 if (handleComputationTakenOver
)
979 cellindexint existingDescriptor
= blockProgressDescriptors
[currentBlock
];
980 // This can only happen if the row was (has become) the uppermost not fully completed one
981 // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
982 if (existingDescriptor
!= INVALID_CELLDESCRIPTOR
)
984 // If not fully completed this must be the final stage of the result assignment into the matrix
985 dIASSERT(existingDescriptor
== MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true));
987 // Go on competing copying the result as anyway the block is the topmost not completed one
988 // and since there was competition for it, there is no other work that can be done right now.
989 const FactorizationSolveL1StripeCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
990 resultContext
.loadPrecalculatedZs(Y
);
992 goAssigningTheResult
= true;
996 // everything is over -- just go handling next blocks
1000 else if (goForLockedBlockDuplicateCalculation
)
1002 blockProcessingState
= BPS_SOME_BLOCKS_PROCESSED
;
1004 bool skipToHandlingSubsequentRows
= false, skiptoCopyingResult
= false;
1006 /* declare variables */
1007 const dReal
*ptrLElement
;
1009 if (completedColumnBlock
< currentBlock
)
1011 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
1012 ptrLElement
= L
+ (currentBlock
* rowSkip
+ completedColumnBlock
) * block_step
;
1013 ptrBElement
= B
+ completedColumnBlock
* block_step
;
1015 /* the inner loop that computes outer products and adds them to Z */
1016 // The iteration starts with even number and decreases it by 2. So, it must end in zero
1017 const unsigned finalColumnBlock
= currentBlock
;
1018 dIASSERT(currentBlock
== completedBlocks
); // Why would we be competing for a row otherwise?
1020 unsigned lastCompletedColumn
= completedColumnBlock
;
1021 unsigned columnCounter
= finalColumnBlock
- completedColumnBlock
;
1022 for (bool exitInnerLoop
= false; !exitInnerLoop
; exitInnerLoop
= --columnCounter
== 0)
1024 /* declare p and q vectors, etc */
1025 dReal p
[block_step
], q
[b_rows
];
1027 /* compute outer product and add it to the Z matrix */
1028 p
[0] = ptrLElement
[0];
1029 q
[0] = ptrBElement
[0];
1030 Z
[0][0] += p
[0] * q
[0];
1033 q
[1] = ptrBElement
[rowSkip
];
1034 Z
[0][1] += p
[0] * q
[1];
1036 p
[1] = ptrLElement
[rowSkip
];
1037 Z
[1][0] += p
[1] * q
[0];
1040 Z
[1][1] += p
[1] * q
[1];
1043 /* compute outer product and add it to the Z matrix */
1044 p
[0] = ptrLElement
[1];
1045 q
[0] = ptrBElement
[1];
1046 Z
[0][0] += p
[0] * q
[0];
1049 q
[1] = ptrBElement
[1 + rowSkip
];
1050 Z
[0][1] += p
[0] * q
[1];
1052 p
[1] = ptrLElement
[1 + rowSkip
];
1053 Z
[1][0] += p
[1] * q
[0];
1056 Z
[1][1] += p
[1] * q
[1];
1059 dSASSERT(block_step
== 2);
1060 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
1062 // Check if the primary solver thread has not made any progress
1063 cellindexint descriptorVerification
= blockProgressDescriptors
[currentBlock
];
1064 unsigned newCompletedColumn
= GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification
);
1066 if (newCompletedColumn
!= lastCompletedColumn
)
1068 // Check, this is the first change the current thread detects.
1069 // There is absolutely no reason in code for the computation to stop/resume twice
1070 // while the current thread is competing.
1071 dIASSERT(lastCompletedColumn
== completedColumnBlock
);
1073 if (descriptorVerification
== INVALID_CELLDESCRIPTOR
)
1075 skipToHandlingSubsequentRows
= true;
1079 if (newCompletedColumn
== currentBlock
+ 1)
1081 skiptoCopyingResult
= true;
1085 // Check if the current thread is behind
1086 if (newCompletedColumn
> finalColumnBlock
- columnCounter
)
1088 // If so, go starting over one more time
1089 blockProcessingState
= BPS_COMPETING_FOR_A_BLOCK
;
1090 stayWithinTheBlock
= true;
1091 skipToHandlingSubsequentRows
= true;
1095 // If current thread is ahead, just save new completed column for further comparisons and go on calculating
1096 lastCompletedColumn
= newCompletedColumn
;
1099 /* advance pointers */
1100 ptrLElement
+= block_step
;
1101 ptrBElement
+= block_step
;
1102 /* end of inner loop */
1105 else if (completedColumnBlock
> currentBlock
)
1107 dIASSERT(completedColumnBlock
== currentBlock
+ 1);
1109 skiptoCopyingResult
= true;
1113 dIASSERT(currentBlock
== 0); // Execution can get here within the very first block only
1115 /* assign the pointers appropriately and go on computing the results */
1116 ptrLElement
= L
/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
1117 ptrBElement
= B
/* + completedColumnBlock * block_step*/;
1120 if (!skipToHandlingSubsequentRows
)
1122 if (!skiptoCopyingResult
)
1124 /* finish computing the X(i) block */
1125 Y
[0][0] = ptrBElement
[0] - Z
[0][0];
1128 Y
[0][1] = ptrBElement
[rowSkip
] - Z
[0][1];
1131 dReal p2
= ptrLElement
[rowSkip
];
1133 Y
[1][0] = ptrBElement
[1] - Z
[1][0] - p2
* Y
[0][0];
1136 Y
[1][1] = ptrBElement
[1 + rowSkip
] - Z
[1][1] - p2
* Y
[0][1];
1139 dSASSERT(block_step
== 2);
1140 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
1142 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
1143 CooperativeAtomics::AtomicReadReorderBarrier();
1145 cellindexint existingDescriptor
= blockProgressDescriptors
[currentBlock
];
1147 if (existingDescriptor
== INVALID_CELLDESCRIPTOR
)
1149 // Everything is over -- proceed to subsequent rows
1150 skipToHandlingSubsequentRows
= true;
1152 else if (existingDescriptor
== MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true))
1154 // The values computed above may not be valid. Copy the values already in the result context.
1155 skiptoCopyingResult
= true;
1159 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
1160 // and the computation result is valid.
1161 cellindexint newDescriptor
= MAKE_CELLDESCRIPTOR(currentBlock
+ 1, CCI__MIN
, true); // put the computation at the top so that the evaluation result from the expression above is reused
1163 // Assign the results to the result context (possibly in parallel with other threads
1164 // that could and ought to be assigning exactly the same values)
1165 FactorizationSolveL1StripeCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
1166 resultContext
.storePrecalculatedZs(Y
);
1168 // Assign the result assignment progress descriptor
1169 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors
[currentBlock
], existingDescriptor
, newDescriptor
); // the result is to be ignored
1171 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
1175 if (!skipToHandlingSubsequentRows
)
1177 if (skiptoCopyingResult
)
1179 // Extract the result values stored in the result context
1180 const FactorizationSolveL1StripeCellContext
&resultContext
= buildResultContextRef(cellContexts
, currentBlock
, blockCount
);
1181 resultContext
.loadPrecalculatedZs(Y
);
1183 ptrBElement
= B
+ currentBlock
* block_step
;
1186 goAssigningTheResult
= true;
1191 if (goAssigningTheResult
)
1193 cellindexint existingDescriptor
= blockProgressDescriptors
[currentBlock
];
1194 // Check if the assignment has not been completed yet
1195 if (existingDescriptor
!= INVALID_CELLDESCRIPTOR
)
1197 // Assign the computation results to their places in the matrix
1198 ptrBElement
[0] = Y
[0][0];
1199 ptrBElement
[1] = Y
[1][0];
1202 ptrBElement
[rowSkip
] = Y
[0][1];
1203 ptrBElement
[1 + rowSkip
] = Y
[1][1];
1206 dSASSERT(block_step
== 2);
1207 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
1209 ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress
, currentBlock
+ 1);
1210 dIASSERT(refBlockCompletionProgress
>= currentBlock
+ 1);
1212 // And assign the completed status no matter what
1213 CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors
[currentBlock
], INVALID_CELLDESCRIPTOR
);
1217 // everything is over -- just go handling next blocks
1221 if (!stayWithinTheBlock
)
1223 completedBlocks
= refBlockCompletionProgress
;
1225 if (completedBlocks
== blockCount
)
1232 bool lookaheadBoundaryReached
= false;
1234 if (currentBlock
== blockCount
|| completedBlocks
== 0)
1236 lookaheadBoundaryReached
= true;
1238 else if (currentBlock
>= completedBlocks
+ lookaheadRange
)
1240 lookaheadBoundaryReached
= blockProcessingState
> BPS_NO_BLOCKS_PROCESSED
;
1242 else if (currentBlock
< completedBlocks
)
1244 // Treat detected row advancement as a row processed
1245 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
1247 currentBlock
= completedBlocks
;
1250 if (lookaheadBoundaryReached
)
1252 dIASSERT(blockProcessingState
!= BPS_COMPETING_FOR_A_BLOCK
); // Why did not we compete???
1254 // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
1255 if (blockProcessingState
<= BPS_NO_BLOCKS_PROCESSED
)
1257 // Abandon job if too few blocks remain
1258 if (blockCount
- completedBlocks
<= ownThreadIndex
)
1263 blockProcessingState
= BPS_COMPETING_FOR_A_BLOCK
;
1267 // If there was some progress, just continue to the next pass
1268 blockProcessingState
= BPS_NO_BLOCKS_PROCESSED
;
1271 currentBlock
= completedBlocks
;
1278 template<unsigned int a_rows
, unsigned int d_stride
>
1280 void ThreadedEquationSolverLDLT::participateScalingAndFactorizingL1Stripe_X(dReal
*ARow
, dReal
*d
, unsigned factorizationRow
, unsigned rowSkip
,
1281 FactorizationFactorizeL1StripeContext
*factorizationContext
, unsigned ownThreadIndex
)
1283 dIASSERT(factorizationRow
!= 0);
1284 dIASSERT(factorizationRow
% 2 == 0);
1286 /* scale the elements in a 2 x i block at A(i,0), and also */
1287 /* compute Z = the outer product matrix that we'll need. */
1288 dReal sameZ
[a_rows
] = { REAL(0.0), }, mixedZ
[dMACRO_MAX(a_rows
- 1, 1)] = { REAL(0.0), };
1289 bool doneAnything
= false;
1291 const unsigned blockSize
= deriveScalingAndFactorizingL1StripeBlockSize(a_rows
);
1293 const unsigned blockCount
= deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(factorizationRow
, blockSize
);
1294 dIASSERT(blockCount
!= 0);
1296 unsigned blockIndex
;
1297 while ((blockIndex
= ThrsafeIncrementIntUpToLimit(&factorizationContext
->m_nextColumnIndex
, blockCount
)) != blockCount
)
1299 doneAnything
= true;
1300 unsigned blockStartRow
= blockIndex
* blockSize
;
1302 dReal
*ptrAElement
= ARow
+ blockStartRow
;
1303 dReal
*ptrDElement
= d
+ blockStartRow
* d_stride
;
1304 for (unsigned columnCounter
= blockIndex
!= blockCount
- 1 ? blockSize
: factorizationRow
- blockStartRow
; ; )
1306 dReal p1
, q1
, p2
, q2
, dd
;
1308 p1
= ptrAElement
[0];
1311 p2
= ptrAElement
[rowSkip
];
1313 dd
= ptrDElement
[0 * d_stride
];
1319 ptrAElement
[0] = q1
;
1322 ptrAElement
[rowSkip
] = q2
;
1324 sameZ
[0] += p1
* q1
;
1327 sameZ
[1] += p2
* q2
;
1328 mixedZ
[0] += p2
* q1
;
1331 p1
= ptrAElement
[1];
1334 p2
= ptrAElement
[1 + rowSkip
];
1336 dd
= ptrDElement
[1 * d_stride
];
1342 ptrAElement
[1] = q1
;
1345 ptrAElement
[1 + rowSkip
] = q2
;
1347 sameZ
[0] += p1
* q1
;
1350 sameZ
[1] += p2
* q2
;
1351 mixedZ
[0] += p2
* q1
;
1354 if (columnCounter
> 6)
1359 ptrDElement
+= 6 * d_stride
;
1361 p1
= ptrAElement
[-4];
1364 p2
= ptrAElement
[-4 + rowSkip
];
1366 dd
= ptrDElement
[-4 * (int)d_stride
];
1372 ptrAElement
[-4] = q1
;
1375 ptrAElement
[-4 + rowSkip
] = q2
;
1377 sameZ
[0] += p1
* q1
;
1380 sameZ
[1] += p2
* q2
;
1381 mixedZ
[0] += p2
* q1
;
1384 p1
= ptrAElement
[-3];
1387 p2
= ptrAElement
[-3 + rowSkip
];
1389 dd
= ptrDElement
[-3 * (int)d_stride
];
1395 ptrAElement
[-3] = q1
;
1398 ptrAElement
[-3 + rowSkip
] = q2
;
1400 sameZ
[0] += p1
* q1
;
1403 sameZ
[1] += p2
* q2
;
1404 mixedZ
[0] += p2
* q1
;
1407 p1
= ptrAElement
[-2];
1410 p2
= ptrAElement
[-2 + rowSkip
];
1412 dd
= ptrDElement
[-2 * (int)d_stride
];
1418 ptrAElement
[-2] = q1
;
1421 ptrAElement
[-2 + rowSkip
] = q2
;
1423 sameZ
[0] += p1
* q1
;
1426 sameZ
[1] += p2
* q2
;
1427 mixedZ
[0] += p2
* q1
;
1430 p1
= ptrAElement
[-1];
1433 p2
= ptrAElement
[-1 + rowSkip
];
1435 dd
= ptrDElement
[-1 * (int)d_stride
];
1441 ptrAElement
[-1] = q1
;
1444 ptrAElement
[-1 + rowSkip
] = q2
;
1446 sameZ
[0] += p1
* q1
;
1449 sameZ
[1] += p2
* q2
;
1450 mixedZ
[0] += p2
* q1
;
1456 ptrDElement
+= 2 * d_stride
;
1458 if ((columnCounter
-= 2) == 0)
1468 unsigned partialSumThreadIndex
;
1469 for (bool exitLoop
= false; !exitLoop
; exitLoop
= CooperativeAtomics::AtomicCompareExchangeUint32(&factorizationContext
->m_sumThreadIndex
, partialSumThreadIndex
, ownThreadIndex
+ 1))
1471 partialSumThreadIndex
= factorizationContext
->m_sumThreadIndex
;
1473 if (partialSumThreadIndex
!= 0)
1475 const FactorizationFactorizeL1StripeThreadContext
&partialSumContext
= factorizationContext
->m_threadContexts
[partialSumThreadIndex
- 1];
1476 factorizationContext
->m_threadContexts
[ownThreadIndex
].assignDataSum
<a_rows
>(sameZ
, mixedZ
, partialSumContext
);
1480 factorizationContext
->m_threadContexts
[ownThreadIndex
].assignDataAlone
<a_rows
>(sameZ
, mixedZ
);
1485 unsigned threadExitIndex
= CooperativeAtomics::AtomicDecrementUint32(&factorizationContext
->m_threadsRunning
);
1486 dIASSERT(threadExitIndex
+ 1U != 0);
1488 if (threadExitIndex
== 0)
1490 // Let the last thread retrieve the sum and perform final computations
1491 unsigned sumThreadIndex
= factorizationContext
->m_sumThreadIndex
;
1492 dIASSERT(sumThreadIndex
!= 0); // The rowIndex was asserted to be not zero, so at least one thread must have done something
1494 const FactorizationFactorizeL1StripeThreadContext
&sumContext
= factorizationContext
->m_threadContexts
[sumThreadIndex
- 1];
1495 sumContext
.retrieveData
<a_rows
>(sameZ
, mixedZ
);
1497 dReal
*ptrAElement
= ARow
+ factorizationRow
;
1498 dReal
*ptrDElement
= d
+ factorizationRow
* d_stride
;
1500 /* solve for diagonal 2 x 2 block at A(i,i) */
1501 dReal Y11
, Y21
, Y22
;
1503 Y11
= ptrAElement
[0] - sameZ
[0];
1506 Y21
= ptrAElement
[rowSkip
] - mixedZ
[0];
1507 Y22
= ptrAElement
[1 + rowSkip
] - sameZ
[1];
1510 /* factorize 2 x 2 block Y, ptrDElement */
1511 /* factorize row 1 */
1512 dReal dd
= dRecip(Y11
);
1514 ptrDElement
[0 * d_stride
] = dd
;
1515 dIASSERT(ptrDElement
== d
+ (sizeint
)factorizationRow
* d_stride
);
1519 /* factorize row 2 */
1520 dReal q2
= Y21
* dd
;
1521 ptrAElement
[rowSkip
] = q2
;
1523 dReal sum
= Y21
* q2
;
1524 ptrDElement
[1 * d_stride
] = dRecip(Y22
- sum
);
1530 #endif // #ifndef _ODE_FASTLDLT_IMPL_H_