Cosmetic: Cosmetic code corrections in QuickStep
[ode.git] / ode / src / fastldltfactor_impl.h
blob8f633d3f632873244d63883682208fd1e9f249e6
3 /*************************************************************************
4 * *
5 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
6 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
7 * *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of EITHER: *
10 * (1) The GNU Lesser General Public License as published by the Free *
11 * Software Foundation; either version 2.1 of the License, or (at *
12 * your option) any later version. The text of the GNU Lesser *
13 * General Public License is included with this library in the *
14 * file LICENSE.TXT. *
15 * (2) The BSD-style license that is included with this library in *
16 * the file LICENSE-BSD.TXT. *
17 * *
18 * This library is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
21 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
22 * *
23 *************************************************************************/
26 * Code style improvements and optimizations by Oleh Derevenko ????-2019
27 * LDLT cooperative factorization code of ThreadedEquationSolverLDLT copyright (c) 2017-2019 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
30 #ifndef _ODE_FASTLDLT_IMPL_H_
31 #define _ODE_FASTLDLT_IMPL_H_
34 #include "error.h"
35 #include "common.h"
38 static void solveL1Stripe_2 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
39 template<unsigned int d_stride>
40 void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned rowIndex, unsigned rowSkip);
41 template<unsigned int d_stride>
42 inline void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip);
44 static void solveStripeL1_1 (const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip);
45 template<unsigned int d_stride>
46 void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned rowIndex);
47 template<unsigned int d_stride>
48 inline void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d);
51 template<unsigned int d_stride>
52 void factorMatrixAsLDLT(dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip)
54 if (rowCount < 1) return;
56 dReal *ARow = A;
57 unsigned blockStartRow = 0;
59 const unsigned blockStep = 2;
60 const unsigned lastRowIndex = rowCount >= blockStep ? rowCount - blockStep + 1 : 0;
62 /* compute blocks of 2 rows */
63 bool subsequentPass = false;
64 for (; blockStartRow < lastRowIndex; subsequentPass = true, ARow += blockStep * rowSkip, blockStartRow += blockStep)
66 if (subsequentPass)
68 /* solve L*(D*l)=a, l is scaled elements in 2 x i block at A(i,0) */
69 solveL1Stripe_2(A, ARow, blockStartRow, rowSkip);
70 scaleAndFactorizeL1Stripe_2<d_stride>(ARow, d, blockStartRow, rowSkip);
72 else
74 scaleAndFactorizeL1FirstRowStripe_2<d_stride>(ARow, d, rowSkip);
76 dSASSERT(blockStep == 2);
77 /* done factorizing 2 x 2 block */
80 /* compute the (less than 2) rows at the bottom */
81 if (!subsequentPass || blockStartRow == lastRowIndex)
83 dSASSERT(blockStep == 2); // for the blockStartRow == lastRowIndex comparison above
85 if (subsequentPass)
87 solveStripeL1_1(A, ARow, blockStartRow, rowSkip);
88 scaleAndFactorizeL1Stripe_1<d_stride>(ARow, d, blockStartRow);
90 else
92 scaleAndFactorizeL1FirstRowStripe_1<d_stride>(ARow, d);
94 dSASSERT(blockStep == 2);
95 /* done factorizing 1 x 1 block */
99 /* solve L*X=B, with B containing 2 right hand sides.
100 * L is an n*n lower triangular matrix with ones on the diagonal.
101 * L is stored by rows and its leading dimension is rowSkip.
102 * B is an n*2 matrix that contains the right hand sides.
103 * B is stored by columns and its leading dimension is also rowSkip.
104 * B is overwritten with X.
105 * this processes blocks of 2*2.
106 * if this is in the factorizer source file, n must be a multiple of 2.
108 static
109 void solveL1Stripe_2(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
111 dIASSERT(rowCount != 0);
112 dIASSERT(rowCount % 2 == 0);
114 /* compute all 2 x 2 blocks of X */
115 unsigned blockStartRow = 0;
116 for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount)
118 const dReal *ptrLElement;
119 dReal *ptrBElement;
121 /* declare variables - Z matrix */
122 dReal Z11, Z12, Z21, Z22;
124 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
125 if (subsequentPass)
127 ptrLElement = L + blockStartRow * rowSkip;
128 ptrBElement = B;
130 /* set Z matrix to 0 */
131 Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
133 /* the inner loop that computes outer products and adds them to Z */
134 // The iteration starts with even number and decreases it by 2. So, it must end in zero
135 for (unsigned columnCounter = blockStartRow; ;)
137 /* declare p and q vectors, etc */
138 dReal p1, q1, p2, q2;
140 /* compute outer product and add it to the Z matrix */
141 p1 = ptrLElement[0];
142 q1 = ptrBElement[0];
143 Z11 += p1 * q1;
144 q2 = ptrBElement[rowSkip];
145 Z12 += p1 * q2;
146 p2 = ptrLElement[rowSkip];
147 Z21 += p2 * q1;
148 Z22 += p2 * q2;
150 /* compute outer product and add it to the Z matrix */
151 p1 = ptrLElement[1];
152 q1 = ptrBElement[1];
153 Z11 += p1 * q1;
154 q2 = ptrBElement[1 + rowSkip];
155 Z12 += p1 * q2;
156 p2 = ptrLElement[1 + rowSkip];
157 Z21 += p2 * q1;
158 Z22 += p2 * q2;
160 if (columnCounter > 6)
162 columnCounter -= 6;
164 /* advance pointers */
165 ptrLElement += 6;
166 ptrBElement += 6;
168 /* compute outer product and add it to the Z matrix */
169 p1 = ptrLElement[-4];
170 q1 = ptrBElement[-4];
171 Z11 += p1 * q1;
172 q2 = ptrBElement[-4 + rowSkip];
173 Z12 += p1 * q2;
174 p2 = ptrLElement[-4 + rowSkip];
175 Z21 += p2 * q1;
176 Z22 += p2 * q2;
178 /* compute outer product and add it to the Z matrix */
179 p1 = ptrLElement[-3];
180 q1 = ptrBElement[-3];
181 Z11 += p1 * q1;
182 q2 = ptrBElement[-3 + rowSkip];
183 Z12 += p1 * q2;
184 p2 = ptrLElement[-3 + rowSkip];
185 Z21 += p2 * q1;
186 Z22 += p2 * q2;
188 /* compute outer product and add it to the Z matrix */
189 p1 = ptrLElement[-2];
190 q1 = ptrBElement[-2];
191 Z11 += p1 * q1;
192 q2 = ptrBElement[-2 + rowSkip];
193 Z12 += p1 * q2;
194 p2 = ptrLElement[-2 + rowSkip];
195 Z21 += p2 * q1;
196 Z22 += p2 * q2;
198 /* compute outer product and add it to the Z matrix */
199 p1 = ptrLElement[-1];
200 q1 = ptrBElement[-1];
201 Z11 += p1 * q1;
202 q2 = ptrBElement[-1 + rowSkip];
203 Z12 += p1 * q2;
204 p2 = ptrLElement[-1 + rowSkip];
205 Z21 += p2 * q1;
206 Z22 += p2 * q2;
208 else
210 /* advance pointers */
211 ptrLElement += 2;
212 ptrBElement += 2;
214 if ((columnCounter -= 2) == 0)
216 break;
219 /* end of inner loop */
222 else
224 ptrLElement = L/* + blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
225 ptrBElement = B;
227 /* set Z matrix to 0 */
228 Z11 = 0; Z12 = 0; Z21 = 0; Z22 = 0;
231 /* finish computing the X(i) block */
233 dReal Y11 = ptrBElement[0] - Z11;
234 dReal Y12 = ptrBElement[rowSkip] - Z12;
236 dReal p2 = ptrLElement[rowSkip];
238 ptrBElement[0] = Y11;
239 ptrBElement[rowSkip] = Y12;
241 dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
242 dReal Y22 = ptrBElement[1 + rowSkip] - Z22 - p2 * Y12;
244 ptrBElement[1] = Y21;
245 ptrBElement[1 + rowSkip] = Y22;
246 /* end of outer loop */
250 template<unsigned int d_stride>
251 void scaleAndFactorizeL1Stripe_2(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip)
253 dIASSERT(factorizationRow != 0);
254 dIASSERT(factorizationRow % 2 == 0);
256 dReal *ptrAElement = ARow;
257 dReal *ptrDElement = d;
259 /* scale the elements in a 2 x i block at A(i,0), and also */
260 /* compute Z = the outer product matrix that we'll need. */
261 dReal Z11 = 0, Z21 = 0, Z22 = 0;
263 for (unsigned columnCounter = factorizationRow; ; )
265 dReal p1, q1, p2, q2, dd;
267 p1 = ptrAElement[0];
268 p2 = ptrAElement[rowSkip];
269 dd = ptrDElement[0 * d_stride];
270 q1 = p1 * dd;
271 q2 = p2 * dd;
272 ptrAElement[0] = q1;
273 ptrAElement[rowSkip] = q2;
274 Z11 += p1 * q1;
275 Z21 += p2 * q1;
276 Z22 += p2 * q2;
278 p1 = ptrAElement[1];
279 p2 = ptrAElement[1 + rowSkip];
280 dd = ptrDElement[1 * d_stride];
281 q1 = p1 * dd;
282 q2 = p2 * dd;
283 ptrAElement[1] = q1;
284 ptrAElement[1 + rowSkip] = q2;
285 Z11 += p1 * q1;
286 Z21 += p2 * q1;
287 Z22 += p2 * q2;
289 if (columnCounter > 6)
291 columnCounter -= 6;
293 ptrAElement += 6;
294 ptrDElement += 6 * d_stride;
296 p1 = ptrAElement[-4];
297 p2 = ptrAElement[-4 + rowSkip];
298 dd = ptrDElement[-4 * (int)d_stride];
299 q1 = p1 * dd;
300 q2 = p2 * dd;
301 ptrAElement[-4] = q1;
302 ptrAElement[-4 + rowSkip] = q2;
303 Z11 += p1 * q1;
304 Z21 += p2 * q1;
305 Z22 += p2 * q2;
307 p1 = ptrAElement[-3];
308 p2 = ptrAElement[-3 + rowSkip];
309 dd = ptrDElement[-3 * (int)d_stride];
310 q1 = p1 * dd;
311 q2 = p2 * dd;
312 ptrAElement[-3] = q1;
313 ptrAElement[-3 + rowSkip] = q2;
314 Z11 += p1 * q1;
315 Z21 += p2 * q1;
316 Z22 += p2 * q2;
318 p1 = ptrAElement[-2];
319 p2 = ptrAElement[-2 + rowSkip];
320 dd = ptrDElement[-2 * (int)d_stride];
321 q1 = p1 * dd;
322 q2 = p2 * dd;
323 ptrAElement[-2] = q1;
324 ptrAElement[-2 + rowSkip] = q2;
325 Z11 += p1 * q1;
326 Z21 += p2 * q1;
327 Z22 += p2 * q2;
329 p1 = ptrAElement[-1];
330 p2 = ptrAElement[-1 + rowSkip];
331 dd = ptrDElement[-1 * (int)d_stride];
332 q1 = p1 * dd;
333 q2 = p2 * dd;
334 ptrAElement[-1] = q1;
335 ptrAElement[-1 + rowSkip] = q2;
336 Z11 += p1 * q1;
337 Z21 += p2 * q1;
338 Z22 += p2 * q2;
340 else
342 ptrAElement += 2;
343 ptrDElement += 2 * d_stride;
345 if ((columnCounter -= 2) == 0)
347 break;
352 /* solve for diagonal 2 x 2 block at A(i,i) */
353 dReal Y11 = ptrAElement[0] - Z11;
354 dReal Y21 = ptrAElement[rowSkip] - Z21;
355 dReal Y22 = ptrAElement[1 + rowSkip] - Z22;
357 /* factorize 2 x 2 block Y, ptrDElement */
358 /* factorize row 1 */
359 dReal dd = dRecip(Y11);
361 ptrDElement[0 * d_stride] = dd;
362 dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
364 /* factorize row 2 */
365 dReal q2 = Y21 * dd;
366 ptrAElement[rowSkip] = q2;
368 dReal sum = Y21 * q2;
369 ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
372 template<unsigned int d_stride>
373 void scaleAndFactorizeL1FirstRowStripe_2(dReal *ARow, dReal *d, unsigned rowSkip)
375 dReal *ptrAElement = ARow;
376 dReal *ptrDElement = d;
378 /* solve for diagonal 2 x 2 block at A(0,0) */
379 dReal Y11 = ptrAElement[0]/* - Z11*/;
380 dReal Y21 = ptrAElement[rowSkip]/* - Z21*/;
381 dReal Y22 = ptrAElement[1 + rowSkip]/* - Z22*/;
383 /* factorize 2 x 2 block Y, ptrDElement */
384 /* factorize row 1 */
385 dReal dd = dRecip(Y11);
387 ptrDElement[0 * d_stride] = dd;
388 dIASSERT(ptrDElement == d/* + (sizeint)factorizationRow * d_stride*/);
390 /* factorize row 2 */
391 dReal q2 = Y21 * dd;
392 ptrAElement[rowSkip] = q2;
394 dReal sum = Y21 * q2;
395 ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
399 /* solve L*X=B, with B containing 1 right hand sides.
400 * L is an n*n lower triangular matrix with ones on the diagonal.
401 * L is stored by rows and its leading dimension is lskip.
402 * B is an n*1 matrix that contains the right hand sides.
403 * B is stored by columns and its leading dimension is also lskip.
404 * B is overwritten with X.
405 * this processes blocks of 2*2.
406 * if this is in the factorizer source file, n must be a multiple of 2.
408 static
409 void solveStripeL1_1(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip)
411 dIASSERT(rowCount != 0);
412 dIASSERT(rowCount % 2 == 0);
414 /* compute all 2 x 1 blocks of X */
415 unsigned blockStartRow = 0;
416 for (bool exitLoop = false, subsequentPass = false; !exitLoop; subsequentPass = true, exitLoop = (blockStartRow += 2) == rowCount)
418 const dReal *ptrLElement;
419 dReal *ptrBElement;
421 /* declare variables - Z matrix */
422 dReal Z11, Z21;
424 if (subsequentPass)
426 ptrLElement = L + (sizeint)blockStartRow * rowSkip;
427 ptrBElement = B;
429 /* set the Z matrix to 0 */
430 Z11 = 0; Z21 = 0;
432 /* compute all 2 x 1 block of X, from rows i..i+2-1 */
434 /* the inner loop that computes outer products and adds them to Z */
435 // The iteration starts with even number and decreases it by 2. So, it must end in zero
436 for (unsigned columnCounter = blockStartRow; ; )
438 /* declare p and q vectors, etc */
439 dReal p1, q1, p2;
441 /* compute outer product and add it to the Z matrix */
442 p1 = ptrLElement[0];
443 q1 = ptrBElement[0];
444 Z11 += p1 * q1;
445 p2 = ptrLElement[rowSkip];
446 Z21 += p2 * q1;
448 /* compute outer product and add it to the Z matrix */
449 p1 = ptrLElement[1];
450 q1 = ptrBElement[1];
451 Z11 += p1 * q1;
452 p2 = ptrLElement[1 + rowSkip];
453 Z21 += p2 * q1;
455 if (columnCounter > 6)
457 columnCounter -= 6;
459 /* advance pointers */
460 ptrLElement += 6;
461 ptrBElement += 6;
463 /* compute outer product and add it to the Z matrix */
464 p1 = ptrLElement[-4];
465 q1 = ptrBElement[-4];
466 Z11 += p1 * q1;
467 p2 = ptrLElement[-4 + rowSkip];
468 Z21 += p2 * q1;
470 /* compute outer product and add it to the Z matrix */
471 p1 = ptrLElement[-3];
472 q1 = ptrBElement[-3];
473 Z11 += p1 * q1;
474 p2 = ptrLElement[-3 + rowSkip];
475 Z21 += p2 * q1;
477 /* compute outer product and add it to the Z matrix */
478 p1 = ptrLElement[-2];
479 q1 = ptrBElement[-2];
480 Z11 += p1 * q1;
481 p2 = ptrLElement[-2 + rowSkip];
482 Z21 += p2 * q1;
484 /* compute outer product and add it to the Z matrix */
485 p1 = ptrLElement[-1];
486 q1 = ptrBElement[-1];
487 Z11 += p1 * q1;
488 p2 = ptrLElement[-1 + rowSkip];
489 Z21 += p2 * q1;
491 else
493 /* advance pointers */
494 ptrLElement += 2;
495 ptrBElement += 2;
497 if ((columnCounter -= 2) == 0)
499 break;
502 /* end of inner loop */
505 else
507 ptrLElement = L/* + (sizeint)blockStartRow * rowSkip*/; dIASSERT(blockStartRow == 0);
508 ptrBElement = B;
510 /* set the Z matrix to 0 */
511 Z11 = 0; Z21 = 0;
514 /* finish computing the X(i) block */
515 dReal p2 = ptrLElement[rowSkip];
517 dReal Y11 = ptrBElement[0] - Z11;
518 dReal Y21 = ptrBElement[1] - Z21 - p2 * Y11;
520 ptrBElement[0] = Y11;
521 ptrBElement[1] = Y21;
522 /* end of outer loop */
526 template<unsigned int d_stride>
527 void scaleAndFactorizeL1Stripe_1(dReal *ARow, dReal *d, unsigned factorizationRow)
529 dReal *ptrAElement = ARow;
530 dReal *ptrDElement = d;
532 /* scale the elements in a 1 x i block at A(i,0), and also */
533 /* compute Z = the outer product matrix that we'll need. */
534 dReal Z11 = 0, Z22 = 0;
536 for (unsigned columnCounter = factorizationRow; ; )
538 dReal p1, p2, q1, q2, dd1, dd2;
540 p1 = ptrAElement[0];
541 p2 = ptrAElement[1];
542 dd1 = ptrDElement[0 * d_stride];
543 dd2 = ptrDElement[1 * d_stride];
544 q1 = p1 * dd1;
545 q2 = p2 * dd2;
546 ptrAElement[0] = q1;
547 ptrAElement[1] = q2;
548 Z11 += p1 * q1;
549 Z22 += p2 * q2;
551 if (columnCounter > 6)
553 columnCounter -= 6;
555 ptrAElement += 6;
556 ptrDElement += 6 * d_stride;
558 p1 = ptrAElement[-4];
559 p2 = ptrAElement[-3];
560 dd1 = ptrDElement[-4 * (int)d_stride];
561 dd2 = ptrDElement[-3 * (int)d_stride];
562 q1 = p1 * dd1;
563 q2 = p2 * dd2;
564 ptrAElement[-4] = q1;
565 ptrAElement[-3] = q2;
566 Z11 += p1 * q1;
567 Z22 += p2 * q2;
569 p1 = ptrAElement[-2];
570 p2 = ptrAElement[-1];
571 dd1 = ptrDElement[-2 * (int)d_stride];
572 dd2 = ptrDElement[-1 * (int)d_stride];
573 q1 = p1 * dd1;
574 q2 = p2 * dd2;
575 ptrAElement[-2] = q1;
576 ptrAElement[-1] = q2;
577 Z11 += p1 * q1;
578 Z22 += p2 * q2;
580 else
582 ptrAElement += 2;
583 ptrDElement += 2 * d_stride;
585 if ((columnCounter -= 2) == 0)
587 break;
592 dReal Y11 = ptrAElement[0] - (Z11 + Z22);
594 /* solve for diagonal 1 x 1 block at A(i,i) */
595 dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
596 /* factorize 1 x 1 block Y, ptrDElement */
597 /* factorize row 1 */
598 ptrDElement[0 * d_stride] = dRecip(Y11);
601 template<unsigned int d_stride>
602 void scaleAndFactorizeL1FirstRowStripe_1(dReal *ARow, dReal *d)
604 dReal *ptrAElement = ARow;
605 dReal *ptrDElement = d;
607 dReal Y11 = ptrAElement[0];
609 /* solve for diagonal 1 x 1 block at A(0,0) */
610 /* factorize 1 x 1 block Y, ptrDElement */
611 /* factorize row 1 */
612 ptrDElement[0 * d_stride] = dRecip(Y11);
618 template<unsigned int block_step, unsigned int b_rows>
619 /*static */
620 void ThreadedEquationSolverLDLT::participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip,
621 volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
622 FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex)
624 const unsigned lookaheadRange = 64;
625 BlockProcessingState blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
627 unsigned completedBlocks = refBlockCompletionProgress;
628 unsigned currentBlock = completedBlocks;
629 dIASSERT(completedBlocks <= blockCount);
631 for (bool exitLoop = completedBlocks == blockCount; !exitLoop; exitLoop = false)
633 bool goForLockedBlockPrimaryCalculation = false, goForLockedBlockDuplicateCalculation = false;
634 bool goAssigningTheResult = false, stayWithinTheBlock = false;
636 dReal Z[block_step][b_rows];
637 dReal Y[block_step][b_rows];
639 dReal *ptrBElement;
641 CellContextInstance previousContextInstance;
642 unsigned completedColumnBlock;
644 for (cellindexint testDescriptor = blockProgressDescriptors[currentBlock]; ; )
646 if (testDescriptor == INVALID_CELLDESCRIPTOR)
648 // Invalid descriptor is the indication that the row has been fully calculated
649 // Test if this was the last row and break out if so.
650 if (currentBlock + 1 == blockCount)
652 exitLoop = true;
653 break;
656 // Treat detected row advancement as a row processed
657 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
658 break;
661 CooperativeAtomics::AtomicReadReorderBarrier();
662 // It is necessary to read up to date completedBblocks value after the descriptor retrieval
663 // as otherwise the logic below breaks
664 completedBlocks = refBlockCompletionProgress;
666 if (!GET_CELLDESCRIPTOR_ISLOCKED(testDescriptor))
668 completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
669 dIASSERT(completedColumnBlock < currentBlock || (completedColumnBlock == currentBlock && currentBlock == 0)); // Otherwise, why would the calculation have had stopped if the final column is reachable???
670 dIASSERT(completedColumnBlock <= completedBlocks); // Since the descriptor is not locked
672 if (completedColumnBlock == completedBlocks && currentBlock != completedBlocks)
674 dIASSERT(completedBlocks < currentBlock);
675 break;
678 if (CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], testDescriptor, MARK_CELLDESCRIPTOR_LOCKED(testDescriptor)))
680 if (completedColumnBlock != 0)
682 CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
683 previousContextInstance = contextInstance;
685 const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
686 sourceContext.loadPrecalculatedZs(Z);
688 else
690 previousContextInstance = CCI__MIN;
691 FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
694 goForLockedBlockPrimaryCalculation = true;
695 break;
698 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
700 break;
703 testDescriptor = blockProgressDescriptors[currentBlock];
705 else
707 if (blockProcessingState != BPS_COMPETING_FOR_A_BLOCK)
709 break;
712 cellindexint verificativeDescriptor;
713 bool verificationFailure = false;
715 completedColumnBlock = GET_CELLDESCRIPTOR_COLUMNINDEX(testDescriptor);
716 dIASSERT(completedColumnBlock != currentBlock || currentBlock == 0); // There is no reason for computations to stop at the very end other than being the initial value at the very first block
718 if (completedColumnBlock != 0)
720 CellContextInstance contextInstance = GET_CELLDESCRIPTOR_CONTEXTINSTANCE(testDescriptor);
721 const FactorizationSolveL1StripeCellContext &sourceContext = buildBlockContextRef(cellContexts, currentBlock, contextInstance);
722 sourceContext.loadPrecalculatedZs(Z);
724 else
726 FactorizationSolveL1StripeCellContext::initializePrecalculatedZs(Z);
729 if (completedColumnBlock != 0 && completedColumnBlock <= currentBlock)
731 // Make sure the descriptor is re-read after the precalculates
732 CooperativeAtomics::AtomicReadReorderBarrier();
735 if (completedColumnBlock <= currentBlock)
737 verificativeDescriptor = blockProgressDescriptors[currentBlock];
738 verificationFailure = verificativeDescriptor != testDescriptor;
741 if (!verificationFailure)
743 dIASSERT(completedColumnBlock <= currentBlock + 1);
745 goForLockedBlockDuplicateCalculation = true;
746 break;
749 testDescriptor = verificativeDescriptor;
753 if (exitLoop)
755 break;
758 if (goForLockedBlockPrimaryCalculation)
760 blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
762 // Declare and assign the variables at the top to not interfere with any branching -- the compiler is going to eliminate them anyway.
763 bool handleComputationTakenOver = false, rowEndReached = false;
765 const dReal *ptrLElement;
766 unsigned finalColumnBlock;
768 if (currentBlock != 0)
770 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
771 ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
772 ptrBElement = B + completedColumnBlock * block_step;
774 /* the inner loop that computes outer products and adds them to Z */
775 finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
776 dIASSERT(completedColumnBlock != finalColumnBlock/* || currentBlock == 0*/);
778 // The iteration starts with even number and decreases it by 2. So, it must end in zero
779 for (unsigned columnCounter = finalColumnBlock - completedColumnBlock; ; )
781 /* declare p and q vectors, etc */
782 dReal p[block_step], q[b_rows];
784 /* compute outer product and add it to the Z matrix */
785 p[0] = ptrLElement[0];
786 q[0] = ptrBElement[0];
787 Z[0][0] += p[0] * q[0];
788 if (b_rows >= 2)
790 q[1] = ptrBElement[rowSkip];
791 Z[0][1] += p[0] * q[1];
793 p[1] = ptrLElement[rowSkip];
794 Z[1][0] += p[1] * q[0];
795 if (b_rows >= 2)
797 Z[1][1] += p[1] * q[1];
800 /* compute outer product and add it to the Z matrix */
801 p[0] = ptrLElement[1];
802 q[0] = ptrBElement[1];
803 Z[0][0] += p[0] * q[0];
804 if (b_rows >= 2)
806 q[1] = ptrBElement[1 + rowSkip];
807 Z[0][1] += p[0] * q[1];
809 p[1] = ptrLElement[1 + rowSkip];
810 Z[1][0] += p[1] * q[0];
811 if (b_rows >= 2)
813 Z[1][1] += p[1] * q[1];
816 dSASSERT(block_step == 2);
817 dSASSERT(b_rows >= 1 && b_rows <= 2);
819 if (columnCounter > 2)
821 /* compute outer product and add it to the Z matrix */
822 p[0] = ptrLElement[2];
823 q[0] = ptrBElement[2];
824 Z[0][0] += p[0] * q[0];
825 if (b_rows >= 2)
827 q[1] = ptrBElement[2 + rowSkip];
828 Z[0][1] += p[0] * q[1];
830 p[1] = ptrLElement[2 + rowSkip];
831 Z[1][0] += p[1] * q[0];
832 if (b_rows >= 2)
834 Z[1][1] += p[1] * q[1];
837 /* compute outer product and add it to the Z matrix */
838 p[0] = ptrLElement[3];
839 q[0] = ptrBElement[3];
840 Z[0][0] += p[0] * q[0];
841 if (b_rows >= 2)
843 q[1] = ptrBElement[3 + rowSkip];
844 Z[0][1] += p[0] * q[1];
846 p[1] = ptrLElement[3 + rowSkip];
847 Z[1][0] += p[1] * q[0];
848 if (b_rows >= 2)
850 Z[1][1] += p[1] * q[1];
853 dSASSERT(block_step == 2);
854 dSASSERT(b_rows >= 1 && b_rows <= 2);
856 /* advance pointers */
857 ptrLElement += 2 * block_step;
858 ptrBElement += 2 * block_step;
859 columnCounter -= 2;
861 else
863 /* advance pointers */
864 ptrLElement += block_step;
865 ptrBElement += block_step;
866 /* end of inner loop */
868 if (--columnCounter == 0)
870 if (finalColumnBlock == currentBlock)
872 rowEndReached = true;
873 break;
876 // Take a look if any more rows have been completed...
877 completedBlocks = refBlockCompletionProgress;
878 dIASSERT(completedBlocks >= finalColumnBlock);
880 if (completedBlocks == finalColumnBlock)
882 break;
885 // ...continue if so.
886 unsigned columnCompletedSoFar = finalColumnBlock;
887 finalColumnBlock = dMACRO_MIN(currentBlock, completedBlocks);
888 columnCounter = finalColumnBlock - columnCompletedSoFar;
893 else
895 ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
896 ptrBElement = B/* + completedColumnBlock * block_step*/;
898 rowEndReached = true;
901 if (rowEndReached)
903 // Check whether there is still a need to proceed or if the computation has been taken over by another thread
904 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
906 if (blockProgressDescriptors[currentBlock] == oldDescriptor)
908 /* finish computing the X(i) block */
909 Y[0][0] = ptrBElement[0] - Z[0][0];
910 if (b_rows >= 2)
912 Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
915 dReal p2 = ptrLElement[rowSkip];
917 Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
918 if (b_rows >= 2)
920 Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
923 dSASSERT(block_step == 2);
924 dSASSERT(b_rows >= 1 && b_rows <= 2);
926 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
927 CooperativeAtomics::AtomicReadReorderBarrier();
929 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
930 // and the computation result is valid.
931 if (blockProgressDescriptors[currentBlock] == oldDescriptor)
933 // Assign the results to the result context (possibly in parallel with other threads
934 // that could and ought to be assigning exactly the same values)
935 FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
936 resultContext.storePrecalculatedZs(Y);
938 // Assign the result assignment progress descriptor
939 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true);
940 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor); // the result is to be ignored
942 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
943 goAssigningTheResult = true;
945 else
947 // Otherwise, go on competing for copying the results
948 handleComputationTakenOver = true;
951 else
953 handleComputationTakenOver = true;
956 else
958 // If the final column has not been reached yet, store current values to the context.
959 // Select the other context instance as the previous one might be read by other threads.
960 CellContextInstance nextContextInstance = buildNextContextInstance(previousContextInstance);
961 FactorizationSolveL1StripeCellContext &destinationContext = buildBlockContextRef(cellContexts, currentBlock, nextContextInstance);
962 destinationContext.storePrecalculatedZs(Z);
964 // Unlock the row until more columns can be used
965 cellindexint oldDescriptor = MAKE_CELLDESCRIPTOR(completedColumnBlock, previousContextInstance, true);
966 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(finalColumnBlock, nextContextInstance, false);
967 // The descriptor might have been updated by a competing thread
968 if (!CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], oldDescriptor, newDescriptor))
970 // Adjust the ptrBElement to point to the result area...
971 ptrBElement = B + currentBlock * block_step;
972 // ...and go on handling the case
973 handleComputationTakenOver = true;
977 if (handleComputationTakenOver)
979 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
980 // This can only happen if the row was (has become) the uppermost not fully completed one
981 // and the competing thread is at final stage of calculation (i.e., it has reached the currentBlock column).
982 if (existingDescriptor != INVALID_CELLDESCRIPTOR)
984 // If not fully completed this must be the final stage of the result assignment into the matrix
985 dIASSERT(existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true));
987 // Go on competing copying the result as anyway the block is the topmost not completed one
988 // and since there was competition for it, there is no other work that can be done right now.
989 const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
990 resultContext.loadPrecalculatedZs(Y);
992 goAssigningTheResult = true;
994 else
996 // everything is over -- just go handling next blocks
1000 else if (goForLockedBlockDuplicateCalculation)
1002 blockProcessingState = BPS_SOME_BLOCKS_PROCESSED;
1004 bool skipToHandlingSubsequentRows = false, skiptoCopyingResult = false;
1006 /* declare variables */
1007 const dReal *ptrLElement;
1009 if (completedColumnBlock < currentBlock)
1011 /* compute all 2 x 2 block of X, from rows i..i+2-1 */
1012 ptrLElement = L + (currentBlock * rowSkip + completedColumnBlock) * block_step;
1013 ptrBElement = B + completedColumnBlock * block_step;
1015 /* the inner loop that computes outer products and adds them to Z */
1016 // The iteration starts with even number and decreases it by 2. So, it must end in zero
1017 const unsigned finalColumnBlock = currentBlock;
1018 dIASSERT(currentBlock == completedBlocks); // Why would we be competing for a row otherwise?
1020 unsigned lastCompletedColumn = completedColumnBlock;
1021 unsigned columnCounter = finalColumnBlock - completedColumnBlock;
1022 for (bool exitInnerLoop = false; !exitInnerLoop; exitInnerLoop = --columnCounter == 0)
1024 /* declare p and q vectors, etc */
1025 dReal p[block_step], q[b_rows];
1027 /* compute outer product and add it to the Z matrix */
1028 p[0] = ptrLElement[0];
1029 q[0] = ptrBElement[0];
1030 Z[0][0] += p[0] * q[0];
1031 if (b_rows >= 2)
1033 q[1] = ptrBElement[rowSkip];
1034 Z[0][1] += p[0] * q[1];
1036 p[1] = ptrLElement[rowSkip];
1037 Z[1][0] += p[1] * q[0];
1038 if (b_rows >= 2)
1040 Z[1][1] += p[1] * q[1];
1043 /* compute outer product and add it to the Z matrix */
1044 p[0] = ptrLElement[1];
1045 q[0] = ptrBElement[1];
1046 Z[0][0] += p[0] * q[0];
1047 if (b_rows >= 2)
1049 q[1] = ptrBElement[1 + rowSkip];
1050 Z[0][1] += p[0] * q[1];
1052 p[1] = ptrLElement[1 + rowSkip];
1053 Z[1][0] += p[1] * q[0];
1054 if (b_rows >= 2)
1056 Z[1][1] += p[1] * q[1];
1059 dSASSERT(block_step == 2);
1060 dSASSERT(b_rows >= 1 && b_rows <= 2);
1062 // Check if the primary solver thread has not made any progress
1063 cellindexint descriptorVerification = blockProgressDescriptors[currentBlock];
1064 unsigned newCompletedColumn = GET_CELLDESCRIPTOR_COLUMNINDEX(descriptorVerification);
1066 if (newCompletedColumn != lastCompletedColumn)
1068 // Check, this is the first change the current thread detects.
1069 // There is absolutely no reason in code for the computation to stop/resume twice
1070 // while the current thread is competing.
1071 dIASSERT(lastCompletedColumn == completedColumnBlock);
1073 if (descriptorVerification == INVALID_CELLDESCRIPTOR)
1075 skipToHandlingSubsequentRows = true;
1076 break;
1079 if (newCompletedColumn == currentBlock + 1)
1081 skiptoCopyingResult = true;
1082 break;
1085 // Check if the current thread is behind
1086 if (newCompletedColumn > finalColumnBlock - columnCounter)
1088 // If so, go starting over one more time
1089 blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1090 stayWithinTheBlock = true;
1091 skipToHandlingSubsequentRows = true;
1092 break;
1095 // If current thread is ahead, just save new completed column for further comparisons and go on calculating
1096 lastCompletedColumn = newCompletedColumn;
1099 /* advance pointers */
1100 ptrLElement += block_step;
1101 ptrBElement += block_step;
1102 /* end of inner loop */
1105 else if (completedColumnBlock > currentBlock)
1107 dIASSERT(completedColumnBlock == currentBlock + 1);
1109 skiptoCopyingResult = true;
1111 else
1113 dIASSERT(currentBlock == 0); // Execution can get here within the very first block only
1115 /* assign the pointers appropriately and go on computing the results */
1116 ptrLElement = L/* + (currentBlock * rowSkip + completedColumnBlock) * block_step*/;
1117 ptrBElement = B/* + completedColumnBlock * block_step*/;
1120 if (!skipToHandlingSubsequentRows)
1122 if (!skiptoCopyingResult)
1124 /* finish computing the X(i) block */
1125 Y[0][0] = ptrBElement[0] - Z[0][0];
1126 if (b_rows >= 2)
1128 Y[0][1] = ptrBElement[rowSkip] - Z[0][1];
1131 dReal p2 = ptrLElement[rowSkip];
1133 Y[1][0] = ptrBElement[1] - Z[1][0] - p2 * Y[0][0];
1134 if (b_rows >= 2)
1136 Y[1][1] = ptrBElement[1 + rowSkip] - Z[1][1] - p2 * Y[0][1];
1139 dSASSERT(block_step == 2);
1140 dSASSERT(b_rows >= 1 && b_rows <= 2);
1142 // Use atomic memory barrier to make sure memory reads of ptrBElement[] and blockProgressDescriptors[] are not swapped
1143 CooperativeAtomics::AtomicReadReorderBarrier();
1145 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1147 if (existingDescriptor == INVALID_CELLDESCRIPTOR)
1149 // Everything is over -- proceed to subsequent rows
1150 skipToHandlingSubsequentRows = true;
1152 else if (existingDescriptor == MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true))
1154 // The values computed above may not be valid. Copy the values already in the result context.
1155 skiptoCopyingResult = true;
1157 else
1159 // The descriptor has not been altered yet - this means the ptrBElement[] values used above were not modified yet
1160 // and the computation result is valid.
1161 cellindexint newDescriptor = MAKE_CELLDESCRIPTOR(currentBlock + 1, CCI__MIN, true); // put the computation at the top so that the evaluation result from the expression above is reused
1163 // Assign the results to the result context (possibly in parallel with other threads
1164 // that could and ought to be assigning exactly the same values)
1165 FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1166 resultContext.storePrecalculatedZs(Y);
1168 // Assign the result assignment progress descriptor
1169 CooperativeAtomics::AtomicCompareExchangeCellindexint(&blockProgressDescriptors[currentBlock], existingDescriptor, newDescriptor); // the result is to be ignored
1171 // Whether succeeded or not, the result is valid, so go on trying to assign it to the matrix
1175 if (!skipToHandlingSubsequentRows)
1177 if (skiptoCopyingResult)
1179 // Extract the result values stored in the result context
1180 const FactorizationSolveL1StripeCellContext &resultContext = buildResultContextRef(cellContexts, currentBlock, blockCount);
1181 resultContext.loadPrecalculatedZs(Y);
1183 ptrBElement = B + currentBlock * block_step;
1186 goAssigningTheResult = true;
1191 if (goAssigningTheResult)
1193 cellindexint existingDescriptor = blockProgressDescriptors[currentBlock];
1194 // Check if the assignment has not been completed yet
1195 if (existingDescriptor != INVALID_CELLDESCRIPTOR)
1197 // Assign the computation results to their places in the matrix
1198 ptrBElement[0] = Y[0][0];
1199 ptrBElement[1] = Y[1][0];
1200 if (b_rows >= 2)
1202 ptrBElement[rowSkip] = Y[0][1];
1203 ptrBElement[1 + rowSkip] = Y[1][1];
1206 dSASSERT(block_step == 2);
1207 dSASSERT(b_rows >= 1 && b_rows <= 2);
1209 ThrsafeIncrementIntUpToLimit(&refBlockCompletionProgress, currentBlock + 1);
1210 dIASSERT(refBlockCompletionProgress >= currentBlock + 1);
1212 // And assign the completed status no matter what
1213 CooperativeAtomics::AtomicStoreCellindexint(&blockProgressDescriptors[currentBlock], INVALID_CELLDESCRIPTOR);
1215 else
1217 // everything is over -- just go handling next blocks
1221 if (!stayWithinTheBlock)
1223 completedBlocks = refBlockCompletionProgress;
1225 if (completedBlocks == blockCount)
1227 break;
1230 currentBlock += 1;
1232 bool lookaheadBoundaryReached = false;
1234 if (currentBlock == blockCount || completedBlocks == 0)
1236 lookaheadBoundaryReached = true;
1238 else if (currentBlock >= completedBlocks + lookaheadRange)
1240 lookaheadBoundaryReached = blockProcessingState > BPS_NO_BLOCKS_PROCESSED;
1242 else if (currentBlock < completedBlocks)
1244 // Treat detected row advancement as a row processed
1245 // blockProcessingState = BPS_SOME_BLOCKS_PROCESSED; <-- performs better without it
1247 currentBlock = completedBlocks;
1250 if (lookaheadBoundaryReached)
1252 dIASSERT(blockProcessingState != BPS_COMPETING_FOR_A_BLOCK); // Why did not we compete???
1254 // If no row has been processed in the previous pass, compete for the next row to avoid cycling uselessly
1255 if (blockProcessingState <= BPS_NO_BLOCKS_PROCESSED)
1257 // Abandon job if too few blocks remain
1258 if (blockCount - completedBlocks <= ownThreadIndex)
1260 break;
1263 blockProcessingState = BPS_COMPETING_FOR_A_BLOCK;
1265 else
1267 // If there was some progress, just continue to the next pass
1268 blockProcessingState = BPS_NO_BLOCKS_PROCESSED;
1271 currentBlock = completedBlocks;
1278 template<unsigned int a_rows, unsigned int d_stride>
1279 /*static */
1280 void ThreadedEquationSolverLDLT::participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip,
1281 FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex)
1283 dIASSERT(factorizationRow != 0);
1284 dIASSERT(factorizationRow % 2 == 0);
1286 /* scale the elements in a 2 x i block at A(i,0), and also */
1287 /* compute Z = the outer product matrix that we'll need. */
1288 dReal sameZ[a_rows] = { REAL(0.0), }, mixedZ[dMACRO_MAX(a_rows - 1, 1)] = { REAL(0.0), };
1289 bool doneAnything = false;
1291 const unsigned blockSize = deriveScalingAndFactorizingL1StripeBlockSize(a_rows);
1293 const unsigned blockCount = deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(factorizationRow, blockSize);
1294 dIASSERT(blockCount != 0);
1296 unsigned blockIndex;
1297 while ((blockIndex = ThrsafeIncrementIntUpToLimit(&factorizationContext->m_nextColumnIndex, blockCount)) != blockCount)
1299 doneAnything = true;
1300 unsigned blockStartRow = blockIndex * blockSize;
1302 dReal *ptrAElement = ARow + blockStartRow;
1303 dReal *ptrDElement = d + blockStartRow * d_stride;
1304 for (unsigned columnCounter = blockIndex != blockCount - 1 ? blockSize : factorizationRow - blockStartRow; ; )
1306 dReal p1, q1, p2, q2, dd;
1308 p1 = ptrAElement[0];
1309 if (a_rows >= 2)
1311 p2 = ptrAElement[rowSkip];
1313 dd = ptrDElement[0 * d_stride];
1314 q1 = p1 * dd;
1315 if (a_rows >= 2)
1317 q2 = p2 * dd;
1319 ptrAElement[0] = q1;
1320 if (a_rows >= 2)
1322 ptrAElement[rowSkip] = q2;
1324 sameZ[0] += p1 * q1;
1325 if (a_rows >= 2)
1327 sameZ[1] += p2 * q2;
1328 mixedZ[0] += p2 * q1;
1331 p1 = ptrAElement[1];
1332 if (a_rows >= 2)
1334 p2 = ptrAElement[1 + rowSkip];
1336 dd = ptrDElement[1 * d_stride];
1337 q1 = p1 * dd;
1338 if (a_rows >= 2)
1340 q2 = p2 * dd;
1342 ptrAElement[1] = q1;
1343 if (a_rows >= 2)
1345 ptrAElement[1 + rowSkip] = q2;
1347 sameZ[0] += p1 * q1;
1348 if (a_rows >= 2)
1350 sameZ[1] += p2 * q2;
1351 mixedZ[0] += p2 * q1;
1354 if (columnCounter > 6)
1356 columnCounter -= 6;
1358 ptrAElement += 6;
1359 ptrDElement += 6 * d_stride;
1361 p1 = ptrAElement[-4];
1362 if (a_rows >= 2)
1364 p2 = ptrAElement[-4 + rowSkip];
1366 dd = ptrDElement[-4 * (int)d_stride];
1367 q1 = p1 * dd;
1368 if (a_rows >= 2)
1370 q2 = p2 * dd;
1372 ptrAElement[-4] = q1;
1373 if (a_rows >= 2)
1375 ptrAElement[-4 + rowSkip] = q2;
1377 sameZ[0] += p1 * q1;
1378 if (a_rows >= 2)
1380 sameZ[1] += p2 * q2;
1381 mixedZ[0] += p2 * q1;
1384 p1 = ptrAElement[-3];
1385 if (a_rows >= 2)
1387 p2 = ptrAElement[-3 + rowSkip];
1389 dd = ptrDElement[-3 * (int)d_stride];
1390 q1 = p1 * dd;
1391 if (a_rows >= 2)
1393 q2 = p2 * dd;
1395 ptrAElement[-3] = q1;
1396 if (a_rows >= 2)
1398 ptrAElement[-3 + rowSkip] = q2;
1400 sameZ[0] += p1 * q1;
1401 if (a_rows >= 2)
1403 sameZ[1] += p2 * q2;
1404 mixedZ[0] += p2 * q1;
1407 p1 = ptrAElement[-2];
1408 if (a_rows >= 2)
1410 p2 = ptrAElement[-2 + rowSkip];
1412 dd = ptrDElement[-2 * (int)d_stride];
1413 q1 = p1 * dd;
1414 if (a_rows >= 2)
1416 q2 = p2 * dd;
1418 ptrAElement[-2] = q1;
1419 if (a_rows >= 2)
1421 ptrAElement[-2 + rowSkip] = q2;
1423 sameZ[0] += p1 * q1;
1424 if (a_rows >= 2)
1426 sameZ[1] += p2 * q2;
1427 mixedZ[0] += p2 * q1;
1430 p1 = ptrAElement[-1];
1431 if (a_rows >= 2)
1433 p2 = ptrAElement[-1 + rowSkip];
1435 dd = ptrDElement[-1 * (int)d_stride];
1436 q1 = p1 * dd;
1437 if (a_rows >= 2)
1439 q2 = p2 * dd;
1441 ptrAElement[-1] = q1;
1442 if (a_rows >= 2)
1444 ptrAElement[-1 + rowSkip] = q2;
1446 sameZ[0] += p1 * q1;
1447 if (a_rows >= 2)
1449 sameZ[1] += p2 * q2;
1450 mixedZ[0] += p2 * q1;
1453 else
1455 ptrAElement += 2;
1456 ptrDElement += 2 * d_stride;
1458 if ((columnCounter -= 2) == 0)
1460 break;
1466 if (doneAnything)
1468 unsigned partialSumThreadIndex;
1469 for (bool exitLoop = false; !exitLoop; exitLoop = CooperativeAtomics::AtomicCompareExchangeUint32(&factorizationContext->m_sumThreadIndex, partialSumThreadIndex, ownThreadIndex + 1))
1471 partialSumThreadIndex = factorizationContext->m_sumThreadIndex;
1473 if (partialSumThreadIndex != 0)
1475 const FactorizationFactorizeL1StripeThreadContext &partialSumContext = factorizationContext->m_threadContexts[partialSumThreadIndex - 1];
1476 factorizationContext->m_threadContexts[ownThreadIndex].assignDataSum<a_rows>(sameZ, mixedZ, partialSumContext);
1478 else
1480 factorizationContext->m_threadContexts[ownThreadIndex].assignDataAlone<a_rows>(sameZ, mixedZ);
1485 unsigned threadExitIndex = CooperativeAtomics::AtomicDecrementUint32(&factorizationContext->m_threadsRunning);
1486 dIASSERT(threadExitIndex + 1U != 0);
1488 if (threadExitIndex == 0)
1490 // Let the last thread retrieve the sum and perform final computations
1491 unsigned sumThreadIndex = factorizationContext->m_sumThreadIndex;
1492 dIASSERT(sumThreadIndex != 0); // The rowIndex was asserted to be not zero, so at least one thread must have done something
1494 const FactorizationFactorizeL1StripeThreadContext &sumContext = factorizationContext->m_threadContexts[sumThreadIndex - 1];
1495 sumContext.retrieveData<a_rows>(sameZ, mixedZ);
1497 dReal *ptrAElement = ARow + factorizationRow;
1498 dReal *ptrDElement = d + factorizationRow * d_stride;
1500 /* solve for diagonal 2 x 2 block at A(i,i) */
1501 dReal Y11, Y21, Y22;
1503 Y11 = ptrAElement[0] - sameZ[0];
1504 if (a_rows >= 2)
1506 Y21 = ptrAElement[rowSkip] - mixedZ[0];
1507 Y22 = ptrAElement[1 + rowSkip] - sameZ[1];
1510 /* factorize 2 x 2 block Y, ptrDElement */
1511 /* factorize row 1 */
1512 dReal dd = dRecip(Y11);
1514 ptrDElement[0 * d_stride] = dd;
1515 dIASSERT(ptrDElement == d + (sizeint)factorizationRow * d_stride);
1517 if (a_rows >= 2)
1519 /* factorize row 2 */
1520 dReal q2 = Y21 * dd;
1521 ptrAElement[rowSkip] = q2;
1523 dReal sum = Y21 * q2;
1524 ptrDElement[1 * d_stride] = dRecip(Y22 - sum);
1530 #endif // #ifndef _ODE_FASTLDLT_IMPL_H_