1 /*************************************************************************
3 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
4 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
6 * This library is free software; you can redistribute it and/or *
7 * modify it under the terms of EITHER: *
8 * (1) The GNU Lesser General Public License as published by the Free *
9 * Software Foundation; either version 2.1 of the License, or (at *
10 * your option) any later version. The text of the GNU Lesser *
11 * General Public License is included with this library in the *
13 * (2) The BSD-style license that is included with this library in *
14 * the file LICENSE-BSD.TXT. *
16 * This library is distributed in the hope that it will be useful, *
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
19 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
21 *************************************************************************/
24 * Equation System Threaded Solver
25 * Copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
30 #ifndef _ODE_THREADED_SOLVER_LDLT_H_
31 #define _ODE_THREADED_SOLVER_LDLT_H_
34 #include "coop_matrix_types.h"
35 #include <ode/threading.h>
38 class dxThreadingBase
;
39 class dxResourceRequirementDescriptor
;
40 class dxRequiredResourceContainer
;
43 class ThreadedEquationSolverLDLT
46 static void estimateCooperativeFactoringLDLTResourceRequirements(dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
47 unsigned allowedThreadCount
, unsigned rowCount
);
48 static void cooperativelyFactorLDLT(dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
49 dReal
*A
, dReal
*d
, unsigned rowCount
, unsigned rowSkip
);
51 static void estimateCooperativeSolvingL1StraightResourceRequirements(dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
52 unsigned allowedThreadCount
, unsigned rowCount
);
53 static void cooperativelySolveL1Straight(dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
54 const dReal
*L
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
);
56 static void estimateCooperativeSolvingL1TransposedResourceRequirements(dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
57 unsigned allowedThreadCount
, unsigned rowCount
);
58 static void cooperativelySolveL1Transposed(dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
59 const dReal
*L
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
);
61 static void estimateCooperativeScalingVectorResourceRequirements(dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
62 unsigned allowedThreadCount
, unsigned elementCount
);
63 static void cooperativelyScaleVector(dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
64 dReal
*vectorData
, const dReal
*scaleData
, unsigned elementCount
);
66 static void estimateCooperativeSolvingLDLTResourceRequirements(dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
67 unsigned allowedThreadCount
, unsigned rowCount
);
68 static void cooperativelySolveLDLT(dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
69 const dReal
*L
, const dReal
*d
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
);
74 ALLOCATION_DEFAULT_ALIGNMENT
= COOP_THREAD_DATA_ALIGNMENT_SIZE
,
78 struct FactorizationSolveL1StripeCellContext
;
79 struct FactorizationFactorizeL1StripeThreadContext
;
84 FLDLT_COOPERATIVE_BLOCK_COUNT_MINIMUM
= 5,
88 FSL1S_REGULAR_B_ROWS
= FSL1S_BLOCK_SIZE
,
89 FSL1S_FINAL_B_ROWS
= 1,
91 FFL1S_REGULAR_A_ROWS
= FSL1S_BLOCK_SIZE
,
92 FFL1S_FINAL_A_ROWS
= 1,
93 FFL1S_REGULAR_BLOCK_SIZE
= 16, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6
94 FFL1S_FINAL_BLOCK_SIZE
= 32, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6
97 static unsigned restrictFactoringLDLTAllowedThreadCount(
98 dxThreadingBase
*threading
, unsigned allowedThreadCount
, unsigned rowCount
);
99 static void doEstimateCooperativeFactoringLDLTResourceRequirementsValidated(
100 dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
101 unsigned allowedThreadCount
, unsigned rowCount
);
102 static void doCooperativelyFactorLDLTValidated(
103 dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
104 dReal
*A
, dReal
*d
, unsigned rowCount
, unsigned rowSkip
);
107 static unsigned deriveSolvingL1StripeBlockCount(unsigned rowCount
, unsigned blockStep
)
109 return (rowCount
+ (blockStep
- 1)) / blockStep
;
112 struct FactorizationSolvingL1StripeMemoryEstimates
114 void assignData(sizeint descriptorSizeRequired
, sizeint contextSizeRequired
)
116 m_descriptorSizeRequired
= descriptorSizeRequired
;
117 m_contextSizeRequired
= contextSizeRequired
;
120 sizeint m_descriptorSizeRequired
;
121 sizeint m_contextSizeRequired
;
124 static unsigned deriveSolvingL1StripeThreadCount(unsigned blockCount
, unsigned allowedThreadCount
)
126 dIASSERT(allowedThreadCount
>= 1);
128 unsigned maximumCount
= blockCount
/ 2;
129 return maximumCount
>= allowedThreadCount
? allowedThreadCount
: dMACRO_MAX(maximumCount
, 1U);
132 static sizeint
estimateCooperativelySolvingL1Stripe_XMemoryRequirement(unsigned blockCount
,
133 FactorizationSolvingL1StripeMemoryEstimates
&ref_memoryEstimates
)
135 sizeint descriptorSizeRequired
= dOVERALIGNED_SIZE(sizeof(cellindexint
) * blockCount
, COOP_THREAD_DATA_ALIGNMENT_SIZE
);
136 sizeint contextSizeRequired
= dOVERALIGNED_SIZE(sizeof(FactorizationSolveL1StripeCellContext
) * (CCI__MAX
+ 1) * blockCount
, COOP_THREAD_DATA_ALIGNMENT_SIZE
);
137 ref_memoryEstimates
.assignData(descriptorSizeRequired
, contextSizeRequired
);
139 sizeint totalSizeRequired
= descriptorSizeRequired
+ contextSizeRequired
;
140 return totalSizeRequired
;
143 static void *markCooperativelySolvingL1Stripe_XMemoryStructuresOut(void *buffer
,
144 const FactorizationSolvingL1StripeMemoryEstimates
&memoryEstimates
,
145 cellindexint
*&out_blockProgressDescriptors
, FactorizationSolveL1StripeCellContext
*&out_cellContexts
)
147 void *currentLocation
= buffer
;
149 out_blockProgressDescriptors
= (cellindexint
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ memoryEstimates
.m_descriptorSizeRequired
;
150 out_cellContexts
= (FactorizationSolveL1StripeCellContext
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ memoryEstimates
.m_contextSizeRequired
;
152 return currentLocation
;
155 static void initializeCooperativelySolvingL1Stripe_XMemoryStructures(unsigned blockCount
,
156 atomicord32
&out_blockCompletionProgress
, cellindexint
*blockProgressDescriptors
, FactorizationSolveL1StripeCellContext
*dUNUSED(cellContexts
))
158 out_blockCompletionProgress
= 0;
159 memset(blockProgressDescriptors
, 0, blockCount
* sizeof(*blockProgressDescriptors
));
162 template<unsigned int block_step
, unsigned int b_rows
>
163 static void participateSolvingL1Stripe_X(const dReal
*L
, dReal
*B
, unsigned blockCount
, unsigned rowSkip
,
164 volatile atomicord32
&refBlockCompletionProgress
/*=0*/, volatile cellindexint
*blockProgressDescriptors
/*=[blockCount]*/,
165 FactorizationSolveL1StripeCellContext
*cellContexts
/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex
);
167 static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromSolvingBlockIndex(unsigned solvingBlockIndex
, unsigned solvingBlockStep
, unsigned blockARows
)
169 unsigned factorizingBlockSize
= deriveScalingAndFactorizingL1StripeBlockSize(blockARows
);
170 return deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(solvingBlockIndex
* solvingBlockStep
, factorizingBlockSize
);
173 static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(unsigned factorizationRowIndex
, unsigned factorizationBlockSize
)
175 return (factorizationRowIndex
+ (factorizationBlockSize
- 1)) / factorizationBlockSize
;
178 static unsigned deriveScalingAndFactorizingL1StripeBlockSize(unsigned blockARows
)
180 unsigned result
= blockARows
!= 1 ? FFL1S_REGULAR_BLOCK_SIZE
: FFL1S_FINAL_BLOCK_SIZE
;
181 dIASSERT(blockARows
>= 1 && blockARows
<= 2);
187 static unsigned deriveScalingAndFactorizingL1StripeThreadCount(unsigned blockCount
, unsigned allowedThreadCount
)
189 dIASSERT(blockCount
!= 0);
190 dIASSERT(allowedThreadCount
>= 1);
192 return dMACRO_MIN(blockCount
, allowedThreadCount
);
195 struct FactorizationFactorizeL1StripeContext
;
197 struct FactorizationScalingAndFactorizingL1StripeMemoryEstimates
199 void assignData(sizeint contextSizeRequired
)
201 m_contextSizeRequired
= contextSizeRequired
;
204 sizeint m_contextSizeRequired
;
207 static sizeint
estimateCooperativelyScalingAndFactorizingL1Stripe_XMemoryRequirement(unsigned factorizingMaximumThreads
,
208 FactorizationScalingAndFactorizingL1StripeMemoryEstimates
&ref_memoryEstimates
)
210 dIASSERT(factorizingMaximumThreads
!= 0);
212 sizeint contextSizeRequired
= dOVERALIGNED_SIZE(sizeof(FactorizationFactorizeL1StripeContext
) + sizeof(FactorizationFactorizeL1StripeThreadContext
) * (factorizingMaximumThreads
- 1), COOP_THREAD_DATA_ALIGNMENT_SIZE
);
213 ref_memoryEstimates
.assignData(contextSizeRequired
);
215 sizeint totalSizeRequired
= contextSizeRequired
;
216 return totalSizeRequired
;
219 static void *markCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructuresOut(void *buffer
,
220 const FactorizationScalingAndFactorizingL1StripeMemoryEstimates
&memoryEstimates
, FactorizationFactorizeL1StripeContext
*&out_factorizationContext
)
222 void *currentLocation
= buffer
;
224 out_factorizationContext
= (FactorizationFactorizeL1StripeContext
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ memoryEstimates
.m_contextSizeRequired
;
226 return currentLocation
;
229 static void initializeCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructures(
230 FactorizationFactorizeL1StripeContext
*factorizationContext
, unsigned threadCount
)
232 factorizationContext
->initialize(threadCount
);
236 template<unsigned int a_rows
, unsigned int d_stride
>
237 static void participateScalingAndFactorizingL1Stripe_X(dReal
*ARow
, dReal
*d
, unsigned factorizationRow
, unsigned rowSkip
,
238 FactorizationFactorizeL1StripeContext
*factorizationContext
, unsigned ownThreadIndex
);
241 struct FactorLDLTWorkerContext
243 FactorLDLTWorkerContext(dxThreadingBase
*threading
, unsigned allowedThreadCount
,
244 dReal
*A
, dReal
*d
, unsigned totalBlockCount
, unsigned rowCount
, unsigned rowSkip
,
245 atomicord32
&ref_solvingBlockCompletionProgress
, cellindexint
*solvingBlockProgressDescriptors
,
246 FactorizationSolveL1StripeCellContext
*solvingCellContexts
,
247 FactorizationFactorizeL1StripeContext
*factorizingFactorizationContext
,
248 dCallReleaseeID calculationFinishReleasee
):
249 m_threading(threading
),
250 m_allowedThreadCount(allowedThreadCount
),
254 m_solvingBlockIndex(0),
255 m_totalBlockCount(totalBlockCount
),
256 m_rowCount(rowCount
),
258 m_refSolvingBlockCompletionProgress(ref_solvingBlockCompletionProgress
),
259 m_solvingBlockProgressDescriptors(solvingBlockProgressDescriptors
),
260 m_solvingCellContexts(solvingCellContexts
),
261 m_factorizingFactorizationContext(factorizingFactorizationContext
),
262 m_calculationFinishReleasee(calculationFinishReleasee
)
266 void incrementForNextBlock()
268 const unsigned blockStep
= FSL1S_BLOCK_SIZE
;
270 m_ARow
+= blockStep
* m_rowSkip
;
271 m_solvingBlockIndex
+= 1;
274 dxThreadingBase
*m_threading
;
275 unsigned m_allowedThreadCount
;
279 unsigned m_solvingBlockIndex
;
280 unsigned m_totalBlockCount
;
283 atomicord32
&m_refSolvingBlockCompletionProgress
;
284 cellindexint
*m_solvingBlockProgressDescriptors
;
285 FactorizationSolveL1StripeCellContext
*m_solvingCellContexts
;
286 FactorizationFactorizeL1StripeContext
*m_factorizingFactorizationContext
;
287 dCallReleaseeID m_calculationFinishReleasee
;
290 static int factotLDLT_solvingComplete_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
291 static void factotLDLT_solvingComplete(FactorLDLTWorkerContext
&ref_context
, unsigned ownThreadIndex
);
293 static int factotLDLT_solvingCompleteSync_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
294 static void factotLDLT_solvingCompleteSync(FactorLDLTWorkerContext
&ref_workerContext
);
296 static int factotLDLT_scalingAndFactorizingComplete_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
297 static void factotLDLT_scalingAndFactorizingComplete(FactorLDLTWorkerContext
&ref_workerContext
, unsigned ownThreadIndex
);
299 static int factotLDLT_scalingAndFactorizingCompleteSync_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
300 static void factotLDLT_scalingAndFactorizingCompleteSync(FactorLDLTWorkerContext
&ref_workerContext
);
302 static int factotLDLT_solvingFinal_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
303 static void factotLDLT_solvingFinal(FactorLDLTWorkerContext
&ref_context
, unsigned ownThreadIndex
);
305 static int factotLDLT_solvingFinalSync_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
306 static void factotLDLT_solvingFinalSync(FactorLDLTWorkerContext
&ref_workerContext
);
308 static int factotLDLT_scalingAndFactorizingFinal_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
309 static void factotLDLT_scalingAndFactorizingFinal(FactorLDLTWorkerContext
&ref_workerContext
, unsigned ownThreadIndex
);
311 static int factotLDLT_completion_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
314 struct FactorizationSolveL1StripeCellContext
316 template<unsigned int block_step
, unsigned int b_rows
>
317 static void initializePrecalculatedZs(dReal (&Z
)[block_step
][b_rows
])
329 dSASSERT(block_step
== 2);
330 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
333 template<unsigned int block_step
, unsigned int b_rows
>
334 void loadPrecalculatedZs(dReal (&Z
)[block_step
][b_rows
]) const
336 dSASSERT(block_step
<= dARRAY_SIZE(m_c
));
337 dSASSERT(b_rows
<= dARRAY_SIZE(m_c
[0]));
349 dSASSERT(block_step
== 2);
350 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
353 template<unsigned int block_step
, unsigned int b_rows
>
354 void storePrecalculatedZs(const dReal (&Z
)[block_step
][b_rows
])
356 dSASSERT(block_step
<= dARRAY_SIZE(m_c
));
357 dSASSERT(b_rows
<= dARRAY_SIZE(m_c
[0]));
369 dSASSERT(block_step
== 2);
370 dSASSERT(b_rows
>= 1 && b_rows
<= 2);
373 dReal m_c
[FSL1S_BLOCK_SIZE
][FSL1S_REGULAR_B_ROWS
];
374 // dReal m_reserved[4];
377 static FactorizationSolveL1StripeCellContext
&buildBlockContextRef(FactorizationSolveL1StripeCellContext
*cellContexts
, unsigned blockIndex
, CellContextInstance contextInstance
)
379 return cellContexts
[blockIndex
* CCI__MAX
+ contextInstance
];
382 static FactorizationSolveL1StripeCellContext
&buildResultContextRef(FactorizationSolveL1StripeCellContext
*cellContexts
, unsigned blockIndex
, unsigned blockCount
)
384 return cellContexts
[blockCount
* CCI__MAX
+ blockIndex
];
388 struct FactorizationFactorizeL1StripeThreadContext
390 template<unsigned int a_rows
>
391 void assignDataSum(const dReal (&sameZ
)[a_rows
], const dReal (&mixedZ
)[dMACRO_MAX(a_rows
- 1, 1)],
392 const FactorizationFactorizeL1StripeThreadContext
&partialSumContext
)
394 m_sameZ
[0] = sameZ
[0] + partialSumContext
.m_sameZ
[0];
397 m_sameZ
[1] = sameZ
[1] + partialSumContext
.m_sameZ
[1];
398 m_mixedZ
[0] = mixedZ
[0] + partialSumContext
.m_mixedZ
[0];
402 template<unsigned int a_rows
>
403 void assignDataAlone(const dReal (&sameZ
)[a_rows
], const dReal (&mixedZ
)[dMACRO_MAX(a_rows
- 1, 1)])
405 m_sameZ
[0] = sameZ
[0];
408 m_sameZ
[1] = sameZ
[1];
409 m_mixedZ
[0] = mixedZ
[0];
413 template<unsigned int a_rows
>
414 void retrieveData(dReal (&out_sameZ
)[a_rows
], dReal (&out_mixedZ
)[dMACRO_MAX(a_rows
- 1, 1)]) const
416 out_sameZ
[0] = m_sameZ
[0];
419 out_sameZ
[1] = m_sameZ
[1];
420 out_mixedZ
[0] = m_mixedZ
[0];
422 dAASSERT(a_rows
>= 1 && a_rows
<= 2);
425 dReal m_sameZ
[FFL1S_REGULAR_A_ROWS
];
426 dReal m_mixedZ
[dMACRO_MAX(FFL1S_REGULAR_A_ROWS
- 1, 1)];
427 dReal m_reserved
[1]; // [5]; // for alignment
430 struct FactorizationFactorizeL1StripeContext
432 void initialize(unsigned threadCount
)
434 m_threadsRunning
= threadCount
;
435 m_nextColumnIndex
= 0;
436 m_sumThreadIndex
= 0;
439 atomicord32 m_threadsRunning
;
440 atomicord32 m_nextColumnIndex
;
441 volatile atomicord32 m_sumThreadIndex
;
442 atomicord32 m_reserved
[1]; // [13]; // for alignment
443 FactorizationFactorizeL1StripeThreadContext m_threadContexts
[1]; // =[threadCount]
447 struct SolveL1StraightCellContext
;
451 SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM
= 8,
457 static unsigned restrictSolvingL1StraightAllowedThreadCount(
458 dxThreadingBase
*threading
, unsigned allowedThreadCount
, unsigned rowCount
);
459 static void doEstimateCooperativeSolvingL1StraightResourceRequirementsValidated(
460 dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
461 unsigned allowedThreadCount
, unsigned rowCount
);
462 static void doCooperativelySolveL1StraightValidated(
463 dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
464 const dReal
*L
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
);
466 static unsigned deriveSolvingL1StraightBlockCount(unsigned rowCount
, unsigned blockStep
)
468 return (rowCount
+ (blockStep
- 1)) / blockStep
;
471 struct SolvingL1StraightMemoryEstimates
473 void assignData(sizeint descriptorSizeRequired
, sizeint contextSizeRequired
)
475 m_descriptorSizeRequired
= descriptorSizeRequired
;
476 m_contextSizeRequired
= contextSizeRequired
;
479 sizeint m_descriptorSizeRequired
;
480 sizeint m_contextSizeRequired
;
483 static unsigned deriveSolvingL1StraightThreadCount(unsigned blockCount
, unsigned allowedThreadCount
)
485 dIASSERT(allowedThreadCount
>= 1);
487 unsigned maximumCount
= 1 + blockCount
/ SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM
;
488 return maximumCount
>= allowedThreadCount
? allowedThreadCount
: dMACRO_MAX(maximumCount
, 1U);
491 template<unsigned int block_step
>
492 static sizeint
estimateCooperativelySolvingL1StraightMemoryRequirement(unsigned rowCount
, SolvingL1StraightMemoryEstimates
&ref_solvingMemoryEstimates
);
494 static void *markCooperativelySolvingL1StraightMemoryStructuresOut(void *buffer
,
495 const SolvingL1StraightMemoryEstimates
&solvingMemoryEstimates
,
496 cellindexint
*&out_blockProgressDescriptors
, SolveL1StraightCellContext
*&out_cellContexts
)
498 void *currentLocation
= buffer
;
500 out_blockProgressDescriptors
= (cellindexint
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ solvingMemoryEstimates
.m_descriptorSizeRequired
;
501 out_cellContexts
= (SolveL1StraightCellContext
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ solvingMemoryEstimates
.m_contextSizeRequired
;
502 return currentLocation
;
505 template<unsigned int block_step
>
506 static void initializeCooperativelySolveL1StraightMemoryStructures(unsigned rowCount
,
507 atomicord32
&out_blockCompletionProgress
, cellindexint
*blockProgressDescriptors
, SolveL1StraightCellContext
*cellContexts
);
508 template<unsigned int block_step
, unsigned int b_stride
>
509 static void participateSolvingL1Straight(const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
,
510 volatile atomicord32
&refBlockCompletionProgress
/*=0*/, volatile cellindexint
*blockProgressDescriptors
/*=[blockCount]*/,
511 SolveL1StraightCellContext
*cellContexts
/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex
);
514 struct SolveL1StraightWorkerContext
516 void init(const dReal
*L
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
,
517 atomicord32
&ref_blockCompletionProgress
, cellindexint
*blockProgressDescriptors
, SolveL1StraightCellContext
*cellContexts
)
521 m_rowCount
= rowCount
;
523 m_ptrBlockCompletionProgress
= &ref_blockCompletionProgress
;
524 m_blockProgressDescriptors
= blockProgressDescriptors
;
525 m_cellContexts
= cellContexts
;
532 atomicord32
*m_ptrBlockCompletionProgress
;
533 cellindexint
*m_blockProgressDescriptors
;
534 SolveL1StraightCellContext
*m_cellContexts
;
537 static int solveL1Straight_worker_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
538 static void solveL1Straight_worker(SolveL1StraightWorkerContext
&ref_context
, unsigned ownThreadIndex
);
540 static int solveL1Straight_completion_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
543 struct SolveL1StraightCellContext
545 template<unsigned int block_step
>
546 static void initializePrecalculatedZs(dReal (&Z
)[block_step
])
548 std::fill(Z
, Z
+ block_step
, REAL(0.0));
551 template<unsigned int block_step
>
552 void loadPrecalculatedZs(dReal (&Z
)[block_step
]) const
554 dSASSERT(block_step
<= dARRAY_SIZE(m_c
));
556 std::copy(m_c
, m_c
+ block_step
, Z
);
559 template<unsigned int block_step
>
560 void storePrecalculatedZs(const dReal (&Z
)[block_step
])
562 dSASSERT(block_step
<= dARRAY_SIZE(m_c
));
564 std::copy(Z
, Z
+ block_step
, m_c
);
567 dReal m_c
[SL1S_BLOCK_SIZE
];
571 static SolveL1StraightCellContext
&buildBlockContextRef(SolveL1StraightCellContext
*cellContexts
, unsigned blockIndex
, CellContextInstance contextInstance
)
573 return cellContexts
[blockIndex
* CCI__MAX
+ contextInstance
];
576 static SolveL1StraightCellContext
&buildResultContextRef(SolveL1StraightCellContext
*cellContexts
, unsigned blockIndex
, unsigned blockCount
)
578 return cellContexts
[blockCount
* CCI__MAX
+ blockIndex
];
583 struct SolveL1TransposedCellContext
;
587 SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM
= SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM
,
589 SL1T_B_STRIDE
= SL1S_B_STRIDE
,
593 static unsigned restrictSolvingL1TransposedAllowedThreadCount(
594 dxThreadingBase
*threading
, unsigned allowedThreadCount
, unsigned rowCount
);
595 static void doEstimateCooperativeSolvingL1TransposedResourceRequirementsValidated(
596 dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
597 unsigned allowedThreadCount
, unsigned rowCount
);
598 static void doCooperativelySolveL1TransposedValidated(
599 dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
600 const dReal
*L
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
);
602 static unsigned deriveSolvingL1TransposedBlockCount(unsigned rowCount
, unsigned blockStep
)
604 return (rowCount
+ (blockStep
- 1)) / blockStep
;
607 struct SolvingL1TransposedMemoryEstimates
609 void assignData(sizeint descriptorSizeRequired
, sizeint contextSizeRequired
)
611 m_descriptorSizeRequired
= descriptorSizeRequired
;
612 m_contextSizeRequired
= contextSizeRequired
;
615 sizeint m_descriptorSizeRequired
;
616 sizeint m_contextSizeRequired
;
619 static unsigned deriveSolvingL1TransposedThreadCount(unsigned blockCount
, unsigned allowedThreadCount
)
621 dSASSERT(SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM
+ 0 == SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM
);
623 return deriveSolvingL1StraightThreadCount(blockCount
, allowedThreadCount
);
626 template<unsigned int block_step
>
627 static sizeint
estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount
, SolvingL1TransposedMemoryEstimates
&ref_solvingMemoryEstimates
);
629 static void *markCooperativelySolvingL1TransposedMemoryStructuresOut(void *buffer
,
630 const SolvingL1TransposedMemoryEstimates
&solvingMemoryEstimates
,
631 cellindexint
*&out_blockProgressDescriptors
, SolveL1TransposedCellContext
*&out_cellContexts
)
633 void *currentLocation
= buffer
;
635 out_blockProgressDescriptors
= (cellindexint
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ solvingMemoryEstimates
.m_descriptorSizeRequired
;
636 out_cellContexts
= (SolveL1TransposedCellContext
*)currentLocation
; currentLocation
= (uint8
*)currentLocation
+ solvingMemoryEstimates
.m_contextSizeRequired
;
637 return currentLocation
;
640 template<unsigned int block_step
>
641 static void *allocateCooperativelySolveL1TransposedMemoryStructures(sizeint
&out_sizeAllocated
, unsigned rowCount
,
642 cellindexint
*&out_blockProgressDescriptors
, SolveL1TransposedCellContext
*&out_cellContexts
);
643 template<unsigned int block_step
>
644 static void initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount
,
645 atomicord32
&out_blockCompletionProgress
, cellindexint
*blockProgressDescriptors
, SolveL1TransposedCellContext
*cellContexts
);
646 template<unsigned int block_step
, unsigned int b_stride
>
647 static void participateSolvingL1Transposed(const dReal
*L
, dReal
*B
, unsigned rowCount
, unsigned rowSkip
,
648 volatile atomicord32
&refBlockCompletionProgress
/*=0*/, volatile cellindexint
*blockProgressDescriptors
/*=[blockCount]*/,
649 SolveL1TransposedCellContext
*cellContexts
/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex
);
652 struct SolveL1TransposedWorkerContext
654 void init(const dReal
*L
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
,
655 atomicord32
&ref_blockCompletionProgress
, cellindexint
*blockProgressDescriptors
, SolveL1TransposedCellContext
*cellContexts
)
659 m_rowCount
= rowCount
;
661 m_ptrBlockCompletionProgress
= &ref_blockCompletionProgress
;
662 m_blockProgressDescriptors
= blockProgressDescriptors
;
663 m_cellContexts
= cellContexts
;
670 atomicord32
*m_ptrBlockCompletionProgress
;
671 cellindexint
*m_blockProgressDescriptors
;
672 SolveL1TransposedCellContext
*m_cellContexts
;
675 static int solveL1Transposed_worker_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
676 static void solveL1Transposed_worker(SolveL1TransposedWorkerContext
&ref_context
, unsigned ownThreadIndex
);
678 static int solveL1Transposed_completion_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
681 struct SolveL1TransposedCellContext
683 template<unsigned int block_step
>
684 static void initializePrecalculatedZs(dReal (&Z
)[block_step
])
686 std::fill(Z
, Z
+ block_step
, REAL(0.0));
689 template<unsigned int block_step
>
690 void loadPrecalculatedZs(dReal (&Z
)[block_step
]) const
692 dSASSERT(block_step
<= dARRAY_SIZE(m_c
));
694 std::copy(m_c
, m_c
+ block_step
, Z
);
697 template<unsigned int block_step
>
698 void storePrecalculatedZs(const dReal (&Z
)[block_step
])
700 dSASSERT(block_step
<= dARRAY_SIZE(m_c
));
702 std::copy(Z
, Z
+ block_step
, m_c
);
705 dReal m_c
[SL1T_BLOCK_SIZE
];
708 static SolveL1TransposedCellContext
&buildBlockContextRef(SolveL1TransposedCellContext
*cellContexts
, unsigned blockIndex
, CellContextInstance contextInstance
)
710 return cellContexts
[blockIndex
* CCI__MAX
+ contextInstance
];
713 static SolveL1TransposedCellContext
&buildResultContextRef(SolveL1TransposedCellContext
*cellContexts
, unsigned blockIndex
, unsigned blockCount
)
715 return cellContexts
[blockCount
* CCI__MAX
+ blockIndex
];
725 SV_COOPERATIVE_BLOCK_COUNT_MINIMUM
= 3,
728 static unsigned restrictScalingVectorAllowedThreadCount(
729 dxThreadingBase
*threading
, unsigned allowedThreadCount
, unsigned elementCount
);
730 static void doEstimateCooperativeScalingVectorResourceRequirementsValidated(
731 dxResourceRequirementDescriptor
*summaryRequirementsDescriptor
,
732 unsigned allowedThreadCount
, unsigned elementCount
);
733 static void doCooperativelyScaleVectorValidated(dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
,
734 dReal
*vectorData
, const dReal
*scaleData
, unsigned elementCount
);
736 static unsigned deriveScalingVectorBlockCount(unsigned elementCount
, unsigned blockStep
)
738 return (elementCount
+ (blockStep
- 1)) / blockStep
;
741 static unsigned deriveScalingVectorThreadCount(unsigned lastBlockIndex
, unsigned allowedThreadCount
)
743 dIASSERT(allowedThreadCount
>= 1);
745 unsigned maximumCount
= lastBlockIndex
;
746 return maximumCount
>= allowedThreadCount
? allowedThreadCount
: dMACRO_MAX(maximumCount
, 1U);
749 static void initializeCooperativelyScaleVectorMemoryStructures(atomicord32
&out_blockCompletionProgress
)
751 out_blockCompletionProgress
= 0;
753 template<unsigned int block_step
, unsigned int a_stride
, unsigned int d_stride
>
754 static void participateScalingVector(dReal
*ptrAStart
, const dReal
*ptrDStart
, const unsigned elementCount
,
755 volatile atomicord32
&refBlockCompletionProgress
/*=0*/);
758 struct ScaleVectorWorkerContext
760 void init(dReal
*vectorData
, const dReal
*scaleData
, unsigned elementCount
,
761 atomicord32
&ref_blockCompletionProgress
)
763 m_vectorData
= vectorData
;
764 m_scaleData
= scaleData
;
765 m_elementCount
= elementCount
;
766 m_ptrBlockCompletionProgress
= &ref_blockCompletionProgress
;
770 const dReal
*m_scaleData
;
771 unsigned m_elementCount
;
772 atomicord32
*m_ptrBlockCompletionProgress
;
775 static int scaleVector_worker_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
776 static void scaleVector_worker(ScaleVectorWorkerContext
&ref_context
);
778 static int scaleVector_completion_callback(void *callContext
, dcallindex_t callInstanceIndex
, dCallReleaseeID callThisReleasee
);
782 enum SolvingLDLTStage
786 SLDLTS_SOLVING_STRAIGHT
= SLDLTS__MIN
,
787 SLDLTS_SCALING_VECTOR
,
788 SLDLTS_SOLVING_TRANSPOSED
,
795 SLDLT_B_STRIDE
= SL1S_B_STRIDE
,
796 SLDLT_D_STRIDE
= FLDLT_D_STRIDE
,
799 static unsigned restrictSolvingLDLTAllowedThreadCount(
800 dxThreadingBase
*threading
, unsigned allowedThreadCount
, unsigned rowCount
, unsigned &out_stageBlockCountSifficiencyMask
);
802 static void doCooperativelySolveLDLTValidated(
803 dxRequiredResourceContainer
*resourceContainer
, unsigned allowedThreadCount
, unsigned stageBlockCountSifficiencyMask
,
804 const dReal
*L
, const dReal
*d
, dReal
*b
, unsigned rowCount
, unsigned rowSkip
);