Cosmetic: Copyright years were updated
[ode.git] / ode / src / threaded_solver_ldlt.h
blobeff2ad597bddfa95a6a906e7b965e06f1ae178a8
1 /*************************************************************************
2 * *
3 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
4 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
5 * *
6 * This library is free software; you can redistribute it and/or *
7 * modify it under the terms of EITHER: *
8 * (1) The GNU Lesser General Public License as published by the Free *
9 * Software Foundation; either version 2.1 of the License, or (at *
10 * your option) any later version. The text of the GNU Lesser *
11 * General Public License is included with this library in the *
12 * file LICENSE.TXT. *
13 * (2) The BSD-style license that is included with this library in *
14 * the file LICENSE-BSD.TXT. *
15 * *
16 * This library is distributed in the hope that it will be useful, *
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
19 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
20 * *
21 *************************************************************************/
24 * Equation System Threaded Solver
25 * Copyright (c) 2017-2024 Oleh Derevenko, odar@eleks.com (change all "a" to "e")
30 #ifndef _ODE_THREADED_SOLVER_LDLT_H_
31 #define _ODE_THREADED_SOLVER_LDLT_H_
34 #include "coop_matrix_types.h"
35 #include <ode/threading.h>
38 class dxThreadingBase;
39 class dxResourceRequirementDescriptor;
40 class dxRequiredResourceContainer;
43 class ThreadedEquationSolverLDLT
45 public:
46 static void estimateCooperativeFactoringLDLTResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
47 unsigned allowedThreadCount, unsigned rowCount);
48 static void cooperativelyFactorLDLT(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
49 dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip);
51 static void estimateCooperativeSolvingL1StraightResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
52 unsigned allowedThreadCount, unsigned rowCount);
53 static void cooperativelySolveL1Straight(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
54 const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
56 static void estimateCooperativeSolvingL1TransposedResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
57 unsigned allowedThreadCount, unsigned rowCount);
58 static void cooperativelySolveL1Transposed(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
59 const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
61 static void estimateCooperativeScalingVectorResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
62 unsigned allowedThreadCount, unsigned elementCount);
63 static void cooperativelyScaleVector(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
64 dReal *vectorData, const dReal *scaleData, unsigned elementCount);
66 static void estimateCooperativeSolvingLDLTResourceRequirements(dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
67 unsigned allowedThreadCount, unsigned rowCount);
68 static void cooperativelySolveLDLT(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
69 const dReal *L, const dReal *d, dReal *b, unsigned rowCount, unsigned rowSkip);
71 public:
72 enum
74 ALLOCATION_DEFAULT_ALIGNMENT = COOP_THREAD_DATA_ALIGNMENT_SIZE,
77 private:
78 struct FactorizationSolveL1StripeCellContext;
79 struct FactorizationFactorizeL1StripeThreadContext;
81 enum
83 FLDLT_D_STRIDE = 1,
84 FLDLT_COOPERATIVE_BLOCK_COUNT_MINIMUM = 5,
86 FSL1S_BLOCK_SIZE = 2,
88 FSL1S_REGULAR_B_ROWS = FSL1S_BLOCK_SIZE,
89 FSL1S_FINAL_B_ROWS = 1,
91 FFL1S_REGULAR_A_ROWS = FSL1S_BLOCK_SIZE,
92 FFL1S_FINAL_A_ROWS = 1,
93 FFL1S_REGULAR_BLOCK_SIZE = 16, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6
94 FFL1S_FINAL_BLOCK_SIZE = 32, // A suitable by magnitude number being a power of 2 and (naturally) not being divisible by 6
97 static unsigned restrictFactoringLDLTAllowedThreadCount(
98 dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount);
99 static void doEstimateCooperativeFactoringLDLTResourceRequirementsValidated(
100 dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
101 unsigned allowedThreadCount, unsigned rowCount);
102 static void doCooperativelyFactorLDLTValidated(
103 dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
104 dReal *A, dReal *d, unsigned rowCount, unsigned rowSkip);
107 static unsigned deriveSolvingL1StripeBlockCount(unsigned rowCount, unsigned blockStep)
109 return (rowCount + (blockStep - 1)) / blockStep;
112 struct FactorizationSolvingL1StripeMemoryEstimates
114 void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired)
116 m_descriptorSizeRequired = descriptorSizeRequired;
117 m_contextSizeRequired = contextSizeRequired;
120 sizeint m_descriptorSizeRequired;
121 sizeint m_contextSizeRequired;
124 static unsigned deriveSolvingL1StripeThreadCount(unsigned blockCount, unsigned allowedThreadCount)
126 dIASSERT(allowedThreadCount >= 1);
128 unsigned maximumCount = blockCount / 2;
129 return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U);
132 static sizeint estimateCooperativelySolvingL1Stripe_XMemoryRequirement(unsigned blockCount,
133 FactorizationSolvingL1StripeMemoryEstimates &ref_memoryEstimates)
135 sizeint descriptorSizeRequired = dOVERALIGNED_SIZE(sizeof(cellindexint) * blockCount, COOP_THREAD_DATA_ALIGNMENT_SIZE);
136 sizeint contextSizeRequired = dOVERALIGNED_SIZE(sizeof(FactorizationSolveL1StripeCellContext) * (CCI__MAX + 1) * blockCount, COOP_THREAD_DATA_ALIGNMENT_SIZE);
137 ref_memoryEstimates.assignData(descriptorSizeRequired, contextSizeRequired);
139 sizeint totalSizeRequired = descriptorSizeRequired + contextSizeRequired;
140 return totalSizeRequired;
143 static void *markCooperativelySolvingL1Stripe_XMemoryStructuresOut(void *buffer,
144 const FactorizationSolvingL1StripeMemoryEstimates &memoryEstimates,
145 cellindexint *&out_blockProgressDescriptors, FactorizationSolveL1StripeCellContext *&out_cellContexts)
147 void *currentLocation = buffer;
149 out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_descriptorSizeRequired;
150 out_cellContexts = (FactorizationSolveL1StripeCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_contextSizeRequired;
152 return currentLocation;
155 static void initializeCooperativelySolvingL1Stripe_XMemoryStructures(unsigned blockCount,
156 atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, FactorizationSolveL1StripeCellContext *dUNUSED(cellContexts))
158 out_blockCompletionProgress = 0;
159 memset(blockProgressDescriptors, 0, blockCount * sizeof(*blockProgressDescriptors));
162 template<unsigned int block_step, unsigned int b_rows>
163 static void participateSolvingL1Stripe_X(const dReal *L, dReal *B, unsigned blockCount, unsigned rowSkip,
164 volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
165 FactorizationSolveL1StripeCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex);
167 static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromSolvingBlockIndex(unsigned solvingBlockIndex, unsigned solvingBlockStep, unsigned blockARows)
169 unsigned factorizingBlockSize = deriveScalingAndFactorizingL1StripeBlockSize(blockARows);
170 return deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(solvingBlockIndex * solvingBlockStep, factorizingBlockSize);
173 static unsigned deriveScalingAndFactorizingL1StripeBlockCountFromFactorizationRow(unsigned factorizationRowIndex, unsigned factorizationBlockSize)
175 return (factorizationRowIndex + (factorizationBlockSize - 1)) / factorizationBlockSize;
178 static unsigned deriveScalingAndFactorizingL1StripeBlockSize(unsigned blockARows)
180 unsigned result = blockARows != 1 ? FFL1S_REGULAR_BLOCK_SIZE : FFL1S_FINAL_BLOCK_SIZE;
181 dIASSERT(blockARows >= 1 && blockARows <= 2);
183 return result;
187 static unsigned deriveScalingAndFactorizingL1StripeThreadCount(unsigned blockCount, unsigned allowedThreadCount)
189 dIASSERT(blockCount != 0);
190 dIASSERT(allowedThreadCount >= 1);
192 return dMACRO_MIN(blockCount, allowedThreadCount);
195 struct FactorizationFactorizeL1StripeContext;
197 struct FactorizationScalingAndFactorizingL1StripeMemoryEstimates
199 void assignData(sizeint contextSizeRequired)
201 m_contextSizeRequired = contextSizeRequired;
204 sizeint m_contextSizeRequired;
207 static sizeint estimateCooperativelyScalingAndFactorizingL1Stripe_XMemoryRequirement(unsigned factorizingMaximumThreads,
208 FactorizationScalingAndFactorizingL1StripeMemoryEstimates &ref_memoryEstimates)
210 dIASSERT(factorizingMaximumThreads != 0);
212 sizeint contextSizeRequired = dOVERALIGNED_SIZE(sizeof(FactorizationFactorizeL1StripeContext) + sizeof(FactorizationFactorizeL1StripeThreadContext) * (factorizingMaximumThreads - 1), COOP_THREAD_DATA_ALIGNMENT_SIZE);
213 ref_memoryEstimates.assignData(contextSizeRequired);
215 sizeint totalSizeRequired = contextSizeRequired;
216 return totalSizeRequired;
219 static void *markCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructuresOut(void *buffer,
220 const FactorizationScalingAndFactorizingL1StripeMemoryEstimates &memoryEstimates, FactorizationFactorizeL1StripeContext *&out_factorizationContext)
222 void *currentLocation = buffer;
224 out_factorizationContext = (FactorizationFactorizeL1StripeContext *)currentLocation; currentLocation = (uint8 *)currentLocation + memoryEstimates.m_contextSizeRequired;
226 return currentLocation;
229 static void initializeCooperativelyScalingAndFactorizingL1Stripe_XMemoryStructures(
230 FactorizationFactorizeL1StripeContext *factorizationContext, unsigned threadCount)
232 factorizationContext->initialize(threadCount);
236 template<unsigned int a_rows, unsigned int d_stride>
237 static void participateScalingAndFactorizingL1Stripe_X(dReal *ARow, dReal *d, unsigned factorizationRow, unsigned rowSkip,
238 FactorizationFactorizeL1StripeContext *factorizationContext, unsigned ownThreadIndex);
240 private:
241 struct FactorLDLTWorkerContext
243 FactorLDLTWorkerContext(dxThreadingBase *threading, unsigned allowedThreadCount,
244 dReal *A, dReal *d, unsigned totalBlockCount, unsigned rowCount, unsigned rowSkip,
245 atomicord32 &ref_solvingBlockCompletionProgress, cellindexint *solvingBlockProgressDescriptors,
246 FactorizationSolveL1StripeCellContext *solvingCellContexts,
247 FactorizationFactorizeL1StripeContext *factorizingFactorizationContext,
248 dCallReleaseeID calculationFinishReleasee):
249 m_threading(threading),
250 m_allowedThreadCount(allowedThreadCount),
251 m_A(A),
252 m_ARow(A),
253 m_d(d),
254 m_solvingBlockIndex(0),
255 m_totalBlockCount(totalBlockCount),
256 m_rowCount(rowCount),
257 m_rowSkip(rowSkip),
258 m_refSolvingBlockCompletionProgress(ref_solvingBlockCompletionProgress),
259 m_solvingBlockProgressDescriptors(solvingBlockProgressDescriptors),
260 m_solvingCellContexts(solvingCellContexts),
261 m_factorizingFactorizationContext(factorizingFactorizationContext),
262 m_calculationFinishReleasee(calculationFinishReleasee)
266 void incrementForNextBlock()
268 const unsigned blockStep = FSL1S_BLOCK_SIZE;
270 m_ARow += blockStep * m_rowSkip;
271 m_solvingBlockIndex += 1;
274 dxThreadingBase *m_threading;
275 unsigned m_allowedThreadCount;
276 dReal *m_A;
277 dReal *m_ARow;
278 dReal *m_d;
279 unsigned m_solvingBlockIndex;
280 unsigned m_totalBlockCount;
281 unsigned m_rowCount;
282 unsigned m_rowSkip;
283 atomicord32 &m_refSolvingBlockCompletionProgress;
284 cellindexint *m_solvingBlockProgressDescriptors;
285 FactorizationSolveL1StripeCellContext *m_solvingCellContexts;
286 FactorizationFactorizeL1StripeContext *m_factorizingFactorizationContext;
287 dCallReleaseeID m_calculationFinishReleasee;
290 static int factotLDLT_solvingComplete_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
291 static void factotLDLT_solvingComplete(FactorLDLTWorkerContext &ref_context, unsigned ownThreadIndex);
293 static int factotLDLT_solvingCompleteSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
294 static void factotLDLT_solvingCompleteSync(FactorLDLTWorkerContext &ref_workerContext);
296 static int factotLDLT_scalingAndFactorizingComplete_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
297 static void factotLDLT_scalingAndFactorizingComplete(FactorLDLTWorkerContext &ref_workerContext, unsigned ownThreadIndex);
299 static int factotLDLT_scalingAndFactorizingCompleteSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
300 static void factotLDLT_scalingAndFactorizingCompleteSync(FactorLDLTWorkerContext &ref_workerContext);
302 static int factotLDLT_solvingFinal_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
303 static void factotLDLT_solvingFinal(FactorLDLTWorkerContext &ref_context, unsigned ownThreadIndex);
305 static int factotLDLT_solvingFinalSync_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
306 static void factotLDLT_solvingFinalSync(FactorLDLTWorkerContext &ref_workerContext);
308 static int factotLDLT_scalingAndFactorizingFinal_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
309 static void factotLDLT_scalingAndFactorizingFinal(FactorLDLTWorkerContext &ref_workerContext, unsigned ownThreadIndex);
311 static int factotLDLT_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
313 private:
314 struct FactorizationSolveL1StripeCellContext
316 template<unsigned int block_step, unsigned int b_rows>
317 static void initializePrecalculatedZs(dReal (&Z)[block_step][b_rows])
319 Z[0][0] = 0;
320 if (b_rows >= 2)
322 Z[0][1] = 0;
324 Z[1][0] = 0;
325 if (b_rows >= 2)
327 Z[1][1] = 0;
329 dSASSERT(block_step == 2);
330 dSASSERT(b_rows >= 1 && b_rows <= 2);
333 template<unsigned int block_step, unsigned int b_rows>
334 void loadPrecalculatedZs(dReal (&Z)[block_step][b_rows]) const
336 dSASSERT(block_step <= dARRAY_SIZE(m_c));
337 dSASSERT(b_rows <= dARRAY_SIZE(m_c[0]));
339 Z[0][0] = m_c[0][0];
340 if (b_rows >= 2)
342 Z[0][1] = m_c[0][1];
344 Z[1][0] = m_c[1][0];
345 if (b_rows >= 2)
347 Z[1][1] = m_c[1][1];
349 dSASSERT(block_step == 2);
350 dSASSERT(b_rows >= 1 && b_rows <= 2);
353 template<unsigned int block_step, unsigned int b_rows>
354 void storePrecalculatedZs(const dReal (&Z)[block_step][b_rows])
356 dSASSERT(block_step <= dARRAY_SIZE(m_c));
357 dSASSERT(b_rows <= dARRAY_SIZE(m_c[0]));
359 m_c[0][0] = Z[0][0];
360 if (b_rows >= 2)
362 m_c[0][1] = Z[0][1];
364 m_c[1][0] = Z[1][0];
365 if (b_rows >= 2)
367 m_c[1][1] = Z[1][1];
369 dSASSERT(block_step == 2);
370 dSASSERT(b_rows >= 1 && b_rows <= 2);
373 dReal m_c[FSL1S_BLOCK_SIZE][FSL1S_REGULAR_B_ROWS];
374 // dReal m_reserved[4];
377 static FactorizationSolveL1StripeCellContext &buildBlockContextRef(FactorizationSolveL1StripeCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance)
379 return cellContexts[blockIndex * CCI__MAX + contextInstance];
382 static FactorizationSolveL1StripeCellContext &buildResultContextRef(FactorizationSolveL1StripeCellContext *cellContexts, unsigned blockIndex, unsigned blockCount)
384 return cellContexts[blockCount * CCI__MAX + blockIndex];
387 private:
388 struct FactorizationFactorizeL1StripeThreadContext
390 template<unsigned int a_rows>
391 void assignDataSum(const dReal (&sameZ)[a_rows], const dReal (&mixedZ)[dMACRO_MAX(a_rows - 1, 1)],
392 const FactorizationFactorizeL1StripeThreadContext &partialSumContext)
394 m_sameZ[0] = sameZ[0] + partialSumContext.m_sameZ[0];
395 if (a_rows >= 2)
397 m_sameZ[1] = sameZ[1] + partialSumContext.m_sameZ[1];
398 m_mixedZ[0] = mixedZ[0] + partialSumContext.m_mixedZ[0];
402 template<unsigned int a_rows>
403 void assignDataAlone(const dReal (&sameZ)[a_rows], const dReal (&mixedZ)[dMACRO_MAX(a_rows - 1, 1)])
405 m_sameZ[0] = sameZ[0];
406 if (a_rows >= 2)
408 m_sameZ[1] = sameZ[1];
409 m_mixedZ[0] = mixedZ[0];
413 template<unsigned int a_rows>
414 void retrieveData(dReal (&out_sameZ)[a_rows], dReal (&out_mixedZ)[dMACRO_MAX(a_rows - 1, 1)]) const
416 out_sameZ[0] = m_sameZ[0];
417 if (a_rows >= 2)
419 out_sameZ[1] = m_sameZ[1];
420 out_mixedZ[0] = m_mixedZ[0];
422 dAASSERT(a_rows >= 1 && a_rows <= 2);
425 dReal m_sameZ[FFL1S_REGULAR_A_ROWS];
426 dReal m_mixedZ[dMACRO_MAX(FFL1S_REGULAR_A_ROWS - 1, 1)];
427 dReal m_reserved[1]; // [5]; // for alignment
430 struct FactorizationFactorizeL1StripeContext
432 void initialize(unsigned threadCount)
434 m_threadsRunning = threadCount;
435 m_nextColumnIndex = 0;
436 m_sumThreadIndex = 0;
439 atomicord32 m_threadsRunning;
440 atomicord32 m_nextColumnIndex;
441 volatile atomicord32 m_sumThreadIndex;
442 atomicord32 m_reserved[1]; // [13]; // for alignment
443 FactorizationFactorizeL1StripeThreadContext m_threadContexts[1]; // =[threadCount]
446 private:
447 struct SolveL1StraightCellContext;
449 enum
451 SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM = 8,
453 SL1S_B_STRIDE = 1,
454 SL1S_BLOCK_SIZE = 4,
457 static unsigned restrictSolvingL1StraightAllowedThreadCount(
458 dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount);
459 static void doEstimateCooperativeSolvingL1StraightResourceRequirementsValidated(
460 dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
461 unsigned allowedThreadCount, unsigned rowCount);
462 static void doCooperativelySolveL1StraightValidated(
463 dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
464 const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
466 static unsigned deriveSolvingL1StraightBlockCount(unsigned rowCount, unsigned blockStep)
468 return (rowCount + (blockStep - 1)) / blockStep;
471 struct SolvingL1StraightMemoryEstimates
473 void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired)
475 m_descriptorSizeRequired = descriptorSizeRequired;
476 m_contextSizeRequired = contextSizeRequired;
479 sizeint m_descriptorSizeRequired;
480 sizeint m_contextSizeRequired;
483 static unsigned deriveSolvingL1StraightThreadCount(unsigned blockCount, unsigned allowedThreadCount)
485 dIASSERT(allowedThreadCount >= 1);
487 unsigned maximumCount = 1 + blockCount / SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM;
488 return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U);
491 template<unsigned int block_step>
492 static sizeint estimateCooperativelySolvingL1StraightMemoryRequirement(unsigned rowCount, SolvingL1StraightMemoryEstimates &ref_solvingMemoryEstimates);
494 static void *markCooperativelySolvingL1StraightMemoryStructuresOut(void *buffer,
495 const SolvingL1StraightMemoryEstimates &solvingMemoryEstimates,
496 cellindexint *&out_blockProgressDescriptors, SolveL1StraightCellContext *&out_cellContexts)
498 void *currentLocation = buffer;
500 out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_descriptorSizeRequired;
501 out_cellContexts = (SolveL1StraightCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_contextSizeRequired;
502 return currentLocation;
505 template<unsigned int block_step>
506 static void initializeCooperativelySolveL1StraightMemoryStructures(unsigned rowCount,
507 atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *cellContexts);
508 template<unsigned int block_step, unsigned int b_stride>
509 static void participateSolvingL1Straight(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
510 volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
511 SolveL1StraightCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex);
513 private:
514 struct SolveL1StraightWorkerContext
516 void init(const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip,
517 atomicord32 &ref_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1StraightCellContext *cellContexts)
519 m_L = L;
520 m_b = b;
521 m_rowCount = rowCount;
522 m_rowSkip = rowSkip;
523 m_ptrBlockCompletionProgress = &ref_blockCompletionProgress;
524 m_blockProgressDescriptors = blockProgressDescriptors;
525 m_cellContexts = cellContexts;
528 const dReal *m_L;
529 dReal *m_b;
530 unsigned m_rowCount;
531 unsigned m_rowSkip;
532 atomicord32 *m_ptrBlockCompletionProgress;
533 cellindexint *m_blockProgressDescriptors;
534 SolveL1StraightCellContext *m_cellContexts;
537 static int solveL1Straight_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
538 static void solveL1Straight_worker(SolveL1StraightWorkerContext &ref_context, unsigned ownThreadIndex);
540 static int solveL1Straight_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
542 private:
543 struct SolveL1StraightCellContext
545 template<unsigned int block_step>
546 static void initializePrecalculatedZs(dReal (&Z)[block_step])
548 std::fill(Z, Z + block_step, REAL(0.0));
551 template<unsigned int block_step>
552 void loadPrecalculatedZs(dReal (&Z)[block_step]) const
554 dSASSERT(block_step <= dARRAY_SIZE(m_c));
556 std::copy(m_c, m_c + block_step, Z);
559 template<unsigned int block_step>
560 void storePrecalculatedZs(const dReal (&Z)[block_step])
562 dSASSERT(block_step <= dARRAY_SIZE(m_c));
564 std::copy(Z, Z + block_step, m_c);
567 dReal m_c[SL1S_BLOCK_SIZE];
571 static SolveL1StraightCellContext &buildBlockContextRef(SolveL1StraightCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance)
573 return cellContexts[blockIndex * CCI__MAX + contextInstance];
576 static SolveL1StraightCellContext &buildResultContextRef(SolveL1StraightCellContext *cellContexts, unsigned blockIndex, unsigned blockCount)
578 return cellContexts[blockCount * CCI__MAX + blockIndex];
582 private:
583 struct SolveL1TransposedCellContext;
585 enum
587 SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM = SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM,
589 SL1T_B_STRIDE = SL1S_B_STRIDE,
590 SL1T_BLOCK_SIZE = 4,
593 static unsigned restrictSolvingL1TransposedAllowedThreadCount(
594 dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount);
595 static void doEstimateCooperativeSolvingL1TransposedResourceRequirementsValidated(
596 dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
597 unsigned allowedThreadCount, unsigned rowCount);
598 static void doCooperativelySolveL1TransposedValidated(
599 dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
600 const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip);
602 static unsigned deriveSolvingL1TransposedBlockCount(unsigned rowCount, unsigned blockStep)
604 return (rowCount + (blockStep - 1)) / blockStep;
607 struct SolvingL1TransposedMemoryEstimates
609 void assignData(sizeint descriptorSizeRequired, sizeint contextSizeRequired)
611 m_descriptorSizeRequired = descriptorSizeRequired;
612 m_contextSizeRequired = contextSizeRequired;
615 sizeint m_descriptorSizeRequired;
616 sizeint m_contextSizeRequired;
619 static unsigned deriveSolvingL1TransposedThreadCount(unsigned blockCount, unsigned allowedThreadCount)
621 dSASSERT(SL1T_COOPERATIVE_BLOCK_COUNT_MINIMUM + 0 == SL1S_COOPERATIVE_BLOCK_COUNT_MINIMUM);
623 return deriveSolvingL1StraightThreadCount(blockCount, allowedThreadCount);
626 template<unsigned int block_step>
627 static sizeint estimateCooperativelySolvingL1TransposedMemoryRequirement(unsigned rowCount, SolvingL1TransposedMemoryEstimates &ref_solvingMemoryEstimates);
629 static void *markCooperativelySolvingL1TransposedMemoryStructuresOut(void *buffer,
630 const SolvingL1TransposedMemoryEstimates &solvingMemoryEstimates,
631 cellindexint *&out_blockProgressDescriptors, SolveL1TransposedCellContext *&out_cellContexts)
633 void *currentLocation = buffer;
635 out_blockProgressDescriptors = (cellindexint *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_descriptorSizeRequired;
636 out_cellContexts = (SolveL1TransposedCellContext *)currentLocation; currentLocation = (uint8 *)currentLocation + solvingMemoryEstimates.m_contextSizeRequired;
637 return currentLocation;
640 template<unsigned int block_step>
641 static void *allocateCooperativelySolveL1TransposedMemoryStructures(sizeint &out_sizeAllocated, unsigned rowCount,
642 cellindexint *&out_blockProgressDescriptors, SolveL1TransposedCellContext *&out_cellContexts);
643 template<unsigned int block_step>
644 static void initializeCooperativelySolveL1TransposedMemoryStructures(unsigned rowCount,
645 atomicord32 &out_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *cellContexts);
646 template<unsigned int block_step, unsigned int b_stride>
647 static void participateSolvingL1Transposed(const dReal *L, dReal *B, unsigned rowCount, unsigned rowSkip,
648 volatile atomicord32 &refBlockCompletionProgress/*=0*/, volatile cellindexint *blockProgressDescriptors/*=[blockCount]*/,
649 SolveL1TransposedCellContext *cellContexts/*=[CCI__MAX x blockCount] + [blockCount]*/, unsigned ownThreadIndex);
651 private:
652 struct SolveL1TransposedWorkerContext
654 void init(const dReal *L, dReal *b, unsigned rowCount, unsigned rowSkip,
655 atomicord32 &ref_blockCompletionProgress, cellindexint *blockProgressDescriptors, SolveL1TransposedCellContext *cellContexts)
657 m_L = L;
658 m_b = b;
659 m_rowCount = rowCount;
660 m_rowSkip = rowSkip;
661 m_ptrBlockCompletionProgress = &ref_blockCompletionProgress;
662 m_blockProgressDescriptors = blockProgressDescriptors;
663 m_cellContexts = cellContexts;
666 const dReal *m_L;
667 dReal *m_b;
668 unsigned m_rowCount;
669 unsigned m_rowSkip;
670 atomicord32 *m_ptrBlockCompletionProgress;
671 cellindexint *m_blockProgressDescriptors;
672 SolveL1TransposedCellContext *m_cellContexts;
675 static int solveL1Transposed_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
676 static void solveL1Transposed_worker(SolveL1TransposedWorkerContext &ref_context, unsigned ownThreadIndex);
678 static int solveL1Transposed_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
680 private:
681 struct SolveL1TransposedCellContext
683 template<unsigned int block_step>
684 static void initializePrecalculatedZs(dReal (&Z)[block_step])
686 std::fill(Z, Z + block_step, REAL(0.0));
689 template<unsigned int block_step>
690 void loadPrecalculatedZs(dReal (&Z)[block_step]) const
692 dSASSERT(block_step <= dARRAY_SIZE(m_c));
694 std::copy(m_c, m_c + block_step, Z);
697 template<unsigned int block_step>
698 void storePrecalculatedZs(const dReal (&Z)[block_step])
700 dSASSERT(block_step <= dARRAY_SIZE(m_c));
702 std::copy(Z, Z + block_step, m_c);
705 dReal m_c[SL1T_BLOCK_SIZE];
708 static SolveL1TransposedCellContext &buildBlockContextRef(SolveL1TransposedCellContext *cellContexts, unsigned blockIndex, CellContextInstance contextInstance)
710 return cellContexts[blockIndex * CCI__MAX + contextInstance];
713 static SolveL1TransposedCellContext &buildResultContextRef(SolveL1TransposedCellContext *cellContexts, unsigned blockIndex, unsigned blockCount)
715 return cellContexts[blockCount * CCI__MAX + blockIndex];
718 private:
719 enum
721 SV_A_STRIDE = 1,
722 SV_D_STRIDE = 1,
724 SV_BLOCK_SIZE = 128,
725 SV_COOPERATIVE_BLOCK_COUNT_MINIMUM = 3,
728 static unsigned restrictScalingVectorAllowedThreadCount(
729 dxThreadingBase *threading, unsigned allowedThreadCount, unsigned elementCount);
730 static void doEstimateCooperativeScalingVectorResourceRequirementsValidated(
731 dxResourceRequirementDescriptor *summaryRequirementsDescriptor,
732 unsigned allowedThreadCount, unsigned elementCount);
733 static void doCooperativelyScaleVectorValidated(dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount,
734 dReal *vectorData, const dReal *scaleData, unsigned elementCount);
736 static unsigned deriveScalingVectorBlockCount(unsigned elementCount, unsigned blockStep)
738 return (elementCount + (blockStep - 1)) / blockStep;
741 static unsigned deriveScalingVectorThreadCount(unsigned lastBlockIndex, unsigned allowedThreadCount)
743 dIASSERT(allowedThreadCount >= 1);
745 unsigned maximumCount = lastBlockIndex;
746 return maximumCount >= allowedThreadCount ? allowedThreadCount : dMACRO_MAX(maximumCount, 1U);
749 static void initializeCooperativelyScaleVectorMemoryStructures(atomicord32 &out_blockCompletionProgress)
751 out_blockCompletionProgress = 0;
753 template<unsigned int block_step, unsigned int a_stride, unsigned int d_stride>
754 static void participateScalingVector(dReal *ptrAStart, const dReal *ptrDStart, const unsigned elementCount,
755 volatile atomicord32 &refBlockCompletionProgress/*=0*/);
757 private:
758 struct ScaleVectorWorkerContext
760 void init(dReal *vectorData, const dReal *scaleData, unsigned elementCount,
761 atomicord32 &ref_blockCompletionProgress)
763 m_vectorData = vectorData;
764 m_scaleData = scaleData;
765 m_elementCount = elementCount;
766 m_ptrBlockCompletionProgress = &ref_blockCompletionProgress;
769 dReal *m_vectorData;
770 const dReal *m_scaleData;
771 unsigned m_elementCount;
772 atomicord32 *m_ptrBlockCompletionProgress;
775 static int scaleVector_worker_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
776 static void scaleVector_worker(ScaleVectorWorkerContext &ref_context);
778 static int scaleVector_completion_callback(void *callContext, dcallindex_t callInstanceIndex, dCallReleaseeID callThisReleasee);
781 private:
782 enum SolvingLDLTStage
784 SLDLTS__MIN,
786 SLDLTS_SOLVING_STRAIGHT = SLDLTS__MIN,
787 SLDLTS_SCALING_VECTOR,
788 SLDLTS_SOLVING_TRANSPOSED,
790 SLDLTS__MAX,
793 enum
795 SLDLT_B_STRIDE = SL1S_B_STRIDE,
796 SLDLT_D_STRIDE = FLDLT_D_STRIDE,
799 static unsigned restrictSolvingLDLTAllowedThreadCount(
800 dxThreadingBase *threading, unsigned allowedThreadCount, unsigned rowCount, unsigned &out_stageBlockCountSifficiencyMask);
802 static void doCooperativelySolveLDLTValidated(
803 dxRequiredResourceContainer *resourceContainer, unsigned allowedThreadCount, unsigned stageBlockCountSifficiencyMask,
804 const dReal *L, const dReal *d, dReal *b, unsigned rowCount, unsigned rowSkip);
808 #endif