3 /*************************************************************************
5 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
6 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of EITHER: *
10 * (1) The GNU Lesser General Public License as published by the Free *
11 * Software Foundation; either version 2.1 of the License, or (at *
12 * your option) any later version. The text of the GNU Lesser *
13 * General Public License is included with this library in the *
15 * (2) The BSD-style license that is included with this library in *
16 * the file LICENSE-BSD.TXT. *
18 * This library is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
21 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
23 *************************************************************************/
25 #ifndef _ODE_FASTSOLVE_IMPL_H_
26 #define _ODE_FASTSOLVE_IMPL_H_
29 /* solve L*X=B, with B containing 1 right hand sides.
30 * L is an n*n lower triangular matrix with ones on the diagonal.
31 * L is stored by rows and its leading dimension is lskip.
32 * B is an n*1 matrix that contains the right hand sides.
33 * B is stored by columns and its leading dimension is also lskip.
34 * B is overwritten with X.
35 * this processes blocks of 4*4.
36 * if this is in the factorizer source file, n must be a multiple of 4.
39 template<unsigned b_stride
>
40 void dxtSolveL1 (const dReal
*L
, dReal
*B
, unsigned n
, unsigned lskip1
)
42 /* declare variables - Z matrix, p and q vectors, etc */
43 dReal Z11
, Z21
, Z31
, Z41
, p1
, q1
, p2
, p3
, p4
, *ex
;
45 unsigned lskip2
, lskip3
, i
, j
;
46 /* compute lskip values */
49 /* compute all 4 x 1 blocks of X */
52 unsigned n4
= loop4
? n
- 4 : 0;
53 for (; loop4
; loop4
= (i
+= 4) <= n4
) {
54 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
55 /* set the Z matrix to 0 */
62 /* the inner loop that computes outer products and adds them to Z */
63 for (j
= i
; j
>= 12; j
-= 12) {
64 /* load p and q values */
70 /* compute outer product and add it to the Z matrix */
75 /* load p and q values */
81 /* compute outer product and add it to the Z matrix */
86 /* load p and q values */
92 /* compute outer product and add it to the Z matrix */
97 /* load p and q values */
103 /* compute outer product and add it to the Z matrix */
108 /* load p and q values */
114 /* compute outer product and add it to the Z matrix */
119 /* load p and q values */
125 /* compute outer product and add it to the Z matrix */
130 /* load p and q values */
136 /* compute outer product and add it to the Z matrix */
141 /* load p and q values */
147 /* compute outer product and add it to the Z matrix */
152 /* load p and q values */
158 /* compute outer product and add it to the Z matrix */
163 /* load p and q values */
169 /* compute outer product and add it to the Z matrix */
174 /* load p and q values */
176 q1
=ex
[10 * b_stride
];
180 /* compute outer product and add it to the Z matrix */
185 /* load p and q values */
187 q1
=ex
[11 * b_stride
];
191 /* compute outer product and add it to the Z matrix */
196 /* advance pointers */
199 /* end of inner loop */
201 /* compute left-over iterations */
203 /* load p and q values */
209 /* compute outer product and add it to the Z matrix */
214 /* advance pointers */
218 /* finish computing the X(i) block */
219 Z11
= ex
[0 * b_stride
] - Z11
;
220 ex
[0 * b_stride
] = Z11
;
222 Z21
= ex
[1 * b_stride
] - Z21
- p1
*Z11
;
223 ex
[1 * b_stride
] = Z21
;
226 Z31
= ex
[2 * b_stride
] - Z31
- p1
*Z11
- p2
*Z21
;
227 ex
[2 * b_stride
] = Z31
;
231 Z41
= ex
[3 * b_stride
] - Z41
- p1
*Z11
- p2
*Z21
- p3
*Z31
;
232 ex
[3 * b_stride
] = Z41
;
233 /* end of outer loop */
235 /* compute rows at end that are not a multiple of block size */
237 /* compute all 1 x 1 block of X, from rows i..i+1-1 */
238 /* set the Z matrix to 0 */
242 /* the inner loop that computes outer products and adds them to Z */
243 for (j
= i
; j
>= 12; j
-= 12) {
244 /* load p and q values */
247 /* compute outer product and add it to the Z matrix */
249 /* load p and q values */
252 /* compute outer product and add it to the Z matrix */
254 /* load p and q values */
257 /* compute outer product and add it to the Z matrix */
259 /* load p and q values */
262 /* compute outer product and add it to the Z matrix */
264 /* load p and q values */
267 /* compute outer product and add it to the Z matrix */
269 /* load p and q values */
272 /* compute outer product and add it to the Z matrix */
274 /* load p and q values */
277 /* compute outer product and add it to the Z matrix */
279 /* load p and q values */
282 /* compute outer product and add it to the Z matrix */
284 /* load p and q values */
287 /* compute outer product and add it to the Z matrix */
289 /* load p and q values */
292 /* compute outer product and add it to the Z matrix */
294 /* load p and q values */
296 q1
=ex
[10 * b_stride
];
297 /* compute outer product and add it to the Z matrix */
299 /* load p and q values */
301 q1
=ex
[11 * b_stride
];
302 /* compute outer product and add it to the Z matrix */
304 /* advance pointers */
307 /* end of inner loop */
309 /* compute left-over iterations */
311 /* load p and q values */
314 /* compute outer product and add it to the Z matrix */
316 /* advance pointers */
320 /* finish computing the X(i) block */
321 Z11
= ex
[0 * b_stride
] - Z11
;
322 ex
[0 * b_stride
] = Z11
;