Cosmetic: Newlines changed to be Unix style (LF) for consistency
[ode.git] / ode / src / fastsolve_impl.h
blob41ee42d9bfdf971e28739503b08bf1821bfadcd1
3 /*************************************************************************
4 * *
5 * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith. *
6 * All rights reserved. Email: russ@q12.org Web: www.q12.org *
7 * *
8 * This library is free software; you can redistribute it and/or *
9 * modify it under the terms of EITHER: *
10 * (1) The GNU Lesser General Public License as published by the Free *
11 * Software Foundation; either version 2.1 of the License, or (at *
12 * your option) any later version. The text of the GNU Lesser *
13 * General Public License is included with this library in the *
14 * file LICENSE.TXT. *
15 * (2) The BSD-style license that is included with this library in *
16 * the file LICENSE-BSD.TXT. *
17 * *
18 * This library is distributed in the hope that it will be useful, *
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files *
21 * LICENSE.TXT and LICENSE-BSD.TXT for more details. *
22 * *
23 *************************************************************************/
25 #ifndef _ODE_FASTSOLVE_IMPL_H_
26 #define _ODE_FASTSOLVE_IMPL_H_
29 /* solve L*X=B, with B containing 1 right hand sides.
30 * L is an n*n lower triangular matrix with ones on the diagonal.
31 * L is stored by rows and its leading dimension is lskip.
32 * B is an n*1 matrix that contains the right hand sides.
33 * B is stored by columns and its leading dimension is also lskip.
34 * B is overwritten with X.
35 * this processes blocks of 4*4.
36 * if this is in the factorizer source file, n must be a multiple of 4.
39 template<unsigned b_stride>
40 void dxtSolveL1 (const dReal *L, dReal *B, unsigned n, unsigned lskip1)
42 /* declare variables - Z matrix, p and q vectors, etc */
43 dReal Z11, Z21, Z31, Z41, p1, q1, p2, p3, p4, *ex;
44 const dReal *ell;
45 unsigned lskip2, lskip3, i, j;
46 /* compute lskip values */
47 lskip2 = 2 * lskip1;
48 lskip3 = 3 * lskip1;
49 /* compute all 4 x 1 blocks of X */
50 i = 0;
51 bool loop4 = n >= 4;
52 unsigned n4 = loop4 ? n - 4 : 0;
53 for (; loop4; loop4 = (i += 4) <= n4) {
54 /* compute all 4 x 1 block of X, from rows i..i+4-1 */
55 /* set the Z matrix to 0 */
56 Z11=0;
57 Z21=0;
58 Z31=0;
59 Z41=0;
60 ell = L + i * lskip1;
61 ex = B;
62 /* the inner loop that computes outer products and adds them to Z */
63 for (j = i; j >= 12; j -= 12) {
64 /* load p and q values */
65 p1=ell[0];
66 q1=ex[0 * b_stride];
67 p2=ell[lskip1];
68 p3=ell[lskip2];
69 p4=ell[lskip3];
70 /* compute outer product and add it to the Z matrix */
71 Z11 += p1 * q1;
72 Z21 += p2 * q1;
73 Z31 += p3 * q1;
74 Z41 += p4 * q1;
75 /* load p and q values */
76 p1=ell[1];
77 q1=ex[1 * b_stride];
78 p2=ell[1+lskip1];
79 p3=ell[1+lskip2];
80 p4=ell[1+lskip3];
81 /* compute outer product and add it to the Z matrix */
82 Z11 += p1 * q1;
83 Z21 += p2 * q1;
84 Z31 += p3 * q1;
85 Z41 += p4 * q1;
86 /* load p and q values */
87 p1=ell[2];
88 q1=ex[2 * b_stride];
89 p2=ell[2+lskip1];
90 p3=ell[2+lskip2];
91 p4=ell[2+lskip3];
92 /* compute outer product and add it to the Z matrix */
93 Z11 += p1 * q1;
94 Z21 += p2 * q1;
95 Z31 += p3 * q1;
96 Z41 += p4 * q1;
97 /* load p and q values */
98 p1=ell[3];
99 q1=ex[3 * b_stride];
100 p2=ell[3+lskip1];
101 p3=ell[3+lskip2];
102 p4=ell[3+lskip3];
103 /* compute outer product and add it to the Z matrix */
104 Z11 += p1 * q1;
105 Z21 += p2 * q1;
106 Z31 += p3 * q1;
107 Z41 += p4 * q1;
108 /* load p and q values */
109 p1=ell[4];
110 q1=ex[4 * b_stride];
111 p2=ell[4+lskip1];
112 p3=ell[4+lskip2];
113 p4=ell[4+lskip3];
114 /* compute outer product and add it to the Z matrix */
115 Z11 += p1 * q1;
116 Z21 += p2 * q1;
117 Z31 += p3 * q1;
118 Z41 += p4 * q1;
119 /* load p and q values */
120 p1=ell[5];
121 q1=ex[5 * b_stride];
122 p2=ell[5+lskip1];
123 p3=ell[5+lskip2];
124 p4=ell[5+lskip3];
125 /* compute outer product and add it to the Z matrix */
126 Z11 += p1 * q1;
127 Z21 += p2 * q1;
128 Z31 += p3 * q1;
129 Z41 += p4 * q1;
130 /* load p and q values */
131 p1=ell[6];
132 q1=ex[6 * b_stride];
133 p2=ell[6+lskip1];
134 p3=ell[6+lskip2];
135 p4=ell[6+lskip3];
136 /* compute outer product and add it to the Z matrix */
137 Z11 += p1 * q1;
138 Z21 += p2 * q1;
139 Z31 += p3 * q1;
140 Z41 += p4 * q1;
141 /* load p and q values */
142 p1=ell[7];
143 q1=ex[7 * b_stride];
144 p2=ell[7+lskip1];
145 p3=ell[7+lskip2];
146 p4=ell[7+lskip3];
147 /* compute outer product and add it to the Z matrix */
148 Z11 += p1 * q1;
149 Z21 += p2 * q1;
150 Z31 += p3 * q1;
151 Z41 += p4 * q1;
152 /* load p and q values */
153 p1=ell[8];
154 q1=ex[8 * b_stride];
155 p2=ell[8+lskip1];
156 p3=ell[8+lskip2];
157 p4=ell[8+lskip3];
158 /* compute outer product and add it to the Z matrix */
159 Z11 += p1 * q1;
160 Z21 += p2 * q1;
161 Z31 += p3 * q1;
162 Z41 += p4 * q1;
163 /* load p and q values */
164 p1=ell[9];
165 q1=ex[9 * b_stride];
166 p2=ell[9+lskip1];
167 p3=ell[9+lskip2];
168 p4=ell[9+lskip3];
169 /* compute outer product and add it to the Z matrix */
170 Z11 += p1 * q1;
171 Z21 += p2 * q1;
172 Z31 += p3 * q1;
173 Z41 += p4 * q1;
174 /* load p and q values */
175 p1=ell[10];
176 q1=ex[10 * b_stride];
177 p2=ell[10+lskip1];
178 p3=ell[10+lskip2];
179 p4=ell[10+lskip3];
180 /* compute outer product and add it to the Z matrix */
181 Z11 += p1 * q1;
182 Z21 += p2 * q1;
183 Z31 += p3 * q1;
184 Z41 += p4 * q1;
185 /* load p and q values */
186 p1=ell[11];
187 q1=ex[11 * b_stride];
188 p2=ell[11+lskip1];
189 p3=ell[11+lskip2];
190 p4=ell[11+lskip3];
191 /* compute outer product and add it to the Z matrix */
192 Z11 += p1 * q1;
193 Z21 += p2 * q1;
194 Z31 += p3 * q1;
195 Z41 += p4 * q1;
196 /* advance pointers */
197 ell += 12;
198 ex += 12 * b_stride;
199 /* end of inner loop */
201 /* compute left-over iterations */
202 for (; j > 0; --j) {
203 /* load p and q values */
204 p1=ell[0];
205 q1=ex[0 * b_stride];
206 p2=ell[lskip1];
207 p3=ell[lskip2];
208 p4=ell[lskip3];
209 /* compute outer product and add it to the Z matrix */
210 Z11 += p1 * q1;
211 Z21 += p2 * q1;
212 Z31 += p3 * q1;
213 Z41 += p4 * q1;
214 /* advance pointers */
215 ell += 1;
216 ex += 1 * b_stride;
218 /* finish computing the X(i) block */
219 Z11 = ex[0 * b_stride] - Z11;
220 ex[0 * b_stride] = Z11;
221 p1 = ell[lskip1];
222 Z21 = ex[1 * b_stride] - Z21 - p1*Z11;
223 ex[1 * b_stride] = Z21;
224 p1 = ell[lskip2];
225 p2 = ell[1+lskip2];
226 Z31 = ex[2 * b_stride] - Z31 - p1*Z11 - p2*Z21;
227 ex[2 * b_stride] = Z31;
228 p1 = ell[lskip3];
229 p2 = ell[1+lskip3];
230 p3 = ell[2+lskip3];
231 Z41 = ex[3 * b_stride] - Z41 - p1*Z11 - p2*Z21 - p3*Z31;
232 ex[3 * b_stride] = Z41;
233 /* end of outer loop */
235 /* compute rows at end that are not a multiple of block size */
236 for (; i < n; ++i) {
237 /* compute all 1 x 1 block of X, from rows i..i+1-1 */
238 /* set the Z matrix to 0 */
239 Z11=0;
240 ell = L + i*lskip1;
241 ex = B;
242 /* the inner loop that computes outer products and adds them to Z */
243 for (j = i; j >= 12; j -= 12) {
244 /* load p and q values */
245 p1=ell[0];
246 q1=ex[0 * b_stride];
247 /* compute outer product and add it to the Z matrix */
248 Z11 += p1 * q1;
249 /* load p and q values */
250 p1=ell[1];
251 q1=ex[1 * b_stride];
252 /* compute outer product and add it to the Z matrix */
253 Z11 += p1 * q1;
254 /* load p and q values */
255 p1=ell[2];
256 q1=ex[2 * b_stride];
257 /* compute outer product and add it to the Z matrix */
258 Z11 += p1 * q1;
259 /* load p and q values */
260 p1=ell[3];
261 q1=ex[3 * b_stride];
262 /* compute outer product and add it to the Z matrix */
263 Z11 += p1 * q1;
264 /* load p and q values */
265 p1=ell[4];
266 q1=ex[4 * b_stride];
267 /* compute outer product and add it to the Z matrix */
268 Z11 += p1 * q1;
269 /* load p and q values */
270 p1=ell[5];
271 q1=ex[5 * b_stride];
272 /* compute outer product and add it to the Z matrix */
273 Z11 += p1 * q1;
274 /* load p and q values */
275 p1=ell[6];
276 q1=ex[6 * b_stride];
277 /* compute outer product and add it to the Z matrix */
278 Z11 += p1 * q1;
279 /* load p and q values */
280 p1=ell[7];
281 q1=ex[7 * b_stride];
282 /* compute outer product and add it to the Z matrix */
283 Z11 += p1 * q1;
284 /* load p and q values */
285 p1=ell[8];
286 q1=ex[8 * b_stride];
287 /* compute outer product and add it to the Z matrix */
288 Z11 += p1 * q1;
289 /* load p and q values */
290 p1=ell[9];
291 q1=ex[9 * b_stride];
292 /* compute outer product and add it to the Z matrix */
293 Z11 += p1 * q1;
294 /* load p and q values */
295 p1=ell[10];
296 q1=ex[10 * b_stride];
297 /* compute outer product and add it to the Z matrix */
298 Z11 += p1 * q1;
299 /* load p and q values */
300 p1=ell[11];
301 q1=ex[11 * b_stride];
302 /* compute outer product and add it to the Z matrix */
303 Z11 += p1 * q1;
304 /* advance pointers */
305 ell += 12;
306 ex += 12 * b_stride;
307 /* end of inner loop */
309 /* compute left-over iterations */
310 for (; j > 0; --j) {
311 /* load p and q values */
312 p1=ell[0];
313 q1=ex[0 * b_stride];
314 /* compute outer product and add it to the Z matrix */
315 Z11 += p1 * q1;
316 /* advance pointers */
317 ell += 1;
318 ex += 1 * b_stride;
320 /* finish computing the X(i) block */
321 Z11 = ex[0 * b_stride] - Z11;
322 ex[0 * b_stride] = Z11;
327 #endif