ode/src/fastsolve_impl.h

   1
   2
   3 /*************************************************************************
   4  *                                                                       *
   5  * Open Dynamics Engine, Copyright (C) 2001,2002 Russell L. Smith.       *
   6  * All rights reserved.  Email: russ@q12.org   Web: www.q12.org          *
   7  *                                                                       *
   8  * This library is free software; you can redistribute it and/or         *
   9  * modify it under the terms of EITHER:                                  *
  10  *   (1) The GNU Lesser General Public License as published by the Free  *
  11  *       Software Foundation; either version 2.1 of the License, or (at  *
  12  *       your option) any later version. The text of the GNU Lesser      *
  13  *       General Public License is included with this library in the     *
  14  *       file LICENSE.TXT.                                               *
  15  *   (2) The BSD-style license that is included with this library in     *
  16  *       the file LICENSE-BSD.TXT.                                       *
  17  *                                                                       *
  18  * This library is distributed in the hope that it will be useful,       *
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of        *
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the files    *
  21  * LICENSE.TXT and LICENSE-BSD.TXT for more details.                     *
  22  *                                                                       *
  23  *************************************************************************/
  24
  25 #ifndef _ODE_FASTSOLVE_IMPL_H_
  26 #define _ODE_FASTSOLVE_IMPL_H_
  27
  28
  29 /* solve L*X=B, with B containing 1 right hand sides.
  30  * L is an n*n lower triangular matrix with ones on the diagonal.
  31  * L is stored by rows and its leading dimension is lskip.
  32  * B is an n*1 matrix that contains the right hand sides.
  33  * B is stored by columns and its leading dimension is also lskip.
  34  * B is overwritten with X.
  35  * this processes blocks of 4*4.
  36  * if this is in the factorizer source file, n must be a multiple of 4.
  37  */
  38
  39 template<unsigned b_stride>
  40 void dxtSolveL1 (const dReal *L, dReal *B, unsigned n, unsigned lskip1)
  41 {
  42     /* declare variables - Z matrix, p and q vectors, etc */
  43     dReal Z11, Z21, Z31, Z41, p1, q1, p2, p3, p4, *ex;
  44     const dReal *ell;
  45     unsigned lskip2, lskip3, i, j;
  46     /* compute lskip values */
  47     lskip2 = 2 * lskip1;
  48     lskip3 = 3 * lskip1;
  49     /* compute all 4 x 1 blocks of X */
  50     i = 0;
  51     bool loop4 = n >= 4;
  52     unsigned n4 = loop4 ? n - 4 : 0;
  53     for (; loop4; loop4 = (i += 4) <= n4) {
  54         /* compute all 4 x 1 block of X, from rows i..i+4-1 */
  55         /* set the Z matrix to 0 */
  56         Z11=0;
  57         Z21=0;
  58         Z31=0;
  59         Z41=0;
  60         ell = L + i * lskip1;
  61         ex = B;
  62         /* the inner loop that computes outer products and adds them to Z */
  63         for (j = i; j >= 12; j -= 12) {
  64             /* load p and q values */
  65             p1=ell[0];
  66             q1=ex[0 * b_stride];
  67             p2=ell[lskip1];
  68             p3=ell[lskip2];
  69             p4=ell[lskip3];
  70             /* compute outer product and add it to the Z matrix */
  71             Z11 += p1 * q1;
  72             Z21 += p2 * q1;
  73             Z31 += p3 * q1;
  74             Z41 += p4 * q1;
  75             /* load p and q values */
  76             p1=ell[1];
  77             q1=ex[1 * b_stride];
  78             p2=ell[1+lskip1];
  79             p3=ell[1+lskip2];
  80             p4=ell[1+lskip3];
  81             /* compute outer product and add it to the Z matrix */
  82             Z11 += p1 * q1;
  83             Z21 += p2 * q1;
  84             Z31 += p3 * q1;
  85             Z41 += p4 * q1;
  86             /* load p and q values */
  87             p1=ell[2];
  88             q1=ex[2 * b_stride];
  89             p2=ell[2+lskip1];
  90             p3=ell[2+lskip2];
  91             p4=ell[2+lskip3];
  92             /* compute outer product and add it to the Z matrix */
  93             Z11 += p1 * q1;
  94             Z21 += p2 * q1;
  95             Z31 += p3 * q1;
  96             Z41 += p4 * q1;
  97             /* load p and q values */
  98             p1=ell[3];
  99             q1=ex[3 * b_stride];
 100             p2=ell[3+lskip1];
 101             p3=ell[3+lskip2];
 102             p4=ell[3+lskip3];
 103             /* compute outer product and add it to the Z matrix */
 104             Z11 += p1 * q1;
 105             Z21 += p2 * q1;
 106             Z31 += p3 * q1;
 107             Z41 += p4 * q1;
 108             /* load p and q values */
 109             p1=ell[4];
 110             q1=ex[4 * b_stride];
 111             p2=ell[4+lskip1];
 112             p3=ell[4+lskip2];
 113             p4=ell[4+lskip3];
 114             /* compute outer product and add it to the Z matrix */
 115             Z11 += p1 * q1;
 116             Z21 += p2 * q1;
 117             Z31 += p3 * q1;
 118             Z41 += p4 * q1;
 119             /* load p and q values */
 120             p1=ell[5];
 121             q1=ex[5 * b_stride];
 122             p2=ell[5+lskip1];
 123             p3=ell[5+lskip2];
 124             p4=ell[5+lskip3];
 125             /* compute outer product and add it to the Z matrix */
 126             Z11 += p1 * q1;
 127             Z21 += p2 * q1;
 128             Z31 += p3 * q1;
 129             Z41 += p4 * q1;
 130             /* load p and q values */
 131             p1=ell[6];
 132             q1=ex[6 * b_stride];
 133             p2=ell[6+lskip1];
 134             p3=ell[6+lskip2];
 135             p4=ell[6+lskip3];
 136             /* compute outer product and add it to the Z matrix */
 137             Z11 += p1 * q1;
 138             Z21 += p2 * q1;
 139             Z31 += p3 * q1;
 140             Z41 += p4 * q1;
 141             /* load p and q values */
 142             p1=ell[7];
 143             q1=ex[7 * b_stride];
 144             p2=ell[7+lskip1];
 145             p3=ell[7+lskip2];
 146             p4=ell[7+lskip3];
 147             /* compute outer product and add it to the Z matrix */
 148             Z11 += p1 * q1;
 149             Z21 += p2 * q1;
 150             Z31 += p3 * q1;
 151             Z41 += p4 * q1;
 152             /* load p and q values */
 153             p1=ell[8];
 154             q1=ex[8 * b_stride];
 155             p2=ell[8+lskip1];
 156             p3=ell[8+lskip2];
 157             p4=ell[8+lskip3];
 158             /* compute outer product and add it to the Z matrix */
 159             Z11 += p1 * q1;
 160             Z21 += p2 * q1;
 161             Z31 += p3 * q1;
 162             Z41 += p4 * q1;
 163             /* load p and q values */
 164             p1=ell[9];
 165             q1=ex[9 * b_stride];
 166             p2=ell[9+lskip1];
 167             p3=ell[9+lskip2];
 168             p4=ell[9+lskip3];
 169             /* compute outer product and add it to the Z matrix */
 170             Z11 += p1 * q1;
 171             Z21 += p2 * q1;
 172             Z31 += p3 * q1;
 173             Z41 += p4 * q1;
 174             /* load p and q values */
 175             p1=ell[10];
 176             q1=ex[10 * b_stride];
 177             p2=ell[10+lskip1];
 178             p3=ell[10+lskip2];
 179             p4=ell[10+lskip3];
 180             /* compute outer product and add it to the Z matrix */
 181             Z11 += p1 * q1;
 182             Z21 += p2 * q1;
 183             Z31 += p3 * q1;
 184             Z41 += p4 * q1;
 185             /* load p and q values */
 186             p1=ell[11];
 187             q1=ex[11 * b_stride];
 188             p2=ell[11+lskip1];
 189             p3=ell[11+lskip2];
 190             p4=ell[11+lskip3];
 191             /* compute outer product and add it to the Z matrix */
 192             Z11 += p1 * q1;
 193             Z21 += p2 * q1;
 194             Z31 += p3 * q1;
 195             Z41 += p4 * q1;
 196             /* advance pointers */
 197             ell += 12;
 198             ex += 12 * b_stride;
 199             /* end of inner loop */
 200         }
 201         /* compute left-over iterations */
 202         for (; j > 0; --j) {
 203             /* load p and q values */
 204             p1=ell[0];
 205             q1=ex[0 * b_stride];
 206             p2=ell[lskip1];
 207             p3=ell[lskip2];
 208             p4=ell[lskip3];
 209             /* compute outer product and add it to the Z matrix */
 210             Z11 += p1 * q1;
 211             Z21 += p2 * q1;
 212             Z31 += p3 * q1;
 213             Z41 += p4 * q1;
 214             /* advance pointers */
 215             ell += 1;
 216             ex += 1 * b_stride;
 217         }
 218         /* finish computing the X(i) block */
 219         Z11 = ex[0 * b_stride] - Z11;
 220         ex[0 * b_stride] = Z11;
 221         p1 = ell[lskip1];
 222         Z21 = ex[1 * b_stride] - Z21 - p1*Z11;
 223         ex[1 * b_stride] = Z21;
 224         p1 = ell[lskip2];
 225         p2 = ell[1+lskip2];
 226         Z31 = ex[2 * b_stride] - Z31 - p1*Z11 - p2*Z21;
 227         ex[2 * b_stride] = Z31;
 228         p1 = ell[lskip3];
 229         p2 = ell[1+lskip3];
 230         p3 = ell[2+lskip3];
 231         Z41 = ex[3 * b_stride] - Z41 - p1*Z11 - p2*Z21 - p3*Z31;
 232         ex[3 * b_stride] = Z41;
 233         /* end of outer loop */
 234     }
 235     /* compute rows at end that are not a multiple of block size */
 236     for (; i < n; ++i) {
 237         /* compute all 1 x 1 block of X, from rows i..i+1-1 */
 238         /* set the Z matrix to 0 */
 239         Z11=0;
 240         ell = L + i*lskip1;
 241         ex = B;
 242         /* the inner loop that computes outer products and adds them to Z */
 243         for (j = i; j >= 12; j -= 12) {
 244             /* load p and q values */
 245             p1=ell[0];
 246             q1=ex[0 * b_stride];
 247             /* compute outer product and add it to the Z matrix */
 248             Z11 += p1 * q1;
 249             /* load p and q values */
 250             p1=ell[1];
 251             q1=ex[1 * b_stride];
 252             /* compute outer product and add it to the Z matrix */
 253             Z11 += p1 * q1;
 254             /* load p and q values */
 255             p1=ell[2];
 256             q1=ex[2 * b_stride];
 257             /* compute outer product and add it to the Z matrix */
 258             Z11 += p1 * q1;
 259             /* load p and q values */
 260             p1=ell[3];
 261             q1=ex[3 * b_stride];
 262             /* compute outer product and add it to the Z matrix */
 263             Z11 += p1 * q1;
 264             /* load p and q values */
 265             p1=ell[4];
 266             q1=ex[4 * b_stride];
 267             /* compute outer product and add it to the Z matrix */
 268             Z11 += p1 * q1;
 269             /* load p and q values */
 270             p1=ell[5];
 271             q1=ex[5 * b_stride];
 272             /* compute outer product and add it to the Z matrix */
 273             Z11 += p1 * q1;
 274             /* load p and q values */
 275             p1=ell[6];
 276             q1=ex[6 * b_stride];
 277             /* compute outer product and add it to the Z matrix */
 278             Z11 += p1 * q1;
 279             /* load p and q values */
 280             p1=ell[7];
 281             q1=ex[7 * b_stride];
 282             /* compute outer product and add it to the Z matrix */
 283             Z11 += p1 * q1;
 284             /* load p and q values */
 285             p1=ell[8];
 286             q1=ex[8 * b_stride];
 287             /* compute outer product and add it to the Z matrix */
 288             Z11 += p1 * q1;
 289             /* load p and q values */
 290             p1=ell[9];
 291             q1=ex[9 * b_stride];
 292             /* compute outer product and add it to the Z matrix */
 293             Z11 += p1 * q1;
 294             /* load p and q values */
 295             p1=ell[10];
 296             q1=ex[10 * b_stride];
 297             /* compute outer product and add it to the Z matrix */
 298             Z11 += p1 * q1;
 299             /* load p and q values */
 300             p1=ell[11];
 301             q1=ex[11 * b_stride];
 302             /* compute outer product and add it to the Z matrix */
 303             Z11 += p1 * q1;
 304             /* advance pointers */
 305             ell += 12;
 306             ex += 12 * b_stride;
 307             /* end of inner loop */
 308         }
 309         /* compute left-over iterations */
 310         for (; j > 0; --j) {
 311             /* load p and q values */
 312             p1=ell[0];
 313             q1=ex[0 * b_stride];
 314             /* compute outer product and add it to the Z matrix */
 315             Z11 += p1 * q1;
 316             /* advance pointers */
 317             ell += 1;
 318             ex += 1 * b_stride;
 319         }
 320         /* finish computing the X(i) block */
 321         Z11 = ex[0 * b_stride] - Z11;
 322         ex[0 * b_stride] = Z11;
 323     }
 324 }
 325
 326
 327 #endif