include/gmx_sse2_single.h

   1 /*
   2  *                This source code is part of
   3  *
   4  *                 G   R   O   M   A   C   S
   5  *
   6  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   7  * Copyright (c) 2001-2009, The GROMACS Development Team
   8  *
   9  * Gromacs is a library for molecular simulation and trajectory analysis,
  10  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
  11  * a full list of developers and information, check out http://www.gromacs.org
  12  *
  13  * This program is free software; you can redistribute it and/or modify it under
  14  * the terms of the GNU Lesser General Public License as published by the Free
  15  * Software Foundation; either version 2 of the License, or (at your option) any
  16  * later version.
  17  * As a special exception, you may use this file as part of a free software
  18  * library without restriction.  Specifically, if other files instantiate
  19  * templates or use macros or inline functions from this file, or you compile
  20  * this file and link it with other files to produce an executable, this
  21  * file does not by itself cause the resulting executable to be covered by
  22  * the GNU Lesser General Public License.
  23  *
  24  * In plain-speak: do not worry about classes/macros/templates either - only
  25  * changes to the library have to be LGPL, not an application linking with it.
  26  *
  27  * To help fund GROMACS development, we humbly ask that you cite
  28  * the papers people have written on it - you can find them on the website!
  29  */
  30 #ifdef HAVE_CONFIG_H
  31 #include <config.h>
  32 #endif
  33
  34 /* We require SSE2 now! */
  35
  36 #include <math.h>
  37
  38
  39 #include <xmmintrin.h> /* SSE */
  40 #include <emmintrin.h> /* SSE2 */
  41
  42 #ifdef GMX_SSE3
  43 #  include <pmmintrin.h> /* SSE3 */
  44 #endif
  45 #ifdef GMX_SSE4
  46 #  include <smmintrin.h> /* SSE4.1 */
  47 #endif
  48
  49 #include <stdio.h>
  50
  51 /***************************************************
  52  *                                                 *
  53  * COMPILER RANT WARNING:                          *
  54  *                                                 *
  55  * Ideally, this header would be filled with       *
  56  * simple static inline functions. Unfortunately,  *
  57  * many vendors provide really braindead compilers *
  58  * that either cannot handle more than 1-2 SSE     *
  59  * function parameters, and some cannot handle     *
  60  * pointers to SSE __m128 datatypes as parameters  *
  61  * at all. Thus, for portability we have had to    *
  62  * implement all but the simplest routines as      *
  63  * macros instead...                               *
  64  *                                                 *
  65  ***************************************************/
  66
  67
  68 /***************************************************
  69  *                                                 *
  70  *   Wrappers/replacements for some instructions   *
  71  *   not available in all SSE versions.            *
  72  *                                                 *
  73  ***************************************************/
  74
  75 #ifdef GMX_SSE4
  76 #  define gmx_mm_extract_epi32(x, imm) _mm_extract_epi32(x,imm)
  77 #else
  78 #  define gmx_mm_extract_epi32(x, imm) _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
  79 #endif
  80
  81 /*
  82  * Some compilers require a cast to change the interpretation
  83  * of a register from FP to Int and vice versa, and not all of
  84  * the provide instructions to do this. Roll our own wrappers...
  85  */
  86
  87 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  88 #  define gmx_mm_castsi128_ps(a) _mm_castsi128_ps(a)
  89 #  define gmx_mm_castps_si128(a) _mm_castps_si128(a)
  90 #elif defined(__GNUC__)
  91 #  define gmx_mm_castsi128_ps(a) ((__m128)(a))
  92 #  define gmx_mm_castps_si128(a) ((__m128i)(a))
  93 #else
  94 static __m128  gmx_mm_castsi128_ps(__m128i a) { return *(__m128 *) &a;  }
  95 static __m128i gmx_mm_castps_si128(__m128 a)  { return *(__m128i *) &a; }
  96 #endif
  97
  98
  99
 100 /* IO functions, just for debugging */
 101
 102 static void
 103 printxmm(const char *s,__m128 xmm)
 104 {
 105         float f[4];
 106
 107         _mm_storeu_ps(f,xmm);
 108         printf("%s: %8.5g %8.5g %8.5g %8.5g\n",s,f[0],f[1],f[2],f[3]);
 109 }
 110
 111
 112 static void
 113 printxmmsum(const char *s,__m128 xmm)
 114 {
 115         float f[4];
 116
 117         _mm_storeu_ps(f,xmm);
 118         printf("%s (sum): %15.10g\n",s,f[0]+f[1]+f[2]+f[3]);
 119 }
 120
 121
 122 static void
 123 printxmmi(const char *s,__m128i xmmi)
 124 {
 125     int i[4];
 126
 127     _mm_storeu_si128((__m128i *)i,xmmi);
 128     printf("%10s: %2d %2d %2d %2d\n",s,i[0],i[1],i[2],i[3]);
 129 }
 130
 131
 132 /************************
 133  *                      *
 134  * Simple math routines *
 135  *                      *
 136  ************************/
 137
 138 static inline __m128
 139 gmx_mm_invsqrt_ps(__m128 x)
 140 {
 141     const __m128 half  = {0.5,0.5,0.5,0.5};
 142     const __m128 three = {3.0,3.0,3.0,3.0};
 143
 144     __m128 lu = _mm_rsqrt_ps(x);
 145
 146     return _mm_mul_ps(half,_mm_mul_ps(_mm_sub_ps(three,_mm_mul_ps(_mm_mul_ps(lu,lu),x)),lu));
 147 }
 148
 149 static inline __m128
 150 gmx_mm_inv_ps(__m128 x)
 151 {
 152         const __m128 two = {2.0f,2.0f,2.0f,2.0f};
 153
 154     __m128 lu = _mm_rcp_ps(x);
 155
 156         return _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,x)));
 157 }
 158
 159
 160 static inline __m128
 161 gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
 162 {
 163     return _mm_add_ps( _mm_add_ps( _mm_mul_ps(dx,dx), _mm_mul_ps(dy,dy) ), _mm_mul_ps(dz,dz) );
 164 }
 165
 166 /* Normal sum of four xmm registers */
 167 static inline __m128
 168 gmx_mm_sum4_ps(__m128 t0, __m128 t1, __m128 t2, __m128 t3)
 169 {
 170     t0 = _mm_add_ps(t0,t1);
 171     t2 = _mm_add_ps(t2,t3);
 172     return _mm_add_ps(t0,t2);
 173 }
 174
 175
 176 static __m128
 177 gmx_mm_log_ps(__m128 x)
 178 {
 179         const __m128 exp_ps  = gmx_mm_castsi128_ps( _mm_set_epi32(0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000) );
 180         const __m128 one_ps  = gmx_mm_castsi128_ps( _mm_set_epi32(0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000) );
 181         const __m128 off_ps  = gmx_mm_castsi128_ps( _mm_set_epi32(0x3FBF8000, 0x3FBF8000, 0x3FBF8000, 0x3FBF8000) );
 182         const __m128 mant_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF) );
 183         const __m128 base_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x43800000, 0x43800000, 0x43800000, 0x43800000) );
 184         const __m128 loge_ps = gmx_mm_castsi128_ps( _mm_set_epi32(0x3F317218, 0x3F317218, 0x3F317218, 0x3F317218) );
 185
 186         const __m128 D5      = gmx_mm_castsi128_ps( _mm_set_epi32(0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5, 0xBD0D0CC5) );
 187         const __m128 D4      = gmx_mm_castsi128_ps( _mm_set_epi32(0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD, 0x3EA2ECDD) );
 188         const __m128 D3      = gmx_mm_castsi128_ps( _mm_set_epi32(0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9, 0xBF9dA2C9) );
 189         const __m128 D2      = gmx_mm_castsi128_ps( _mm_set_epi32(0x4026537B, 0x4026537B, 0x4026537B, 0x4026537B) );
 190         const __m128 D1      = gmx_mm_castsi128_ps( _mm_set_epi32(0xC054bFAD, 0xC054bFAD, 0xC054bFAD, 0xC054bFAD) );
 191         const __m128 D0      = gmx_mm_castsi128_ps( _mm_set_epi32(0x4047691A, 0x4047691A, 0x4047691A, 0x4047691A) );
 192
 193         __m128  xmm0,xmm1,xmm2;
 194
 195         xmm0  = x;
 196         xmm1  = xmm0;
 197         xmm1  = _mm_and_ps(xmm1, exp_ps);
 198         xmm1 = gmx_mm_castsi128_ps( _mm_srli_epi32( gmx_mm_castps_si128(xmm1),8) );
 199
 200         xmm1  = _mm_or_ps(xmm1, one_ps);
 201         xmm1  = _mm_sub_ps(xmm1, off_ps);
 202
 203         xmm1  = _mm_mul_ps(xmm1, base_ps);
 204         xmm0  = _mm_and_ps(xmm0, mant_ps);
 205         xmm0  = _mm_or_ps(xmm0, one_ps);
 206
 207         xmm2  = _mm_mul_ps(xmm0, D5);
 208         xmm2  = _mm_add_ps(xmm2, D4);
 209         xmm2  = _mm_mul_ps(xmm2,xmm0);
 210         xmm2  = _mm_add_ps(xmm2, D3);
 211         xmm2  = _mm_mul_ps(xmm2,xmm0);
 212         xmm2  = _mm_add_ps(xmm2, D2);
 213         xmm2  = _mm_mul_ps(xmm2,xmm0);
 214         xmm2  = _mm_add_ps(xmm2, D1);
 215         xmm2  = _mm_mul_ps(xmm2,xmm0);
 216         xmm2  = _mm_add_ps(xmm2, D0);
 217         xmm0  = _mm_sub_ps(xmm0, one_ps);
 218         xmm0  = _mm_mul_ps(xmm0,xmm2);
 219         xmm1  = _mm_add_ps(xmm1,xmm0);
 220
 221         x     = xmm1;
 222         x  = _mm_mul_ps(x, loge_ps);
 223
 224     return x;
 225 }
 226
 227
 228 /* This exp-routine has a relative precision of 2^-22.33 bits (essentially single precision :-) ) */
 229 static __m128
 230 gmx_mm_exp_ps(__m128 x)
 231 {
 232     const __m128i half = _mm_set_epi32(0x3F000000, 0x3F000000, 0x3F000000, 0x3F000000);   // 0.5e+0f
 233     const __m128i base = _mm_set_epi32(0x0000007F, 0x0000007F, 0x0000007F, 0x0000007F);   // 127
 234         const __m128i CC   = _mm_set_epi32(0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B);   // log2(e)
 235
 236     const __m128i D5   = _mm_set_epi32(0x3AF61905, 0x3AF61905, 0x3AF61905, 0x3AF61905);   // 1.8775767e-3f
 237     const __m128i D4   = _mm_set_epi32(0x3C134806, 0x3C134806, 0x3C134806, 0x3C134806);   // 8.9893397e-3f
 238     const __m128i D3   = _mm_set_epi32(0x3D64AA23, 0x3D64AA23, 0x3D64AA23, 0x3D64AA23);   // 5.5826318e-2f
 239     const __m128i D2   = _mm_set_epi32(0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4, 0x3E75EAD4);   // 2.4015361e-1f
 240     const __m128i D1   = _mm_set_epi32(0x3F31727B, 0x3F31727B, 0x3F31727B, 0x3F31727B);   // 6.9315308e-1f
 241     const __m128i D0   = _mm_set_epi32(0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF, 0x3F7FFFFF);   // 9.9999994e-1f
 242
 243         __m128 xmm0,xmm1;
 244         __m128i xmm2;
 245
 246         xmm0 = _mm_mul_ps(x,(__m128) CC);
 247         xmm1 = _mm_sub_ps(xmm0, (__m128) half);
 248         xmm2 = _mm_cvtps_epi32(xmm1);
 249         xmm1 = _mm_cvtepi32_ps(xmm2);
 250
 251         xmm2 = _mm_add_epi32(xmm2,(__m128i) base);
 252         xmm2 = _mm_slli_epi32(xmm2,23);
 253
 254         xmm0 = _mm_sub_ps(xmm0,xmm1);
 255         xmm1 = _mm_mul_ps(xmm0,(__m128) D5);
 256         xmm1 = _mm_add_ps(xmm1,(__m128) D4);
 257         xmm1 = _mm_mul_ps(xmm1,xmm0);
 258         xmm1 = _mm_add_ps(xmm1,(__m128) D3);
 259         xmm1 = _mm_mul_ps(xmm1,xmm0);
 260         xmm1 = _mm_add_ps(xmm1,(__m128) D2);
 261         xmm1 = _mm_mul_ps(xmm1,xmm0);
 262         xmm1 = _mm_add_ps(xmm1,(__m128) D1);
 263         xmm1 = _mm_mul_ps(xmm1,xmm0);
 264         xmm1 = _mm_add_ps(xmm1,(__m128) D0);
 265         xmm1 = _mm_mul_ps(xmm1,(__m128) xmm2);
 266
 267         /* 18 instructions currently */
 268         return xmm1;
 269 }
 270
 271
 272
 273 #define GMX_MM_SINCOS_PS(x,sinval,cosval)                                                                    \
 274 {                                                                                                            \
 275         const __m128 _sincosf_two_over_pi = {2.0/M_PI,2.0/M_PI,2.0/M_PI,2.0/M_PI};                               \
 276     const __m128 _sincosf_half        = {0.5,0.5,0.5,0.5};                                                   \
 277     const __m128 _sincosf_one         = {1.0,1.0,1.0,1.0};                                                   \
 278                                                                                                              \
 279         const __m128i _sincosf_izero      = _mm_set1_epi32(0);                                                   \
 280     const __m128i _sincosf_ione       = _mm_set1_epi32(1);                                                   \
 281     const __m128i _sincosf_itwo       = _mm_set1_epi32(2);                                                   \
 282     const __m128i _sincosf_ithree     = _mm_set1_epi32(3);                                                   \
 283                                                                                                              \
 284         const __m128 _sincosf_kc1 = {1.57079625129,1.57079625129,1.57079625129,1.57079625129};                   \
 285     const __m128 _sincosf_kc2 = {7.54978995489e-8,7.54978995489e-8,7.54978995489e-8,7.54978995489e-8};       \
 286         const __m128 _sincosf_cc0 = {-0.0013602249,-0.0013602249,-0.0013602249,-0.0013602249};                   \
 287     const __m128 _sincosf_cc1 = {0.0416566950,0.0416566950,0.0416566950,0.0416566950};                       \
 288     const __m128 _sincosf_cc2 = {-0.4999990225,-0.4999990225,-0.4999990225,-0.4999990225};                   \
 289         const __m128 _sincosf_sc0 = {-0.0001950727,-0.0001950727,-0.0001950727,-0.0001950727};                   \
 290     const __m128 _sincosf_sc1 = {0.0083320758,0.0083320758,0.0083320758,0.0083320758};                       \
 291     const __m128 _sincosf_sc2 = {-0.1666665247,-0.1666665247,-0.1666665247,-0.1666665247};                   \
 292                                                                                                              \
 293         __m128 _sincosf_signbit           = gmx_mm_castsi128_ps( _mm_set1_epi32(0x80000000) );                   \
 294     __m128 _sincosf_tiny              = gmx_mm_castsi128_ps( _mm_set1_epi32(0x3e400000) );                   \
 295                                                                                                              \
 296         __m128 _sincosf_xl;                                                                                      \
 297     __m128 _sincosf_xl2;                                                                                     \
 298     __m128 _sincosf_xl3;                                                                                     \
 299     __m128 _sincosf_qf;                                                                                      \
 300     __m128 _sincosf_absxl;                                                                                   \
 301     __m128 _sincosf_p1;                                                                                      \
 302     __m128 _sincosf_cx;                                                                                      \
 303     __m128 _sincosf_sx;                                                                                      \
 304     __m128 _sincosf_ts;                                                                                      \
 305     __m128 _sincosf_tc;                                                                                      \
 306     __m128 _sincosf_tsn;                                                                                     \
 307     __m128 _sincosf_tcn;                                                                                     \
 308         __m128i _sincosf_q;                                                                                      \
 309     __m128i _sincosf_offsetSin;                                                                              \
 310     __m128i _sincosf_offsetCos;                                                                              \
 311     __m128 _sincosf_sinMask;                                                                                 \
 312     __m128 _sincosf_cosMask;                                                                                 \
 313     __m128 _sincosf_isTiny;                                                                                  \
 314     __m128 _sincosf_ct0;                                                                                     \
 315     __m128 _sincosf_ct1;                                                                                     \
 316     __m128 _sincosf_ct2;                                                                                     \
 317     __m128 _sincosf_st1;                                                                                     \
 318     __m128 _sincosf_st2;                                                                                     \
 319                                                                                                              \
 320     _sincosf_xl        = _mm_mul_ps(x,_sincosf_two_over_pi);                                                 \
 321                                                                                                              \
 322     _sincosf_xl        = _mm_add_ps(_sincosf_xl,_mm_or_ps(_mm_and_ps(_sincosf_xl,_sincosf_signbit),_sincosf_half)); \
 323                                                                                                                  \
 324     _sincosf_q         = _mm_cvttps_epi32(_sincosf_xl);                                                      \
 325     _sincosf_qf        = _mm_cvtepi32_ps(_sincosf_q);                                                        \
 326                                                                                                                  \
 327     _sincosf_offsetSin   = _mm_and_si128(_sincosf_q,_sincosf_ithree);                                        \
 328     _sincosf_offsetCos   = _mm_add_epi32(_sincosf_offsetSin,_sincosf_ione);                                  \
 329                                                                                                              \
 330     _sincosf_p1 = _mm_mul_ps(_sincosf_qf,_sincosf_kc1);                                                      \
 331     _sincosf_xl = _mm_mul_ps(_sincosf_qf,_sincosf_kc2);                                                      \
 332     _sincosf_p1 = _mm_sub_ps(x,_sincosf_p1);                                                                 \
 333     _sincosf_xl = _mm_sub_ps(_sincosf_p1,_sincosf_xl);                                                       \
 334                                                                                                              \
 335     _sincosf_absxl  = _mm_andnot_ps(_sincosf_signbit,_sincosf_xl);                                           \
 336     _sincosf_isTiny = _mm_cmpgt_ps(_sincosf_tiny,_sincosf_absxl);                                            \
 337                                                                                                              \
 338     _sincosf_xl2    = _mm_mul_ps(_sincosf_xl,_sincosf_xl);                                                   \
 339     _sincosf_xl3    = _mm_mul_ps(_sincosf_xl2,_sincosf_xl);                                                  \
 340                                                                                                                  \
 341         _sincosf_ct1    = _mm_mul_ps(_sincosf_cc0,_sincosf_xl2);                                                 \
 342         _sincosf_ct1    = _mm_add_ps(_sincosf_ct1,_sincosf_cc1);                                                 \
 343         _sincosf_st1    = _mm_mul_ps(_sincosf_sc0,_sincosf_xl2);                                                 \
 344         _sincosf_st1    = _mm_add_ps(_sincosf_st1,_sincosf_sc1);                                                 \
 345         _sincosf_ct2    = _mm_mul_ps(_sincosf_ct1,_sincosf_xl2);                                                 \
 346         _sincosf_ct2    = _mm_add_ps(_sincosf_ct2,_sincosf_cc2);                                                 \
 347         _sincosf_st2    = _mm_mul_ps(_sincosf_st1,_sincosf_xl2);                                                 \
 348         _sincosf_st2    = _mm_add_ps(_sincosf_st2,_sincosf_sc2);                                                 \
 349                                                                                                                  \
 350         _sincosf_cx     = _mm_mul_ps(_sincosf_ct2,_sincosf_xl2);                                                 \
 351     _sincosf_cx     = _mm_add_ps(_sincosf_cx,_sincosf_one);                                                  \
 352                                                                                                              \
 353     _sincosf_sx     = _mm_mul_ps(_sincosf_st2,_sincosf_xl3);                                                 \
 354     _sincosf_sx     = _mm_add_ps(_sincosf_sx,_sincosf_xl);                                                   \
 355                                                                                                              \
 356     _sincosf_sinMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_ione), _sincosf_izero) ); \
 357     _sincosf_cosMask = gmx_mm_castsi128_ps( _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_ione), _sincosf_izero) ); \
 358                                                                                                              \
 359     _sincosf_ts     = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_cx) ); \
 360     _sincosf_tc     = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_sx) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_cx) ); \
 361                                                                                                                  \
 362     _sincosf_sinMask = gmx_mm_castsi128_ps(  _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetSin,_sincosf_itwo), _sincosf_izero) );\
 363     _sincosf_tsn    = _mm_xor_ps(_sincosf_signbit,_sincosf_ts);                                              \
 364     _sincosf_ts     = _mm_or_ps( _mm_and_ps(_sincosf_sinMask,_sincosf_ts) , _mm_andnot_ps(_sincosf_sinMask,_sincosf_tsn) ); \
 365                                                                                                                  \
 366     _sincosf_cosMask = gmx_mm_castsi128_ps(  _mm_cmpeq_epi32( _mm_and_si128(_sincosf_offsetCos,_sincosf_itwo), _sincosf_izero) ); \
 367     _sincosf_tcn    = _mm_xor_ps(_sincosf_signbit,_sincosf_tc);                                              \
 368     _sincosf_tc     = _mm_or_ps( _mm_and_ps(_sincosf_cosMask,_sincosf_tc) , _mm_andnot_ps(_sincosf_cosMask,_sincosf_tcn) ); \
 369                                                                                                                  \
 370     sinval = _sincosf_ts;                                                                                    \
 371     cosval = _sincosf_tc;                                                                                    \
 372 }
 373
 374
 375
 376 /* Load a single value from 1-4 places, merge into xmm register */
 377
 378 #define GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1) \
 379 {                                                         \
 380     __m128 _txmm2,_txmm3,_txmm4;                          \
 381     xmm1           = _mm_load_ss(ptr1);                   \
 382     _txmm2         = _mm_load_ss(ptr2);                   \
 383     _txmm3         = _mm_load_ss(ptr3);                   \
 384     _txmm4         = _mm_load_ss(ptr4);                   \
 385     xmm1           = _mm_unpacklo_ps(xmm1,_txmm3);        \
 386     _txmm2         = _mm_unpacklo_ps(_txmm2,_txmm4);      \
 387     xmm1           = _mm_unpacklo_ps(xmm1,_txmm2);        \
 388 }
 389
 390
 391 #define GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,xmm1) \
 392 {                                                    \
 393     __m128 _txmm2,_txmm3;                            \
 394     xmm1           = _mm_load_ss(ptr1);              \
 395     _txmm2         = _mm_load_ss(ptr2);              \
 396     _txmm3         = _mm_load_ss(ptr3);              \
 397     xmm1           = _mm_unpacklo_ps(xmm1,_txmm3);   \
 398     xmm1           = _mm_unpacklo_ps(xmm1,_txmm2);   \
 399 }
 400
 401
 402 #define GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,xmm1)    \
 403 {                                                  \
 404     __m128 _txmm2;                                 \
 405     xmm1           = _mm_load_ss(ptr1);            \
 406     _txmm2         = _mm_load_ss(ptr2);            \
 407     xmm1           = _mm_unpacklo_ps(xmm1,_txmm2); \
 408 }
 409
 410
 411 #define GMX_MM_LOAD_1VALUE_PS(ptr1,xmm1) \
 412 {                                         \
 413       xmm1           = _mm_load_ss(ptr1); \
 414 }
 415
 416 /* Store data in an xmm register into 1-4 different places */
 417 #define GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1)             \
 418 {                                                                      \
 419     __m128 _txmm2,_txmm3,_txmm4;                                       \
 420     _txmm3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);               \
 421     _txmm2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1));     \
 422     _txmm4       = _mm_shuffle_ps(_txmm3,_txmm3,_MM_SHUFFLE(1,1,1,1)); \
 423     _mm_store_ss(ptr1,xmm1);                                           \
 424     _mm_store_ss(ptr2,_txmm2);                                         \
 425     _mm_store_ss(ptr3,_txmm3);                                         \
 426     _mm_store_ss(ptr4,_txmm4);                                         \
 427 }
 428
 429
 430 #define GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,xmm1)              \
 431 {                                                                  \
 432     __m128 _txmm2,_txmm3;                                          \
 433     _txmm3       = _mm_movehl_ps(_mm_setzero_ps(),xmm1);           \
 434     _txmm2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
 435     _mm_store_ss(ptr1,xmm1);                                       \
 436     _mm_store_ss(ptr2,_txmm2);                                     \
 437     _mm_store_ss(ptr3,_txmm3);                                     \
 438 }
 439
 440
 441 #define GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,xmm1)                   \
 442 {                                                                  \
 443     __m128 _txmm2;                                                 \
 444     _txmm2       = _mm_shuffle_ps(xmm1,xmm1,_MM_SHUFFLE(1,1,1,1)); \
 445     _mm_store_ss(ptr1,xmm1);                                       \
 446     _mm_store_ss(ptr2,_txmm2);                                     \
 447 }
 448
 449
 450 #define GMX_MM_STORE_1VALUE_PS(ptr1,xmm1) \
 451 {                                          \
 452     _mm_store_ss(ptr1,xmm1);               \
 453 }
 454
 455
 456 /* Similar to store, but increments value in memory */
 457 #define GMX_MM_INCREMENT_8VALUES_PS(ptr1,ptr2,ptr3,ptr4,ptr5,ptr6,ptr7,ptr8,xmm1,xmm2)    \
 458 {                                                                  \
 459     __m128 _tincr1,_tincr2;                                        \
 460     GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1);          \
 461     GMX_MM_LOAD_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2);          \
 462     _tincr1 = _mm_add_ps(_tincr1,xmm1);                            \
 463     _tincr2 = _mm_add_ps(_tincr2,xmm2);                            \
 464     GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr1);         \
 465     GMX_MM_STORE_4VALUES_PS(ptr5,ptr6,ptr7,ptr8,_tincr2);         \
 466 }
 467
 468 #define GMX_MM_INCREMENT_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,xmm1)    \
 469 {                                                                 \
 470     __m128 _tincr;                                                \
 471     GMX_MM_LOAD_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr);          \
 472     _tincr = _mm_add_ps(_tincr,xmm1);                             \
 473     GMX_MM_STORE_4VALUES_PS(ptr1,ptr2,ptr3,ptr4,_tincr);         \
 474 }
 475
 476 #define GMX_MM_INCREMENT_3VALUES_PS(ptr1,ptr2,ptr3,xmm1)         \
 477 {                                                                 \
 478     __m128 _tincr;                                                \
 479     GMX_MM_LOAD_3VALUES_PS(ptr1,ptr2,ptr3,_tincr);               \
 480     _tincr = _mm_add_ps(_tincr,xmm1);                             \
 481     GMX_MM_STORE_3VALUES_PS(ptr1,ptr2,ptr3,_tincr);              \
 482 }
 483
 484 #define GMX_MM_INCREMENT_2VALUES_PS(ptr1,ptr2,xmm1)         \
 485 {                                                            \
 486     __m128 _tincr;                                           \
 487     GMX_MM_LOAD_2VALUES_PS(ptr1,ptr2,_tincr);               \
 488     _tincr = _mm_add_ps(_tincr,xmm1);                        \
 489     GMX_MM_STORE_2VALUES_PS(ptr1,ptr2,_tincr);              \
 490 }
 491
 492 #define GMX_MM_INCREMENT_1VALUE_PS(ptr1,xmm1)         \
 493 {                                                      \
 494     __m128 _tincr;                                     \
 495     GMX_MM_LOAD_1VALUE_PS(ptr1,_tincr);               \
 496     _tincr = _mm_add_ss(_tincr,xmm1);                  \
 497     GMX_MM_STORE_1VALUE_PS(ptr1,_tincr);              \
 498 }
 499
 500
 501
 502 /* Routines to load pairs from 1-4 places, put in two separate xmm registers. Useful to load LJ parameters! */
 503 #define GMX_MM_LOAD_4PAIRS_PS(ptr1,ptr2,ptr3,ptr4,c6,c12)    \
 504 {                                                             \
 505     __m128 _tmp1,_tmp2,_tmp3,_tmp4;                           \
 506     _tmp1  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));  \
 507     _tmp2  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2));  \
 508     _tmp3  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3));  \
 509     _tmp4  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4));  \
 510     _tmp1  = _mm_unpacklo_ps(_tmp1,_tmp3);                    \
 511     _tmp2  = _mm_unpacklo_ps(_tmp2,_tmp4);                    \
 512     c6     = _mm_unpacklo_ps(_tmp1,_tmp2);                    \
 513     c12    = _mm_unpackhi_ps(_tmp1,_tmp2);                    \
 514 }
 515
 516 #define GMX_MM_LOAD_3PAIRS_PS(ptr1,ptr2,ptr3,c6,c12)        \
 517 {                                                            \
 518     __m128 _tmp1,_tmp2,_tmp3;                                \
 519     _tmp1  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
 520     _tmp2  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
 521     _tmp3  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3)); \
 522     _tmp1  = _mm_unpacklo_ps(_tmp1,_tmp3);                   \
 523     _tmp2  = _mm_unpacklo_ps(_tmp2,_mm_setzero_ps());        \
 524     c6     = _mm_unpacklo_ps(_tmp1,_tmp2);                   \
 525     c12    = _mm_unpackhi_ps(_tmp1,_tmp2);                   \
 526 }
 527
 528
 529 #define GMX_MM_LOAD_2PAIRS_PS(ptr1,ptr2,c6,c12)             \
 530 {                                                            \
 531     __m128 _tmp1,_tmp2;                                      \
 532     _tmp1  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1)); \
 533     _tmp2  = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2)); \
 534     c6     = _mm_unpacklo_ps(_tmp1,_tmp2);                   \
 535     c12    = _mm_movehl_ps(c12,c6);                          \
 536 }
 537
 538 #define GMX_MM_LOAD_1PAIR_PS(ptr1,c6,c12)                   \
 539 {                                                            \
 540     c6     = _mm_load_ss(ptr1);                              \
 541     c12    = _mm_load_ss(ptr1+1);                            \
 542 }
 543
 544
 545 /* Routines to load 1-4 rvecs from 1-4 places.
 546  * We mainly use these to load coordinates. The extra routines
 547  * are very efficient for the water-water loops, since we e.g.
 548  * know that a TIP4p water has 4 atoms, so we should load 12 floats+shuffle.
 549  */
 550 #define GMX_MM_LOAD_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) {             \
 551          jx1            = _mm_load_ss(ptr1);                                \
 552      jy1            = _mm_load_ss((ptr1)+1);                            \
 553      jz1            = _mm_load_ss((ptr1)+2);                            \
 554 }
 555
 556 #define GMX_MM_LOAD_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) {      \
 557          jx1            = _mm_load_ss(ptr1);                                      \
 558      jy1            = _mm_load_ss((ptr1)+1);                                  \
 559      jz1            = _mm_load_ss((ptr1)+2);                                  \
 560          jx2            = _mm_load_ss((ptr1)+3);                                  \
 561      jy2            = _mm_load_ss((ptr1)+4);                                  \
 562      jz2            = _mm_load_ss((ptr1)+5);                                  \
 563 }
 564
 565
 566 #define GMX_MM_LOAD_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 567          jx1            = _mm_load_ss(ptr1);                                    \
 568      jy1            = _mm_load_ss((ptr1)+1);                                \
 569      jz1            = _mm_load_ss((ptr1)+2);                                \
 570          jx2            = _mm_load_ss((ptr1)+3);                                \
 571      jy2            = _mm_load_ss((ptr1)+4);                                \
 572      jz2            = _mm_load_ss((ptr1)+5);                                \
 573          jx3            = _mm_load_ss((ptr1)+6);                                \
 574      jy3            = _mm_load_ss((ptr1)+7);                                \
 575      jz3            = _mm_load_ss((ptr1)+8);                                \
 576 }
 577
 578
 579 #define GMX_MM_LOAD_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 580          jx1            = _mm_load_ss(ptr1);                                    \
 581      jy1            = _mm_load_ss((ptr1)+1);                                \
 582      jz1            = _mm_load_ss((ptr1)+2);                                \
 583          jx2            = _mm_load_ss((ptr1)+3);                                \
 584      jy2            = _mm_load_ss((ptr1)+4);                                \
 585      jz2            = _mm_load_ss((ptr1)+5);                                \
 586          jx3            = _mm_load_ss((ptr1)+6);                                \
 587      jy3            = _mm_load_ss((ptr1)+7);                                \
 588      jz3            = _mm_load_ss((ptr1)+8);                                \
 589          jx4            = _mm_load_ss((ptr1)+9);                                \
 590      jy4            = _mm_load_ss((ptr1)+10);                               \
 591      jz4            = _mm_load_ss((ptr1)+11);                               \
 592 }
 593
 594
 595 #define GMX_MM_LOAD_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) {  \
 596       __m128 _tmp1,_tmp2;                                           \
 597       _tmp1           = _mm_load_ss(ptr1);                          \
 598           _tmp2           = _mm_load_ss(ptr2);                          \
 599       _tmp1           = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));      \
 600       _tmp2           = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));      \
 601       jx1             = _mm_unpacklo_ps(_tmp1,_tmp2);               \
 602       jy1             = _mm_unpackhi_ps(_tmp1,_tmp2);               \
 603           jx1             = _mm_unpacklo_ps(_tmp1,_tmp2);               \
 604       jz1             = _mm_movehl_ps(jz1,jy1);                     \
 605 }
 606
 607 #define GMX_MM_LOAD_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
 608      __m128 _tmp1, _tmp2;                                                      \
 609          _tmp1          = _mm_loadu_ps(ptr1);                                      \
 610      jy1            = _mm_loadu_ps(ptr2);                                      \
 611      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));        \
 612      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));        \
 613      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                              \
 614      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                              \
 615      jy2            = _mm_unpacklo_ps(jy2,_tmp2);                              \
 616      jy1            = _mm_movehl_ps(jx1,jx1);                                  \
 617      jx2            = _mm_movehl_ps(jz1,jz1);                                  \
 618      jz2            = _mm_movehl_ps(jy2,jy2);                                  \
 619 }
 620
 621
 622 #define GMX_MM_LOAD_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 623      __m128 _tmp1, _tmp2, _tmp3;                                                           \
 624          _tmp1          = _mm_loadu_ps(ptr1);                                                  \
 625      jy1            = _mm_loadu_ps(ptr2);                                                  \
 626      _tmp2          = _mm_loadu_ps(ptr1+4);                                                \
 627      jz2            = _mm_loadu_ps(ptr2+4);                                                \
 628      jz3            = _mm_load_ss(ptr1+8);                                                 \
 629      _tmp3          = _mm_load_ss(ptr2+8);                                                 \
 630      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                                          \
 631      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                                          \
 632      jy2            = _mm_unpacklo_ps(_tmp2,jz2);                                          \
 633      jx3            = _mm_unpackhi_ps(_tmp2,jz2);                                          \
 634      jy1            = _mm_movehl_ps(jx1,jx1);                                              \
 635      jx2            = _mm_movehl_ps(jz1,jz1);                                              \
 636      jz2            = _mm_movehl_ps(jy2,jy2);                                              \
 637      jy3            = _mm_movehl_ps(jx3,jx3);                                              \
 638      jz3            = _mm_unpacklo_ps(jz3,_tmp3);                                          \
 639 }
 640
 641
 642 #define GMX_MM_LOAD_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 643      __m128 _tmp1, _tmp2, _tmp3,_tmp4;                                                                 \
 644          _tmp1          = _mm_loadu_ps(ptr1);                                                              \
 645      jy1            = _mm_loadu_ps(ptr2);                                                              \
 646      _tmp2          = _mm_loadu_ps(ptr1+4);                                                            \
 647      jz2            = _mm_loadu_ps(ptr2+4);                                                            \
 648      _tmp3          = _mm_loadu_ps(ptr1+8);                                                            \
 649      _tmp4          = _mm_loadu_ps(ptr2+8);                                                            \
 650      jx1            = _mm_unpacklo_ps(_tmp1,jy1);                                                      \
 651      jz1            = _mm_unpackhi_ps(_tmp1,jy1);                                                      \
 652      jy2            = _mm_unpacklo_ps(_tmp2,jz2);                                                      \
 653      jx3            = _mm_unpackhi_ps(_tmp2,jz2);                                                      \
 654      jz3            = _mm_unpacklo_ps(_tmp3,_tmp4);                                                    \
 655      jy4            = _mm_unpackhi_ps(_tmp3,_tmp4);                                                    \
 656      jy1            = _mm_movehl_ps(jx1,jx1);                                                          \
 657      jx2            = _mm_movehl_ps(jz1,jz1);                                                          \
 658      jz2            = _mm_movehl_ps(jy2,jy2);                                                          \
 659      jy3            = _mm_movehl_ps(jx3,jx3);                                                          \
 660      jx4            = _mm_movehl_ps(jz3,jz3);                                                          \
 661      jz4            = _mm_movehl_ps(jy4,jy4);                                                          \
 662 }
 663
 664
 665 #define GMX_MM_LOAD_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
 666      __m128 _tmp1,_tmp3,_tmp4;                                         \
 667          jx1            = _mm_load_ss(ptr1);                               \
 668      jy1            = _mm_load_ss(ptr2);                               \
 669      jz1            = _mm_load_ss(ptr3);                               \
 670          jx1            = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1));             \
 671      jy1            = _mm_loadh_pi(jy1,(__m64 *)(ptr2+1));             \
 672      jz1            = _mm_loadh_pi(jz1,(__m64 *)(ptr3+1));             \
 673      _tmp1          = _mm_unpacklo_ps(jx1,jy1);                        \
 674      _tmp3          = _mm_unpackhi_ps(jx1,jy1);                        \
 675      _tmp4          = _mm_unpackhi_ps(jz1,jz1);                        \
 676      jx1            = _mm_movelh_ps(_tmp1,jz1);                        \
 677      jy1            = _mm_movelh_ps(_tmp3,_tmp4);                      \
 678      jz1            = _mm_movehl_ps(_tmp4,_tmp3);                      \
 679 }
 680
 681
 682 #define GMX_MM_LOAD_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
 683      __m128 _tmp1, _tmp2;                                                           \
 684          jx1            = _mm_loadu_ps(ptr1);                                           \
 685      jy1            = _mm_loadu_ps(ptr2);                                           \
 686      jz1            = _mm_loadu_ps(ptr3);                                           \
 687      jx2            = _mm_setzero_ps();                                             \
 688      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                            \
 689      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));             \
 690      jz2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));             \
 691      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));             \
 692      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp2);                                 \
 693      jz2            = _mm_unpacklo_ps(jz2,_mm_setzero_ps());                        \
 694      jy2            = _mm_unpacklo_ps(_tmp1,jz2);                                   \
 695      jz2            = _mm_unpackhi_ps(_tmp1,jz2);                                   \
 696 }
 697
 698
 699 #define GMX_MM_LOAD_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 700      __m128 _tmp1, _tmp2;                                                                       \
 701          jx1            = _mm_loadu_ps(ptr1);                                                       \
 702      jy1            = _mm_loadu_ps(ptr2);                                                       \
 703      jz1            = _mm_loadu_ps(ptr3);                                                       \
 704      jx2            = _mm_setzero_ps();                                                         \
 705      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                        \
 706      jy2            = _mm_loadu_ps(ptr1+4);                                                     \
 707      jz2            = _mm_loadu_ps(ptr2+4);                                                     \
 708      jx3            = _mm_loadu_ps(ptr3+4);                                                     \
 709      jy3            = _mm_setzero_ps();                                                         \
 710      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                        \
 711      jz3            = _mm_load_ss(ptr1+8);                                                      \
 712      _tmp1          = _mm_load_ss(ptr2+8);                                                      \
 713      _tmp2          = _mm_load_ss(ptr3+8);                                                      \
 714      jz3            = _mm_unpacklo_ps(jz3,_tmp2);                                               \
 715      _tmp1          = _mm_unpacklo_ps(_tmp1,_mm_setzero_ps());                                  \
 716      jz3            = _mm_unpacklo_ps(jz3,_tmp1);                                               \
 717 }
 718
 719
 720 #define GMX_MM_LOAD_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 721          jx1            = _mm_loadu_ps(ptr1);                                                                   \
 722      jy1            = _mm_loadu_ps(ptr2);                                                                   \
 723      jz1            = _mm_loadu_ps(ptr3);                                                                   \
 724      jx2            = _mm_setzero_ps();                                                                     \
 725      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                                    \
 726      jy2            = _mm_loadu_ps(ptr1+4);                                                                 \
 727      jz2            = _mm_loadu_ps(ptr2+4);                                                                 \
 728      jx3            = _mm_loadu_ps(ptr3+4);                                                                 \
 729      jy3            = _mm_setzero_ps();                                                                     \
 730      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                                    \
 731      jz3            = _mm_loadu_ps(ptr1+8);                                                                 \
 732      jx4            = _mm_loadu_ps(ptr2+8);                                                                 \
 733      jy4            = _mm_loadu_ps(ptr3+8);                                                                 \
 734      jz4            = _mm_setzero_ps();                                                                     \
 735      _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4);                                                                    \
 736 }
 737
 738
 739
 740 #define GMX_MM_LOAD_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) {  \
 741      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                   \
 742          jx1            = _mm_load_ss(ptr1);                                     \
 743      _tmp1          = _mm_load_ss(ptr2);                                     \
 744      jy1            = _mm_load_ss(ptr3);                                     \
 745      jz1            = _mm_load_ss(ptr4);                                     \
 746          jx1            = _mm_loadh_pi(jx1,(__m64 *)(ptr1+1));                   \
 747      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2+1));                 \
 748      jy1            = _mm_loadh_pi(jy1,(__m64 *)(ptr3+1));                   \
 749      jz1            = _mm_loadh_pi(jz1,(__m64 *)(ptr4+1));                   \
 750      _tmp2          = _mm_unpacklo_ps(jx1,_tmp1);                            \
 751      _tmp3          = _mm_unpacklo_ps(jy1,jz1);                              \
 752      _tmp4          = _mm_unpackhi_ps(jx1,_tmp1);                            \
 753      _tmp5          = _mm_unpackhi_ps(jy1,jz1);                              \
 754      jx1            = _mm_movelh_ps(_tmp2,_tmp3);                            \
 755      jy1            = _mm_movelh_ps(_tmp4,_tmp5);                            \
 756      jz1            = _mm_movehl_ps(_tmp5,_tmp4);                            \
 757 }
 758
 759
 760 #define GMX_MM_LOAD_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
 761      __m128 _tmp1, _tmp2;                                                                \
 762          jx1            = _mm_loadu_ps(ptr1);                                                \
 763      jy1            = _mm_loadu_ps(ptr2);                                                \
 764      jz1            = _mm_loadu_ps(ptr3);                                                \
 765      jx2            = _mm_loadu_ps(ptr4);                                                \
 766      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                 \
 767      jy2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));                  \
 768      jz2            = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr2+4));                  \
 769      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));                  \
 770      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr4+4));                  \
 771      _tmp1          = _mm_unpacklo_ps(jy2,_tmp1);                                        \
 772      _tmp2          = _mm_unpacklo_ps(jz2,_tmp2);                                        \
 773      jy2            = _mm_unpacklo_ps(_tmp1,_tmp2);                                      \
 774      jz2            = _mm_unpackhi_ps(_tmp1,_tmp2);                                      \
 775 }
 776
 777
 778 #define GMX_MM_LOAD_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 779      __m128 _tmp1, _tmp2, _tmp3;                                                                     \
 780          jx1            = _mm_loadu_ps(ptr1);                                                            \
 781      jy1            = _mm_loadu_ps(ptr2);                                                            \
 782      jz1            = _mm_loadu_ps(ptr3);                                                            \
 783      jx2            = _mm_loadu_ps(ptr4);                                                            \
 784      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                             \
 785      jy2            = _mm_loadu_ps(ptr1+4);                                                          \
 786      jz2            = _mm_loadu_ps(ptr2+4);                                                          \
 787      jx3            = _mm_loadu_ps(ptr3+4);                                                          \
 788      jy3            = _mm_loadu_ps(ptr4+4);                                                          \
 789      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                             \
 790      jz3            = _mm_load_ss(ptr1+8);                                                           \
 791      _tmp1          = _mm_load_ss(ptr2+8);                                                           \
 792      _tmp2          = _mm_load_ss(ptr3+8);                                                           \
 793      _tmp3          = _mm_load_ss(ptr4+8);                                                           \
 794      jz3            = _mm_unpacklo_ps(jz3,_tmp2);                                                    \
 795      _tmp1          = _mm_unpacklo_ps(_tmp1,_tmp3);                                                  \
 796      jz3            = _mm_unpacklo_ps(jz3,_tmp1);                                                    \
 797 }
 798
 799
 800 #define GMX_MM_LOAD_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 801          jx1            = _mm_loadu_ps(ptr1);                                                                        \
 802      jy1            = _mm_loadu_ps(ptr2);                                                                        \
 803      jz1            = _mm_loadu_ps(ptr3);                                                                        \
 804      jx2            = _mm_loadu_ps(ptr4);                                                                        \
 805      _MM_TRANSPOSE4_PS(jx1,jy1,jz1,jx2);                                                                         \
 806      jy2            = _mm_loadu_ps(ptr1+4);                                                                      \
 807      jz2            = _mm_loadu_ps(ptr2+4);                                                                      \
 808      jx3            = _mm_loadu_ps(ptr3+4);                                                                      \
 809      jy3            = _mm_loadu_ps(ptr4+4);                                                                      \
 810      _MM_TRANSPOSE4_PS(jy2,jz2,jx3,jy3);                                                                         \
 811      jz3            = _mm_loadu_ps(ptr1+8);                                                                      \
 812      jx4            = _mm_loadu_ps(ptr2+8);                                                                      \
 813      jy4            = _mm_loadu_ps(ptr3+8);                                                                      \
 814      jz4            = _mm_loadu_ps(ptr4+8);                                                                      \
 815      _MM_TRANSPOSE4_PS(jz3,jx4,jy4,jz4);                                                                         \
 816 }
 817
 818
 819 /* Routines to increment rvecs in memory, typically use for j particle force updates */
 820 #define GMX_MM_INCREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) {      \
 821      __m128 _tmp1;                                                    \
 822      jy1            = _mm_unpacklo_ps(jy1,jz1);                       \
 823      jx1            = _mm_movelh_ps(jx1,jy1);                         \
 824      _tmp1          = _mm_load_ss(ptr1);                              \
 825      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
 826      _tmp1          = _mm_add_ps(_tmp1,jx1);                          \
 827      _mm_store_ss(ptr1,_tmp1);                                        \
 828      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
 829 }
 830
 831
 832 #define GMX_MM_INCREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
 833      __m128 _tmp1, _tmp2;                                                     \
 834      _tmp1          = _mm_loadu_ps(ptr1);                                     \
 835      _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));       \
 836      jx1            = _mm_unpacklo_ps(jx1,jy1);                               \
 837      jz1            = _mm_unpacklo_ps(jz1,jx2);                               \
 838      jy2            = _mm_unpacklo_ps(jy2,jz2);                               \
 839      jx1            = _mm_movelh_ps(jx1,jz1);                                 \
 840      _tmp1          = _mm_add_ps(_tmp1,jx1);                                  \
 841      _tmp2          = _mm_add_ps(_tmp2,jy2);                                  \
 842      _mm_storeu_ps(ptr1,_tmp1);                                               \
 843      _mm_storel_pi((__m64 *)(ptr1+4),_tmp2);                                  \
 844 }
 845
 846
 847 #define GMX_MM_INCREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 848      __m128 _tmp1, _tmp2, _tmp3;                                                          \
 849      _tmp1          = _mm_loadu_ps(ptr1);                                                 \
 850      _tmp2          = _mm_loadu_ps(ptr1+4);                                               \
 851      _tmp3          = _mm_load_ss(ptr1+8);                                                \
 852      jx1            = _mm_unpacklo_ps(jx1,jy1);                                           \
 853      jz1            = _mm_unpacklo_ps(jz1,jx2);                                           \
 854      jy2            = _mm_unpacklo_ps(jy2,jz2);                                           \
 855      jx3            = _mm_unpacklo_ps(jx3,jy3);                                           \
 856      jx1            = _mm_movelh_ps(jx1,jz1);                                             \
 857      jy2            = _mm_movelh_ps(jy2,jx3);                                             \
 858      _tmp1           = _mm_add_ps(_tmp1,jx1);                                             \
 859      _tmp2           = _mm_add_ps(_tmp2,jy2);                                             \
 860      _tmp3           = _mm_add_ss(_tmp3,jz3);                                             \
 861      _mm_storeu_ps(ptr1,_tmp1);                                                           \
 862      _mm_storeu_ps(ptr1+4,_tmp2);                                                         \
 863      _mm_store_ss(ptr1+8,_tmp3);                                                          \
 864 }
 865
 866
 867 #define GMX_MM_INCREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 868      __m128 _tmp1, _tmp2, _tmp3;                                                                      \
 869      _tmp1          = _mm_loadu_ps(ptr1);                                                             \
 870      _tmp2          = _mm_loadu_ps(ptr1+4);                                                           \
 871      _tmp3          = _mm_loadu_ps(ptr1+8);                                                           \
 872      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                       \
 873      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                       \
 874      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                       \
 875      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                       \
 876      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                       \
 877      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                       \
 878      jx1            = _mm_movelh_ps(jx1,jz1);                                                         \
 879      jy2            = _mm_movelh_ps(jy2,jx3);                                                         \
 880      jz3            = _mm_movelh_ps(jz3,jy4);                                                         \
 881      _tmp1          = _mm_add_ps(_tmp1,jx1);                                                          \
 882      _tmp2          = _mm_add_ps(_tmp2,jy2);                                                          \
 883      _tmp3          = _mm_add_ps(_tmp3,jz3);                                                          \
 884      _mm_storeu_ps(ptr1,_tmp1);                                                                       \
 885      _mm_storeu_ps(ptr1+4,_tmp2);                                                                     \
 886      _mm_storeu_ps(ptr1+8,_tmp3);                                                                     \
 887 }
 888
 889
 890 #define GMX_MM_INCREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) {        \
 891      __m128 _tmp1,_tmp2,_tmp3,_tmp4;                                          \
 892      _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));         \
 893      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2));                    \
 894      _tmp2          = _mm_load_ss(ptr1+2);                                    \
 895      _tmp3          = _mm_load_ss(ptr2+2);                                    \
 896      jx1            = _mm_unpacklo_ps(jx1,jy1);                               \
 897      _tmp4          = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1));           \
 898      _tmp1          = _mm_add_ps(_tmp1,jx1);                                  \
 899      _mm_storel_pi((__m64 *)(ptr1),_tmp1);                                    \
 900      _mm_storeh_pi((__m64 *)(ptr2),_tmp1);                                    \
 901      _mm_store_ss(ptr1+2,_mm_add_ss(_tmp2,jz1));                              \
 902          _mm_store_ss(ptr2+2,_mm_add_ss(_tmp3,_tmp4));                            \
 903 }
 904
 905
 906 #define GMX_MM_INCREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) {  \
 907      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                           \
 908      _tmp1          = _mm_loadu_ps(ptr1);                                            \
 909      _tmp2          = _mm_loadu_ps(ptr2);                                            \
 910      _tmp3          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));              \
 911      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4));                         \
 912      jx1            = _mm_unpacklo_ps(jx1,jy1);                                      \
 913      jz1            = _mm_unpacklo_ps(jz1,jx2);                                      \
 914      jy2            = _mm_unpacklo_ps(jy2,jz2);                                      \
 915      _tmp4          = _mm_movelh_ps(jx1,jz1);                                        \
 916      _tmp5          = _mm_movehl_ps(jz1,jx1);                                        \
 917      _tmp1          = _mm_add_ps(_tmp1,_tmp4);                                       \
 918      _tmp2          = _mm_add_ps(_tmp2,_tmp5);                                       \
 919      _tmp3          = _mm_add_ps(_tmp3,jy2);                                         \
 920      _mm_storeu_ps(ptr1,_tmp1);                                                      \
 921      _mm_storeu_ps(ptr2,_tmp2);                                                      \
 922      _mm_storel_pi((__m64 *)(ptr1+4),_tmp3);                                         \
 923          _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3);                                         \
 924 }
 925
 926
 927 #define GMX_MM_INCREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
 928      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                \
 929      _tmp1          = _mm_loadu_ps(ptr1);                                                       \
 930      _tmp2          = _mm_loadu_ps(ptr1+4);                                                     \
 931      _tmp3          = _mm_load_ss(ptr1+8);                                                      \
 932      _tmp4          = _mm_loadu_ps(ptr2);                                                       \
 933      _tmp5          = _mm_loadu_ps(ptr2+4);                                                     \
 934      _tmp6          = _mm_load_ss(ptr2+8);                                                      \
 935      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                 \
 936      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                 \
 937      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                 \
 938      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                 \
 939      _tmp7          = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                             \
 940      _tmp8          = _mm_movelh_ps(jx1,jz1);                                                   \
 941      _tmp9          = _mm_movehl_ps(jz1,jx1);                                                   \
 942      _tmp10         = _mm_movelh_ps(jy2,jx3);                                                   \
 943      _tmp11         = _mm_movehl_ps(jx3,jy2);                                                   \
 944      _tmp1          = _mm_add_ps(_tmp1,_tmp8);                                                  \
 945      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                                 \
 946      _tmp3          = _mm_add_ss(_tmp3,jz3);                                                    \
 947      _tmp4          = _mm_add_ps(_tmp4,_tmp9);                                                  \
 948      _tmp5          = _mm_add_ps(_tmp5,_tmp11);                                                 \
 949      _tmp6          = _mm_add_ss(_tmp6,_tmp7);                                                  \
 950      _mm_storeu_ps(ptr1,_tmp1);                                                                 \
 951      _mm_storeu_ps(ptr1+4,_tmp2);                                                               \
 952      _mm_store_ss(ptr1+8,_tmp3);                                                                \
 953      _mm_storeu_ps(ptr2,_tmp4);                                                                 \
 954      _mm_storeu_ps(ptr2+4,_tmp5);                                                               \
 955      _mm_store_ss(ptr2+8,_tmp6);                                                                \
 956 }
 957
 958
 959 #define GMX_MM_INCREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
 960      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13;              \
 961      _tmp1          = _mm_loadu_ps(ptr1);                                                                   \
 962      _tmp2          = _mm_loadu_ps(ptr1+4);                                                                 \
 963      _tmp3          = _mm_loadu_ps(ptr1+8);                                                                 \
 964      _tmp4          = _mm_loadu_ps(ptr2);                                                                   \
 965      _tmp5          = _mm_loadu_ps(ptr2+4);                                                                 \
 966      _tmp6          = _mm_loadu_ps(ptr2+8);                                                                 \
 967      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                             \
 968      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                             \
 969      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                             \
 970      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                             \
 971      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                             \
 972      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                             \
 973      _tmp8          = _mm_movelh_ps(jx1,jz1);                                                               \
 974      _tmp9          = _mm_movehl_ps(jz1,jx1);                                                               \
 975      _tmp10         = _mm_movelh_ps(jy2,jx3);                                                               \
 976      _tmp11         = _mm_movehl_ps(jx3,jy2);                                                               \
 977      _tmp12         = _mm_movelh_ps(jz3,jy4);                                                               \
 978      _tmp13         = _mm_movehl_ps(jy4,jz3);                                                               \
 979      _tmp1          = _mm_add_ps(_tmp1,_tmp8);                                                              \
 980      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                                             \
 981      _tmp3          = _mm_add_ps(_tmp3,_tmp12);                                                             \
 982      _tmp4          = _mm_add_ps(_tmp4,_tmp9);                                                              \
 983      _tmp5          = _mm_add_ps(_tmp5,_tmp11);                                                             \
 984      _tmp6          = _mm_add_ps(_tmp6,_tmp13);                                                             \
 985      _mm_storeu_ps(ptr1,_tmp1);                                                                             \
 986      _mm_storeu_ps(ptr1+4,_tmp2);                                                                           \
 987      _mm_storeu_ps(ptr1+8,_tmp3);                                                                           \
 988      _mm_storeu_ps(ptr2,_tmp4);                                                                             \
 989      _mm_storeu_ps(ptr2+4,_tmp5);                                                                           \
 990      _mm_storeu_ps(ptr2+8,_tmp6);                                                                           \
 991 }
 992
 993
 994 #define GMX_MM_INCREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) {   \
 995      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7;                        \
 996      _tmp1          = _mm_load_ss(ptr1);                                      \
 997      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));                  \
 998      _tmp2          = _mm_load_ss(ptr2);                                      \
 999      _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));                  \
1000      _tmp3          = _mm_load_ss(ptr3);                                      \
1001      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));                  \
1002      _tmp4          = _mm_unpacklo_ps(jy1,jz1);                               \
1003      _tmp5          = _mm_unpackhi_ps(jy1,jz1);                               \
1004      _tmp6          = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1));         \
1005      _tmp7          = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));           \
1006      jx1            = _mm_movelh_ps(jx1,_tmp4);                               \
1007      _tmp7          = _mm_movelh_ps(_tmp7,_tmp5);                             \
1008      _tmp1          = _mm_add_ps(_tmp1,jx1);                                  \
1009      _tmp2          = _mm_add_ps(_tmp2,_tmp6);                                \
1010      _tmp3          = _mm_add_ps(_tmp3,_tmp7);                                \
1011      _mm_store_ss(ptr1,_tmp1);                                                \
1012      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                                  \
1013      _mm_store_ss(ptr2,_tmp2);                                                \
1014      _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                                  \
1015      _mm_store_ss(ptr3,_tmp3);                                                \
1016      _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                                  \
1017 }
1018
1019
1020 #define GMX_MM_INCREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1021      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                \
1022      _tmp1          = _mm_loadu_ps(ptr1);                                                \
1023      _tmp2          = _mm_loadu_ps(ptr2);                                                \
1024      _tmp3          = _mm_loadu_ps(ptr3);                                                \
1025      _tmp4          = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4));                             \
1026      _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4));                             \
1027      _tmp5          = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4));                             \
1028      _tmp6          = _mm_unpackhi_ps(jx1,jy1);                                          \
1029          jx1            = _mm_unpacklo_ps(jx1,jy1);                                          \
1030      _tmp7          = _mm_unpackhi_ps(jz1,jx2);                                          \
1031      jz1            = _mm_unpacklo_ps(jz1,jx2);                                          \
1032      _tmp8          = _mm_unpackhi_ps(jy2,jz2);                                          \
1033      jy2            = _mm_unpacklo_ps(jy2,jz2);                                          \
1034      _tmp9          = _mm_movelh_ps(jx1,jz1);                                            \
1035      _tmp10         = _mm_movehl_ps(jz1,jx1);                                            \
1036      _tmp6          = _mm_movelh_ps(_tmp6,_tmp7);                                        \
1037      _tmp1          = _mm_add_ps(_tmp1,_tmp9);                                           \
1038      _tmp2          = _mm_add_ps(_tmp2,_tmp10);                                          \
1039      _tmp3          = _mm_add_ps(_tmp3,_tmp6);                                           \
1040      _tmp4          = _mm_add_ps(_tmp4,jy2);                                             \
1041      _tmp5          = _mm_add_ps(_tmp5,_tmp8);                                           \
1042      _mm_storeu_ps(ptr1,_tmp1);                                                          \
1043      _mm_storeu_ps(ptr2,_tmp2);                                                          \
1044      _mm_storeu_ps(ptr3,_tmp3);                                                          \
1045      _mm_storel_pi((__m64 *)(ptr1+4),_tmp4);                                             \
1046      _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4);                                             \
1047          _mm_storel_pi((__m64 *)(ptr3+4),_tmp5);                                             \
1048 }
1049
1050
1051 #define GMX_MM_INCREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1052      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                            \
1053      __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;                          \
1054      _tmp1          = _mm_loadu_ps(ptr1);                                                            \
1055      _tmp2          = _mm_loadu_ps(ptr1+4);                                                          \
1056      _tmp3          = _mm_load_ss(ptr1+8);                                                           \
1057      _tmp4          = _mm_loadu_ps(ptr2);                                                            \
1058      _tmp5          = _mm_loadu_ps(ptr2+4);                                                          \
1059      _tmp6          = _mm_load_ss(ptr2+8);                                                           \
1060      _tmp7          = _mm_loadu_ps(ptr3);                                                            \
1061      _tmp8          = _mm_loadu_ps(ptr3+4);                                                          \
1062      _tmp9          = _mm_load_ss(ptr3+8);                                                           \
1063      _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                                      \
1064      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                      \
1065      _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                                      \
1066      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                      \
1067      _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                                      \
1068      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                      \
1069      _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                                      \
1070      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                      \
1071      _tmp14         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                                  \
1072      _tmp15         = _mm_movehl_ps(jz3,jz3);                                                        \
1073      _tmp16         = _mm_movelh_ps(jx1,jz1);                                                        \
1074      _tmp17         = _mm_movehl_ps(jz1,jx1);                                                        \
1075      _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                                                  \
1076      _tmp18         = _mm_movelh_ps(jy2,jx3);                                                        \
1077      _tmp19         = _mm_movehl_ps(jx3,jy2);                                                        \
1078      _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                                                  \
1079      _tmp1          = _mm_add_ps(_tmp1,_tmp16);                                                      \
1080      _tmp2          = _mm_add_ps(_tmp2,_tmp18);                                                      \
1081      _tmp3          = _mm_add_ss(_tmp3,jz3);                                                         \
1082      _tmp4          = _mm_add_ps(_tmp4,_tmp17);                                                      \
1083      _tmp5          = _mm_add_ps(_tmp5,_tmp19);                                                      \
1084      _tmp6          = _mm_add_ss(_tmp6,_tmp14);                                                      \
1085      _tmp7          = _mm_add_ps(_tmp7,_tmp10);                                                      \
1086      _tmp8          = _mm_add_ps(_tmp8,_tmp12);                                                      \
1087      _tmp9          = _mm_add_ss(_tmp9,_tmp15);                                                      \
1088      _mm_storeu_ps(ptr1,_tmp1);                                                                      \
1089      _mm_storeu_ps(ptr1+4,_tmp2);                                                                    \
1090      _mm_store_ss(ptr1+8,_tmp3);                                                                     \
1091      _mm_storeu_ps(ptr2,_tmp4);                                                                      \
1092      _mm_storeu_ps(ptr2+4,_tmp5);                                                                    \
1093      _mm_store_ss(ptr2+8,_tmp6);                                                                     \
1094      _mm_storeu_ps(ptr3,_tmp7);                                                                      \
1095      _mm_storeu_ps(ptr3+4,_tmp8);                                                                    \
1096      _mm_store_ss(ptr3+8,_tmp9);                                                                     \
1097 }
1098
1099
1100 #define GMX_MM_INCREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1101      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                                 \
1102      __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21;                               \
1103      _tmp1          = _mm_loadu_ps(ptr1);                                                                        \
1104      _tmp2          = _mm_loadu_ps(ptr1+4);                                                                      \
1105      _tmp3          = _mm_loadu_ps(ptr1+8);                                                                      \
1106      _tmp4          = _mm_loadu_ps(ptr2);                                                                        \
1107      _tmp5          = _mm_loadu_ps(ptr2+4);                                                                      \
1108      _tmp6          = _mm_loadu_ps(ptr2+8);                                                                      \
1109      _tmp7          = _mm_loadu_ps(ptr3);                                                                        \
1110      _tmp8          = _mm_loadu_ps(ptr3+4);                                                                      \
1111      _tmp9          = _mm_loadu_ps(ptr3+8);                                                                      \
1112      _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                                                  \
1113      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                                  \
1114      _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                                                  \
1115      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                                  \
1116      _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                                                  \
1117      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                                  \
1118      _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                                                  \
1119      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                                  \
1120      _tmp14         = _mm_unpackhi_ps(jz3,jx4);                                                                  \
1121      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                                  \
1122      _tmp15         = _mm_unpackhi_ps(jy4,jz4);                                                                  \
1123      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                                  \
1124      _tmp16         = _mm_movelh_ps(jx1,jz1);                                                                    \
1125      _tmp17         = _mm_movehl_ps(jz1,jx1);                                                                    \
1126      _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                                                              \
1127      _tmp18         = _mm_movelh_ps(jy2,jx3);                                                                    \
1128      _tmp19         = _mm_movehl_ps(jx3,jy2);                                                                    \
1129      _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                                                              \
1130      _tmp20         = _mm_movelh_ps(jz3,jy4);                                                                    \
1131      _tmp21         = _mm_movehl_ps(jy4,jz3);                                                                    \
1132      _tmp14         = _mm_movelh_ps(_tmp14,_tmp15);                                                              \
1133      _tmp1          = _mm_add_ps(_tmp1,_tmp16);                                                                  \
1134      _tmp2          = _mm_add_ps(_tmp2,_tmp18);                                                                  \
1135      _tmp3          = _mm_add_ps(_tmp3,_tmp20);                                                                  \
1136      _tmp4          = _mm_add_ps(_tmp4,_tmp17);                                                                  \
1137      _tmp5          = _mm_add_ps(_tmp5,_tmp19);                                                                  \
1138      _tmp6          = _mm_add_ps(_tmp6,_tmp21);                                                                  \
1139      _tmp7          = _mm_add_ps(_tmp7,_tmp10);                                                                  \
1140      _tmp8          = _mm_add_ps(_tmp8,_tmp12);                                                                  \
1141      _tmp9          = _mm_add_ps(_tmp9,_tmp14);                                                                  \
1142      _mm_storeu_ps(ptr1,_tmp1);                                                                                  \
1143      _mm_storeu_ps(ptr1+4,_tmp2);                                                                                \
1144      _mm_storeu_ps(ptr1+8,_tmp3);                                                                                \
1145      _mm_storeu_ps(ptr2,_tmp4);                                                                                  \
1146      _mm_storeu_ps(ptr2+4,_tmp5);                                                                                \
1147      _mm_storeu_ps(ptr2+8,_tmp6);                                                                                \
1148      _mm_storeu_ps(ptr3,_tmp7);                                                                                  \
1149      _mm_storeu_ps(ptr3+4,_tmp8);                                                                                \
1150      _mm_storeu_ps(ptr3+8,_tmp9);                                                                                \
1151 }
1152
1153
1154
1155 #define GMX_MM_INCREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1156      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;        \
1157      _tmp1          = _mm_load_ss(ptr1);                                         \
1158      _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));                     \
1159      _tmp2          = _mm_load_ss(ptr2);                                         \
1160      _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));                     \
1161      _tmp3          = _mm_load_ss(ptr3);                                         \
1162      _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));                     \
1163      _tmp4          = _mm_load_ss(ptr4);                                         \
1164      _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1));                     \
1165      _tmp5          = _mm_unpacklo_ps(jy1,jz1);                                  \
1166      _tmp6          = _mm_unpackhi_ps(jy1,jz1);                                  \
1167      _tmp7          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0));            \
1168      _tmp8          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1));            \
1169      _tmp9          = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2));            \
1170      _tmp10         = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3));            \
1171      _tmp1          = _mm_add_ps(_tmp1,_tmp7);                                   \
1172      _tmp2          = _mm_add_ps(_tmp2,_tmp8);                                   \
1173      _tmp3          = _mm_add_ps(_tmp3,_tmp9);                                   \
1174      _tmp4          = _mm_add_ps(_tmp4,_tmp10);                                  \
1175      _mm_store_ss(ptr1,_tmp1);                                                   \
1176      _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                                     \
1177      _mm_store_ss(ptr2,_tmp2);                                                   \
1178      _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                                     \
1179      _mm_store_ss(ptr3,_tmp3);                                                   \
1180      _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                                     \
1181      _mm_store_ss(ptr4,_tmp4);                                                   \
1182      _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4);                                     \
1183 }
1184
1185
1186 #define GMX_MM_INCREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) {  \
1187      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1188      _tmp1          = _mm_loadu_ps(ptr1);                                                      \
1189      _tmp2          = _mm_loadu_ps(ptr2);                                                      \
1190      _tmp3          = _mm_loadu_ps(ptr3);                                                      \
1191      _tmp4          = _mm_loadu_ps(ptr4);                                                      \
1192      _tmp5          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));                        \
1193      _tmp5          = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4));                                   \
1194      _tmp6          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));                        \
1195      _tmp6          = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4));                                   \
1196      _tmp7          = _mm_unpackhi_ps(jx1,jy1);                                                \
1197          jx1            = _mm_unpacklo_ps(jx1,jy1);                                                \
1198      _tmp8          = _mm_unpackhi_ps(jz1,jx2);                                                \
1199      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                \
1200      _tmp9          = _mm_unpackhi_ps(jy2,jz2);                                                \
1201      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                \
1202      _tmp10         = _mm_movelh_ps(jx1,jz1);                                                  \
1203      _tmp11         = _mm_movehl_ps(jz1,jx1);                                                  \
1204      _tmp12         = _mm_movelh_ps(_tmp7,_tmp8);                                              \
1205      _tmp13         = _mm_movehl_ps(_tmp8,_tmp7);                                              \
1206      _tmp1          = _mm_add_ps(_tmp1,_tmp10);                                                \
1207      _tmp2          = _mm_add_ps(_tmp2,_tmp11);                                                \
1208      _tmp3          = _mm_add_ps(_tmp3,_tmp12);                                                \
1209      _tmp4          = _mm_add_ps(_tmp4,_tmp13);                                                \
1210      _tmp5          = _mm_add_ps(_tmp5,jy2);                                                   \
1211      _tmp6          = _mm_add_ps(_tmp6,_tmp9);                                                 \
1212      _mm_storeu_ps(ptr1,_tmp1);                                                                \
1213      _mm_storeu_ps(ptr2,_tmp2);                                                                \
1214      _mm_storeu_ps(ptr3,_tmp3);                                                                \
1215      _mm_storeu_ps(ptr4,_tmp4);                                                                \
1216      _mm_storel_pi((__m64 *)(ptr1+4),_tmp5);                                                   \
1217      _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5);                                                   \
1218          _mm_storel_pi((__m64 *)(ptr3+4),_tmp6);                                                   \
1219          _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6);                                                   \
1220 }
1221
1222
1223 #define GMX_MM_INCREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1224      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                                 \
1225      __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;                               \
1226      __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25;                                                    \
1227      _tmp1          = _mm_loadu_ps(ptr1);                                                                 \
1228      _tmp2          = _mm_loadu_ps(ptr1+4);                                                               \
1229      _tmp3          = _mm_load_ss(ptr1+8);                                                                \
1230      _tmp4          = _mm_loadu_ps(ptr2);                                                                 \
1231      _tmp5          = _mm_loadu_ps(ptr2+4);                                                               \
1232      _tmp6          = _mm_load_ss(ptr2+8);                                                                \
1233      _tmp7          = _mm_loadu_ps(ptr3);                                                                 \
1234      _tmp8          = _mm_loadu_ps(ptr3+4);                                                               \
1235      _tmp9          = _mm_load_ss(ptr3+8);                                                                \
1236      _tmp10         = _mm_loadu_ps(ptr4);                                                                 \
1237      _tmp11         = _mm_loadu_ps(ptr4+4);                                                               \
1238      _tmp12         = _mm_load_ss(ptr4+8);                                                                \
1239      _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                                           \
1240      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                           \
1241      _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                                           \
1242      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                           \
1243      _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                                           \
1244      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                           \
1245      _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                                           \
1246      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                           \
1247      _tmp17         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                                       \
1248      _tmp18         = _mm_movehl_ps(jz3,jz3);                                                             \
1249      _tmp19         = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1));                                 \
1250      _tmp20         = _mm_movelh_ps(jx1,jz1);                                                             \
1251      _tmp21         = _mm_movehl_ps(jz1,jx1);                                                             \
1252      _tmp22         = _mm_movelh_ps(_tmp13,_tmp14);                                                       \
1253      _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                                                       \
1254      _tmp23         = _mm_movelh_ps(jy2,jx3);                                                             \
1255      _tmp24         = _mm_movehl_ps(jx3,jy2);                                                             \
1256      _tmp25         = _mm_movelh_ps(_tmp15,_tmp16);                                                       \
1257      _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                                                       \
1258      _tmp1          = _mm_add_ps(_tmp1,_tmp20);                                                           \
1259      _tmp2          = _mm_add_ps(_tmp2,_tmp23);                                                           \
1260      _tmp3          = _mm_add_ss(_tmp3,jz3);                                                              \
1261      _tmp4          = _mm_add_ps(_tmp4,_tmp21);                                                           \
1262      _tmp5          = _mm_add_ps(_tmp5,_tmp24);                                                           \
1263      _tmp6          = _mm_add_ss(_tmp6,_tmp17);                                                           \
1264      _tmp7          = _mm_add_ps(_tmp7,_tmp22);                                                           \
1265      _tmp8          = _mm_add_ps(_tmp8,_tmp25);                                                           \
1266      _tmp9          = _mm_add_ss(_tmp9,_tmp18);                                                           \
1267      _tmp10         = _mm_add_ps(_tmp10,_tmp14);                                                          \
1268      _tmp11         = _mm_add_ps(_tmp11,_tmp16);                                                          \
1269      _tmp12         = _mm_add_ss(_tmp12,_tmp19);                                                          \
1270      _mm_storeu_ps(ptr1,_tmp1);                                                                           \
1271      _mm_storeu_ps(ptr1+4,_tmp2);                                                                         \
1272      _mm_store_ss(ptr1+8,_tmp3);                                                                          \
1273      _mm_storeu_ps(ptr2,_tmp4);                                                                           \
1274      _mm_storeu_ps(ptr2+4,_tmp5);                                                                         \
1275      _mm_store_ss(ptr2+8,_tmp6);                                                                          \
1276      _mm_storeu_ps(ptr3,_tmp7);                                                                           \
1277      _mm_storeu_ps(ptr3+4,_tmp8);                                                                         \
1278      _mm_store_ss(ptr3+8,_tmp9);                                                                          \
1279      _mm_storeu_ps(ptr4,_tmp10);                                                                          \
1280      _mm_storeu_ps(ptr4+4,_tmp11);                                                                        \
1281      _mm_store_ss(ptr4+8,_tmp12);                                                                         \
1282 }
1283
1284
1285 #define GMX_MM_INCREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1286      __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                                      \
1287      __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;                             \
1288      __m128 _tmp23,_tmp24;                                                                                            \
1289      _tmp1          = _mm_loadu_ps(ptr1);                                                                             \
1290      _tmp2          = _mm_loadu_ps(ptr1+4);                                                                           \
1291      _tmp3          = _mm_loadu_ps(ptr1+8);                                                                           \
1292      _tmp4          = _mm_loadu_ps(ptr2);                                                                             \
1293      _tmp5          = _mm_loadu_ps(ptr2+4);                                                                           \
1294      _tmp6          = _mm_loadu_ps(ptr2+8);                                                                           \
1295      _tmp7          = _mm_loadu_ps(ptr3);                                                                             \
1296      _tmp8          = _mm_loadu_ps(ptr3+4);                                                                           \
1297      _tmp9          = _mm_loadu_ps(ptr3+8);                                                                           \
1298      _tmp10         = _mm_loadu_ps(ptr4);                                                                             \
1299      _tmp11         = _mm_loadu_ps(ptr4+4);                                                                           \
1300      _tmp12         = _mm_loadu_ps(ptr4+8);                                                                           \
1301      _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                                                       \
1302      jx1            = _mm_unpacklo_ps(jx1,jy1);                                                                       \
1303      _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                                                       \
1304      jz1            = _mm_unpacklo_ps(jz1,jx2);                                                                       \
1305      _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                                                       \
1306      jy2            = _mm_unpacklo_ps(jy2,jz2);                                                                       \
1307      _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                                                       \
1308      jx3            = _mm_unpacklo_ps(jx3,jy3);                                                                       \
1309      _tmp17         = _mm_unpackhi_ps(jz3,jx4);                                                                       \
1310      jz3            = _mm_unpacklo_ps(jz3,jx4);                                                                       \
1311      _tmp18         = _mm_unpackhi_ps(jy4,jz4);                                                                       \
1312      jy4            = _mm_unpacklo_ps(jy4,jz4);                                                                       \
1313      _tmp19         = _mm_movelh_ps(jx1,jz1);                                                                         \
1314      jz1            = _mm_movehl_ps(jz1,jx1);                                                                         \
1315      _tmp20         = _mm_movelh_ps(_tmp13,_tmp14);                                                                   \
1316      _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                                                                   \
1317      _tmp21         = _mm_movelh_ps(jy2,jx3);                                                                         \
1318      jx3            = _mm_movehl_ps(jx3,jy2);                                                                         \
1319      _tmp22         = _mm_movelh_ps(_tmp15,_tmp16);                                                                   \
1320      _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                                                                   \
1321      _tmp23         = _mm_movelh_ps(jz3,jy4);                                                                         \
1322      jy4            = _mm_movehl_ps(jy4,jz3);                                                                         \
1323      _tmp24         = _mm_movelh_ps(_tmp17,_tmp18);                                                                   \
1324      _tmp18         = _mm_movehl_ps(_tmp18,_tmp17);                                                                   \
1325      _tmp1          = _mm_add_ps(_tmp1,_tmp19);                                                                       \
1326      _tmp2          = _mm_add_ps(_tmp2,_tmp21);                                                                       \
1327      _tmp3          = _mm_add_ps(_tmp3,_tmp23);                                                                       \
1328      _tmp4          = _mm_add_ps(_tmp4,jz1);                                                                          \
1329      _tmp5          = _mm_add_ps(_tmp5,jx3);                                                                          \
1330      _tmp6          = _mm_add_ps(_tmp6,jy4);                                                                          \
1331      _tmp7          = _mm_add_ps(_tmp7,_tmp20);                                                                       \
1332      _tmp8          = _mm_add_ps(_tmp8,_tmp22);                                                                       \
1333      _tmp9          = _mm_add_ps(_tmp9,_tmp24);                                                                       \
1334      _tmp10         = _mm_add_ps(_tmp10,_tmp14);                                                                      \
1335      _tmp11         = _mm_add_ps(_tmp11,_tmp16);                                                                      \
1336      _tmp12         = _mm_add_ps(_tmp12,_tmp18);                                                                      \
1337      _mm_storeu_ps(ptr1,_tmp1);                                                                                       \
1338      _mm_storeu_ps(ptr1+4,_tmp2);                                                                                     \
1339      _mm_storeu_ps(ptr1+8,_tmp3);                                                                                     \
1340      _mm_storeu_ps(ptr2,_tmp4);                                                                                       \
1341      _mm_storeu_ps(ptr2+4,_tmp5);                                                                                     \
1342      _mm_storeu_ps(ptr2+8,_tmp6);                                                                                     \
1343      _mm_storeu_ps(ptr3,_tmp7);                                                                                       \
1344      _mm_storeu_ps(ptr3+4,_tmp8);                                                                                     \
1345      _mm_storeu_ps(ptr3+8,_tmp9);                                                                                     \
1346      _mm_storeu_ps(ptr4,_tmp10);                                                                                      \
1347      _mm_storeu_ps(ptr4+4,_tmp11);                                                                                    \
1348      _mm_storeu_ps(ptr4+8,_tmp12);                                                                                    \
1349 }
1350
1351
1352
1353 #define GMX_MM_DECREMENT_1RVEC_1POINTER_PS(ptr1,jx1,jy1,jz1) {     \
1354     __m128 _tmp1;                                                    \
1355     jy1            = _mm_unpacklo_ps(jy1,jz1);                       \
1356     jx1            = _mm_movelh_ps(jx1,jy1);                         \
1357     _tmp1          = _mm_load_ss(ptr1);                              \
1358     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
1359     _tmp1          = _mm_sub_ps(_tmp1,jx1);                          \
1360     _mm_store_ss(ptr1,_tmp1);                                        \
1361     _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
1362 }
1363
1364
1365 #define GMX_MM_DECREMENT_2RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2) { \
1366     __m128 _tmp1, _tmp2;                                                      \
1367     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1368     _tmp2          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));        \
1369     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1370     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1371     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1372     jx1            = _mm_movelh_ps(jx1,jz1);                                  \
1373     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                   \
1374     _tmp2          = _mm_sub_ps(_tmp2,jy2);                                   \
1375     _mm_storeu_ps(ptr1,_tmp1);                                                \
1376     _mm_storel_pi((__m64 *)(ptr1+4),_tmp2);                                   \
1377 }
1378
1379
1380 #define GMX_MM_DECREMENT_3RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1381     __m128 _tmp1, _tmp2, _tmp3;                                                           \
1382     _tmp1          = _mm_loadu_ps(ptr1);                                                  \
1383     _tmp2          = _mm_loadu_ps(ptr1+4);                                                \
1384     _tmp3          = _mm_load_ss(ptr1+8);                                                 \
1385     jx1            = _mm_unpacklo_ps(jx1,jy1);                                            \
1386     jz1            = _mm_unpacklo_ps(jz1,jx2);                                            \
1387     jy2            = _mm_unpacklo_ps(jy2,jz2);                                            \
1388     jx3            = _mm_unpacklo_ps(jx3,jy3);                                            \
1389     jx1            = _mm_movelh_ps(jx1,jz1);                                              \
1390     jy2            = _mm_movelh_ps(jy2,jx3);                                              \
1391     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                               \
1392     _tmp2          = _mm_sub_ps(_tmp2,jy2);                                               \
1393     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                               \
1394     _mm_storeu_ps(ptr1,_tmp1);                                                            \
1395     _mm_storeu_ps(ptr1+4,_tmp2);                                                          \
1396     _mm_store_ss(ptr1+8,_tmp3);                                                           \
1397 }
1398
1399
1400 #define GMX_MM_DECREMENT_4RVECS_1POINTER_PS(ptr1,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1401     __m128 _tmp1, _tmp2, _tmp3;                                                                       \
1402     _tmp1          = _mm_loadu_ps(ptr1);                                                              \
1403     _tmp2          = _mm_loadu_ps(ptr1+4);                                                            \
1404     _tmp3          = _mm_loadu_ps(ptr1+8);                                                            \
1405     jx1            = _mm_unpacklo_ps(jx1,jy1);                                                        \
1406     jz1            = _mm_unpacklo_ps(jz1,jx2);                                                        \
1407     jy2            = _mm_unpacklo_ps(jy2,jz2);                                                        \
1408     jx3            = _mm_unpacklo_ps(jx3,jy3);                                                        \
1409     jz3            = _mm_unpacklo_ps(jz3,jx4);                                                        \
1410     jy4            = _mm_unpacklo_ps(jy4,jz4);                                                        \
1411     jx1            = _mm_movelh_ps(jx1,jz1);                                                          \
1412     jy2            = _mm_movelh_ps(jy2,jx3);                                                          \
1413     jz3            = _mm_movelh_ps(jz3,jy4);                                                          \
1414     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                                           \
1415     _tmp2          = _mm_sub_ps(_tmp2,jy2);                                                           \
1416     _tmp3          = _mm_sub_ps(_tmp3,jz3);                                                           \
1417     _mm_storeu_ps(ptr1,_tmp1);                                                                        \
1418     _mm_storeu_ps(ptr1+4,_tmp2);                                                                      \
1419     _mm_storeu_ps(ptr1+8,_tmp3);                                                                      \
1420 }
1421
1422
1423 #define GMX_MM_DECREMENT_1RVEC_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1) {        \
1424     __m128 _tmp1,_tmp2,_tmp3,_tmp4;                                           \
1425     _tmp1          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1));          \
1426     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr2));                     \
1427     _tmp2          = _mm_load_ss(ptr1+2);                                     \
1428     _tmp3          = _mm_load_ss(ptr2+2);                                     \
1429     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1430     _tmp4          = _mm_shuffle_ps(jz1,jz1,_MM_SHUFFLE(0,0,0,1));            \
1431     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                   \
1432     _mm_storel_pi((__m64 *)(ptr1),_tmp1);                                     \
1433     _mm_storeh_pi((__m64 *)(ptr2),_tmp1);                                     \
1434     _mm_store_ss(ptr1+2,_mm_sub_ss(_tmp2,jz1));                               \
1435     _mm_store_ss(ptr2+2,_mm_sub_ss(_tmp3,_tmp4));                             \
1436 }
1437
1438
1439 #define GMX_MM_DECREMENT_2RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2) { \
1440     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5;                                           \
1441     _tmp1          = _mm_loadu_ps(ptr1);                                            \
1442     _tmp2          = _mm_loadu_ps(ptr2);                                            \
1443     _tmp3          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));              \
1444     _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr2+4));                         \
1445     jx1            = _mm_unpacklo_ps(jx1,jy1);                                      \
1446     jz1            = _mm_unpacklo_ps(jz1,jx2);                                      \
1447     jy2            = _mm_unpacklo_ps(jy2,jz2);                                      \
1448     _tmp4          = _mm_movelh_ps(jx1,jz1);                                        \
1449     _tmp5          = _mm_movehl_ps(jz1,jx1);                                        \
1450     _tmp1          = _mm_sub_ps(_tmp1,_tmp4);                                       \
1451     _tmp2          = _mm_sub_ps(_tmp2,_tmp5);                                       \
1452     _tmp3          = _mm_sub_ps(_tmp3,jy2);                                         \
1453     _mm_storeu_ps(ptr1,_tmp1);                                                      \
1454     _mm_storeu_ps(ptr2,_tmp2);                                                      \
1455     _mm_storel_pi((__m64 *)(ptr1+4),_tmp3);                                         \
1456     _mm_storeh_pi((__m64 *)(ptr2+4),_tmp3);                                         \
1457 }
1458
1459
1460 #define GMX_MM_DECREMENT_3RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) {\
1461     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                \
1462     _tmp1          = _mm_loadu_ps(ptr1);                                                       \
1463     _tmp2          = _mm_loadu_ps(ptr1+4);                                                     \
1464     _tmp3          = _mm_load_ss(ptr1+8);                                                      \
1465     _tmp4          = _mm_loadu_ps(ptr2);                                                       \
1466     _tmp5          = _mm_loadu_ps(ptr2+4);                                                     \
1467     _tmp6          = _mm_load_ss(ptr2+8);                                                      \
1468     jx1            = _mm_unpacklo_ps(jx1,jy1);                                                 \
1469     jz1            = _mm_unpacklo_ps(jz1,jx2);                                                 \
1470     jy2            = _mm_unpacklo_ps(jy2,jz2);                                                 \
1471     jx3            = _mm_unpacklo_ps(jx3,jy3);                                                 \
1472     _tmp7          = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));                             \
1473     _tmp8          = _mm_movelh_ps(jx1,jz1);                                                   \
1474     _tmp9          = _mm_movehl_ps(jz1,jx1);                                                   \
1475     _tmp10         = _mm_movelh_ps(jy2,jx3);                                                   \
1476     _tmp11         = _mm_movehl_ps(jx3,jy2);                                                   \
1477     _tmp1          = _mm_sub_ps(_tmp1,_tmp8);                                                  \
1478     _tmp2          = _mm_sub_ps(_tmp2,_tmp10);                                                 \
1479     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                                    \
1480     _tmp4          = _mm_sub_ps(_tmp4,_tmp9);                                                  \
1481     _tmp5          = _mm_sub_ps(_tmp5,_tmp11);                                                 \
1482     _tmp6          = _mm_sub_ss(_tmp6,_tmp7);                                                  \
1483     _mm_storeu_ps(ptr1,_tmp1);                                                                 \
1484     _mm_storeu_ps(ptr1+4,_tmp2);                                                               \
1485     _mm_store_ss(ptr1+8,_tmp3);                                                                \
1486     _mm_storeu_ps(ptr2,_tmp4);                                                                 \
1487     _mm_storeu_ps(ptr2+4,_tmp5);                                                               \
1488     _mm_store_ss(ptr2+8,_tmp6);                                                                \
1489 }
1490
1491
1492 #define GMX_MM_DECREMENT_4RVECS_2POINTERS_PS(ptr1,ptr2,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) {\
1493     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13;              \
1494     _tmp1          = _mm_loadu_ps(ptr1);                                                                   \
1495     _tmp2          = _mm_loadu_ps(ptr1+4);                                                                 \
1496     _tmp3          = _mm_loadu_ps(ptr1+8);                                                                 \
1497     _tmp4          = _mm_loadu_ps(ptr2);                                                                   \
1498     _tmp5          = _mm_loadu_ps(ptr2+4);                                                                 \
1499     _tmp6          = _mm_loadu_ps(ptr2+8);                                                                 \
1500     jx1            = _mm_unpacklo_ps(jx1,jy1);                                                             \
1501     jz1            = _mm_unpacklo_ps(jz1,jx2);                                                             \
1502     jy2            = _mm_unpacklo_ps(jy2,jz2);                                                             \
1503     jx3            = _mm_unpacklo_ps(jx3,jy3);                                                             \
1504     jz3            = _mm_unpacklo_ps(jz3,jx4);                                                             \
1505     jy4            = _mm_unpacklo_ps(jy4,jz4);                                                             \
1506     _tmp8          = _mm_movelh_ps(jx1,jz1);                                                               \
1507     _tmp9          = _mm_movehl_ps(jz1,jx1);                                                               \
1508     _tmp10         = _mm_movelh_ps(jy2,jx3);                                                               \
1509     _tmp11         = _mm_movehl_ps(jx3,jy2);                                                               \
1510     _tmp12         = _mm_movelh_ps(jz3,jy4);                                                               \
1511     _tmp13         = _mm_movehl_ps(jy4,jz3);                                                               \
1512     _tmp1          = _mm_sub_ps(_tmp1,_tmp8);                                                              \
1513     _tmp2          = _mm_sub_ps(_tmp2,_tmp10);                                                             \
1514     _tmp3          = _mm_sub_ps(_tmp3,_tmp12);                                                             \
1515     _tmp4          = _mm_sub_ps(_tmp4,_tmp9);                                                              \
1516     _tmp5          = _mm_sub_ps(_tmp5,_tmp11);                                                             \
1517     _tmp6          = _mm_sub_ps(_tmp6,_tmp13);                                                             \
1518     _mm_storeu_ps(ptr1,_tmp1);                                                                             \
1519     _mm_storeu_ps(ptr1+4,_tmp2);                                                                           \
1520     _mm_storeu_ps(ptr1+8,_tmp3);                                                                           \
1521     _mm_storeu_ps(ptr2,_tmp4);                                                                             \
1522     _mm_storeu_ps(ptr2+4,_tmp5);                                                                           \
1523     _mm_storeu_ps(ptr2+8,_tmp6);                                                                           \
1524 }
1525
1526
1527 #define GMX_MM_DECREMENT_1RVEC_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1) { \
1528     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7;                       \
1529     _tmp1          = _mm_load_ss(ptr1);                                     \
1530     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));                 \
1531     _tmp2          = _mm_load_ss(ptr2);                                     \
1532     _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));                 \
1533     _tmp3          = _mm_load_ss(ptr3);                                     \
1534     _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));                 \
1535     _tmp4          = _mm_unpacklo_ps(jy1,jz1);                              \
1536     _tmp5          = _mm_unpackhi_ps(jy1,jz1);                              \
1537     _tmp6          = _mm_shuffle_ps(jx1,_tmp4,_MM_SHUFFLE(3,2,0,1));        \
1538     _tmp7          = _mm_shuffle_ps(jx1,jx1,_MM_SHUFFLE(0,0,0,2));          \
1539     jx1            = _mm_movelh_ps(jx1,_tmp4);                              \
1540     _tmp7          = _mm_movelh_ps(_tmp7,_tmp5);                            \
1541     _tmp1          = _mm_sub_ps(_tmp1,jx1);                                 \
1542     _tmp2          = _mm_sub_ps(_tmp2,_tmp6);                               \
1543     _tmp3          = _mm_sub_ps(_tmp3,_tmp7);                               \
1544     _mm_store_ss(ptr1,_tmp1);                                               \
1545     _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                                 \
1546     _mm_store_ss(ptr2,_tmp2);                                               \
1547     _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                                 \
1548     _mm_store_ss(ptr3,_tmp3);                                               \
1549     _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                                 \
1550 }
1551
1552
1553 #define GMX_MM_DECREMENT_2RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2) { \
1554     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;                 \
1555     _tmp1          = _mm_loadu_ps(ptr1);                                                 \
1556     _tmp2          = _mm_loadu_ps(ptr2);                                                 \
1557     _tmp3          = _mm_loadu_ps(ptr3);                                                 \
1558     _tmp4          = _mm_loadl_pi(_tmp4,(__m64 *)(ptr1+4));                              \
1559     _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr2+4));                              \
1560     _tmp5          = _mm_loadl_pi(_tmp5,(__m64 *)(ptr3+4));                              \
1561     _tmp6          = _mm_unpackhi_ps(jx1,jy1);                                           \
1562     jx1            = _mm_unpacklo_ps(jx1,jy1);                                           \
1563     _tmp7          = _mm_unpackhi_ps(jz1,jx2);                                           \
1564     jz1            = _mm_unpacklo_ps(jz1,jx2);                                           \
1565     _tmp8          = _mm_unpackhi_ps(jy2,jz2);                                           \
1566     jy2            = _mm_unpacklo_ps(jy2,jz2);                                           \
1567     _tmp9          = _mm_movelh_ps(jx1,jz1);                                             \
1568     _tmp10         = _mm_movehl_ps(jz1,jx1);                                             \
1569     _tmp6          = _mm_movelh_ps(_tmp6,_tmp7);                                         \
1570     _tmp1          = _mm_sub_ps(_tmp1,_tmp9);                                            \
1571     _tmp2          = _mm_sub_ps(_tmp2,_tmp10);                                           \
1572     _tmp3          = _mm_sub_ps(_tmp3,_tmp6);                                            \
1573     _tmp4          = _mm_sub_ps(_tmp4,jy2);                                              \
1574     _tmp5          = _mm_sub_ps(_tmp5,_tmp8);                                            \
1575     _mm_storeu_ps(ptr1,_tmp1);                                                           \
1576     _mm_storeu_ps(ptr2,_tmp2);                                                           \
1577     _mm_storeu_ps(ptr3,_tmp3);                                                           \
1578     _mm_storel_pi((__m64 *)(ptr1+4),_tmp4);                                              \
1579     _mm_storeh_pi((__m64 *)(ptr2+4),_tmp4);                                              \
1580     _mm_storel_pi((__m64 *)(ptr3+4),_tmp5);                                              \
1581 }
1582
1583
1584 #define GMX_MM_DECREMENT_3RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1585     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;      \
1586     __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;    \
1587     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1588     _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
1589     _tmp3          = _mm_load_ss(ptr1+8);                                     \
1590     _tmp4          = _mm_loadu_ps(ptr2);                                      \
1591     _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
1592     _tmp6          = _mm_load_ss(ptr2+8);                                     \
1593     _tmp7          = _mm_loadu_ps(ptr3);                                      \
1594     _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
1595     _tmp9          = _mm_load_ss(ptr3+8);                                     \
1596     _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                \
1597     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1598     _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                \
1599     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1600     _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                \
1601     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1602     _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                \
1603     jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
1604     _tmp14         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));            \
1605     _tmp15         = _mm_movehl_ps(jz3,jz3);                                  \
1606     _tmp16         = _mm_movelh_ps(jx1,jz1);                                  \
1607     _tmp17         = _mm_movehl_ps(jz1,jx1);                                  \
1608     _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                            \
1609     _tmp18         = _mm_movelh_ps(jy2,jx3);                                  \
1610     _tmp19         = _mm_movehl_ps(jx3,jy2);                                  \
1611     _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                            \
1612     _tmp1          = _mm_sub_ps(_tmp1,_tmp16);                                \
1613     _tmp2          = _mm_sub_ps(_tmp2,_tmp18);                                \
1614     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                   \
1615     _tmp4          = _mm_sub_ps(_tmp4,_tmp17);                                \
1616     _tmp5          = _mm_sub_ps(_tmp5,_tmp19);                                \
1617     _tmp6          = _mm_sub_ss(_tmp6,_tmp14);                                \
1618     _tmp7          = _mm_sub_ps(_tmp7,_tmp10);                                \
1619     _tmp8          = _mm_sub_ps(_tmp8,_tmp12);                                \
1620     _tmp9          = _mm_sub_ss(_tmp9,_tmp15);                                \
1621     _mm_storeu_ps(ptr1,_tmp1);                                                \
1622     _mm_storeu_ps(ptr1+4,_tmp2);                                              \
1623     _mm_store_ss(ptr1+8,_tmp3);                                               \
1624     _mm_storeu_ps(ptr2,_tmp4);                                                \
1625     _mm_storeu_ps(ptr2+4,_tmp5);                                              \
1626     _mm_store_ss(ptr2+8,_tmp6);                                               \
1627     _mm_storeu_ps(ptr3,_tmp7);                                                \
1628     _mm_storeu_ps(ptr3+4,_tmp8);                                              \
1629     _mm_store_ss(ptr3+8,_tmp9);                                               \
1630 }
1631
1632
1633 #define GMX_MM_DECREMENT_4RVECS_3POINTERS_PS(ptr1,ptr2,ptr3,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1634     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;                                  \
1635     __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21;                                \
1636     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1637     _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
1638     _tmp3          = _mm_loadu_ps(ptr1+8);                                    \
1639     _tmp4          = _mm_loadu_ps(ptr2);                                      \
1640     _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
1641     _tmp6          = _mm_loadu_ps(ptr2+8);                                    \
1642     _tmp7          = _mm_loadu_ps(ptr3);                                      \
1643     _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
1644     _tmp9          = _mm_loadu_ps(ptr3+8);                                    \
1645     _tmp10         = _mm_unpackhi_ps(jx1,jy1);                                \
1646     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1647     _tmp11         = _mm_unpackhi_ps(jz1,jx2);                                \
1648     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1649     _tmp12         = _mm_unpackhi_ps(jy2,jz2);                                \
1650     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1651     _tmp13         = _mm_unpackhi_ps(jx3,jy3);                                \
1652     jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
1653     _tmp14         = _mm_unpackhi_ps(jz3,jx4);                                \
1654     jz3            = _mm_unpacklo_ps(jz3,jx4);                                \
1655     _tmp15         = _mm_unpackhi_ps(jy4,jz4);                                \
1656     jy4            = _mm_unpacklo_ps(jy4,jz4);                                \
1657     _tmp16         = _mm_movelh_ps(jx1,jz1);                                  \
1658     _tmp17         = _mm_movehl_ps(jz1,jx1);                                  \
1659     _tmp10         = _mm_movelh_ps(_tmp10,_tmp11);                            \
1660     _tmp18         = _mm_movelh_ps(jy2,jx3);                                  \
1661     _tmp19         = _mm_movehl_ps(jx3,jy2);                                  \
1662     _tmp12         = _mm_movelh_ps(_tmp12,_tmp13);                            \
1663     _tmp20         = _mm_movelh_ps(jz3,jy4);                                  \
1664     _tmp21         = _mm_movehl_ps(jy4,jz3);                                  \
1665     _tmp14         = _mm_movelh_ps(_tmp14,_tmp15);                            \
1666     _tmp1          = _mm_sub_ps(_tmp1,_tmp16);                                \
1667     _tmp2          = _mm_sub_ps(_tmp2,_tmp18);                                \
1668     _tmp3          = _mm_sub_ps(_tmp3,_tmp20);                                \
1669     _tmp4          = _mm_sub_ps(_tmp4,_tmp17);                                \
1670     _tmp5          = _mm_sub_ps(_tmp5,_tmp19);                                \
1671     _tmp6          = _mm_sub_ps(_tmp6,_tmp21);                                \
1672     _tmp7          = _mm_sub_ps(_tmp7,_tmp10);                                \
1673     _tmp8          = _mm_sub_ps(_tmp8,_tmp12);                                \
1674     _tmp9          = _mm_sub_ps(_tmp9,_tmp14);                                \
1675     _mm_storeu_ps(ptr1,_tmp1);                                                \
1676     _mm_storeu_ps(ptr1+4,_tmp2);                                              \
1677     _mm_storeu_ps(ptr1+8,_tmp3);                                              \
1678     _mm_storeu_ps(ptr2,_tmp4);                                                \
1679     _mm_storeu_ps(ptr2+4,_tmp5);                                              \
1680     _mm_storeu_ps(ptr2+8,_tmp6);                                              \
1681     _mm_storeu_ps(ptr3,_tmp7);                                                \
1682     _mm_storeu_ps(ptr3+4,_tmp8);                                              \
1683     _mm_storeu_ps(ptr3+8,_tmp9);                                              \
1684 }
1685
1686
1687
1688
1689 #define GMX_MM_DECREMENT_1RVEC_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1) { \
1690     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;         \
1691     _tmp1          = _mm_load_ss(ptr1);                              \
1692     _tmp1          = _mm_loadh_pi(_tmp1,(__m64 *)(ptr1+1));          \
1693     _tmp2          = _mm_load_ss(ptr2);                              \
1694     _tmp2          = _mm_loadh_pi(_tmp2,(__m64 *)(ptr2+1));          \
1695     _tmp3          = _mm_load_ss(ptr3);                              \
1696     _tmp3          = _mm_loadh_pi(_tmp3,(__m64 *)(ptr3+1));          \
1697     _tmp4          = _mm_load_ss(ptr4);                              \
1698     _tmp4          = _mm_loadh_pi(_tmp4,(__m64 *)(ptr4+1));          \
1699     _tmp5          = _mm_unpacklo_ps(jy1,jz1);                       \
1700     _tmp6          = _mm_unpackhi_ps(jy1,jz1);                       \
1701     _tmp7          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(1,0,0,0)); \
1702     _tmp8          = _mm_shuffle_ps(jx1,_tmp5,_MM_SHUFFLE(3,2,0,1)); \
1703     _tmp9          = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(1,0,0,2)); \
1704     _tmp10         = _mm_shuffle_ps(jx1,_tmp6,_MM_SHUFFLE(3,2,0,3)); \
1705     _tmp1          = _mm_sub_ps(_tmp1,_tmp7);                        \
1706     _tmp2          = _mm_sub_ps(_tmp2,_tmp8);                        \
1707     _tmp3          = _mm_sub_ps(_tmp3,_tmp9);                        \
1708     _tmp4          = _mm_sub_ps(_tmp4,_tmp10);                       \
1709     _mm_store_ss(ptr1,_tmp1);                                        \
1710     _mm_storeh_pi((__m64 *)(ptr1+1),_tmp1);                          \
1711     _mm_store_ss(ptr2,_tmp2);                                        \
1712     _mm_storeh_pi((__m64 *)(ptr2+1),_tmp2);                          \
1713     _mm_store_ss(ptr3,_tmp3);                                        \
1714     _mm_storeh_pi((__m64 *)(ptr3+1),_tmp3);                          \
1715     _mm_store_ss(ptr4,_tmp4);                                        \
1716     _mm_storeh_pi((__m64 *)(ptr4+1),_tmp4);                          \
1717 }
1718
1719
1720
1721 #define GMX_MM_DECREMENT_2RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2) { \
1722     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11,_tmp12,_tmp13; \
1723     _tmp1          = _mm_loadu_ps(ptr1);                                       \
1724     _tmp2          = _mm_loadu_ps(ptr2);                                       \
1725     _tmp3          = _mm_loadu_ps(ptr3);                                       \
1726     _tmp4          = _mm_loadu_ps(ptr4);                                       \
1727     _tmp5          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr1+4));         \
1728     _tmp5          = _mm_loadh_pi(_tmp5,(__m64 *)(ptr2+4));                    \
1729     _tmp6          = _mm_loadl_pi(_mm_setzero_ps(),(__m64 *)(ptr3+4));         \
1730     _tmp6          = _mm_loadh_pi(_tmp6,(__m64 *)(ptr4+4));                    \
1731     _tmp7          = _mm_unpackhi_ps(jx1,jy1);                                 \
1732     jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
1733     _tmp8          = _mm_unpackhi_ps(jz1,jx2);                                 \
1734     jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
1735     _tmp9          = _mm_unpackhi_ps(jy2,jz2);                                 \
1736     jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
1737     _tmp10         = _mm_movelh_ps(jx1,jz1);                                   \
1738     _tmp11         = _mm_movehl_ps(jz1,jx1);                                   \
1739     _tmp12         = _mm_movelh_ps(_tmp7,_tmp8);                               \
1740     _tmp13         = _mm_movehl_ps(_tmp8,_tmp7);                               \
1741     _tmp1          = _mm_sub_ps(_tmp1,_tmp10);                                 \
1742     _tmp2          = _mm_sub_ps(_tmp2,_tmp11);                                 \
1743     _tmp3          = _mm_sub_ps(_tmp3,_tmp12);                                 \
1744     _tmp4          = _mm_sub_ps(_tmp4,_tmp13);                                 \
1745     _tmp5          = _mm_sub_ps(_tmp5,jy2);                                    \
1746     _tmp6          = _mm_sub_ps(_tmp6,_tmp9);                                  \
1747     _mm_storeu_ps(ptr1,_tmp1);                                                 \
1748     _mm_storeu_ps(ptr2,_tmp2);                                                 \
1749     _mm_storeu_ps(ptr3,_tmp3);                                                 \
1750     _mm_storeu_ps(ptr4,_tmp4);                                                 \
1751     _mm_storel_pi((__m64 *)(ptr1+4),_tmp5);                                    \
1752     _mm_storeh_pi((__m64 *)(ptr2+4),_tmp5);                                    \
1753     _mm_storel_pi((__m64 *)(ptr3+4),_tmp6);                                    \
1754     _mm_storeh_pi((__m64 *)(ptr4+4),_tmp6);                                    \
1755 }
1756
1757
1758 #define GMX_MM_DECREMENT_3RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3) { \
1759     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10;       \
1760     __m128 _tmp11,_tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19;     \
1761     __m128 _tmp20,_tmp21,_tmp22,_tmp23,_tmp24,_tmp25;                          \
1762     _tmp1          = _mm_loadu_ps(ptr1);                                       \
1763     _tmp2          = _mm_loadu_ps(ptr1+4);                                     \
1764     _tmp3          = _mm_load_ss(ptr1+8);                                      \
1765     _tmp4          = _mm_loadu_ps(ptr2);                                       \
1766     _tmp5          = _mm_loadu_ps(ptr2+4);                                     \
1767     _tmp6          = _mm_load_ss(ptr2+8);                                      \
1768     _tmp7          = _mm_loadu_ps(ptr3);                                       \
1769     _tmp8          = _mm_loadu_ps(ptr3+4);                                     \
1770     _tmp9          = _mm_load_ss(ptr3+8);                                      \
1771     _tmp10         = _mm_loadu_ps(ptr4);                                       \
1772     _tmp11         = _mm_loadu_ps(ptr4+4);                                     \
1773     _tmp12         = _mm_load_ss(ptr4+8);                                      \
1774     _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                 \
1775     jx1            = _mm_unpacklo_ps(jx1,jy1);                                 \
1776     _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                 \
1777     jz1            = _mm_unpacklo_ps(jz1,jx2);                                 \
1778     _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                 \
1779     jy2            = _mm_unpacklo_ps(jy2,jz2);                                 \
1780     _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                 \
1781     jx3            = _mm_unpacklo_ps(jx3,jy3);                                 \
1782     _tmp17         = _mm_shuffle_ps(jz3,jz3,_MM_SHUFFLE(0,0,0,1));             \
1783     _tmp18         = _mm_movehl_ps(jz3,jz3);                                   \
1784     _tmp19         = _mm_shuffle_ps(_tmp18,_tmp18,_MM_SHUFFLE(0,0,0,1));       \
1785     _tmp20         = _mm_movelh_ps(jx1,jz1);                                   \
1786     _tmp21         = _mm_movehl_ps(jz1,jx1);                                   \
1787     _tmp22         = _mm_movelh_ps(_tmp13,_tmp14);                             \
1788     _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                             \
1789     _tmp23         = _mm_movelh_ps(jy2,jx3);                                   \
1790     _tmp24         = _mm_movehl_ps(jx3,jy2);                                   \
1791     _tmp25         = _mm_movelh_ps(_tmp15,_tmp16);                             \
1792     _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                             \
1793     _tmp1          = _mm_sub_ps(_tmp1,_tmp20);                                 \
1794     _tmp2          = _mm_sub_ps(_tmp2,_tmp23);                                 \
1795     _tmp3          = _mm_sub_ss(_tmp3,jz3);                                    \
1796     _tmp4          = _mm_sub_ps(_tmp4,_tmp21);                                 \
1797     _tmp5          = _mm_sub_ps(_tmp5,_tmp24);                                 \
1798     _tmp6          = _mm_sub_ss(_tmp6,_tmp17);                                 \
1799     _tmp7          = _mm_sub_ps(_tmp7,_tmp22);                                 \
1800     _tmp8          = _mm_sub_ps(_tmp8,_tmp25);                                 \
1801     _tmp9          = _mm_sub_ss(_tmp9,_tmp18);                                 \
1802     _tmp10         = _mm_sub_ps(_tmp10,_tmp14);                                \
1803     _tmp11         = _mm_sub_ps(_tmp11,_tmp16);                                \
1804     _tmp12         = _mm_sub_ss(_tmp12,_tmp19);                                \
1805     _mm_storeu_ps(ptr1,_tmp1);                                                 \
1806     _mm_storeu_ps(ptr1+4,_tmp2);                                               \
1807     _mm_store_ss(ptr1+8,_tmp3);                                                \
1808     _mm_storeu_ps(ptr2,_tmp4);                                                 \
1809     _mm_storeu_ps(ptr2+4,_tmp5);                                               \
1810     _mm_store_ss(ptr2+8,_tmp6);                                                \
1811     _mm_storeu_ps(ptr3,_tmp7);                                                 \
1812     _mm_storeu_ps(ptr3+4,_tmp8);                                               \
1813     _mm_store_ss(ptr3+8,_tmp9);                                                \
1814     _mm_storeu_ps(ptr4,_tmp10);                                                \
1815     _mm_storeu_ps(ptr4+4,_tmp11);                                              \
1816     _mm_store_ss(ptr4+8,_tmp12);                                               \
1817 }
1818
1819
1820 #define GMX_MM_DECREMENT_4RVECS_4POINTERS_PS(ptr1,ptr2,ptr3,ptr4,jx1,jy1,jz1,jx2,jy2,jz2,jx3,jy3,jz3,jx4,jy4,jz4) { \
1821     __m128 _tmp1,_tmp2,_tmp3,_tmp4,_tmp5,_tmp6,_tmp7,_tmp8,_tmp9,_tmp10,_tmp11;         \
1822     __m128 _tmp12,_tmp13,_tmp14,_tmp15,_tmp16,_tmp17,_tmp18,_tmp19,_tmp20,_tmp21,_tmp22;\
1823     __m128 _tmp23,_tmp24;                                                     \
1824     _tmp1          = _mm_loadu_ps(ptr1);                                      \
1825     _tmp2          = _mm_loadu_ps(ptr1+4);                                    \
1826     _tmp3          = _mm_loadu_ps(ptr1+8);                                    \
1827     _tmp4          = _mm_loadu_ps(ptr2);                                      \
1828     _tmp5          = _mm_loadu_ps(ptr2+4);                                    \
1829     _tmp6          = _mm_loadu_ps(ptr2+8);                                    \
1830     _tmp7          = _mm_loadu_ps(ptr3);                                      \
1831     _tmp8          = _mm_loadu_ps(ptr3+4);                                    \
1832     _tmp9          = _mm_loadu_ps(ptr3+8);                                    \
1833     _tmp10         = _mm_loadu_ps(ptr4);                                      \
1834     _tmp11         = _mm_loadu_ps(ptr4+4);                                    \
1835     _tmp12         = _mm_loadu_ps(ptr4+8);                                    \
1836     _tmp13         = _mm_unpackhi_ps(jx1,jy1);                                \
1837     jx1            = _mm_unpacklo_ps(jx1,jy1);                                \
1838     _tmp14         = _mm_unpackhi_ps(jz1,jx2);                                \
1839     jz1            = _mm_unpacklo_ps(jz1,jx2);                                \
1840     _tmp15         = _mm_unpackhi_ps(jy2,jz2);                                \
1841     jy2            = _mm_unpacklo_ps(jy2,jz2);                                \
1842     _tmp16         = _mm_unpackhi_ps(jx3,jy3);                                \
1843     jx3            = _mm_unpacklo_ps(jx3,jy3);                                \
1844     _tmp17         = _mm_unpackhi_ps(jz3,jx4);                                \
1845     jz3            = _mm_unpacklo_ps(jz3,jx4);                                \
1846     _tmp18         = _mm_unpackhi_ps(jy4,jz4);                                \
1847     jy4            = _mm_unpacklo_ps(jy4,jz4);                                \
1848     _tmp19         = _mm_movelh_ps(jx1,jz1);                                  \
1849     jz1            = _mm_movehl_ps(jz1,jx1);                                  \
1850     _tmp20         = _mm_movelh_ps(_tmp13,_tmp14);                            \
1851     _tmp14         = _mm_movehl_ps(_tmp14,_tmp13);                            \
1852     _tmp21         = _mm_movelh_ps(jy2,jx3);                                  \
1853     jx3            = _mm_movehl_ps(jx3,jy2);                                  \
1854     _tmp22         = _mm_movelh_ps(_tmp15,_tmp16);                            \
1855     _tmp16         = _mm_movehl_ps(_tmp16,_tmp15);                            \
1856     _tmp23         = _mm_movelh_ps(jz3,jy4);                                  \
1857     jy4            = _mm_movehl_ps(jy4,jz3);                                  \
1858     _tmp24         = _mm_movelh_ps(_tmp17,_tmp18);                            \
1859     _tmp18         = _mm_movehl_ps(_tmp18,_tmp17);                            \
1860     _tmp1          = _mm_sub_ps(_tmp1,_tmp19);                                \
1861     _tmp2          = _mm_sub_ps(_tmp2,_tmp21);                                \
1862     _tmp3          = _mm_sub_ps(_tmp3,_tmp23);                                \
1863     _tmp4          = _mm_sub_ps(_tmp4,jz1);                                   \
1864     _tmp5          = _mm_sub_ps(_tmp5,jx3);                                   \
1865     _tmp6          = _mm_sub_ps(_tmp6,jy4);                                   \
1866     _tmp7          = _mm_sub_ps(_tmp7,_tmp20);                                \
1867     _tmp8          = _mm_sub_ps(_tmp8,_tmp22);                                \
1868     _tmp9          = _mm_sub_ps(_tmp9,_tmp24);                                \
1869     _tmp10         = _mm_sub_ps(_tmp10,_tmp14);                               \
1870     _tmp11         = _mm_sub_ps(_tmp11,_tmp16);                               \
1871     _tmp12         = _mm_sub_ps(_tmp12,_tmp18);                               \
1872     _mm_storeu_ps(ptr1,_tmp1);                                                \
1873     _mm_storeu_ps(ptr1+4,_tmp2);                                              \
1874     _mm_storeu_ps(ptr1+8,_tmp3);                                              \
1875     _mm_storeu_ps(ptr2,_tmp4);                                                \
1876     _mm_storeu_ps(ptr2+4,_tmp5);                                              \
1877     _mm_storeu_ps(ptr2+8,_tmp6);                                              \
1878     _mm_storeu_ps(ptr3,_tmp7);                                                \
1879     _mm_storeu_ps(ptr3+4,_tmp8);                                              \
1880     _mm_storeu_ps(ptr3+8,_tmp9);                                              \
1881     _mm_storeu_ps(ptr4,_tmp10);                                               \
1882     _mm_storeu_ps(ptr4+4,_tmp11);                                             \
1883     _mm_storeu_ps(ptr4+8,_tmp12);                                             \
1884 }
1885
1886
1887
1888
1889
1890
1891 /* Routine to be called with rswitch/rcut at the beginning of a kernel
1892  * to set up the 7 constants used for analytic 5th order switch calculations.
1893  */
1894 #define GMX_MM_SETUP_SWITCH5_PS(rswitch,rcut,switch_C3,switch_C4,switch_C5,switch_D2,switch_D3,switch_D4) {  \
1895         const __m128  _swsetup_cm6  = { -6.0, -6.0, -6.0, -6.0};                                                 \
1896         const __m128 _swsetup_cm10  = {-10.0,-10.0,-10.0,-10.0};                                                 \
1897         const __m128  _swsetup_c15  = { 15.0, 15.0, 15.0, 15.0};                                                 \
1898         const __m128 _swsetup_cm30  = {-30.0,-30.0,-30.0,-30.0};                                                 \
1899         const __m128  _swsetup_c60  = { 60.0, 60.0, 60.0, 60.0};                                                 \
1900                                                                                                              \
1901         __m128 d,dinv,dinv2,dinv3,dinv4,dinv5;                                                                   \
1902                                                                                                                  \
1903         d       = _mm_sub_ps(rcut,rswitch);                                                                      \
1904         dinv    = gmx_mm_inv_ps(d);                                                                              \
1905         dinv2   = _mm_mul_ps(dinv,dinv);                                                                         \
1906         dinv3   = _mm_mul_ps(dinv2,dinv);                                                                        \
1907         dinv4   = _mm_mul_ps(dinv2,dinv2);                                                                       \
1908         dinv5   = _mm_mul_ps(dinv3,dinv2);                                                                       \
1909                                                                                                                  \
1910         switch_C3 = _mm_mul_ps(_swsetup_cm10,dinv3);                                                             \
1911         switch_C4 = _mm_mul_ps(_swsetup_c15,dinv4);                                                              \
1912         switch_C5 = _mm_mul_ps(_swsetup_cm6,dinv5);                                                              \
1913         switch_D2 = _mm_mul_ps(_swsetup_cm30,dinv3);                                                             \
1914         switch_D3 = _mm_mul_ps(_swsetup_c60,dinv4);                                                              \
1915         switch_D4 = _mm_mul_ps(_swsetup_cm30,dinv5);                                                             \
1916 }
1917
1918
1919 #define GMX_MM_EVALUATE_SWITCH5_PS(r,rswitch,rcut,sw,dsw,sw_C3,sw_C4,sw_C5,sw_D2,sw_D3,sw_D4) { \
1920     const __m128  _sw_one  = {  1.0,  1.0,  1.0,  1.0};                                         \
1921     __m128 d,d2;                                                                                \
1922     d     = _mm_max_ps(r,rswitch);                                                              \
1923     d     = _mm_min_ps(d,rcut);                                                                 \
1924     d     = _mm_sub_ps(d,rswitch);                                                              \
1925     d2    = _mm_mul_ps(d,d);                                                                    \
1926     sw    = _mm_mul_ps(d,sw_C5);                                                                \
1927     dsw   = _mm_mul_ps(d,sw_D4);                                                                \
1928     sw    = _mm_add_ps(sw,sw_C4);                                                               \
1929     dsw   = _mm_add_ps(dsw,sw_D3);                                                              \
1930     sw    = _mm_mul_ps(sw,d);                                                                   \
1931     dsw   = _mm_mul_ps(dsw,d);                                                                  \
1932     sw    = _mm_add_ps(sw,sw_C3);                                                               \
1933     dsw   = _mm_add_ps(dsw,sw_D2);                                                              \
1934     sw    = _mm_mul_ps(sw,_mm_mul_ps(d,d2));                                                    \
1935     dsw   = _mm_mul_ps(dsw,d2);                                                                 \
1936     sw    = _mm_add_ps(sw,_sw_one);                                                             \
1937 }
1938
1939
1940 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1941 static inline __m128
1942 gmx_mm_interaction_coulomb_ps(__m128 rinv, __m128 qq,__m128 *vctot)
1943 {
1944         __m128 vcoul = _mm_mul_ps(qq,rinv);
1945         *vctot   = _mm_add_ps(*vctot,vcoul);
1946         return vcoul;
1947 }
1948
1949
1950 static inline void
1951 gmx_mm_interaction_coulomb_noforce_ps(__m128 rinv, __m128 qq,__m128 *vctot)
1952 {
1953         __m128 vcoul = _mm_mul_ps(qq,rinv);
1954         *vctot   = _mm_add_ps(*vctot,vcoul);
1955         return;
1956 }
1957
1958 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1959 static inline __m128
1960 gmx_mm_interaction_coulombrf_ps(const __m128 rinv, const __m128 rsq, const __m128 krf, const __m128 crf, const __m128 qq,__m128 *vctot)
1961 {
1962         const __m128 two  = {2.0,2.0,2.0,2.0};
1963         __m128 vcoul,krsq;
1964
1965         krsq   = _mm_mul_ps(krf,rsq);
1966         vcoul  = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
1967         *vctot = _mm_add_ps(*vctot,vcoul);
1968
1969         return _mm_mul_ps(qq, _mm_sub_ps(rinv, _mm_mul_ps(two,krsq)));
1970 }
1971
1972
1973 static inline void
1974 gmx_mm_interaction_coulombrf_noforce_ps(__m128 rinv, __m128 rsq, __m128 krf, __m128 crf, __m128 qq,__m128 *vctot)
1975 {
1976         __m128 vcoul,krsq;
1977
1978         krsq   = _mm_mul_ps(krf,rsq);
1979         vcoul  = _mm_mul_ps(qq, _mm_sub_ps(_mm_add_ps(rinv,krsq),crf));
1980         *vctot   = _mm_add_ps(*vctot,vcoul);
1981         return;
1982 }
1983
1984
1985 /* GB */
1986
1987
1988
1989
1990 /* GB + RF */
1991
1992
1993 /* Returns fscaltmp, multiply with rinvsq to get fscal! */
1994 static inline __m128
1995 gmx_mm_int_lj_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
1996 {
1997         const __m128 six    = {6.0,6.0,6.0,6.0};
1998         const __m128 twelve = {12.0,12.0,12.0,12.0};
1999
2000         __m128 rinvsix,vvdw6,vvdw12;
2001
2002         rinvsix  = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
2003         vvdw6    = _mm_mul_ps(c6,rinvsix);
2004         vvdw12   = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
2005         *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
2006
2007         return _mm_sub_ps( _mm_mul_ps(twelve,vvdw12),_mm_mul_ps(six,vvdw6));
2008 }
2009
2010
2011 static inline void
2012 gmx_mm_int_lj_potonly_ps(__m128 rinvsq, __m128 c6, __m128 c12, __m128 *vvdwtot)
2013 {
2014         __m128 rinvsix,vvdw6,vvdw12;
2015
2016         rinvsix  = _mm_mul_ps(_mm_mul_ps(rinvsq,rinvsq),rinvsq);
2017         vvdw6    = _mm_mul_ps(c6,rinvsix);
2018         vvdw12   = _mm_mul_ps(c12, _mm_mul_ps(rinvsix,rinvsix));
2019         *vvdwtot = _mm_add_ps(*vvdwtot , _mm_sub_ps(vvdw12,vvdw6));
2020
2021         return;
2022 }
2023
2024
2025
2026 /* Return force should be multiplied by -rinv to get fscal */
2027 static inline __m128
2028 gmx_mm_int_4_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2029 {
2030     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2031         __m128i n0;
2032         int     n_a,n_b,n_c,n_d;
2033
2034     rt       = _mm_mul_ps(r,tabscale);
2035         n0       = _mm_cvttps_epi32(rt);
2036         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2037         eps2     = _mm_mul_ps(eps,eps);
2038
2039         /* Extract indices from n0 */
2040         n_a      = gmx_mm_extract_epi32(n0,0);
2041         n_b      = gmx_mm_extract_epi32(n0,1);
2042         n_c      = gmx_mm_extract_epi32(n0,2);
2043         n_d      = gmx_mm_extract_epi32(n0,3);
2044         Y        = _mm_load_ps(VFtab + 4* n_a);
2045         F        = _mm_load_ps(VFtab + 4* n_b);
2046         G        = _mm_load_ps(VFtab + 4* n_c);
2047         H        = _mm_load_ps(VFtab + 4* n_d);
2048         _MM_TRANSPOSE4_PS(Y,F,G,H);
2049         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2050         G        = _mm_mul_ps(G,eps);               /* Geps  */
2051         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2052         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2053         *vctot   = _mm_add_ps(*vctot,vcoul);
2054
2055         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2056
2057         return _mm_mul_ps(F,tabscale);
2058 }
2059
2060
2061
2062 /* Return force should be multiplied by -rinv to get fscal */
2063 static inline __m128
2064 gmx_mm_int_4_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2065 {
2066     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2067         __m128i n0;
2068         int     n_a,n_b,n_c,n_d;
2069
2070     rt       = _mm_mul_ps(r,tabscale);
2071         n0       = _mm_cvttps_epi32(rt);
2072         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2073         eps2     = _mm_mul_ps(eps,eps);
2074
2075         /* Extract indices from n0 */
2076         n_a      = gmx_mm_extract_epi32(n0,0);
2077         n_b      = gmx_mm_extract_epi32(n0,1);
2078         n_c      = gmx_mm_extract_epi32(n0,2);
2079         n_d      = gmx_mm_extract_epi32(n0,3);
2080
2081         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2082          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2083          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2084          */
2085         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset);
2086         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset);
2087         Gd       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset);
2088         Hd       = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset);
2089         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + 4*offset + 4);
2090         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + 4*offset + 4);
2091         Gr       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + 4*offset + 4);
2092         Hr       = _mm_load_ps(VFtab + 4*(offset+2)* n_d + 4*offset + 4);
2093         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2094         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2095         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2096         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2097         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2098         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2099         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2100         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2101         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2102         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2103         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2104
2105         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2106         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2107
2108         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2109 }
2110
2111
2112 /* Return force should be multiplied by -rinv to get fscal */
2113 static inline __m128
2114 gmx_mm_int_4_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2115                                                                   __m128 *vctot, __m128 *vvdwtot)
2116 {
2117     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2118         __m128i n0;
2119         int     n_a,n_b,n_c,n_d;
2120
2121     rt       = _mm_mul_ps(r,tabscale);
2122         n0       = _mm_cvttps_epi32(rt);
2123         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2124         eps2     = _mm_mul_ps(eps,eps);
2125
2126         /* Extract indices from n0 */
2127         n_a      = gmx_mm_extract_epi32(n0,0);
2128         n_b      = gmx_mm_extract_epi32(n0,1);
2129         n_c      = gmx_mm_extract_epi32(n0,2);
2130         n_d      = gmx_mm_extract_epi32(n0,3);
2131
2132
2133         Yc       = _mm_load_ps(VFtab + 12* n_a);
2134         Fc       = _mm_load_ps(VFtab + 12* n_b);
2135         Gc       = _mm_load_ps(VFtab + 12* n_c);
2136         Hc       = _mm_load_ps(VFtab + 12* n_d);
2137         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2138         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
2139         Gd       = _mm_load_ps(VFtab + 12* n_c + 4);
2140         Hd       = _mm_load_ps(VFtab + 12* n_d + 4);
2141         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2142         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
2143         Gr       = _mm_load_ps(VFtab + 12* n_c + 8);
2144         Hr       = _mm_load_ps(VFtab + 12* n_d + 8);
2145         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2146         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2147         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2148         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2149         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2150         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2151         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2152         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2153         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2154         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2155         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2156         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2157
2158         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2159         *vctot   = _mm_add_ps(*vctot,vcoul);
2160
2161         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2162         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2163         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2164
2165         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2166         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2167         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2168
2169         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2170 }
2171
2172
2173
2174 /* Return force should be multiplied by -rinv to get fscal */
2175 static inline __m128
2176 gmx_mm_int_3_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2177 {
2178     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2179         __m128i n0;
2180         int     n_a,n_b,n_c;
2181
2182     rt       = _mm_mul_ps(r,tabscale);
2183         n0       = _mm_cvttps_epi32(rt);
2184         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2185         eps2     = _mm_mul_ps(eps,eps);
2186
2187         /* Extract indices from n0 */
2188         n_a      = gmx_mm_extract_epi32(n0,0);
2189         n_b      = gmx_mm_extract_epi32(n0,1);
2190         n_c      = gmx_mm_extract_epi32(n0,2);
2191         Y        = _mm_load_ps(VFtab + 4* n_a);
2192         F        = _mm_load_ps(VFtab + 4* n_b);
2193         G        = _mm_load_ps(VFtab + 4* n_c);
2194         H        = _mm_setzero_ps();
2195         _MM_TRANSPOSE4_PS(Y,F,G,H);
2196         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2197         G        = _mm_mul_ps(G,eps);               /* Geps  */
2198         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2199         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2200         *vctot   = _mm_add_ps(*vctot,vcoul);
2201
2202         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2203
2204         return _mm_mul_ps(F,tabscale);
2205 }
2206
2207
2208
2209 /* Return force should be multiplied by -rinv to get fscal */
2210 static inline __m128
2211 gmx_mm_int_3_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2212 {
2213     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2214         __m128i n0;
2215         int     n_a,n_b,n_c;
2216
2217     rt       = _mm_mul_ps(r,tabscale);
2218         n0       = _mm_cvttps_epi32(rt);
2219         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2220         eps2     = _mm_mul_ps(eps,eps);
2221
2222         /* Extract indices from n0 */
2223         n_a      = gmx_mm_extract_epi32(n0,0);
2224         n_b      = gmx_mm_extract_epi32(n0,1);
2225         n_c      = gmx_mm_extract_epi32(n0,2);
2226
2227         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2228          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2229          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2230          */
2231         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2232         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
2233         Gd       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset);
2234         Hd       = _mm_setzero_ps();
2235         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2236         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
2237         Gr       = _mm_load_ps(VFtab + 4*(offset+2)* n_c + offset + 4);
2238         Hr       = _mm_setzero_ps();
2239         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2240         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2241         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2242         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2243         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2244         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2245         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2246         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2247         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2248         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2249         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2250
2251         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2252         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2253
2254         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2255 }
2256
2257
2258 /* Return force should be multiplied by -rinv to get fscal */
2259 static inline __m128
2260 gmx_mm_int_3_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2261                                                                   __m128 *vctot, __m128 *vvdwtot)
2262 {
2263     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2264         __m128i n0;
2265         int     n_a,n_b,n_c;
2266
2267     rt       = _mm_mul_ps(r,tabscale);
2268         n0       = _mm_cvttps_epi32(rt);
2269         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2270         eps2     = _mm_mul_ps(eps,eps);
2271
2272         /* Extract indices from n0 */
2273         n_a      = gmx_mm_extract_epi32(n0,0);
2274         n_b      = gmx_mm_extract_epi32(n0,1);
2275         n_c      = gmx_mm_extract_epi32(n0,2);
2276
2277
2278         Yc       = _mm_load_ps(VFtab + 12* n_a);
2279         Fc       = _mm_load_ps(VFtab + 12* n_b);
2280         Gc       = _mm_load_ps(VFtab + 12* n_c);
2281         Hc       = _mm_setzero_ps();
2282         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2283         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
2284         Gd       = _mm_load_ps(VFtab + 12* n_c + 4);
2285         Hd       = _mm_setzero_ps();
2286         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2287         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
2288         Gr       = _mm_load_ps(VFtab + 12* n_c + 8);
2289         Hr       = _mm_setzero_ps();
2290         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2291         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2292         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2293         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2294         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2295         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2296         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2297         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2298         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2299         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2300         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2301         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2302
2303         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2304         *vctot   = _mm_add_ps(*vctot,vcoul);
2305
2306         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2307         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2308         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2309
2310         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2311         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2312         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2313
2314         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2315 }
2316
2317
2318
2319
2320
2321 /* Return force should be multiplied by -rinv to get fscal */
2322 static inline __m128
2323 gmx_mm_int_2_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2324 {
2325     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2326         __m128i n0;
2327         int     n_a,n_b;
2328
2329     rt       = _mm_mul_ps(r,tabscale);
2330         n0       = _mm_cvttps_epi32(rt);
2331         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2332         eps2     = _mm_mul_ps(eps,eps);
2333
2334         /* Extract indices from n0 */
2335         n_a      = gmx_mm_extract_epi32(n0,0);
2336         n_b      = gmx_mm_extract_epi32(n0,1);
2337         Y        = _mm_load_ps(VFtab + 4* n_a);
2338         F        = _mm_load_ps(VFtab + 4* n_b);
2339         G        = _mm_setzero_ps();
2340         H        = _mm_setzero_ps();
2341         _MM_TRANSPOSE4_PS(Y,F,G,H);
2342         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2343         G        = _mm_mul_ps(G,eps);               /* Geps  */
2344         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2345         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2346         *vctot   = _mm_add_ps(*vctot,vcoul);
2347
2348         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2349
2350         return _mm_mul_ps(F,tabscale);
2351 }
2352
2353
2354
2355 /* Return force should be multiplied by -rinv to get fscal */
2356 static inline __m128
2357 gmx_mm_int_2_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2358 {
2359     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2360         __m128i n0;
2361         int     n_a,n_b;
2362
2363     rt       = _mm_mul_ps(r,tabscale);
2364         n0       = _mm_cvttps_epi32(rt);
2365         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2366         eps2     = _mm_mul_ps(eps,eps);
2367
2368         /* Extract indices from n0 */
2369         n_a      = gmx_mm_extract_epi32(n0,0);
2370         n_b      = gmx_mm_extract_epi32(n0,1);
2371
2372         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2373          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2374          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2375          */
2376         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2377         Fd       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset);
2378         Gd       = _mm_setzero_ps();
2379         Hd       = _mm_setzero_ps();
2380         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2381         Fr       = _mm_load_ps(VFtab + 4*(offset+2)* n_b + offset + 4);
2382         Gr       = _mm_setzero_ps();
2383         Hr       = _mm_setzero_ps();
2384         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2385         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2386         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2387         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2388         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2389         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2390         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2391         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2392         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2393         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2394         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2395
2396         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2397         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2398
2399         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2400 }
2401
2402
2403 /* Return force should be multiplied by -rinv to get fscal */
2404 static inline __m128
2405 gmx_mm_int_2_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2406                                                                   __m128 *vctot, __m128 *vvdwtot)
2407 {
2408     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2409         __m128i n0;
2410         int     n_a,n_b;
2411
2412     rt       = _mm_mul_ps(r,tabscale);
2413         n0       = _mm_cvttps_epi32(rt);
2414         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2415         eps2     = _mm_mul_ps(eps,eps);
2416
2417         /* Extract indices from n0 */
2418         n_a      = gmx_mm_extract_epi32(n0,0);
2419         n_b      = gmx_mm_extract_epi32(n0,1);
2420
2421         Yc       = _mm_load_ps(VFtab + 12* n_a);
2422         Fc       = _mm_load_ps(VFtab + 12* n_b);
2423         Gc       = _mm_setzero_ps();
2424         Hc       = _mm_setzero_ps();
2425         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2426         Fd       = _mm_load_ps(VFtab + 12* n_b + 4);
2427         Gd       = _mm_setzero_ps();
2428         Hd       = _mm_setzero_ps();
2429         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2430         Fr       = _mm_load_ps(VFtab + 12* n_b + 8);
2431         Gr       = _mm_setzero_ps();
2432         Hr       = _mm_setzero_ps();
2433         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2434         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2435         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2436         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2437         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2438         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2439         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2440         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2441         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2442         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2443         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2444         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2445
2446         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2447         *vctot   = _mm_add_ps(*vctot,vcoul);
2448
2449         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2450         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2451         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2452
2453         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2454         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2455         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2456
2457         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2458 }
2459
2460
2461
2462
2463 /* Return force should be multiplied by -rinv to get fscal */
2464 static inline __m128
2465 gmx_mm_int_1_table_coulomb_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 *vctot)
2466 {
2467     __m128  rt,eps,eps2,Y,F,G,H,vcoul;
2468         __m128i n0;
2469         int     n_a;
2470
2471     rt       = _mm_mul_ps(r,tabscale);
2472         n0       = _mm_cvttps_epi32(rt);
2473         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2474         eps2     = _mm_mul_ps(eps,eps);
2475
2476         /* Extract indices from n0 */
2477         n_a      = gmx_mm_extract_epi32(n0,0);
2478         Y        = _mm_load_ps(VFtab + 4* n_a);
2479         F        = _mm_setzero_ps();
2480         G        = _mm_setzero_ps();
2481         H        = _mm_setzero_ps();
2482         _MM_TRANSPOSE4_PS(Y,F,G,H);
2483         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2484         G        = _mm_mul_ps(G,eps);               /* Geps  */
2485         F        = _mm_add_ps(F, _mm_add_ps(G,H));  /* Fp    */
2486         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Y, _mm_mul_ps(eps,F)));
2487         *vctot   = _mm_add_ps(*vctot,vcoul);
2488
2489         F        = _mm_mul_ps(qq, _mm_add_ps(F, _mm_add_ps(G, _mm_add_ps(H,H))));
2490
2491         return _mm_mul_ps(F,tabscale);
2492 }
2493
2494
2495
2496 /* Return force should be multiplied by -rinv to get fscal */
2497 static inline __m128
2498 gmx_mm_int_1_table_lj_ps(__m128 r, __m128 tabscale, float * VFtab, int offset, __m128 c6, __m128 c12, __m128 *vvdwtot)
2499 {
2500     __m128  rt,eps,eps2,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2501         __m128i n0;
2502         int     n_a;
2503
2504     rt       = _mm_mul_ps(r,tabscale);
2505         n0       = _mm_cvttps_epi32(rt);
2506         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2507         eps2     = _mm_mul_ps(eps,eps);
2508
2509         /* Extract indices from n0 */
2510         n_a      = gmx_mm_extract_epi32(n0,0);
2511
2512         /* For a few cases, like TIP4p waters, there are particles with LJ-only interactions in a loop where
2513          * the table data might contain both coulomb and LJ. To handle this case, we use an offset value of 0
2514          * if the data is an LJ-only table, and 1 if it is actually a mixed coul+lj table.
2515          */
2516         Yd       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset);
2517         Fd       = _mm_setzero_ps();
2518         Gd       = _mm_setzero_ps();
2519         Hd       = _mm_setzero_ps();
2520         Yr       = _mm_load_ps(VFtab + 4*(offset+2)* n_a + offset + 4);
2521         Fr       = _mm_setzero_ps();
2522         Gr       = _mm_setzero_ps();
2523         Hr       = _mm_setzero_ps();
2524         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2525         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2526         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2527         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2528         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2529         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2530         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2531         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2532         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2533         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2534         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2535
2536         Fd        = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2537         Fr        = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2538
2539         return _mm_mul_ps( _mm_add_ps(Fd,Fr),tabscale);
2540 }
2541
2542
2543 /* Return force should be multiplied by -rinv to get fscal */
2544 static inline __m128
2545 gmx_mm_int_1_table_coulomb_and_lj_ps(__m128 r, __m128 tabscale, float * VFtab, __m128 qq, __m128 c6, __m128 c12,
2546                                                                          __m128 *vctot, __m128 *vvdwtot)
2547 {
2548     __m128  rt,eps,eps2,vcoul,Yc,Fc,Gc,Hc,Yd,Fd,Gd,Hd,Yr,Fr,Gr,Hr,vvdw6,vvdw12;
2549         __m128i n0;
2550         int     n_a;
2551
2552     rt       = _mm_mul_ps(r,tabscale);
2553         n0       = _mm_cvttps_epi32(rt);
2554         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2555         eps2     = _mm_mul_ps(eps,eps);
2556
2557         /* Extract indices from n0 */
2558         n_a      = gmx_mm_extract_epi32(n0,0);
2559
2560         Yc       = _mm_load_ps(VFtab + 12* n_a);
2561         Fc       = _mm_setzero_ps();
2562         Gc       = _mm_setzero_ps();
2563         Hc       = _mm_setzero_ps();
2564         Yd       = _mm_load_ps(VFtab + 12* n_a + 4);
2565         Fd       = _mm_setzero_ps();
2566         Gd       = _mm_setzero_ps();
2567         Hd       = _mm_setzero_ps();
2568         Yr       = _mm_load_ps(VFtab + 12* n_a + 8);
2569         Fr       = _mm_setzero_ps();
2570         Gr       = _mm_setzero_ps();
2571         Hr       = _mm_setzero_ps();
2572         _MM_TRANSPOSE4_PS(Yc,Fc,Gc,Hc);
2573         _MM_TRANSPOSE4_PS(Yd,Fd,Gd,Hd);
2574         _MM_TRANSPOSE4_PS(Yr,Fr,Gr,Hr);
2575         Hc       = _mm_mul_ps(Hc,eps2);              /* Heps2 */
2576         Gc       = _mm_mul_ps(Gc,eps);               /* Geps  */
2577         Fc       = _mm_add_ps(Fc, _mm_add_ps(Gc,Hc));  /* Fp    */
2578         Hd       = _mm_mul_ps(Hd,eps2);              /* Heps2 */
2579         Gd       = _mm_mul_ps(Gd,eps);               /* Geps  */
2580         Fd       = _mm_add_ps(Fd, _mm_add_ps(Gd,Hd));  /* Fp    */
2581         Hr       = _mm_mul_ps(Hr,eps2);              /* Heps2 */
2582         Gr       = _mm_mul_ps(Gr,eps);               /* Geps  */
2583         Fr       = _mm_add_ps(Fr, _mm_add_ps(Gr,Hr));  /* Fp    */
2584
2585         vcoul    = _mm_mul_ps(qq, _mm_add_ps(Yc, _mm_mul_ps(eps,Fc)));
2586         *vctot   = _mm_add_ps(*vctot,vcoul);
2587
2588         vvdw6    = _mm_mul_ps(c6,  _mm_add_ps(Yd, _mm_mul_ps(eps,Fd)));
2589         vvdw12   = _mm_mul_ps(c12, _mm_add_ps(Yr, _mm_mul_ps(eps,Fr)));
2590         *vvdwtot = _mm_add_ps(*vvdwtot, _mm_add_ps(vvdw6,vvdw12));
2591
2592         Fc       = _mm_mul_ps(qq, _mm_add_ps(Fc, _mm_add_ps(Gc, _mm_add_ps(Hc,Hc))));
2593         Fd       = _mm_mul_ps(c6,  _mm_add_ps(Fd, _mm_add_ps(Gd, _mm_add_ps(Hd,Hd))));
2594         Fr       = _mm_mul_ps(c12, _mm_add_ps(Fr, _mm_add_ps(Gr, _mm_add_ps(Hr,Hr))));
2595
2596         return _mm_mul_ps( _mm_add_ps(Fc,_mm_add_ps(Fd,Fr)),tabscale);
2597 }
2598
2599
2600
2601
2602
2603 /* Return force should be multiplied by +rinv to get fscal */
2604 static inline __m128
2605 gmx_mm_int_4_genborn_ps(__m128 r, __m128 isai,
2606                                                 float * isaj1, float *isaj2, float *isaj3, float *isaj4,
2607                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2608                                                 float *dvdaj1, float *dvdaj2, float *dvdaj3, float *dvdaj4,
2609                                                 __m128 *vgbtot)
2610 {
2611         const __m128 half  = {0.5,0.5,0.5,0.5};
2612
2613     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2614         __m128i n0;
2615         int     n_a,n_b,n_c,n_d;
2616
2617         /* Assemble isaj */
2618         isaj     = _mm_load_ss(isaj1);
2619         t2       = _mm_load_ss(isaj2);
2620         t3       = _mm_load_ss(isaj3);
2621         t4       = _mm_load_ss(isaj4);
2622         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
2623         t3       = _mm_unpacklo_ps(t3,t4);  /* - - t4 t3 */
2624         isaj     = _mm_movelh_ps(isaj,t3); /* t4 t3 t2 t1 */
2625
2626         isaprod     = _mm_mul_ps(isai,isaj);
2627         qq          = _mm_mul_ps(qq,isaprod);
2628         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2629
2630         rt       = _mm_mul_ps(r,gbtabscale);
2631         n0       = _mm_cvttps_epi32(rt);
2632         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2633         eps2     = _mm_mul_ps(eps,eps);
2634
2635         /* Extract indices from n0 */
2636         n_a      = gmx_mm_extract_epi32(n0,0);
2637         n_b      = gmx_mm_extract_epi32(n0,1);
2638         n_c      = gmx_mm_extract_epi32(n0,2);
2639         n_d      = gmx_mm_extract_epi32(n0,3);
2640         Y        = _mm_load_ps(GBtab + 4* n_a);
2641         F        = _mm_load_ps(GBtab + 4* n_b);
2642         G        = _mm_load_ps(GBtab + 4* n_c);
2643         H        = _mm_load_ps(GBtab + 4* n_d);
2644         _MM_TRANSPOSE4_PS(Y,F,G,H);
2645         G        = _mm_mul_ps(G,eps);               /* Geps  */
2646         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2647         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2648
2649         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2650         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2651
2652         vgb      = _mm_mul_ps(qq, VV);
2653         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2654
2655         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2656
2657         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2658
2659         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2660
2661         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2662
2663         /* Update 4 dada[j] values */
2664         Y        = _mm_load_ss(dvdaj1);
2665         F        = _mm_load_ss(dvdaj2);
2666         G        = _mm_load_ss(dvdaj3);
2667         H        = _mm_load_ss(dvdaj4);
2668         t3       = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
2669         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2670         t4       = _mm_shuffle_ps(t3,t3,_MM_SHUFFLE(0,0,0,1));
2671
2672         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2673         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2674         _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
2675         _mm_store_ss( dvdaj4 , _mm_add_ss( H, t4 ) );
2676
2677         return ftmp;
2678 }
2679
2680
2681
2682 /* Return force should be multiplied by +rinv to get fscal */
2683 static inline __m128
2684 gmx_mm_int_3_genborn_ps(__m128 r, __m128 isai,
2685                                                 float * isaj1, float *isaj2, float *isaj3,
2686                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2687                                                 float *dvdaj1, float *dvdaj2, float *dvdaj3,
2688                                                 __m128 *vgbtot)
2689 {
2690         const __m128 half  = {0.5,0.5,0.5,0.5};
2691
2692     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2693         __m128i n0;
2694         int     n_a,n_b,n_c,n_d;
2695
2696         /* Assemble isaj */
2697         isaj     = _mm_load_ss(isaj1);
2698         t2       = _mm_load_ss(isaj2);
2699         t3       = _mm_load_ss(isaj3);
2700         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
2701         t3       = _mm_unpacklo_ps(t3,t3);  /* - - t3 t3 */
2702         isaj     = _mm_movelh_ps(isaj,t3); /* t3 t3 t2 t1 */
2703
2704         isaprod     = _mm_mul_ps(isai,isaj);
2705         qq          = _mm_mul_ps(qq,isaprod);
2706         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2707
2708         rt       = _mm_mul_ps(r,gbtabscale);
2709         n0       = _mm_cvttps_epi32(rt);
2710         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2711         eps2     = _mm_mul_ps(eps,eps);
2712
2713         /* Extract indices from n0 */
2714         n_a      = gmx_mm_extract_epi32(n0,0);
2715         n_b      = gmx_mm_extract_epi32(n0,1);
2716         n_c      = gmx_mm_extract_epi32(n0,2);
2717         Y        = _mm_load_ps(GBtab + 4* n_a);
2718         F        = _mm_load_ps(GBtab + 4* n_b);
2719         G        = _mm_load_ps(GBtab + 4* n_c);
2720         H        = _mm_setzero_ps();
2721         _MM_TRANSPOSE4_PS(Y,F,G,H);
2722         G        = _mm_mul_ps(G,eps);               /* Geps  */
2723         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2724         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2725
2726         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2727         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2728
2729         vgb      = _mm_mul_ps(qq, VV);
2730         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2731
2732         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2733
2734         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2735
2736         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2737
2738         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2739
2740         /* Update 3 dada[j] values */
2741         Y        = _mm_load_ss(dvdaj1);
2742         F        = _mm_load_ss(dvdaj2);
2743         G        = _mm_load_ss(dvdaj3);
2744         t3       = _mm_movehl_ps(_mm_setzero_ps(),dvdatmp);
2745         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2746
2747         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2748         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2749         _mm_store_ss( dvdaj3 , _mm_add_ss( G, t3 ) );
2750
2751         return ftmp;
2752 }
2753
2754
2755
2756
2757 /* Return force should be multiplied by +rinv to get fscal */
2758 static inline __m128
2759 gmx_mm_int_2_genborn_ps(__m128 r, __m128 isai,
2760                                                 float * isaj1, float *isaj2,
2761                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2762                                                 float *dvdaj1, float *dvdaj2,
2763                                                 __m128 *vgbtot)
2764 {
2765         const __m128 half  = {0.5,0.5,0.5,0.5};
2766
2767     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2768         __m128i n0;
2769         int     n_a,n_b,n_c,n_d;
2770
2771         /* Assemble isaj */
2772         isaj     = _mm_load_ss(isaj1);
2773         t2       = _mm_load_ss(isaj2);
2774         isaj     = _mm_unpacklo_ps(isaj,t2);  /* - - t2 t1 */
2775
2776         isaprod     = _mm_mul_ps(isai,isaj);
2777         qq          = _mm_mul_ps(qq,isaprod);
2778         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2779
2780         rt       = _mm_mul_ps(r,gbtabscale);
2781         n0       = _mm_cvttps_epi32(rt);
2782         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2783         eps2     = _mm_mul_ps(eps,eps);
2784
2785         /* Extract indices from n0 */
2786         n_a      = gmx_mm_extract_epi32(n0,0);
2787         n_b      = gmx_mm_extract_epi32(n0,1);
2788         Y        = _mm_load_ps(GBtab + 4* n_a);
2789         F        = _mm_load_ps(GBtab + 4* n_b);
2790         G        = _mm_setzero_ps();
2791         H        = _mm_setzero_ps();
2792         _MM_TRANSPOSE4_PS(Y,F,G,H);
2793         G        = _mm_mul_ps(G,eps);               /* Geps  */
2794         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2795         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2796
2797         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2798         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2799
2800         vgb      = _mm_mul_ps(qq, VV);
2801         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2802
2803         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2804
2805         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2806
2807         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2808
2809         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2810
2811         /* Update 2 dada[j] values */
2812         Y        = _mm_load_ss(dvdaj1);
2813         F        = _mm_load_ss(dvdaj2);
2814         t2       = _mm_shuffle_ps(dvdatmp,dvdatmp,_MM_SHUFFLE(0,0,0,1));
2815
2816         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2817         _mm_store_ss( dvdaj2 , _mm_add_ss( F, t2 ) );
2818
2819         return ftmp;
2820 }
2821
2822 /* Return force should be multiplied by +rinv to get fscal */
2823 static inline __m128
2824 gmx_mm_int_1_genborn_ps(__m128 r, __m128 isai,
2825                                                 float * isaj1,
2826                                                 __m128 gbtabscale, float * GBtab, __m128 qq, __m128 *dvdasum,
2827                                                 float *dvdaj1,
2828                                                 __m128 *vgbtot)
2829 {
2830         const __m128 half  = {0.5,0.5,0.5,0.5};
2831
2832     __m128  rt,eps,eps2,Y,F,G,H,VV,FF,ftmp,isaprod,t2,t3,t4,isaj,vgb,dvdatmp;
2833         __m128i n0;
2834         int     n_a,n_b,n_c,n_d;
2835
2836         /* Assemble isaj */
2837         isaj     = _mm_load_ss(isaj1);
2838
2839         isaprod     = _mm_mul_ps(isai,isaj);
2840         qq          = _mm_mul_ps(qq,isaprod);
2841         gbtabscale  = _mm_mul_ps( isaprod, gbtabscale );
2842
2843         rt       = _mm_mul_ps(r,gbtabscale);
2844         n0       = _mm_cvttps_epi32(rt);
2845         eps      = _mm_sub_ps(rt, _mm_cvtepi32_ps(n0));
2846         eps2     = _mm_mul_ps(eps,eps);
2847
2848         /* Extract indices from n0 */
2849         n_a      = gmx_mm_extract_epi32(n0,0);
2850         Y        = _mm_load_ps(GBtab + 4* n_a);
2851         F        = _mm_setzero_ps();
2852         G        = _mm_setzero_ps();
2853         H        = _mm_setzero_ps();
2854         _MM_TRANSPOSE4_PS(Y,F,G,H);
2855         G        = _mm_mul_ps(G,eps);               /* Geps  */
2856         H        = _mm_mul_ps(H,eps2);              /* Heps2 */
2857         F        = _mm_add_ps(_mm_add_ps(F,G),H);  /* Fp    */
2858
2859         VV       = _mm_add_ps(Y, _mm_mul_ps(eps,F));
2860         FF       = _mm_add_ps(_mm_add_ps(F,G), _mm_add_ps(H,H));
2861
2862         vgb      = _mm_mul_ps(qq, VV);
2863         *vgbtot  = _mm_sub_ps(*vgbtot,vgb); /* Yes, the sign is correct */
2864
2865         ftmp     = _mm_mul_ps(_mm_mul_ps(qq, FF), gbtabscale);
2866
2867         dvdatmp  = _mm_mul_ps(half, _mm_add_ps(vgb,_mm_mul_ps(ftmp,r)));
2868
2869         *dvdasum = _mm_add_ps(*dvdasum,dvdatmp);
2870
2871         dvdatmp  = _mm_mul_ps(_mm_mul_ps(dvdatmp,isaj), isaj);
2872
2873         /* Update 1 dada[j] values */
2874         Y        = _mm_load_ss(dvdaj1);
2875
2876         _mm_store_ss( dvdaj1 , _mm_add_ss( Y, dvdatmp ) );
2877
2878         return ftmp;
2879 }
2880
2881
2882
2883
2884
2885 static inline void
2886 gmx_mm_update_iforce_1atom_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2887                               float *fptr,
2888                               float *fshiftptr)
2889 {
2890         __m128 t1,t2,t3;
2891
2892 #ifdef GMX_SSE3
2893         fix1 = _mm_hadd_ps(fix1,fix1);
2894         fiy1 = _mm_hadd_ps(fiy1,fiz1);
2895
2896         fix1 = _mm_hadd_ps(fix1,fiy1); /* fiz1 fiy1 fix1 fix1 */
2897 #else
2898         /* SSE2 */
2899         /* transpose data */
2900         t1 = fix1;
2901         _MM_TRANSPOSE4_PS(fix1,t1,fiy1,fiz1);
2902         fix1 = _mm_add_ps(_mm_add_ps(fix1,t1), _mm_add_ps(fiy1,fiz1));
2903 #endif
2904         t2 = _mm_load_ss(fptr);
2905         t2 = _mm_loadh_pi(t2,(__m64 *)(fptr+1));
2906         t3 = _mm_load_ss(fshiftptr);
2907         t3 = _mm_loadh_pi(t3,(__m64 *)(fshiftptr+1));
2908
2909         t2 = _mm_add_ps(t2,fix1);
2910         t3 = _mm_add_ps(t3,fix1);
2911
2912         _mm_store_ss(fptr,t2);
2913         _mm_storeh_pi((__m64 *)(fptr+1),t2);
2914         _mm_store_ss(fshiftptr,t3);
2915         _mm_storeh_pi((__m64 *)(fshiftptr+1),t3);
2916 }
2917
2918 static inline void
2919 gmx_mm_update_iforce_2atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2920                                __m128 fix2, __m128 fiy2, __m128 fiz2,
2921                                float *fptr,
2922                                float *fshiftptr)
2923 {
2924         __m128 t1,t2,t4;
2925
2926 #ifdef GMX_SSE3
2927         fix1 = _mm_hadd_ps(fix1,fiy1);
2928         fiz1 = _mm_hadd_ps(fiz1,fix2);
2929         fiy2 = _mm_hadd_ps(fiy2,fiz2);
2930
2931         fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
2932         fiy2 = _mm_hadd_ps(fiy2,fiy2); /*  -    -   fiz2 fiy2 */
2933 #else
2934         /* SSE2 */
2935         /* transpose data */
2936         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
2937         t1 = _mm_unpacklo_ps(fiy2,fiz2);
2938         t2 = _mm_unpackhi_ps(fiy2,fiz2);
2939
2940         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2941         t1   = _mm_add_ps(t1,t2);
2942         t2   = _mm_movehl_ps(t2,t1);
2943         fiy2 = _mm_add_ps(t1,t2);
2944 #endif
2945         _mm_storeu_ps(fptr,   _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
2946         t1 = _mm_loadl_pi(t1,(__m64 *)(fptr+4));
2947         _mm_storel_pi((__m64 *)(fptr+4), _mm_add_ps(fiy2,t1));
2948
2949         t4 = _mm_load_ss(fshiftptr+2);
2950         t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
2951
2952         t1 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,2));   /* fiy2  -   fix2 fiz1 */
2953         t1 = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,1,0,0));       /* fiy2 fix2  -   fiz1 */
2954         t2 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(1,0,0,1));   /* fiy1 fix1  -   fiz2 */
2955
2956         t1 = _mm_add_ps(t1,t2);
2957         t1 = _mm_add_ps(t1,t4); /* y x - z */
2958
2959         _mm_store_ss(fshiftptr+2,t1);
2960         _mm_storeh_pi((__m64 *)(fshiftptr),t1);
2961 }
2962
2963
2964
2965 static inline void
2966 gmx_mm_update_iforce_3atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
2967                                __m128 fix2, __m128 fiy2, __m128 fiz2,
2968                                __m128 fix3, __m128 fiy3, __m128 fiz3,
2969                                float *fptr,
2970                                float *fshiftptr)
2971 {
2972         __m128 t1,t2,t3,t4;
2973
2974 #ifdef GMX_SSE3
2975         fix1 = _mm_hadd_ps(fix1,fiy1);
2976         fiz1 = _mm_hadd_ps(fiz1,fix2);
2977         fiy2 = _mm_hadd_ps(fiy2,fiz2);
2978         fix3 = _mm_hadd_ps(fix3,fiy3);
2979         fiz3 = _mm_hadd_ps(fiz3,fiz3);
2980
2981         fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
2982         fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
2983         fiz3 = _mm_hadd_ps(fiz3,fiz3); /*  -    -    -   fiz3 */
2984 #else
2985         /* SSE2 */
2986         /* transpose data */
2987         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
2988         _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
2989         t2   = _mm_movehl_ps(_mm_setzero_ps(),fiz3);
2990         t1   = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(0,0,0,1));
2991         t3   = _mm_shuffle_ps(t2,t2,_MM_SHUFFLE(0,0,0,1));
2992
2993         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
2994         fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
2995         fiz3 = _mm_add_ss(_mm_add_ps(fiz3,t1)  , _mm_add_ps(t2,t3));
2996 #endif
2997         _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
2998         _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
2999         _mm_store_ss (fptr+8,_mm_add_ss(fiz3,_mm_load_ss(fptr+8) ));
3000
3001         t4 = _mm_load_ss(fshiftptr+2);
3002         t4 = _mm_loadh_pi(t4,(__m64 *)(fshiftptr));
3003
3004         t1 = _mm_shuffle_ps(fiz3,fix1,_MM_SHUFFLE(1,0,0,0));   /* fiy1 fix1  -   fiz3 */
3005         t2 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(3,2,2,2));   /* fiy3 fix3  -   fiz1 */
3006         t3 = _mm_shuffle_ps(fiy2,fix1,_MM_SHUFFLE(3,3,0,1));   /* fix2 fix2 fiy2 fiz2 */
3007         t3 = _mm_shuffle_ps(t3  ,t3  ,_MM_SHUFFLE(1,2,0,0));   /* fiy2 fix2  -   fiz2 */
3008
3009         t1 = _mm_add_ps(t1,t2);
3010         t3 = _mm_add_ps(t3,t4);
3011         t1 = _mm_add_ps(t1,t3); /* y x - z */
3012
3013         _mm_store_ss(fshiftptr+2,t1);
3014         _mm_storeh_pi((__m64 *)(fshiftptr),t1);
3015 }
3016
3017
3018 static inline void
3019 gmx_mm_update_iforce_4atoms_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
3020                                __m128 fix2, __m128 fiy2, __m128 fiz2,
3021                                __m128 fix3, __m128 fiy3, __m128 fiz3,
3022                                __m128 fix4, __m128 fiy4, __m128 fiz4,
3023                                float *fptr,
3024                                float *fshiftptr)
3025 {
3026         __m128 t1,t2,t3,t4,t5;
3027
3028 #ifdef GMX_SSE3
3029         fix1 = _mm_hadd_ps(fix1,fiy1);
3030         fiz1 = _mm_hadd_ps(fiz1,fix2);
3031         fiy2 = _mm_hadd_ps(fiy2,fiz2);
3032         fix3 = _mm_hadd_ps(fix3,fiy3);
3033         fiz3 = _mm_hadd_ps(fiz3,fix4);
3034         fiy4 = _mm_hadd_ps(fiy4,fiz4);
3035
3036         fix1 = _mm_hadd_ps(fix1,fiz1); /* fix2 fiz1 fiy1 fix1 */
3037         fiy2 = _mm_hadd_ps(fiy2,fix3); /* fiy3 fix3 fiz2 fiy2 */
3038         fiz3 = _mm_hadd_ps(fiz3,fiy4); /* fiz4 fiy4 fix4 fiz3 */
3039 #else
3040         /* SSE2 */
3041         /* transpose data */
3042         _MM_TRANSPOSE4_PS(fix1,fiy1,fiz1,fix2);
3043         _MM_TRANSPOSE4_PS(fiy2,fiz2,fix3,fiy3);
3044         _MM_TRANSPOSE4_PS(fiz3,fix4,fiy4,fiz4);
3045
3046         fix1 = _mm_add_ps(_mm_add_ps(fix1,fiy1), _mm_add_ps(fiz1,fix2));
3047         fiy2 = _mm_add_ps(_mm_add_ps(fiy2,fiz2), _mm_add_ps(fix3,fiy3));
3048         fiz3 = _mm_add_ps(_mm_add_ps(fiz3,fix4), _mm_add_ps(fiy4,fiz4));
3049 #endif
3050         _mm_storeu_ps(fptr,  _mm_add_ps(fix1,_mm_loadu_ps(fptr)  ));
3051         _mm_storeu_ps(fptr+4,_mm_add_ps(fiy2,_mm_loadu_ps(fptr+4)));
3052         _mm_storeu_ps(fptr+8,_mm_add_ps(fiz3,_mm_loadu_ps(fptr+8)));
3053
3054         t5 = _mm_load_ss(fshiftptr+2);
3055         t5 = _mm_loadh_pi(t5,(__m64 *)(fshiftptr));
3056
3057         t1 = _mm_shuffle_ps(fix1,fix1,_MM_SHUFFLE(1,0,2,2));   /* fiy1 fix1  -   fiz1 */
3058         t2 = _mm_shuffle_ps(fiy2,fiy2,_MM_SHUFFLE(3,2,1,1));   /* fiy3 fix3  -   fiz2 */
3059         t3 = _mm_shuffle_ps(fiz3,fiz3,_MM_SHUFFLE(2,1,0,0));   /* fiy4 fix4  -   fiz3 */
3060         t4 = _mm_shuffle_ps(fix1,fiy2,_MM_SHUFFLE(0,0,3,3));   /* fiy2 fiy2 fix2 fix2 */
3061         t4 = _mm_shuffle_ps(fiz3,t4  ,_MM_SHUFFLE(2,0,3,3));   /* fiy2 fix2  -   fiz4 */
3062
3063         t1 = _mm_add_ps(t1,t2);
3064         t3 = _mm_add_ps(t3,t4);
3065         t1 = _mm_add_ps(t1,t3); /* y x - z */
3066         t5 = _mm_add_ps(t5,t1);
3067
3068         _mm_store_ss(fshiftptr+2,t5);
3069         _mm_storeh_pi((__m64 *)(fshiftptr),t5);
3070 }
3071
3072
3073 static inline void
3074 gmx_mm_update_1pot_ps(__m128 pot1, float *ptr1)
3075 {
3076 #ifdef GMX_SSE3
3077         pot1 = _mm_hadd_ps(pot1,pot1);
3078         pot1 = _mm_hadd_ps(pot1,pot1);
3079 #else
3080         /* SSE2 */
3081         pot1 = _mm_add_ps(pot1,_mm_movehl_ps(pot1,pot1));
3082         pot1 = _mm_add_ps(pot1,_mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1)));
3083 #endif
3084         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3085 }
3086
3087
3088 static inline void
3089 gmx_mm_update_2pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2)
3090 {
3091 #ifdef GMX_SSE3
3092         pot1 = _mm_hadd_ps(pot1,pot2);
3093         pot1 = _mm_hadd_ps(pot1,pot1);
3094         pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(0,0,0,1));
3095 #else
3096         /* SSE2 */
3097         __m128 t1,t2;
3098         t1   = _mm_movehl_ps(pot2,pot1); /* 2d 2c 1d 1c */
3099         t2   = _mm_movelh_ps(pot1,pot2); /* 2b 2a 1b 1a */
3100         t1   = _mm_add_ps(t1,t2);       /* 2  2  1  1  */
3101         t2   = _mm_shuffle_ps(t1,t1,_MM_SHUFFLE(3,3,1,1));
3102         pot1 = _mm_add_ps(t1,t2);       /* -  2  -  1  */
3103         pot2 = _mm_movehl_ps(t2,pot1);    /* -  -  -  2  */
3104 #endif
3105
3106         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3107         _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
3108 }
3109
3110
3111 static inline void
3112 gmx_mm_update_4pot_ps(__m128 pot1, float *ptr1, __m128 pot2, float *ptr2, __m128 pot3, float *ptr3, __m128 pot4, float *ptr4)
3113 {
3114     _MM_TRANSPOSE4_PS(pot1,pot2,pot3,pot4);
3115
3116     pot1 = _mm_add_ps(_mm_add_ps(pot1,pot2),_mm_add_ps(pot3,pot4));
3117     pot2 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(1,1,1,1));
3118     pot3 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(2,2,2,2));
3119     pot4 = _mm_shuffle_ps(pot1,pot1,_MM_SHUFFLE(3,3,3,3));
3120
3121         _mm_store_ss(ptr1,_mm_add_ss(pot1,_mm_load_ss(ptr1)));
3122         _mm_store_ss(ptr2,_mm_add_ss(pot2,_mm_load_ss(ptr2)));
3123         _mm_store_ss(ptr3,_mm_add_ss(pot3,_mm_load_ss(ptr3)));
3124         _mm_store_ss(ptr4,_mm_add_ss(pot4,_mm_load_ss(ptr4)));
3125 }
3126