src/mdlib/genborn_sse2_double.c

   1 #ifdef HAVE_CONFIG_H
   2 #include <config.h>
   3 #endif
   4
   5 #include <math.h>
   6 #include <string.h>
   7
   8 #include "typedefs.h"
   9 #include "smalloc.h"
  10 #include "genborn.h"
  11 #include "vec.h"
  12 #include "grompp.h"
  13 #include "pdbio.h"
  14 #include "names.h"
  15 #include "physics.h"
  16 #include "domdec.h"
  17 #include "partdec.h"
  18 #include "network.h"
  19 #include "gmx_fatal.h"
  20 #include "mtop_util.h"
  21 #include "genborn.h"
  22
  23 #ifdef GMX_LIB_MPI
  24 #include <mpi.h>
  25 #endif
  26 #ifdef GMX_THREADS
  27 #include "tmpi.h"
  28 #endif
  29
  30 /* Only compile this file if SSE2 intrinsics are available */
  31 #if ( (defined(GMX_IA32_SSE2) || defined(GMX_X86_64_SSE2) || defined(GMX_SSE2)) && defined(GMX_DOUBLE) )
  32 #include <xmmintrin.h>
  33 #include <emmintrin.h>
  34
  35 #if (defined (_MSC_VER) || defined(__INTEL_COMPILER))
  36 #define gmx_castsi128_pd(a) _mm_castsi128_pd(a)
  37 #define gmx_castpd_si128(a) _mm_castpd_si128(a)
  38 #elif defined(__GNUC__)
  39 #define gmx_castsi128_pd(a) ((__m128d)(a))
  40 #define gmx_castpd_si128(a) ((__m128i)(a))
  41 #else
  42 static __m128d gmx_castsi128_pd(__m128i a) { return *(__m128d *) &a; }
  43 static __m128i gmx_castpd_si128(__m128d a) { return *(__m128i *) &a; }
  44 #endif
  45
  46
  47 static inline void
  48 sincos_sse2double(__m128d x, __m128d *sinval, __m128d *cosval)
  49 {
  50     const __m128d two_over_pi = {2.0/M_PI,2.0/M_PI};
  51     const __m128d half        = {0.5,0.5};
  52     const __m128d one         = {1.0,1.0};
  53     const __m128i izero       = _mm_set1_epi32(0);
  54     const __m128i ione        = _mm_set1_epi32(1);
  55     const __m128i itwo        = _mm_set1_epi32(2);
  56     const __m128i ithree      = _mm_set1_epi32(3);
  57     const __m128d sincosd_kc1 = {(13176794.0 / 8388608.0),(13176794.0 / 8388608.0)};
  58     const __m128d sincosd_kc2 = {7.5497899548918821691639751442098584e-8,7.5497899548918821691639751442098584e-8};
  59     const __m128d sincosd_cc0 = {0.00000000206374484196,0.00000000206374484196};
  60     const __m128d sincosd_cc1 = {-0.00000027555365134677,-0.00000027555365134677};
  61     const __m128d sincosd_cc2 = {0.00002480157946764225,0.00002480157946764225};
  62     const __m128d sincosd_cc3 = {-0.00138888888730525966,-0.00138888888730525966};
  63     const __m128d sincosd_cc4 = {0.04166666666651986722,0.04166666666651986722};
  64     const __m128d sincosd_cc5 = {-0.49999999999999547304,-0.49999999999999547304};
  65     const __m128d sincosd_sc0 = {0.00000000015893606014,0.00000000015893606014};
  66     const __m128d sincosd_sc1 = {-0.00000002505069049138,-0.00000002505069049138};
  67     const __m128d sincosd_sc2 = {0.00000275573131527032,0.00000275573131527032};
  68     const __m128d sincosd_sc3 = {-0.00019841269827816117,-0.00019841269827816117};
  69     const __m128d sincosd_sc4 = {0.00833333333331908278,0.00833333333331908278};
  70     const __m128d sincosd_sc5 = {-0.16666666666666612594,-0.16666666666666612594};
  71
  72     __m128d signbit           = gmx_castsi128_pd(_mm_set1_epi64x(0x8000000000000000ULL));
  73     __m128d tiny              = gmx_castsi128_pd(_mm_set1_epi64x(0x3e40000000000000ULL));
  74
  75     __m128d xl,xl2,xl3,qd,absxl,p1,cx,sx,ts,tc,tsn,tcn;
  76     __m128i q;
  77     __m128i offsetSin,offsetCos;
  78     __m128d sinMask,cosMask,isTiny;
  79     __m128d ct0,ct1,ct2,ct3,ct4,ct5,ct6,st1,st2,st3,st4,st6;
  80
  81     /* Rescale the angle to the range 0..4, and find which quadrant it is in */
  82     xl        = _mm_mul_pd(x,two_over_pi);
  83
  84     /* q=integer part of xl, rounded _away_ from 0.0 */
  85     /* Add 0.5 away from 0.0 */
  86     xl        = _mm_add_pd(xl,_mm_or_pd(_mm_and_pd(xl,signbit),half));
  87
  88     q         = _mm_cvttpd_epi32(xl);
  89     qd        = _mm_cvtepi32_pd(q);
  90     q         = _mm_shuffle_epi32(q,_MM_SHUFFLE(1,1,0,0));
  91
  92     /* Compute offset based on quadrant the arg falls in */
  93     offsetSin   = _mm_and_si128(q,ithree);
  94     offsetCos   = _mm_add_epi32(offsetSin,ione);
  95
  96     /* Remainder in range [-pi/4..pi/4] */
  97     p1 = _mm_mul_pd(qd,sincosd_kc1);
  98     xl = _mm_mul_pd(qd,sincosd_kc2);
  99     p1 = _mm_sub_pd(x,p1);
 100     xl = _mm_sub_pd(p1,xl);
 101
 102     absxl  = _mm_andnot_pd(signbit,xl);
 103     isTiny = _mm_cmpgt_pd(tiny,absxl);
 104
 105     xl2    = _mm_mul_pd(xl,xl);
 106     xl3    = _mm_mul_pd(xl2,xl);
 107
 108     ct0    = _mm_mul_pd(xl2,xl2);
 109     ct1    = _mm_mul_pd(sincosd_cc0,xl2);
 110     ct2    = _mm_mul_pd(sincosd_cc2,xl2);
 111     ct3    = _mm_mul_pd(sincosd_cc4,xl2);
 112     st1    = _mm_mul_pd(sincosd_sc0,xl2);
 113     st2    = _mm_mul_pd(sincosd_sc2,xl2);
 114     st3    = _mm_mul_pd(sincosd_sc4,xl2);
 115     ct1    = _mm_add_pd(ct1,sincosd_cc1);
 116     ct2    = _mm_add_pd(ct2,sincosd_cc3);
 117     ct3    = _mm_add_pd(ct3,sincosd_cc5);
 118     st1    = _mm_add_pd(st1,sincosd_sc1);
 119     st2    = _mm_add_pd(st2,sincosd_sc3);
 120     st3    = _mm_add_pd(st3,sincosd_sc5);
 121
 122     ct4    = _mm_mul_pd(ct2,ct0);
 123     ct4    = _mm_add_pd(ct4,ct3);
 124
 125     st4    = _mm_mul_pd(st2,ct0);
 126     st4    = _mm_add_pd(st4,st3);
 127     ct5    = _mm_mul_pd(ct0,ct0);
 128
 129     ct6    = _mm_mul_pd(ct5,ct1);
 130     ct6    = _mm_add_pd(ct6,ct4);
 131
 132     st6    = _mm_mul_pd(ct5,st1);
 133     st6    = _mm_add_pd(st6,st4);
 134
 135     cx     = _mm_mul_pd(ct6,xl2);
 136     cx     = _mm_add_pd(cx,one);
 137
 138     sx     = _mm_mul_pd(st6,xl3);
 139     sx     = _mm_add_pd(sx,xl);
 140
 141     /* Small angle approximation, sin(tiny)=tiny, cos(tiny)=1.0 */
 142     sx     = _mm_or_pd( _mm_and_pd(isTiny,xl) , _mm_andnot_pd(isTiny,sx) );
 143     cx     = _mm_or_pd( _mm_and_pd(isTiny,one) , _mm_andnot_pd(isTiny,cx) );
 144
 145     sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,ione), izero));
 146     cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,ione), izero));
 147
 148     ts     = _mm_or_pd( _mm_and_pd(sinMask,sx) , _mm_andnot_pd(sinMask,cx) );
 149     tc     = _mm_or_pd( _mm_and_pd(cosMask,sx) , _mm_andnot_pd(cosMask,cx) );
 150
 151     /* Flip the sign of the result when (offset mod 4) = 1 or 2 */
 152     sinMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetSin,itwo), izero));
 153     tsn    = _mm_xor_pd(signbit,ts);
 154     ts     = _mm_or_pd( _mm_and_pd(sinMask,ts) , _mm_andnot_pd(sinMask,tsn) );
 155
 156     cosMask = gmx_castsi128_pd(_mm_cmpeq_epi32( _mm_and_si128(offsetCos,itwo), izero));
 157     tcn    = _mm_xor_pd(signbit,tc);
 158     tc     = _mm_or_pd( _mm_and_pd(cosMask,tc) , _mm_andnot_pd(cosMask,tcn) );
 159
 160     *sinval = ts;
 161     *cosval = tc;
 162
 163     return;
 164 }
 165
 166 __m128d log_pd(__m128d x)
 167 {
 168         const __m128i exp_mask   = _mm_set_epi32(0x7FF00000,0,0x7FF00000,0);
 169         const __m128i exp_bias   = _mm_set_epi32(0,1023,0,1023);
 170
 171         const __m128d const_loge = _mm_set1_pd(0.69314718055994529);
 172         const __m128d const_one  = _mm_set1_pd(1.0);
 173         const __m128d const_two  = _mm_set1_pd(2.0);
 174
 175         /* Almost full single precision accuracy (~20 bits worst case) */
 176         const __m128d P0      = _mm_set1_pd(6.108179944792157749153050);
 177         const __m128d P1      = _mm_set1_pd(52.43691313715523327631139);
 178         const __m128d P2      = _mm_set1_pd(71.53664010795613671168440);
 179         const __m128d P3      = _mm_set1_pd(18.92097516931559547548485);
 180         const __m128d P4      = _mm_set1_pd(0.3504714784635941984522153);
 181         const __m128d P5      = _mm_set1_pd(-0.007105890734229368515879);
 182         const __m128d Q1      = _mm_set1_pd(17.73314231909420567454406);
 183         const __m128d Q2      = _mm_set1_pd(48.82373085428713023213363);
 184         const __m128d Q3      = _mm_set1_pd(31.65945943354513166309101);
 185         const __m128d Q4      = _mm_set1_pd(4.302477477108162270199051);
 186
 187         __m128d xmm0,xmm1,xmm2,xmm3, xmm4;
 188         __m128i xmmi,xmmj;
 189
 190         xmmi = gmx_castpd_si128(x);
 191         xmm1 = _mm_cvtepi32_pd(_mm_shuffle_epi32(_mm_sub_epi64(_mm_srli_epi64(_mm_and_si128(xmmi, exp_mask), 52), exp_bias),_MM_SHUFFLE(3,1,2,0)));
 192         xmm0 = _mm_or_pd(gmx_castsi128_pd(_mm_andnot_si128(exp_mask, xmmi)), const_one);
 193
 194         xmm2  = _mm_mul_pd(P5,xmm0);
 195         xmm2  = _mm_add_pd(xmm2,P4);
 196         xmm2  = _mm_mul_pd(xmm2,xmm0);
 197         xmm2  = _mm_add_pd(xmm2,P3);
 198         xmm2  = _mm_mul_pd(xmm2,xmm0);
 199         xmm2  = _mm_add_pd(xmm2,P2);
 200         xmm2  = _mm_mul_pd(xmm2,xmm0);
 201         xmm2  = _mm_add_pd(xmm2,P1);
 202         xmm2  = _mm_mul_pd(xmm2,xmm0);
 203         xmm2  = _mm_add_pd(xmm2,P0);
 204
 205         xmm3  = _mm_mul_pd(Q4,xmm0);
 206         xmm3  = _mm_add_pd(xmm3,Q3);
 207         xmm3  = _mm_mul_pd(xmm3,xmm0);
 208         xmm3  = _mm_add_pd(xmm3,Q2);
 209         xmm3  = _mm_mul_pd(xmm3,xmm0);
 210         xmm3  = _mm_add_pd(xmm3,Q1);
 211         xmm3  = _mm_mul_pd(xmm3,xmm0);
 212         xmm3  = _mm_add_pd(xmm3,const_one);
 213
 214         /* xmm4=1.0/xmm3 */
 215         xmm4 = _mm_cvtps_pd(_mm_rcp_ps(_mm_cvtpd_ps(xmm3)));
 216         xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
 217         xmm4 = _mm_mul_pd(xmm4,_mm_sub_pd(const_two,_mm_mul_pd(xmm3,xmm4)));
 218         xmm2 = _mm_mul_pd(xmm2,xmm4);
 219
 220         xmm0  = _mm_sub_pd(xmm0, const_one);
 221         xmm0  = _mm_mul_pd(xmm0,xmm2);
 222
 223         xmm0  = _mm_add_pd(xmm0,xmm1);
 224
 225     return _mm_mul_pd(xmm0, const_loge);
 226 }
 227
 228 /* This exp() routine provides accuracy of 10E-9 to 10E-11.
 229  * The polynomial minimax coefficients are actually accurate to 10E-14,
 230  * but we lose some accuracy in the polynomial evaluation.
 231  */
 232 __m128d exp_pd(__m128d x)
 233 {
 234     const __m128d lim1   = _mm_set1_pd(1025.0);   /* 1025.00000e+0d */
 235     const __m128d lim2   = _mm_set1_pd(-1022.99999999);   /* -1022.99999e+0f */
 236
 237     const __m128i base   = _mm_set_epi32(0,0,1023,1023);
 238         const __m128d half   = _mm_set1_pd(0.5);
 239         const __m128d log2e  = _mm_set1_pd(1.4426950408889634);
 240
 241     const __m128d exp_P0 = _mm_set1_pd(1.00000000000001276211229749);
 242     const __m128d exp_P1 = _mm_set1_pd(6.931471805598709708635169E-1);
 243     const __m128d exp_P2 = _mm_set1_pd(2.402265069564965287455972E-1);
 244     const __m128d exp_P3 = _mm_set1_pd(5.550410866868561155599683E-2);
 245     const __m128d exp_P4 = _mm_set1_pd(9.618129192067246128919915E-3);
 246     const __m128d exp_P5 = _mm_set1_pd(1.333355761760444302342084E-3);
 247     const __m128d exp_P6 = _mm_set1_pd(1.540343494807179111289781E-4);
 248     const __m128d exp_P7 = _mm_set1_pd(1.525298483865349629325421E-5);
 249     const __m128d exp_P8 = _mm_set1_pd(1.325940560934510417818578E-6);
 250         const __m128d exp_P9 = _mm_set1_pd(1.015033670529892589443421E-7);
 251
 252         __m128d xmm0,xmm1;
 253         __m128i xmmi;
 254
 255         xmm0 = _mm_mul_pd(x,log2e);
 256         xmm0 = _mm_min_pd(xmm0,lim1);
 257         xmm0 = _mm_max_pd(xmm0,lim2);
 258         xmm1 = _mm_sub_pd(xmm0,half);
 259
 260         xmmi = _mm_cvtpd_epi32(xmm1);
 261         xmm1 = _mm_cvtepi32_pd(xmmi);
 262
 263         xmmi = _mm_add_epi32(xmmi,base);
 264         xmmi = _mm_shuffle_epi32(xmmi,_MM_SHUFFLE(3,1,2,0));
 265         xmmi = _mm_slli_epi64(xmmi,52);
 266
 267         xmm0 = _mm_sub_pd(xmm0,xmm1);
 268
 269         xmm1 = _mm_mul_pd(exp_P9,xmm0);
 270         xmm1 = _mm_add_pd(xmm1,exp_P8);
 271         xmm1 = _mm_mul_pd(xmm1,xmm0);
 272         xmm1 = _mm_add_pd(xmm1,exp_P7);
 273         xmm1 = _mm_mul_pd(xmm1,xmm0);
 274         xmm1 = _mm_add_pd(xmm1,exp_P6);
 275         xmm1 = _mm_mul_pd(xmm1,xmm0);
 276         xmm1 = _mm_add_pd(xmm1,exp_P5);
 277         xmm1 = _mm_mul_pd(xmm1,xmm0);
 278         xmm1 = _mm_add_pd(xmm1,exp_P4);
 279         xmm1 = _mm_mul_pd(xmm1,xmm0);
 280         xmm1 = _mm_add_pd(xmm1,exp_P3);
 281         xmm1 = _mm_mul_pd(xmm1,xmm0);
 282         xmm1 = _mm_add_pd(xmm1,exp_P2);
 283         xmm1 = _mm_mul_pd(xmm1,xmm0);
 284         xmm1 = _mm_add_pd(xmm1,exp_P1);
 285         xmm1 = _mm_mul_pd(xmm1,xmm0);
 286         xmm1 = _mm_add_pd(xmm1,exp_P0);
 287         xmm1 = _mm_mul_pd(xmm1,gmx_castsi128_pd(xmmi));
 288
 289     return xmm1;
 290 }
 291
 292 static inline __m128d
 293 my_invrsq_pd(__m128d x)
 294 {
 295         const __m128d three = {3.0, 3.0};
 296         const __m128d half  = {0.5, 0.5};
 297
 298         __m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
 299         __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
 300
 301         /* First Newton-Rapson step, accuracy is now 24 bits */
 302         __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
 303
 304         /* Return second Newton-Rapson step, accuracy 48 bits */
 305         return _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
 306 }
 307
 308 static inline __m128d
 309 my_inv_pd(__m128d x)
 310 {
 311         const __m128d two = {2.0, 2.0};
 312
 313         __m128  t  = _mm_rcp_ps(_mm_cvtpd_ps(x));
 314         __m128d t1 = _mm_cvtps_pd(t);
 315         __m128d t2 = _mm_mul_pd(t1,_mm_sub_pd(two,_mm_mul_pd(t1,x)));
 316
 317         return _mm_mul_pd(t2,_mm_sub_pd(two,_mm_mul_pd(t2,x)));
 318 }
 319
 320
 321 int
 322 calc_gb_rad_still_sse2_double(t_commrec *cr, t_forcerec *fr,int natoms, gmx_localtop_t *top,
 323                                                           const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
 324 {
 325         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23;
 326         int at0,at1,nj0,nj1,offset,taj1,taj2;
 327         int shift;
 328
 329         double factor,gpi_ai,gpi_tmp,gpi2;
 330         double shX,shY,shZ;
 331
 332         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
 333         __m128d t1,t2,t3,rsq11,rinv,rinv2,rinv4,rinv6;
 334         __m128d ratio,gpi,rai,raj,vaj,rvdw,mask_cmp;
 335         __m128d ccf,dccf,theta,cosq,term,sinq,res,prod;
 336         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,vai,prod_ai,icf4,icf6;
 337
 338         const __m128d half  = {0.5, 0.5};
 339         const __m128d three = {3.0, 3.0};
 340         const __m128d one   = {1.0, 1.0};
 341         const __m128d two   = {2.0, 2.0};
 342         const __m128d zero  = {0.0, 0.0};
 343         const __m128d four  = {4.0, 4.0};
 344
 345         const __m128d p5inv  = {STILL_P5INV, STILL_P5INV};
 346         const __m128d pip5   = {STILL_PIP5,  STILL_PIP5};
 347         const __m128d p4     = {STILL_P4,    STILL_P4};
 348
 349         factor              = 0.5 * ONE_4PI_EPS0;
 350         n                   = 0;
 351
 352         /* Keep the compiler happy */
 353         raj         = _mm_setzero_pd();
 354         vaj         = _mm_setzero_pd();
 355         jx          = _mm_setzero_pd();
 356         jy          = _mm_setzero_pd();
 357         jz          = _mm_setzero_pd();
 358         xmm2        = _mm_setzero_pd();
 359         xmm7        = _mm_setzero_pd();
 360
 361         for(i=0;i<born->nr;i++)
 362         {
 363                 born->gpol_still_work[i]=0;
 364         }
 365
 366         for(i=0;i<nl->nri;i++)
 367         {
 368                 ai      = nl->iinr[i];
 369                 ai3     = ai * 3;
 370
 371                 nj0     = nl->jindex[i];
 372                 nj1     = nl->jindex[i+1];
 373
 374                 /* Load shifts for this list */
 375                 shift   = nl->shift[i];
 376                 shX     = fr->shift_vec[shift][0];
 377                 shY     = fr->shift_vec[shift][1];
 378                 shZ     = fr->shift_vec[shift][2];
 379
 380                 /* Splat the shifts */
 381                 sX = _mm_load1_pd(&shX);
 382                 sY = _mm_load1_pd(&shY);
 383                 sZ = _mm_load1_pd(&shZ);
 384
 385                 offset  = (nj1-nj0)%2;
 386
 387                 /* Polarization energy for atom ai */
 388                 gpi     = _mm_setzero_pd();
 389
 390                 /* Load particle ai coordinates and add shifts */
 391                 ix      = _mm_load1_pd(x+ai3);
 392                 iy      = _mm_load1_pd(x+ai3+1);
 393                 iz      = _mm_load1_pd(x+ai3+2);
 394
 395                 ix      = _mm_add_pd(sX,ix);
 396                 iy      = _mm_add_pd(sY,iy);
 397                 iz      = _mm_add_pd(sZ,iz);
 398
 399                 /* Load particle ai gb_radius */
 400                 rai     = _mm_set1_pd(top->atomtypes.gb_radius[md->typeA[ai]]);
 401                 vai     = _mm_set1_pd(born->vsolv[ai]);
 402                 prod_ai = _mm_mul_pd(p4,vai);
 403
 404                 for(k=nj0;k<nj1-offset;k+=2)
 405                 {
 406                         aj1      = nl->jjnr[k];
 407                         aj2      = nl->jjnr[k+1];
 408
 409                         aj13     = aj1 * 3;
 410                         aj23     = aj2 * 3;
 411
 412                         taj1     = md->typeA[aj1];
 413                         taj2     = md->typeA[aj2];
 414
 415                         /* Load particle aj1-2 coordinates and compute ai->aj distances */
 416                         xmm1     = _mm_loadu_pd(x+aj13);
 417                         xmm2     = _mm_loadu_pd(x+aj23);
 418                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
 419                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
 420
 421                         jz       = _mm_loadl_pd(jz,x+aj13+2);
 422                         jz       = _mm_loadh_pd(jz,x+aj23+2);
 423
 424                         dx       = _mm_sub_pd(ix,jx);
 425                         dy       = _mm_sub_pd(iy,jy);
 426                         dz       = _mm_sub_pd(iz,jz);
 427
 428                         rsq11    = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
 429                         rinv     = my_invrsq_pd(rsq11);
 430
 431                         rinv2    = _mm_mul_pd(rinv,rinv);
 432                         rinv4    = _mm_mul_pd(rinv2,rinv2);
 433                         rinv6    = _mm_mul_pd(rinv4,rinv2);
 434
 435                         vaj      = _mm_loadl_pd(vaj,born->vsolv+aj1);
 436                         vaj      = _mm_loadh_pd(vaj,born->vsolv+aj2);
 437
 438                         raj      = _mm_loadl_pd(raj, top->atomtypes.gb_radius+taj1);
 439                         raj      = _mm_loadh_pd(raj, top->atomtypes.gb_radius+taj2);
 440
 441                         rvdw     = _mm_add_pd(rai,raj);
 442                         rvdw     = _mm_mul_pd(rvdw,rvdw);
 443                         ratio    = _mm_div_pd(rsq11,rvdw);
 444
 445                         mask_cmp = _mm_cmpgt_pd(ratio,p5inv);
 446
 447                         switch(_mm_movemask_pd(mask_cmp))
 448                         {
 449                                 case 0xF:
 450                                         ccf  = one;
 451                                         dccf = zero;
 452                                         break;
 453                                 default:
 454                                         theta = _mm_mul_pd(ratio,pip5);
 455                                         sincos_sse2double(theta,&sinq,&cosq);
 456
 457                                         term  = _mm_sub_pd(one,cosq);
 458                                         term  = _mm_mul_pd(half,term);
 459                                         ccf   = _mm_mul_pd(term,term);
 460                                         dccf  = _mm_mul_pd(two,term);
 461                                         dccf  = _mm_mul_pd(dccf,sinq);
 462                                         dccf  = _mm_mul_pd(dccf,pip5);
 463                                         dccf  = _mm_mul_pd(dccf,ratio);
 464
 465                                         ccf   = _mm_or_pd(_mm_and_pd(mask_cmp,one)  ,_mm_andnot_pd(mask_cmp,ccf));
 466                                         dccf  = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
 467                         }
 468
 469                         prod      = _mm_mul_pd(p4,vaj);
 470                         icf4      = _mm_mul_pd(ccf,rinv4);
 471                         xmm2      = _mm_mul_pd(icf4,prod);
 472                         xmm3      = _mm_mul_pd(icf4,prod_ai);
 473                         gpi       = _mm_add_pd(gpi,xmm2);
 474
 475                         /* Load, subtract and store atom aj gpol energy */
 476                         xmm7      = _mm_loadl_pd(xmm7,born->gpol_still_work+aj1);
 477                         xmm7      = _mm_loadh_pd(xmm7,born->gpol_still_work+aj2);
 478
 479                         xmm3      = _mm_add_pd(xmm7,xmm3);
 480
 481                         _mm_storel_pd(born->gpol_still_work+aj1,xmm3);
 482                         _mm_storeh_pd(born->gpol_still_work+aj2,xmm3);
 483
 484                         /* Chain rule terms */
 485                         ccf       = _mm_mul_pd(four,ccf);
 486                         xmm3      = _mm_sub_pd(ccf,dccf);
 487                         icf6      = _mm_mul_pd(xmm3,rinv6);
 488                         xmm1      = _mm_mul_pd(icf6,prod);
 489                         xmm2      = _mm_mul_pd(icf6,prod_ai);
 490
 491                         /* As with single precision, we need to shift stuff around, to change the order of
 492                          * the interactions from ai->aj1, ai->aj2 to ai->aj1, aj1->ai, ai->aj2, aj2->ai etc,
 493                          to do 2 instead of 4 store operations
 494                          */
 495                         xmm3      = _mm_unpacklo_pd(xmm1,xmm2);
 496                         xmm4      = _mm_unpackhi_pd(xmm1,xmm2);
 497
 498                         _mm_storeu_pd(fr->dadx+n,xmm3);
 499                         n         = n + 2;
 500                         _mm_storeu_pd(fr->dadx+n,xmm4);
 501                         n         = n + 2;
 502                 }
 503
 504                 /* Deal with odd elements */
 505                 if(offset!=0)
 506                 {
 507                         aj1  = nl->jjnr[k];
 508                         aj13 = aj1 * 3;
 509                         taj1 = md->typeA[aj1];
 510
 511                         jx    = _mm_load_sd(x+aj13);
 512                         jy    = _mm_load_sd(x+aj13+1);
 513                         jz    = _mm_load_sd(x+aj13+2);
 514
 515                         raj   = _mm_load_sd(top->atomtypes.gb_radius+taj1);
 516                         vaj   = _mm_load_sd(born->vsolv+aj1);
 517
 518                         dx    = _mm_sub_sd(ix,jx);
 519                         dy    = _mm_sub_pd(iy,jy);
 520                         dz    = _mm_sub_pd(iz,jz);
 521
 522                         rsq11 = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
 523                         rinv  = my_invrsq_pd(rsq11);
 524
 525                         rinv2 = _mm_mul_sd(rinv,rinv);
 526                         rinv4 = _mm_mul_sd(rinv2,rinv2);
 527                         rinv6 = _mm_mul_sd(rinv4,rinv2);
 528
 529                         rvdw  = _mm_add_sd(rai,raj);
 530                         rvdw  = _mm_mul_sd(rvdw,rvdw);
 531                         ratio = _mm_div_sd(rsq11,rvdw);
 532
 533                         mask_cmp = _mm_cmpgt_sd(ratio,p5inv);
 534
 535                         switch(_mm_movemask_pd(mask_cmp))
 536                         {
 537                                 case 0xF:
 538                                         ccf  = one;
 539                                         dccf = zero;
 540                                         break;
 541                                 default:
 542                                         theta = _mm_mul_sd(ratio,pip5);
 543                                         sincos_sse2double(theta,&sinq,&cosq);
 544
 545                                         term  = _mm_sub_sd(one,cosq);
 546                                         term  = _mm_mul_sd(half,term);
 547                                         ccf   = _mm_mul_sd(term,term);
 548                                         dccf  = _mm_mul_sd(two,term);
 549                                         dccf  = _mm_mul_sd(dccf,sinq);
 550                                         dccf  = _mm_mul_sd(dccf,pip5);
 551                                         dccf  = _mm_mul_sd(dccf,ratio);
 552
 553                                         ccf   = _mm_or_pd(_mm_and_pd(mask_cmp,one)  ,_mm_andnot_pd(mask_cmp,ccf));
 554                                         dccf  = _mm_or_pd(_mm_and_pd(mask_cmp,zero) ,_mm_andnot_pd(mask_cmp,dccf));
 555                         }
 556
 557                         prod      = _mm_mul_sd(p4,vaj);
 558                         icf4      = _mm_mul_sd(ccf,rinv4);
 559                         xmm2      = _mm_mul_sd(icf4,prod);
 560                         xmm3      = _mm_mul_sd(icf4,prod_ai);
 561                         gpi       = _mm_add_sd(gpi,xmm2);
 562
 563                         /* Load, subtract and store atom aj gpol energy */
 564                         xmm7      = _mm_load_sd(born->gpol_still_work+aj1);
 565                         xmm3      = _mm_add_sd(xmm7,xmm3);
 566                         _mm_store_sd(born->gpol_still_work+aj1,xmm3);
 567
 568                         /* Chain rule terms */
 569                         ccf       = _mm_mul_sd(four,ccf);
 570                         xmm3      = _mm_sub_sd(ccf,dccf);
 571                         icf6      = _mm_mul_sd(xmm3,rinv6);
 572                         xmm1      = _mm_mul_sd(icf6,prod);
 573                         xmm2      = _mm_mul_sd(icf6,prod_ai);
 574
 575                         /* Here we only have ai->aj1 and aj1->ai, so we can store directly */
 576                         _mm_storel_pd(fr->dadx+n,xmm1);
 577                         n         = n + 1;
 578                         _mm_storel_pd(fr->dadx+n,xmm2);
 579                         n         = n + 1;
 580                 } /* End offset */
 581
 582                 /* Do end processing ... */
 583                 xmm2  = _mm_unpacklo_pd(xmm2,gpi);
 584                 gpi   = _mm_add_pd(gpi,xmm2);
 585                 gpi   = _mm_shuffle_pd(gpi,gpi,_MM_SHUFFLE2(1,1));
 586
 587                 /* Load, add and store atom ai polarisation energy */
 588                 xmm2 = _mm_load_sd(born->gpol_still_work+ai);
 589                 gpi = _mm_add_sd(gpi,xmm2);
 590                 _mm_store_sd(born->gpol_still_work+ai,gpi);
 591         }
 592
 593         /* Parallell summations */
 594         if(PARTDECOMP(cr))
 595         {
 596                 gmx_sum(natoms,born->gpol_still_work, cr);
 597         }
 598         else if(DOMAINDECOMP(cr))
 599         {
 600                 dd_atom_sum_real(cr->dd, born->gpol_still_work);
 601         }
 602
 603         /* Compute the radii */
 604         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
 605         {
 606                 if(born->use[i] != 0)
 607                 {
 608                         gpi_ai = born->gpol[i] + born->gpol_still_work[i];
 609                         gpi2   = gpi_ai*gpi_ai;
 610
 611                         born->bRad[i]=factor*gmx_invsqrt(gpi2);
 612                         fr->invsqrta[i]=gmx_invsqrt(born->bRad[ai]);
 613                 }
 614         }
 615
 616         /* Extra (local) communication reqiured for DD */
 617         if(DOMAINDECOMP(cr))
 618         {
 619                 dd_atom_spread_real(cr->dd, born->bRad);
 620                 dd_atom_spread_real(cr->dd, fr->invsqrta);
 621         }
 622
 623         return 0;
 624 }
 625
 626 int
 627 calc_gb_rad_hct_sse2_double(t_commrec *cr, t_forcerec *fr, int natoms, gmx_localtop_t *top,
 628                                                         const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born, t_mdatoms *md)
 629 {
 630         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
 631         int p1, p2, shift;
 632         double rr,sum,sum_tmp,min_rad,rad,doff;
 633         double shX,shY,shZ;
 634
 635         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
 636         __m128d t1,t2,t3,rsq11,rinv,r,rai;
 637         __m128d rai_inv,sk,sk2,lij,dlij,duij;
 638         __m128d uij,lij2,uij2,lij3,uij3,diff2;
 639         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
 640         __m128d mask_cmp,mask_cmp2,mask_cmp3;
 641         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9,doffset;
 642         __m128d sum_ai,chrule,chrule_ai,tmp_ai;
 643         __m128d sk_ai, sk2_ai,raj,raj_inv;
 644
 645         const __m128d neg   = {-1.0, -1.0};
 646         const __m128d zero  = {0.0, 0.0};
 647         const __m128d eigth = {0.125, 0.125};
 648         const __m128d qrtr  = {0.25, 0.25};
 649         const __m128d half  = {0.5, 0.5};
 650         const __m128d one   = {1.0, 1.0};
 651         const __m128d two   = {2.0, 2.0};
 652         const __m128d three = {3.0, 3.0};
 653
 654         /* Keep the compiler happy */
 655         tmp_ai      = _mm_setzero_pd();
 656         tmp         = _mm_setzero_pd();
 657         xmm7        = _mm_setzero_pd();
 658         xmm8        = _mm_setzero_pd();
 659         raj_inv     = _mm_setzero_pd();
 660         raj         = _mm_setzero_pd();
 661         sk          = _mm_setzero_pd();
 662         jx          = _mm_setzero_pd();
 663         jy          = _mm_setzero_pd();
 664         jz          = _mm_setzero_pd();
 665
 666         /* Set the dielectric offset */
 667         doff = born->gb_doffset;
 668         doffset = _mm_load1_pd(&doff);
 669         n       = 0;
 670
 671         for(i=0;i<born->nr;i++)
 672         {
 673                 born->gpol_hct_work[i] = 0;
 674         }
 675
 676         for(i=0;i<nl->nri;i++)
 677         {
 678                 ai      = nl->iinr[i];
 679                 ai3     = ai * 3;
 680
 681                 nj0     = nl->jindex[i];
 682                 nj1     = nl->jindex[i+1];
 683
 684                 /* Load shifts for this list */
 685                 shift   = nl->shift[i];
 686                 shX     = fr->shift_vec[shift][0];
 687                 shY     = fr->shift_vec[shift][1];
 688                 shZ     = fr->shift_vec[shift][2];
 689
 690                 /* Splat the shifts */
 691                 sX = _mm_load1_pd(&shX);
 692                 sY = _mm_load1_pd(&shY);
 693                 sZ = _mm_load1_pd(&shZ);
 694
 695                 offset  = (nj1-nj0)%2;
 696
 697                 /* Load rai */
 698                 rr      = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
 699                 rai     = _mm_load1_pd(&rr);
 700                 rr      = 1.0/rr;
 701                 rai_inv = _mm_load1_pd(&rr);
 702
 703                 /* Zero out sums for polarisation energies */
 704                 sum_ai  = _mm_setzero_pd();
 705
 706                 /* Load ai coordinates and add shifts */
 707                 ix       = _mm_load1_pd(x+ai3);
 708                 iy       = _mm_load1_pd(x+ai3+1);
 709                 iz       = _mm_load1_pd(x+ai3+2);
 710
 711                 ix      = _mm_add_pd(sX,ix);
 712                 iy      = _mm_add_pd(sY,iy);
 713                 iz      = _mm_add_pd(sZ,iz);
 714
 715                 sk_ai    = _mm_load1_pd(born->param+ai);
 716                 sk2_ai   = _mm_mul_pd(sk_ai,sk_ai);
 717
 718                 for(k=nj0;k<nj1-offset;k+=2)
 719                 {
 720                         aj1  = nl->jjnr[k];
 721                         aj2  = nl->jjnr[k+1];
 722
 723                         aj13 = aj1 * 3;
 724                         aj23 = aj2 * 3;
 725
 726                         /* Load particle aj1-2 coordinates */
 727                         xmm1     = _mm_loadu_pd(x+aj13);
 728                         xmm2     = _mm_loadu_pd(x+aj23);
 729                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
 730                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
 731
 732                         jz         = _mm_loadl_pd(jz, x+aj13+2);
 733                         jz         = _mm_loadh_pd(jz, x+aj23+2);
 734
 735                         dx         = _mm_sub_pd(ix,jx);
 736                         dy         = _mm_sub_pd(iy,jy);
 737                         dz         = _mm_sub_pd(iz,jz);
 738
 739                         rsq11      = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
 740                         rinv       = my_invrsq_pd(rsq11);
 741                         r          = _mm_mul_pd(rinv,rsq11);
 742
 743                         sk         = _mm_loadl_pd(sk,born->param+aj1);
 744                         sk         = _mm_loadh_pd(sk,born->param+aj2);
 745
 746                         /* Load aj1,aj2 raj */
 747                         p1        = md->typeA[aj1];
 748                         p2        = md->typeA[aj2];
 749
 750                         raj       = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
 751                         raj       = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
 752                         raj       = _mm_sub_pd(raj,doffset);
 753
 754                         /* Compute 1.0/raj */
 755                         raj_inv   = my_inv_pd(raj);
 756
 757                         /* INTERACTION aj->ai STARS HERE */
 758                         /* conditional mask for rai<dr+sk */
 759                         xmm1       = _mm_add_pd(r,sk);
 760                         mask_cmp   = _mm_cmplt_pd(rai,xmm1);
 761
 762                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
 763                         xmm2      = _mm_sub_pd(r,sk);
 764                         xmm3      = my_inv_pd(xmm2);
 765                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
 766
 767                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
 768                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
 769
 770                         uij       = my_inv_pd(xmm1);
 771                         lij2      = _mm_mul_pd(lij,lij);
 772                         lij3      = _mm_mul_pd(lij2,lij);
 773                         uij2      = _mm_mul_pd(uij,uij);
 774                         uij3      = _mm_mul_pd(uij2,uij);
 775
 776                         diff2     = _mm_sub_pd(uij2,lij2);
 777
 778                         lij_inv   = my_invrsq_pd(lij2);
 779                         sk2       = _mm_mul_pd(sk,sk);
 780                         sk2_inv   = _mm_mul_pd(sk2,rinv);
 781                         prod      = _mm_mul_pd(qrtr,sk2_inv);
 782
 783                         log_term  = _mm_mul_pd(uij,lij_inv);
 784                         log_term  = log_pd(log_term);
 785
 786                         xmm1      = _mm_sub_pd(lij,uij);
 787                         xmm2      = _mm_mul_pd(qrtr,r);
 788                         xmm2      = _mm_mul_pd(xmm2,diff2);
 789                         xmm1      = _mm_add_pd(xmm1,xmm2);
 790                         xmm2      = _mm_mul_pd(half,rinv);
 791                         xmm2      = _mm_mul_pd(xmm2,log_term);
 792                         xmm1      = _mm_add_pd(xmm1,xmm2);
 793                         xmm9      = _mm_mul_pd(neg,diff2);
 794                         xmm2      = _mm_mul_pd(xmm9,prod);
 795                         tmp_ai    = _mm_add_pd(xmm1,xmm2);
 796
 797                         /* contitional for rai<sk-dr */
 798                         xmm3      = _mm_sub_pd(sk,r);
 799                         mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
 800
 801                         xmm4    = _mm_sub_pd(rai_inv,lij);
 802                         xmm4    = _mm_mul_pd(two,xmm4);
 803                         xmm4    = _mm_add_pd(tmp_ai,xmm4);
 804
 805                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
 806
 807                         /* the tmp will now contain two partial values, that not all are to be used. Which */
 808                         /* ones are governed by the mask_cmp mask. */
 809                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
 810                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
 811                         sum_ai     = _mm_add_pd(sum_ai,tmp_ai);
 812
 813                         xmm2   = _mm_mul_pd(half,lij2);
 814                         xmm3   = _mm_mul_pd(prod,lij3);
 815                         xmm2   = _mm_add_pd(xmm2,xmm3);
 816                         xmm3   = _mm_mul_pd(lij,rinv);
 817                         xmm4   = _mm_mul_pd(lij3,r);
 818                         xmm3   = _mm_add_pd(xmm3,xmm4);
 819                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 820                         t1     = _mm_sub_pd(xmm2,xmm3);
 821
 822                         xmm2   = _mm_mul_pd(half,uij2);
 823                         xmm2   = _mm_mul_pd(neg,xmm2);
 824                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
 825                         xmm3   = _mm_mul_pd(xmm3,uij3);
 826                         xmm2   = _mm_sub_pd(xmm2,xmm3);
 827                         xmm3   = _mm_mul_pd(uij,rinv);
 828                         xmm4   = _mm_mul_pd(uij3,r);
 829                         xmm3   = _mm_add_pd(xmm3,xmm4);
 830                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 831                         t2     = _mm_add_pd(xmm2,xmm3);
 832
 833                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
 834                         xmm2   = _mm_add_pd(one,xmm2);
 835                         xmm2   = _mm_mul_pd(eigth,xmm2);
 836                         xmm2   = _mm_mul_pd(xmm2,xmm9);
 837                         xmm3   = _mm_mul_pd(log_term, rinv);
 838                         xmm3   = _mm_mul_pd(xmm3,rinv);
 839                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 840                         t3     = _mm_add_pd(xmm2,xmm3);
 841
 842                         /* chain rule terms */
 843                         xmm2   = _mm_mul_pd(dlij,t1);
 844                         xmm2   = _mm_add_pd(xmm2,t2);
 845                         xmm2   = _mm_add_pd(xmm2,t3);
 846
 847                         /* temporary storage of chain rule terms, since we have to compute
 848                          the reciprocal terms also before storing them */
 849                         chrule = _mm_mul_pd(xmm2,rinv);
 850
 851                         /* INTERACTION ai->aj starts here */
 852                         /* Conditional mask for raj<dr+sk_ai */
 853                         xmm1   = _mm_add_pd(r,sk_ai);
 854                         mask_cmp = _mm_cmplt_pd(raj,xmm1);
 855
 856                         /* Conditional for rai>dr-sk, ends with mask_cmp2 */
 857                         xmm2  = _mm_sub_pd(r,sk_ai);
 858                         xmm3  = my_inv_pd(xmm2);
 859                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
 860
 861                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
 862                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
 863
 864                         uij       = my_inv_pd(xmm1);
 865                         lij2      = _mm_mul_pd(lij,lij);
 866                         lij3      = _mm_mul_pd(lij2,lij);
 867                         uij2      = _mm_mul_pd(uij,uij);
 868                         uij3      = _mm_mul_pd(uij2,uij);
 869
 870                         diff2     = _mm_sub_pd(uij2,lij2);
 871
 872                         lij_inv   = my_invrsq_pd(lij2);
 873
 874                         sk2       = sk2_ai;
 875                         sk2_inv   = _mm_mul_pd(sk2,rinv);
 876                         prod      = _mm_mul_pd(qrtr,sk2_inv);
 877
 878                         log_term  = _mm_mul_pd(uij,lij_inv);
 879                         log_term = log_pd(log_term);
 880
 881                         xmm1      = _mm_sub_pd(lij,uij);
 882                         xmm2      = _mm_mul_pd(qrtr,r);
 883                         xmm2      = _mm_mul_pd(xmm2,diff2);
 884                         xmm1      = _mm_add_pd(xmm1,xmm2);
 885                         xmm2      = _mm_mul_pd(half,rinv);
 886                         xmm2      = _mm_mul_pd(xmm2,log_term);
 887                         xmm1      = _mm_add_pd(xmm1,xmm2);
 888                         xmm9      = _mm_mul_pd(neg,diff2);
 889                         xmm2      = _mm_mul_pd(xmm9,prod);
 890                         tmp       = _mm_add_pd(xmm1,xmm2);
 891
 892                         /* contitional for rai<sk_ai-dr */
 893                         xmm3      = _mm_sub_pd(sk_ai,r);
 894                         mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
 895
 896                         xmm4    = _mm_sub_pd(raj_inv,lij);
 897                         xmm4    = _mm_mul_pd(two,xmm4);
 898                         xmm4    = _mm_add_pd(tmp,xmm4);
 899
 900                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
 901
 902                         /* the tmp will now contain two partial values, that not all are to be used. Which */
 903                         /* ones are governed by the mask_cmp mask. */
 904                         tmp     = _mm_mul_pd(half,tmp);
 905                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
 906
 907                         /* Load, add and store ai->aj pol energy */
 908                         xmm7    = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
 909                         xmm7    = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
 910
 911                         xmm7    = _mm_add_pd(xmm7,tmp);
 912
 913                         _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
 914                         _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
 915
 916                         /* Start chain rule terms */
 917                         xmm2   = _mm_mul_pd(half,lij2);
 918                         xmm3   = _mm_mul_pd(prod,lij3);
 919                         xmm2   = _mm_add_pd(xmm2,xmm3);
 920                         xmm3   = _mm_mul_pd(lij,rinv);
 921                         xmm4   = _mm_mul_pd(lij3,r);
 922                         xmm3   = _mm_add_pd(xmm3,xmm4);
 923                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 924                         t1     = _mm_sub_pd(xmm2,xmm3);
 925
 926                         xmm2   = _mm_mul_pd(half,uij2);
 927                         xmm2   = _mm_mul_pd(neg,xmm2);
 928                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
 929                         xmm3   = _mm_mul_pd(xmm3,uij3);
 930                         xmm2   = _mm_sub_pd(xmm2,xmm3);
 931                         xmm3   = _mm_mul_pd(uij,rinv);
 932                         xmm4   = _mm_mul_pd(uij3,r);
 933                         xmm3   = _mm_add_pd(xmm3,xmm4);
 934                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 935                         t2     = _mm_add_pd(xmm2,xmm3);
 936
 937                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
 938                         xmm2   = _mm_add_pd(one,xmm2);
 939                         xmm2   = _mm_mul_pd(eigth,xmm2);
 940                         xmm2   = _mm_mul_pd(xmm2,xmm9);
 941                         xmm3   = _mm_mul_pd(log_term, rinv);
 942                         xmm3   = _mm_mul_pd(xmm3,rinv);
 943                         xmm3   = _mm_mul_pd(qrtr,xmm3);
 944                         t3     = _mm_add_pd(xmm2,xmm3);
 945
 946                         /* chain rule terms */
 947                         xmm2   = _mm_mul_pd(dlij,t1);
 948                         xmm2   = _mm_add_pd(xmm2,t2);
 949                         xmm2   = _mm_add_pd(xmm2,t3);
 950                         chrule_ai = _mm_mul_pd(xmm2,rinv);
 951
 952                         /* Store chain rule terms
 953                          * same unpacking rationale as with Still above
 954                          */
 955                         xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
 956                         xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
 957
 958                         _mm_storeu_pd(fr->dadx+n, xmm3);
 959                         n = n + 2;
 960                         _mm_storeu_pd(fr->dadx+n, xmm4);
 961                         n = n + 2;
 962                 }
 963
 964                 if(offset!=0)
 965                 {
 966                         aj1       = nl->jjnr[k];
 967                         aj13      = aj1 * 3;
 968                         p1        = md->typeA[aj1];
 969
 970                         jx        = _mm_load_sd(x+aj13);
 971                         jy        = _mm_load_sd(x+aj13+1);
 972                         jz        = _mm_load_sd(x+aj13+2);
 973
 974                         sk        = _mm_load_sd(born->param+aj1);
 975
 976                         dx        = _mm_sub_sd(ix,jx);
 977                         dy        = _mm_sub_pd(iy,jy);
 978                         dz        = _mm_sub_pd(iz,jz);
 979
 980                         rsq11     = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
 981                         rinv      = my_invrsq_pd(rsq11);
 982                         r         = _mm_mul_sd(rinv,rsq11);
 983
 984                         /* Load raj */
 985                         raj       = _mm_load_sd(top->atomtypes.gb_radius+p1);
 986                         raj       = _mm_sub_sd(raj,doffset);
 987                         raj_inv   = my_inv_pd(raj);
 988
 989                         /* OFFSET INTERATIONS aj->ai STARTS HERE */
 990                         /* conditional mask for rai<dr+sk */
 991                         xmm1      = _mm_add_sd(r,sk);
 992                         mask_cmp  = _mm_cmplt_sd(rai,xmm1);
 993
 994                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
 995                         xmm2      = _mm_sub_sd(r,sk);
 996                         xmm3      = my_inv_pd(xmm2);
 997                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
 998
 999                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1000                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1001
1002                         uij       = my_inv_pd(xmm1);
1003                         lij2      = _mm_mul_sd(lij,lij);
1004                         lij3      = _mm_mul_sd(lij2,lij);
1005                         uij2      = _mm_mul_sd(uij,uij);
1006                         uij3      = _mm_mul_sd(uij2,uij);
1007
1008                         diff2     = _mm_sub_sd(uij2,lij2);
1009
1010                         lij_inv   = my_invrsq_pd(lij2);
1011                         sk2       = _mm_mul_sd(sk,sk);
1012                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1013                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1014
1015                         log_term  = _mm_mul_pd(uij,lij_inv);
1016                         log_term = log_pd(log_term);
1017
1018                         xmm1      = _mm_sub_sd(lij,uij);
1019                         xmm2      = _mm_mul_sd(qrtr,r);
1020                         xmm2      = _mm_mul_sd(xmm2,diff2);
1021                         xmm1      = _mm_add_sd(xmm1,xmm2);
1022                         xmm2      = _mm_mul_sd(half,rinv);
1023                         xmm2      = _mm_mul_sd(xmm2,log_term);
1024                         xmm1      = _mm_add_sd(xmm1,xmm2);
1025                         xmm9      = _mm_mul_sd(neg,diff2);
1026                         xmm2      = _mm_mul_sd(xmm9,prod);
1027
1028                         tmp_ai    = _mm_add_sd(xmm1,xmm2);
1029
1030                         /* contitional for rai<sk-dr */
1031                         xmm3      = _mm_sub_sd(sk,r);
1032                         mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1033
1034                         xmm4    = _mm_sub_sd(rai_inv,lij);
1035                         xmm4    = _mm_mul_sd(two,xmm4);
1036                         xmm4    = _mm_add_sd(tmp_ai,xmm4);
1037
1038                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1039
1040                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1041                         /* ones are governed by the mask_cmp mask. */
1042                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
1043                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1044                         sum_ai     = _mm_add_sd(sum_ai,tmp_ai);
1045
1046                         xmm2   = _mm_mul_sd(half,lij2);
1047                         xmm3   = _mm_mul_sd(prod,lij3);
1048                         xmm2   = _mm_add_sd(xmm2,xmm3);
1049                         xmm3   = _mm_mul_sd(lij,rinv);
1050                         xmm4   = _mm_mul_sd(lij3,r);
1051                         xmm3   = _mm_add_sd(xmm3,xmm4);
1052                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1053                         t1     = _mm_sub_sd(xmm2,xmm3);
1054
1055                         xmm2   = _mm_mul_sd(half,uij2);
1056                         xmm2   = _mm_mul_sd(neg,xmm2);
1057                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1058                         xmm3   = _mm_mul_sd(xmm3,uij3);
1059                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1060                         xmm3   = _mm_mul_sd(uij,rinv);
1061                         xmm4   = _mm_mul_sd(uij3,r);
1062                         xmm3   = _mm_add_sd(xmm3,xmm4);
1063                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1064                         t2     = _mm_add_sd(xmm2,xmm3);
1065
1066                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1067                         xmm2   = _mm_add_sd(one,xmm2);
1068                         xmm2   = _mm_mul_sd(eigth,xmm2);
1069                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1070                         xmm3   = _mm_mul_sd(log_term, rinv);
1071                         xmm3   = _mm_mul_sd(xmm3,rinv);
1072                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1073                         t3     = _mm_add_sd(xmm2,xmm3);
1074
1075                         /* chain rule terms */
1076                         xmm2   = _mm_mul_sd(dlij,t1);
1077                         xmm2   = _mm_add_sd(xmm2,t2);
1078                         xmm2   = _mm_add_sd(xmm2,t3);
1079
1080                         /* temporary storage of chain rule terms, since we have to compute
1081                          the reciprocal terms also before storing them */
1082                         chrule = _mm_mul_sd(xmm2,rinv);
1083
1084                         /* OFFSET INTERACTION ai->aj starts here */
1085                         /* conditional mask for raj<dr+sk */
1086                         xmm1      = _mm_add_sd(r,sk_ai);
1087                         mask_cmp  = _mm_cmplt_sd(raj,xmm1);
1088
1089                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1090                         xmm2      = _mm_sub_sd(r,sk_ai);
1091                         xmm3      = my_inv_pd(xmm2);
1092                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1093
1094                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1095                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1096
1097                         uij       = my_inv_pd(xmm1);
1098                         lij2      = _mm_mul_sd(lij,lij);
1099                         lij3      = _mm_mul_sd(lij2,lij);
1100                         uij2      = _mm_mul_sd(uij,uij);
1101                         uij3      = _mm_mul_sd(uij2,uij);
1102
1103                         diff2     = _mm_sub_sd(uij2,lij2);
1104
1105                         lij_inv   = my_invrsq_pd(lij2);
1106
1107                         sk2       = sk2_ai;
1108                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1109                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1110
1111                         log_term  = _mm_mul_pd(uij,lij_inv);
1112                         log_term = log_pd(log_term);
1113
1114                         xmm1      = _mm_sub_sd(lij,uij);
1115                         xmm2      = _mm_mul_sd(qrtr,r);
1116                         xmm2      = _mm_mul_sd(xmm2,diff2);
1117                         xmm1      = _mm_add_sd(xmm1,xmm2);
1118                         xmm2      = _mm_mul_sd(half,rinv);
1119                         xmm2      = _mm_mul_sd(xmm2,log_term);
1120                         xmm1      = _mm_add_sd(xmm1,xmm2);
1121                         xmm9      = _mm_mul_sd(neg,diff2);
1122                         xmm2      = _mm_mul_sd(xmm9,prod);
1123                         tmp       = _mm_add_sd(xmm1,xmm2);
1124
1125                         /* contitional for rai<sk-dr */
1126                         xmm3      = _mm_sub_sd(sk_ai,r);
1127                         mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1128
1129                         xmm4    = _mm_sub_sd(raj_inv,lij);
1130                         xmm4    = _mm_mul_sd(two,xmm4);
1131                         xmm4    = _mm_add_sd(tmp,xmm4);
1132
1133                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1134
1135                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1136                         /* ones are governed by the mask_cmp mask. */
1137                         tmp     = _mm_mul_pd(half,tmp);
1138                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1139
1140                         /* Load, add and store gpol energy */
1141                         xmm7    = _mm_load_sd(born->gpol_hct_work+aj1);
1142                         xmm7    = _mm_add_sd(xmm7,tmp);
1143                         _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1144
1145                         /* Start chain rule terms, t1 */
1146                         xmm2   = _mm_mul_sd(half,lij2);
1147                         xmm3   = _mm_mul_sd(prod,lij3);
1148                         xmm2   = _mm_add_sd(xmm2,xmm3);
1149                         xmm3   = _mm_mul_sd(lij,rinv);
1150                         xmm4   = _mm_mul_sd(lij3,r);
1151                         xmm3   = _mm_add_sd(xmm3,xmm4);
1152                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1153                         t1     = _mm_sub_sd(xmm2,xmm3);
1154
1155                         xmm2   = _mm_mul_sd(half,uij2);
1156                         xmm2   = _mm_mul_sd(neg,xmm2);
1157                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1158                         xmm3   = _mm_mul_sd(xmm3,uij3);
1159                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1160                         xmm3   = _mm_mul_sd(uij,rinv);
1161                         xmm4   = _mm_mul_sd(uij3,r);
1162                         xmm3   = _mm_add_sd(xmm3,xmm4);
1163                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1164                         t2     = _mm_add_sd(xmm2,xmm3);
1165
1166                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1167                         xmm2   = _mm_add_sd(one,xmm2);
1168                         xmm2   = _mm_mul_sd(eigth,xmm2);
1169                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1170                         xmm3   = _mm_mul_sd(log_term, rinv);
1171                         xmm3   = _mm_mul_sd(xmm3,rinv);
1172                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1173                         t3     = _mm_add_sd(xmm2,xmm3);
1174
1175                         /* chain rule terms */
1176                         xmm2   = _mm_mul_sd(dlij,t1);
1177                         xmm2   = _mm_add_sd(xmm2,t2);
1178                         xmm2   = _mm_add_sd(xmm2,t3);
1179                         chrule_ai = _mm_mul_sd(xmm2,rinv);
1180
1181                         _mm_store_sd(fr->dadx+n, chrule);
1182                         n = n + 1;
1183                         _mm_store_sd(fr->dadx+n, chrule_ai);
1184                         n = n + 1;
1185                 }
1186
1187                 /* Do end processing ...  */
1188                 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1189                 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1190                 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1191
1192                 /* Load, add and store atom ai polarisation energy */
1193                 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1194                 sum_ai = _mm_add_sd(sum_ai,xmm2);
1195                 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1196         }
1197
1198         /* Parallel summations */
1199         if(PARTDECOMP(cr))
1200         {
1201                 gmx_sum(natoms, born->gpol_hct_work, cr);
1202         }
1203         else if(DOMAINDECOMP(cr))
1204         {
1205                 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1206         }
1207
1208         /* Compute the radii */
1209         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1210         {
1211                 if(born->use[i] != 0)
1212                 {
1213                         rr      = top->atomtypes.gb_radius[md->typeA[i]]-doff;
1214                         sum     = 1.0/rr - born->gpol_hct_work[i];
1215                         min_rad = rr + doff;
1216                         rad     = 1.0/sum;
1217
1218                         born->bRad[i]   = rad > min_rad ? rad : min_rad;
1219                         fr->invsqrta[i] = gmx_invsqrt(born->bRad[i]);
1220                 }
1221         }
1222
1223         /* Extra (local) communication required for DD */
1224         if(DOMAINDECOMP(cr))
1225         {
1226                 dd_atom_spread_real(cr->dd, born->bRad);
1227                 dd_atom_spread_real(cr->dd, fr->invsqrta);
1228         }
1229
1230         return 0;
1231 }
1232
1233 int
1234 calc_gb_rad_obc_sse2_double(t_commrec *cr, t_forcerec * fr, int natoms, gmx_localtop_t *top,
1235                                                         const t_atomtypes *atype, double *x, t_nblist *nl, gmx_genborn_t *born,t_mdatoms *md)
1236 {
1237         int i,k,n,ai,ai3,aj1,aj2,aj13,aj23,nj0,nj1,at0,at1,offset;
1238         int p1,p2,p3,p4,shift;
1239         double rr,sum,sum_tmp,sum2,sum3,min_rad,rad,doff;
1240         double tsum,tchain,rr_inv,rr_inv2,gbr;
1241         double shX,shY,shZ;
1242
1243         __m128d ix,iy,iz,jx,jy,jz,dx,dy,dz,sX,sY,sZ;
1244         __m128d t1,t2,t3,rsq11,rinv,r,rai;
1245         __m128d rai_inv,sk,sk2,lij,dlij,duij;
1246         __m128d uij,lij2,uij2,lij3,uij3,diff2;
1247         __m128d lij_inv,sk2_inv,prod,log_term,tmp,tmp_sum;
1248         __m128d mask_cmp,mask_cmp2,mask_cmp3,doffset,raj,raj_inv;
1249         __m128d xmm1,xmm2,xmm3,xmm4,xmm7,xmm8,xmm9;
1250         __m128d sum_ai,chrule,chrule_ai,tmp_ai,sk_ai,sk2_ai;
1251
1252         const __m128d neg   = {-1.0, -1.0};
1253         const __m128d zero  = {0.0, 0.0};
1254         const __m128d eigth = {0.125, 0.125};
1255         const __m128d qrtr  = {0.25, 0.25};
1256         const __m128d half  = {0.5, 0.5};
1257         const __m128d one   = {1.0, 1.0};
1258         const __m128d two   = {2.0, 2.0};
1259         const __m128d three = {3.0, 3.0};
1260
1261         /* Keep the compiler happy */
1262         tmp_ai      = _mm_setzero_pd();
1263         tmp         = _mm_setzero_pd();
1264         raj_inv     = _mm_setzero_pd();
1265         raj         = _mm_setzero_pd();
1266         sk          = _mm_setzero_pd();
1267         jx          = _mm_setzero_pd();
1268         jy          = _mm_setzero_pd();
1269         jz          = _mm_setzero_pd();
1270         xmm7        = _mm_setzero_pd();
1271         xmm8        = _mm_setzero_pd();
1272
1273         /* Set the dielectric offset */
1274         doff = born->gb_doffset;
1275         doffset = _mm_load1_pd(&doff);
1276
1277         n       = 0;
1278
1279         for(i=0;i<born->nr;i++)
1280         {
1281                 born->gpol_hct_work[i] = 0;
1282         }
1283
1284         for(i=0;i<nl->nri;i++)
1285         {
1286                 ai      = nl->iinr[i];
1287                 ai3     = ai * 3;
1288
1289                 nj0     = nl->jindex[i];
1290                 nj1     = nl->jindex[i+1];
1291
1292                 /* Load shifts for this list */
1293                 shift   = nl->shift[i];
1294                 shX     = fr->shift_vec[shift][0];
1295                 shY     = fr->shift_vec[shift][1];
1296                 shZ     = fr->shift_vec[shift][2];
1297
1298                 /* Splat the shifts */
1299                 sX = _mm_load1_pd(&shX);
1300                 sY = _mm_load1_pd(&shY);
1301                 sZ = _mm_load1_pd(&shZ);
1302
1303                 offset  = (nj1-nj0)%2;
1304
1305                 /* Load rai */
1306                 rr      = top->atomtypes.gb_radius[md->typeA[ai]]-doff;
1307                 rai     = _mm_load1_pd(&rr);
1308                 rr      = 1.0/rr;
1309                 rai_inv = _mm_load1_pd(&rr);
1310
1311                 /* Load ai coordinates and add shifts */
1312                 ix       = _mm_load1_pd(x+ai3);
1313                 iy       = _mm_load1_pd(x+ai3+1);
1314                 iz       = _mm_load1_pd(x+ai3+2);
1315
1316                 ix      = _mm_add_pd(sX,ix);
1317                 iy      = _mm_add_pd(sY,iy);
1318                 iz      = _mm_add_pd(sZ,iz);
1319
1320                 /* Zero out sums for polarisation energies */
1321                 sum_ai = _mm_setzero_pd();
1322
1323                 sk_ai  =  _mm_load1_pd(born->param+ai);
1324                 sk2_ai = _mm_mul_pd(sk_ai,sk_ai);
1325
1326                 for(k=nj0;k<nj1-offset;k+=2)
1327                 {
1328                         aj1  = nl->jjnr[k];
1329                         aj2  = nl->jjnr[k+1];
1330
1331                         aj13 = aj1 * 3;
1332                         aj23 = aj2 * 3;
1333
1334                         /* Load particle aj1-2 coordinates */
1335                         xmm1     = _mm_loadu_pd(x+aj13);
1336                         xmm2     = _mm_loadu_pd(x+aj23);
1337                         jx       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1338                         jy       = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1339
1340                         jz         = _mm_loadl_pd(jz, x+aj13+2);
1341                         jz         = _mm_loadh_pd(jz, x+aj23+2);
1342
1343                         dx         = _mm_sub_pd(ix,jx);
1344                         dy         = _mm_sub_pd(iy,jy);
1345                         dz         = _mm_sub_pd(iz,jz);
1346
1347                         rsq11      = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
1348                         rinv       = my_invrsq_pd(rsq11);
1349                         r          = _mm_mul_pd(rinv,rsq11);
1350
1351                         /* Load atom aj1,aj2 raj */
1352                         p1         = md->typeA[aj1];
1353                         p2         = md->typeA[aj2];
1354
1355                         raj        = _mm_loadl_pd(raj,top->atomtypes.gb_radius+p1);
1356                         raj        = _mm_loadh_pd(raj,top->atomtypes.gb_radius+p2);
1357                         raj        = _mm_sub_pd(raj,doffset);
1358
1359                         /* Compute 1.0/raj */
1360                         raj_inv    = my_inv_pd(raj);
1361
1362                         sk         = _mm_loadl_pd(sk,born->param+aj1);
1363                         sk         = _mm_loadh_pd(sk,born->param+aj2);
1364
1365                         /* INTERACTION aj->ai STARTS HERE */
1366                         /* conditional mask for rai<dr+sk */
1367                         xmm1       = _mm_add_pd(r,sk);
1368                         mask_cmp   = _mm_cmplt_pd(rai,xmm1);
1369
1370                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1371                         xmm2      = _mm_sub_pd(r,sk);
1372                         xmm3      = my_inv_pd(xmm2);
1373                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1374
1375                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1376                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1377
1378                         uij       = my_inv_pd(xmm1);
1379                         lij2      = _mm_mul_pd(lij,lij);
1380                         lij3      = _mm_mul_pd(lij2,lij);
1381                         uij2      = _mm_mul_pd(uij,uij);
1382                         uij3      = _mm_mul_pd(uij2,uij);
1383
1384                         diff2     = _mm_sub_pd(uij2,lij2);
1385
1386                         lij_inv   = my_invrsq_pd(lij2);
1387                         sk2       = _mm_mul_pd(sk,sk);
1388                         sk2_inv   = _mm_mul_pd(sk2,rinv);
1389                         prod      = _mm_mul_pd(qrtr,sk2_inv);
1390
1391                         log_term  = _mm_mul_pd(uij,lij_inv);
1392                         log_term = log_pd(log_term);
1393
1394                         xmm1      = _mm_sub_pd(lij,uij);
1395                         xmm2      = _mm_mul_pd(qrtr,r);
1396                         xmm2      = _mm_mul_pd(xmm2,diff2);
1397                         xmm1      = _mm_add_pd(xmm1,xmm2);
1398                         xmm2      = _mm_mul_pd(half,rinv);
1399                         xmm2      = _mm_mul_pd(xmm2,log_term);
1400                         xmm1      = _mm_add_pd(xmm1,xmm2);
1401                         xmm9      = _mm_mul_pd(neg,diff2);
1402                         xmm2      = _mm_mul_pd(xmm9,prod);
1403                         tmp_ai    = _mm_add_pd(xmm1,xmm2);
1404
1405                         /* contitional for rai<sk-dr */
1406                         xmm3      = _mm_sub_pd(sk,r);
1407                         mask_cmp3 = _mm_cmplt_pd(rai,xmm3); /* rai<sk-dr */
1408
1409                         xmm4    = _mm_sub_pd(rai_inv,lij);
1410                         xmm4    = _mm_mul_pd(two,xmm4);
1411                         xmm4    = _mm_add_pd(tmp_ai,xmm4);
1412
1413                         tmp_ai      = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1414
1415                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1416                         /* ones are governed by the mask_cmp mask. */
1417                         tmp_ai     = _mm_mul_pd(half,tmp_ai);
1418                         tmp_ai     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1419                         sum_ai     = _mm_add_pd(sum_ai,tmp_ai);
1420
1421                         /* Start the dadx chain rule terms */
1422                         xmm2   = _mm_mul_pd(half,lij2);
1423                         xmm3   = _mm_mul_pd(prod,lij3);
1424                         xmm2   = _mm_add_pd(xmm2,xmm3);
1425                         xmm3   = _mm_mul_pd(lij,rinv);
1426                         xmm4   = _mm_mul_pd(lij3,r);
1427                         xmm3   = _mm_add_pd(xmm3,xmm4);
1428                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1429                         t1     = _mm_sub_pd(xmm2,xmm3);
1430
1431                         xmm2   = _mm_mul_pd(half,uij2);
1432                         xmm2   = _mm_mul_pd(neg,xmm2);
1433                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
1434                         xmm3   = _mm_mul_pd(xmm3,uij3);
1435                         xmm2   = _mm_sub_pd(xmm2,xmm3);
1436                         xmm3   = _mm_mul_pd(uij,rinv);
1437                         xmm4   = _mm_mul_pd(uij3,r);
1438                         xmm3   = _mm_add_pd(xmm3,xmm4);
1439                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1440                         t2     = _mm_add_pd(xmm2,xmm3);
1441
1442                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
1443                         xmm2   = _mm_add_pd(one,xmm2);
1444                         xmm2   = _mm_mul_pd(eigth,xmm2);
1445                         xmm2   = _mm_mul_pd(xmm2,xmm9);
1446                         xmm3   = _mm_mul_pd(log_term, rinv);
1447                         xmm3   = _mm_mul_pd(xmm3,rinv);
1448                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1449                         t3     = _mm_add_pd(xmm2,xmm3);
1450
1451                         /* chain rule terms */
1452                         xmm2   = _mm_mul_pd(dlij,t1);
1453                         xmm2   = _mm_add_pd(xmm2,t2);
1454                         xmm2   = _mm_add_pd(xmm2,t3);
1455
1456                         /* temporary storage of chain rule terms, since we have to compute
1457                          the reciprocal terms also before storing them */
1458                         chrule   = _mm_mul_pd(xmm2,rinv);
1459
1460                         /* INTERACTION ai->aj STARTS HERE */
1461                         /* conditional mask for raj<dr+sk_ai */
1462                         xmm1       = _mm_add_pd(r,sk_ai);
1463                         mask_cmp   = _mm_cmplt_pd(raj,xmm1);
1464
1465                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1466                         xmm2      = _mm_sub_pd(r,sk_ai);
1467                         xmm3      = my_inv_pd(xmm2);
1468                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1469
1470                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1471                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1472
1473                         uij       = my_inv_pd(xmm1);
1474                         lij2      = _mm_mul_pd(lij,lij);
1475                         lij3      = _mm_mul_pd(lij2,lij);
1476                         uij2      = _mm_mul_pd(uij,uij);
1477                         uij3      = _mm_mul_pd(uij2,uij);
1478
1479                         diff2     = _mm_sub_pd(uij2,lij2);
1480
1481                         lij_inv   = my_invrsq_pd(lij2);
1482
1483                         sk2       = sk2_ai;
1484                         sk2_inv   = _mm_mul_pd(sk2,rinv);
1485                         prod      = _mm_mul_pd(qrtr,sk2_inv);
1486
1487                         log_term  = _mm_mul_pd(uij,lij_inv);
1488                         log_term = log_pd(log_term);
1489
1490                         xmm1      = _mm_sub_pd(lij,uij);
1491                         xmm2      = _mm_mul_pd(qrtr,r);
1492                         xmm2      = _mm_mul_pd(xmm2,diff2);
1493                         xmm1      = _mm_add_pd(xmm1,xmm2);
1494                         xmm2      = _mm_mul_pd(half,rinv);
1495                         xmm2      = _mm_mul_pd(xmm2,log_term);
1496                         xmm1      = _mm_add_pd(xmm1,xmm2);
1497                         xmm9      = _mm_mul_pd(neg,diff2);
1498                         xmm2      = _mm_mul_pd(xmm9,prod);
1499                         tmp       = _mm_add_pd(xmm1,xmm2);
1500
1501                         /* contitional for raj<sk_ai-dr */
1502                         xmm3      = _mm_sub_pd(sk_ai,r);
1503                         mask_cmp3 = _mm_cmplt_pd(raj,xmm3); /* rai<sk-dr */
1504
1505                         xmm4    = _mm_sub_pd(raj_inv,lij);
1506                         xmm4    = _mm_mul_pd(two,xmm4);
1507                         xmm4    = _mm_add_pd(tmp,xmm4);
1508
1509                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1510
1511                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1512                         /* ones are governed by the mask_cmp mask. */
1513                         tmp     = _mm_mul_pd(half,tmp);
1514                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1515
1516                         /* Load, add and store ai->aj pol energy */
1517                         xmm7    = _mm_loadl_pd(xmm7,born->gpol_hct_work+aj1);
1518                         xmm7    = _mm_loadh_pd(xmm7,born->gpol_hct_work+aj2);
1519
1520                         xmm7    = _mm_add_pd(xmm7,tmp);
1521
1522                         _mm_storel_pd(born->gpol_hct_work+aj1,xmm7);
1523                         _mm_storeh_pd(born->gpol_hct_work+aj2,xmm7);
1524
1525                         /* Start dadx chain rule terms */
1526                         xmm2   = _mm_mul_pd(half,lij2);
1527                         xmm3   = _mm_mul_pd(prod,lij3);
1528                         xmm2   = _mm_add_pd(xmm2,xmm3);
1529                         xmm3   = _mm_mul_pd(lij,rinv);
1530                         xmm4   = _mm_mul_pd(lij3,r);
1531                         xmm3   = _mm_add_pd(xmm3,xmm4);
1532                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1533                         t1     = _mm_sub_pd(xmm2,xmm3);
1534
1535                         xmm2   = _mm_mul_pd(half,uij2);
1536                         xmm2   = _mm_mul_pd(neg,xmm2);
1537                         xmm3   = _mm_mul_pd(qrtr,sk2_inv);
1538                         xmm3   = _mm_mul_pd(xmm3,uij3);
1539                         xmm2   = _mm_sub_pd(xmm2,xmm3);
1540                         xmm3   = _mm_mul_pd(uij,rinv);
1541                         xmm4   = _mm_mul_pd(uij3,r);
1542                         xmm3   = _mm_add_pd(xmm3,xmm4);
1543                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1544                         t2     = _mm_add_pd(xmm2,xmm3);
1545
1546                         xmm2   = _mm_mul_pd(sk2_inv,rinv);
1547                         xmm2   = _mm_add_pd(one,xmm2);
1548                         xmm2   = _mm_mul_pd(eigth,xmm2);
1549                         xmm2   = _mm_mul_pd(xmm2,xmm9);
1550                         xmm3   = _mm_mul_pd(log_term, rinv);
1551                         xmm3   = _mm_mul_pd(xmm3,rinv);
1552                         xmm3   = _mm_mul_pd(qrtr,xmm3);
1553                         t3     = _mm_add_pd(xmm2,xmm3);
1554
1555                         /* chain rule terms */
1556                         xmm2   = _mm_mul_pd(dlij,t1);
1557                         xmm2   = _mm_add_pd(xmm2,t2);
1558                         xmm2   = _mm_add_pd(xmm2,t3);
1559                         chrule_ai   = _mm_mul_pd(xmm2,rinv);
1560
1561                         /* Store chain rule terms
1562                          * same unpacking rationale as with Still above
1563                          */
1564                         xmm3 = _mm_unpacklo_pd(chrule, chrule_ai);
1565                         xmm4 = _mm_unpackhi_pd(chrule, chrule_ai);
1566
1567                         _mm_storeu_pd(fr->dadx+n, xmm3);
1568                         n = n + 2;
1569                         _mm_storeu_pd(fr->dadx+n, xmm4);
1570                         n = n + 2;
1571                 }
1572
1573                 if(offset!=0)
1574                 {
1575                         aj1       = nl->jjnr[k];
1576                         aj13      = aj1 * 3;
1577                         p1        = md->typeA[aj1];
1578
1579                         jx        = _mm_load_sd(x+aj13);
1580                         jy        = _mm_load_sd(x+aj13+1);
1581                         jz        = _mm_load_sd(x+aj13+2);
1582
1583                         sk        = _mm_load_sd(born->param+aj1);
1584
1585                         /* Load raj */
1586                         raj        = _mm_load_sd(top->atomtypes.gb_radius+p1);
1587                         raj        = _mm_sub_sd(raj,doffset);
1588                         raj_inv    = my_inv_pd(raj);
1589
1590                         dx        = _mm_sub_sd(ix,jx);
1591                         dy        = _mm_sub_pd(iy,jy);
1592                         dz        = _mm_sub_pd(iz,jz);
1593
1594                         rsq11     = _mm_add_sd( _mm_add_sd( _mm_mul_sd(dx,dx) , _mm_mul_sd(dy,dy) ) , _mm_mul_sd(dz,dz) );
1595                         rinv      = my_invrsq_pd(rsq11);
1596                         r         = _mm_mul_sd(rinv,rsq11);
1597
1598                         /* OFFSET INTERACTION aj->ai STARTS HERE */
1599                         /* conditional mask for rai<dr+sk */
1600                         xmm1      = _mm_add_sd(r,sk);
1601                         mask_cmp  = _mm_cmplt_sd(rai,xmm1);
1602
1603                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1604                         xmm2      = _mm_sub_sd(r,sk);
1605                         xmm3      = my_inv_pd(xmm2);
1606                         mask_cmp2 = _mm_cmpgt_pd(rai,xmm2);
1607
1608                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,rai_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1609                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1610
1611                         uij       = my_inv_pd(xmm1);
1612                         lij2      = _mm_mul_sd(lij,lij);
1613                         lij3      = _mm_mul_sd(lij2,lij);
1614                         uij2      = _mm_mul_sd(uij,uij);
1615                         uij3      = _mm_mul_sd(uij2,uij);
1616
1617                         diff2     = _mm_sub_sd(uij2,lij2);
1618
1619                         lij_inv   = my_invrsq_pd(lij2);
1620                         sk2       = _mm_mul_sd(sk,sk);
1621                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1622                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1623
1624                         log_term  = _mm_mul_pd(uij,lij_inv);
1625                         log_term = log_pd(log_term);
1626
1627                         xmm1      = _mm_sub_sd(lij,uij);
1628                         xmm2      = _mm_mul_sd(qrtr,r);
1629                         xmm2      = _mm_mul_sd(xmm2,diff2);
1630                         xmm1      = _mm_add_sd(xmm1,xmm2);
1631                         xmm2      = _mm_mul_sd(half,rinv);
1632                         xmm2      = _mm_mul_sd(xmm2,log_term);
1633                         xmm1      = _mm_add_sd(xmm1,xmm2);
1634                         xmm9      = _mm_mul_sd(neg,diff2);
1635                         xmm2      = _mm_mul_sd(xmm9,prod);
1636
1637                         tmp_ai    = _mm_add_sd(xmm1,xmm2);
1638
1639                         /* contitional for rai<sk-dr */
1640                         xmm3      = _mm_sub_sd(sk,r);
1641                         mask_cmp3 = _mm_cmplt_sd(rai,xmm3); /* rai<sk-dr */
1642
1643                         xmm4    = _mm_sub_sd(rai_inv,lij);
1644                         xmm4    = _mm_mul_sd(two,xmm4);
1645                         xmm4    = _mm_add_sd(tmp_ai,xmm4);
1646
1647                         tmp_ai  = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp_ai)); /*conditional as a mask*/
1648
1649                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1650                         /* ones are governed by the mask_cmp mask. */
1651                         tmp_ai = _mm_mul_pd(half,tmp_ai);
1652                         tmp_ai = _mm_or_pd(_mm_and_pd(mask_cmp,tmp_ai)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1653                         sum_ai = _mm_add_sd(sum_ai,tmp_ai);
1654
1655                         xmm2   = _mm_mul_sd(half,lij2);
1656                         xmm3   = _mm_mul_sd(prod,lij3);
1657                         xmm2   = _mm_add_sd(xmm2,xmm3);
1658                         xmm3   = _mm_mul_sd(lij,rinv);
1659                         xmm4   = _mm_mul_sd(lij3,r);
1660                         xmm3   = _mm_add_sd(xmm3,xmm4);
1661                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1662                         t1     = _mm_sub_sd(xmm2,xmm3);
1663
1664                         xmm2   = _mm_mul_sd(half,uij2);
1665                         xmm2   = _mm_mul_sd(neg,xmm2);
1666                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1667                         xmm3   = _mm_mul_sd(xmm3,uij3);
1668                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1669                         xmm3   = _mm_mul_sd(uij,rinv);
1670                         xmm4   = _mm_mul_sd(uij3,r);
1671                         xmm3   = _mm_add_sd(xmm3,xmm4);
1672                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1673                         t2     = _mm_add_sd(xmm2,xmm3);
1674
1675                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1676                         xmm2   = _mm_add_sd(one,xmm2);
1677                         xmm2   = _mm_mul_sd(eigth,xmm2);
1678                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1679                         xmm3   = _mm_mul_sd(log_term, rinv);
1680                         xmm3   = _mm_mul_sd(xmm3,rinv);
1681                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1682                         t3     = _mm_add_sd(xmm2,xmm3);
1683
1684                         /* chain rule terms */
1685                         xmm2   = _mm_mul_sd(dlij,t1);
1686                         xmm2   = _mm_add_sd(xmm2,t2);
1687                         xmm2   = _mm_add_sd(xmm2,t3);
1688                         chrule   = _mm_mul_sd(xmm2,rinv);
1689
1690                         /* OFFSET INTERACTION ai->aj STARTS HERE */
1691                         /* conditional mask for raj<dr+sk_ai */
1692                         xmm1      = _mm_add_sd(r,sk_ai);
1693                         mask_cmp  = _mm_cmplt_sd(raj,xmm1);
1694
1695                         /* conditional for rai>dr-sk, ends with mask_cmp2 */
1696                         xmm2      = _mm_sub_sd(r,sk_ai);
1697                         xmm3      = my_inv_pd(xmm2);
1698                         mask_cmp2 = _mm_cmpgt_pd(raj,xmm2);
1699
1700                         lij           = _mm_or_pd(_mm_and_pd(mask_cmp2,raj_inv)  ,_mm_andnot_pd(mask_cmp2,xmm3)); /*conditional as a mask*/
1701                         dlij      = _mm_or_pd(_mm_and_pd(mask_cmp2,zero) ,_mm_andnot_pd(mask_cmp2,one));
1702
1703                         uij       = my_inv_pd(xmm1);
1704                         lij2      = _mm_mul_sd(lij,lij);
1705                         lij3      = _mm_mul_sd(lij2,lij);
1706                         uij2      = _mm_mul_sd(uij,uij);
1707                         uij3      = _mm_mul_sd(uij2,uij);
1708
1709                         diff2     = _mm_sub_sd(uij2,lij2);
1710
1711                         lij_inv   = my_invrsq_pd(lij2);
1712
1713                         sk2       = sk2_ai;
1714                         sk2_inv   = _mm_mul_sd(sk2,rinv);
1715                         prod      = _mm_mul_sd(qrtr,sk2_inv);
1716
1717                         log_term  = _mm_mul_pd(uij,lij_inv);
1718                         log_term = log_pd(log_term);
1719
1720                         xmm1      = _mm_sub_sd(lij,uij);
1721                         xmm2      = _mm_mul_sd(qrtr,r);
1722                         xmm2      = _mm_mul_sd(xmm2,diff2);
1723                         xmm1      = _mm_add_sd(xmm1,xmm2);
1724                         xmm2      = _mm_mul_sd(half,rinv);
1725                         xmm2      = _mm_mul_sd(xmm2,log_term);
1726                         xmm1      = _mm_add_sd(xmm1,xmm2);
1727                         xmm9      = _mm_mul_sd(neg,diff2);
1728                         xmm2      = _mm_mul_sd(xmm9,prod);
1729                         tmp       = _mm_add_sd(xmm1,xmm2);
1730
1731                         /* contitional for raj<sk_ai-dr */
1732                         xmm3      = _mm_sub_sd(sk_ai,r);
1733                         mask_cmp3 = _mm_cmplt_sd(raj,xmm3); /* rai<sk-dr */
1734
1735                         xmm4    = _mm_sub_sd(raj_inv,lij);
1736                         xmm4    = _mm_mul_sd(two,xmm4);
1737                         xmm4    = _mm_add_sd(tmp,xmm4);
1738
1739                         tmp         = _mm_or_pd(_mm_and_pd(mask_cmp3,xmm4)  ,_mm_andnot_pd(mask_cmp3,tmp)); /*conditional as a mask*/
1740
1741                         /* the tmp will now contain two partial values, that not all are to be used. Which */
1742                         /* ones are governed by the mask_cmp mask. */
1743                         tmp     = _mm_mul_pd(half,tmp);
1744                         tmp     = _mm_or_pd(_mm_and_pd(mask_cmp,tmp)  ,_mm_andnot_pd(mask_cmp,zero)); /*conditional as a mask*/
1745
1746                         /* Load, add and store gpol energy */
1747                         xmm7    = _mm_load_sd(born->gpol_hct_work+aj1);
1748                         xmm7    = _mm_add_sd(xmm7,tmp);
1749                         _mm_store_sd(born->gpol_hct_work+aj1,xmm7);
1750
1751                         /* Start chain rule terms, t1 */
1752                         xmm2   = _mm_mul_sd(half,lij2);
1753                         xmm3   = _mm_mul_sd(prod,lij3);
1754                         xmm2   = _mm_add_sd(xmm2,xmm3);
1755                         xmm3   = _mm_mul_sd(lij,rinv);
1756                         xmm4   = _mm_mul_sd(lij3,r);
1757                         xmm3   = _mm_add_sd(xmm3,xmm4);
1758                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1759                         t1     = _mm_sub_sd(xmm2,xmm3);
1760
1761                         xmm2   = _mm_mul_sd(half,uij2);
1762                         xmm2   = _mm_mul_sd(neg,xmm2);
1763                         xmm3   = _mm_mul_sd(qrtr,sk2_inv);
1764                         xmm3   = _mm_mul_sd(xmm3,uij3);
1765                         xmm2   = _mm_sub_sd(xmm2,xmm3);
1766                         xmm3   = _mm_mul_sd(uij,rinv);
1767                         xmm4   = _mm_mul_sd(uij3,r);
1768                         xmm3   = _mm_add_sd(xmm3,xmm4);
1769                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1770                         t2     = _mm_add_sd(xmm2,xmm3);
1771
1772                         xmm2   = _mm_mul_sd(sk2_inv,rinv);
1773                         xmm2   = _mm_add_sd(one,xmm2);
1774                         xmm2   = _mm_mul_sd(eigth,xmm2);
1775                         xmm2   = _mm_mul_sd(xmm2,xmm9);
1776                         xmm3   = _mm_mul_sd(log_term, rinv);
1777                         xmm3   = _mm_mul_sd(xmm3,rinv);
1778                         xmm3   = _mm_mul_sd(qrtr,xmm3);
1779                         t3     = _mm_add_sd(xmm2,xmm3);
1780
1781                         /* chain rule terms */
1782                         xmm2   = _mm_mul_sd(dlij,t1);
1783                         xmm2   = _mm_add_sd(xmm2,t2);
1784                         xmm2   = _mm_add_sd(xmm2,t3);
1785                         chrule_ai   = _mm_mul_sd(xmm2,rinv);
1786
1787                         _mm_store_sd(fr->dadx+n, chrule);
1788                         n = n + 1;
1789                         _mm_store_sd(fr->dadx+n, chrule_ai);
1790                         n = n + 1;
1791                 }
1792
1793                 /* Do end processing ...  */
1794                 tmp_ai = _mm_unpacklo_pd(tmp_ai,sum_ai);
1795                 sum_ai = _mm_add_pd(sum_ai,tmp_ai);
1796                 sum_ai = _mm_shuffle_pd(sum_ai,sum_ai,_MM_SHUFFLE2(1,1));
1797
1798                 /* Load, add and store atom ai polarisation energy */
1799                 xmm2 = _mm_load_sd(born->gpol_hct_work+ai);
1800                 sum_ai = _mm_add_sd(sum_ai,xmm2);
1801                 _mm_store_sd(born->gpol_hct_work+ai,sum_ai);
1802         }
1803
1804         /* Parallel summations */
1805         if(PARTDECOMP(cr))
1806         {
1807                 gmx_sum(natoms, born->gpol_hct_work, cr);
1808         }
1809         else if(DOMAINDECOMP(cr))
1810         {
1811                 dd_atom_sum_real(cr->dd, born->gpol_hct_work);
1812         }
1813
1814         /* Compute the radii */
1815         for(i=0;i<fr->natoms_force;i++) /* PELA born->nr */
1816         {
1817                 if(born->use[i] != 0)
1818                 {
1819                         rr      = top->atomtypes.gb_radius[md->typeA[i]];
1820                         rr_inv2 = 1.0/rr;
1821                         rr      = rr-doff;
1822                         rr_inv  = 1.0/rr;
1823                         sum     = rr * born->gpol_hct_work[i];
1824                         sum2    = sum  * sum;
1825                         sum3    = sum2 * sum;
1826
1827                         tsum    = tanh(born->obc_alpha*sum-born->obc_beta*sum2+born->obc_gamma*sum3);
1828                         born->bRad[i] = rr_inv - tsum*rr_inv2;
1829                         born->bRad[i] = 1.0 / born->bRad[i];
1830
1831                         fr->invsqrta[ai]=gmx_invsqrt(born->bRad[ai]);
1832
1833                         tchain  = rr * (born->obc_alpha-2*born->obc_beta*sum+3*born->obc_gamma*sum2);
1834                         born->drobc[i] = (1.0-tsum*tsum)*tchain*rr_inv2;
1835                 }
1836         }
1837
1838         /* Extra (local) communication required for DD */
1839         if(DOMAINDECOMP(cr))
1840         {
1841                 dd_atom_spread_real(cr->dd, born->bRad);
1842                 dd_atom_spread_real(cr->dd, fr->invsqrta);
1843                 dd_atom_spread_real(cr->dd, born->drobc);
1844         }
1845
1846         return 0;
1847
1848 }
1849
1850
1851 int
1852 calc_gb_chainrule_sse2_double(int natoms, t_nblist *nl, double *dadx, double *dvda, double *xd, double *f,
1853                                                           double *fshift, double *shift_vec, int gb_algorithm, gmx_genborn_t *born)
1854 {
1855         int i,k,n,ai,aj,ai3,aj1,aj2,aj13,aj23,aj4,nj0,nj1,offset;
1856         int shift;
1857         double rbi,shX,shY,shZ;
1858         double *rb;
1859
1860         __m128d ix,iy,iz,jx,jy,jz,fix,fiy,fiz,sX,sY,sZ;
1861         __m128d dx,dy,dz,t1,t2,t3,dva,dvaj,dax,dax_ai,fgb,fgb_ai;
1862         __m128d xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
1863
1864         t1 = t2 = t3 = _mm_setzero_pd();
1865         rb = born->work;
1866
1867         /* Loop to get the proper form for the Born radius term */
1868         if(gb_algorithm==egbSTILL) {
1869                 for(i=0;i<natoms;i++)
1870                 {
1871                         rbi   = born->bRad[i];
1872                         rb[i] = (2 * rbi * rbi * dvda[i])/ONE_4PI_EPS0;
1873                 }
1874         }
1875
1876         else if(gb_algorithm==egbHCT) {
1877                 for(i=0;i<natoms;i++)
1878                 {
1879                         rbi   = born->bRad[i];
1880                         rb[i] = rbi * rbi * dvda[i];
1881                 }
1882         }
1883
1884         else if(gb_algorithm==egbOBC) {
1885                 for(i=0;i<natoms;i++)
1886                 {
1887                         rbi   = born->bRad[i];
1888                         rb[i] = rbi * rbi * born->drobc[i] * dvda[i];
1889                 }
1890         }
1891
1892         n = 0;
1893
1894         for(i=0;i<nl->nri;i++)
1895         {
1896                 ai     = nl->iinr[i];
1897                 ai3    = ai * 3;
1898
1899                 nj0    = nl->jindex[i];
1900                 nj1    = nl->jindex[i+1];
1901
1902                 /* Load shifts for this list */
1903                 shift   = 3*nl->shift[i];
1904                 shX     = shift_vec[shift+0];
1905                 shY     = shift_vec[shift+1];
1906                 shZ     = shift_vec[shift+2];
1907
1908                 /* Splat the shifts */
1909                 sX = _mm_load1_pd(&shX);
1910                 sY = _mm_load1_pd(&shY);
1911                 sZ = _mm_load1_pd(&shZ);
1912
1913                 offset = (nj1-nj0)%2;
1914
1915                 /* Load particle ai coordinates and add shifts */
1916                 ix  = _mm_load1_pd(xd+ai3);
1917                 iy  = _mm_load1_pd(xd+ai3+1);
1918                 iz  = _mm_load1_pd(xd+ai3+2);
1919
1920                 ix      = _mm_add_pd(sX,ix);
1921                 iy      = _mm_add_pd(sY,iy);
1922                 iz      = _mm_add_pd(sZ,iz);
1923
1924                 /* Load particle ai dvda */
1925                 dva = _mm_load1_pd(rb+ai);
1926
1927                 fix = _mm_setzero_pd();
1928                 fiy = _mm_setzero_pd();
1929                 fiz = _mm_setzero_pd();
1930
1931                 for(k=nj0;k<nj1-offset;k+=2)
1932                 {
1933                         aj1 = nl->jjnr[k];
1934                         aj2 = nl->jjnr[k+1];
1935
1936                         /* Load dvda_j */
1937                         dvaj = _mm_set_pd(rb[aj2],rb[aj1]);
1938
1939                         aj13 = aj1 * 3;
1940                         aj23 = aj2 * 3;
1941
1942                         /* Load particle aj1-2 coordinates */
1943                         xmm1 = _mm_loadu_pd(xd+aj13);
1944                         xmm2 = _mm_loadu_pd(xd+aj23);
1945
1946                         xmm5 = _mm_load_sd(xd+aj13+2);
1947                         xmm6 = _mm_load_sd(xd+aj23+2);
1948
1949                         jx   = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
1950                         jy   = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
1951                         jz   = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
1952
1953                         /* load derivative of born radii w.r.t. coordinates */
1954                         xmm7  = _mm_loadu_pd(dadx+n);
1955                         n     = n + 2;
1956                         xmm8  = _mm_loadu_pd(dadx+n);
1957                         n     = n + 2;
1958
1959                         /* Shuffle to get the ai and aj components right */
1960                         dax    = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(2,0));
1961                         dax_ai = _mm_shuffle_pd(xmm7,xmm8,_MM_SHUFFLE2(3,1));
1962
1963                         /* distances */
1964                         dx   = _mm_sub_pd(ix,jx);
1965                         dy   = _mm_sub_pd(iy,jy);
1966                         dz   = _mm_sub_pd(iz,jz);
1967
1968                         /* scalar force */
1969                         fgb  = _mm_mul_pd(dva,dax);
1970                         fgb_ai = _mm_mul_pd(dvaj,dax_ai);
1971                         fgb = _mm_add_pd(fgb,fgb_ai);
1972
1973                         /* partial forces */
1974                         t1   = _mm_mul_pd(fgb,dx);
1975                         t2   = _mm_mul_pd(fgb,dy);
1976                         t3   = _mm_mul_pd(fgb,dz);
1977
1978                         /* update the i force */
1979                         fix  = _mm_add_pd(fix,t1);
1980                         fiy  = _mm_add_pd(fiy,t2);
1981                         fiz  = _mm_add_pd(fiz,t3);
1982
1983                         /* accumulate forces from memory */
1984                         xmm1 = _mm_loadu_pd(f+aj13); /* fx1 fx2 */
1985                         xmm2 = _mm_loadu_pd(f+aj23); /* fy1 fy2 */
1986
1987                         xmm5 = _mm_load1_pd(f+aj13+2); /* fz1 fz1 */
1988                         xmm6 = _mm_load1_pd(f+aj23+2); /* fz2 fz2 */
1989
1990                         /* transpose */
1991                         xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
1992                         xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
1993                         xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
1994
1995                         /* subtract partial forces */
1996                         xmm5 = _mm_sub_pd(xmm5, t1);
1997                         xmm6 = _mm_sub_pd(xmm6, t2);
1998                         xmm7 = _mm_sub_pd(xmm7, t3);
1999
2000                         /* transpose back fx and fy */
2001                         xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0));
2002                         xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1));
2003
2004                         /* store the force, first fx's and fy's */
2005                         _mm_storeu_pd(f+aj13,xmm1);
2006                         _mm_storeu_pd(f+aj23,xmm2);
2007
2008                         /* then fz */
2009                         _mm_storel_pd(f+aj13+2,xmm7);
2010                         _mm_storeh_pd(f+aj23+2,xmm7);
2011                 }
2012
2013                 if(offset!=0)
2014                 {
2015                         aj1  = nl->jjnr[k];
2016
2017                         dvaj = _mm_load_sd(rb+aj1);
2018
2019                         aj13 = aj1 * 3;
2020
2021                         jx    = _mm_load_sd(xd+aj13);
2022                         jy    = _mm_load_sd(xd+aj13+1);
2023                         jz    = _mm_load_sd(xd+aj13+2);
2024
2025                         dax  = _mm_load_sd(dadx+n);
2026                         n    = n + 1;
2027                         dax_ai = _mm_load_sd(dadx+n);
2028                         n    = n + 1;
2029
2030                         dx   = _mm_sub_sd(ix,jx);
2031                         dy   = _mm_sub_sd(iy,jy);
2032                         dz   = _mm_sub_sd(iz,jz);
2033
2034                         fgb  = _mm_mul_sd(dva,dax);
2035                         fgb_ai = _mm_mul_sd(dvaj,dax_ai);
2036                         fgb  = _mm_add_pd(fgb,fgb_ai);
2037
2038                         t1   = _mm_mul_sd(fgb,dx);
2039                         t2   = _mm_mul_sd(fgb,dy);
2040                         t3   = _mm_mul_sd(fgb,dz);
2041
2042                         fix  = _mm_add_sd(fix,t1);
2043                         fiy  = _mm_add_sd(fiy,t2);
2044                         fiz  = _mm_add_sd(fiz,t3);
2045
2046                         /* accumulate forces from memory */
2047                         xmm5 = _mm_load_sd(f+aj13);
2048                         xmm6 = _mm_load_sd(f+aj13+1);
2049                         xmm7 = _mm_load_sd(f+aj13+2);
2050
2051                         /* subtract partial forces */
2052                         xmm5 = _mm_sub_sd(xmm5, t1);
2053                         xmm6 = _mm_sub_sd(xmm6, t2);
2054                         xmm7 = _mm_sub_sd(xmm7, t3);
2055
2056                         /* store the force */
2057                         _mm_store_sd(f+aj13,xmm5);
2058                         _mm_store_sd(f+aj13+1,xmm6);
2059                         _mm_store_sd(f+aj13+2,xmm7);
2060                 }
2061
2062                 /* fix/fiy/fiz now contain two partial terms, that all should be
2063                  * added to the i particle forces
2064                  */
2065                 t1               = _mm_unpacklo_pd(t1,fix);
2066                 t2               = _mm_unpacklo_pd(t2,fiy);
2067                 t3               = _mm_unpacklo_pd(t3,fiz);
2068
2069                 fix              = _mm_add_pd(fix,t1);
2070                 fiy              = _mm_add_pd(fiy,t2);
2071                 fiz              = _mm_add_pd(fiz,t3);
2072
2073                 fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
2074                 fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
2075                 fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
2076
2077                 /* load, add and store i forces */
2078                 xmm1     = _mm_load_sd(f+ai3);
2079                 xmm2     = _mm_load_sd(f+ai3+1);
2080                 xmm3     = _mm_load_sd(f+ai3+2);
2081
2082                 fix      = _mm_add_sd(fix,xmm1);
2083                 fiy      = _mm_add_sd(fiy,xmm2);
2084                 fiz      = _mm_add_sd(fiz,xmm3);
2085
2086                 _mm_store_sd(f+ai3,fix);
2087                 _mm_store_sd(f+ai3+1,fiy);
2088                 _mm_store_sd(f+ai3+2,fiz);
2089
2090                 /* load, add and store i shift forces */
2091                 xmm1     = _mm_load_sd(fshift+shift);
2092                 xmm2     = _mm_load_sd(fshift+shift+1);
2093                 xmm3     = _mm_load_sd(fshift+shift+2);
2094
2095                 fix      = _mm_add_sd(fix,xmm1);
2096                 fiy      = _mm_add_sd(fiy,xmm2);
2097                 fiz      = _mm_add_sd(fiz,xmm3);
2098
2099                 _mm_store_sd(fshift+shift,fix);
2100                 _mm_store_sd(fshift+shift+1,fiy);
2101                 _mm_store_sd(fshift+shift+2,fiz);
2102
2103         }
2104
2105         return 0;
2106 }
2107
2108 #else
2109 /* keep compiler happy */
2110 int genborn_sse_dummy;
2111
2112 #endif /* SSE2 intrinsics available */
2113