2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
40 #include "pme-solve.h"
44 #include "gromacs/fft/parallel_3dfft.h"
45 #include "gromacs/math/units.h"
46 #include "gromacs/math/utilities.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/simd/simd.h"
49 #include "gromacs/simd/simd_math.h"
50 #include "gromacs/utility/exceptions.h"
51 #include "gromacs/utility/smalloc.h"
53 #include "pme-internal.h"
55 #if GMX_SIMD_HAVE_REAL
56 /* Turn on arbitrary width SIMD intrinsics for PME solve */
57 # define PME_SIMD_SOLVE
60 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
62 struct pme_solve_work_t
64 /* work data for solve_pme */
83 static void realloc_work(struct pme_solve_work_t
*work
, int nkx
)
85 if (nkx
> work
->nalloc
)
90 srenew(work
->mhx
, work
->nalloc
);
91 srenew(work
->mhy
, work
->nalloc
);
92 srenew(work
->mhz
, work
->nalloc
);
93 srenew(work
->m2
, work
->nalloc
);
94 /* Allocate an aligned pointer for SIMD operations, including extra
95 * elements at the end for padding.
98 simd_width
= GMX_SIMD_REAL_WIDTH
;
100 /* We can use any alignment, apart from 0, so we use 4 */
103 sfree_aligned(work
->denom
);
104 sfree_aligned(work
->tmp1
);
105 sfree_aligned(work
->tmp2
);
106 sfree_aligned(work
->eterm
);
107 snew_aligned(work
->denom
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
108 snew_aligned(work
->tmp1
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
109 snew_aligned(work
->tmp2
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
110 snew_aligned(work
->eterm
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
111 srenew(work
->m2inv
, work
->nalloc
);
113 /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
114 * of simd padded elements.
116 for (i
= 0; i
< work
->nalloc
+simd_width
; i
++)
123 void pme_init_all_work(struct pme_solve_work_t
**work
, int nthread
, int nkx
)
126 /* Use fft5d, order after FFT is y major, z, x minor */
128 snew(*work
, nthread
);
129 /* Allocate the work arrays thread local to optimize memory access */
130 #pragma omp parallel for num_threads(nthread) schedule(static)
131 for (thread
= 0; thread
< nthread
; thread
++)
135 realloc_work(&((*work
)[thread
]), nkx
);
137 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
141 static void free_work(struct pme_solve_work_t
*work
)
149 sfree_aligned(work
->denom
);
150 sfree_aligned(work
->tmp1
);
151 sfree_aligned(work
->tmp2
);
152 sfree_aligned(work
->eterm
);
157 void pme_free_all_work(struct pme_solve_work_t
**work
, int nthread
)
161 for (int thread
= 0; thread
< nthread
; thread
++)
163 free_work(&(*work
)[thread
]);
170 void get_pme_ener_vir_q(struct pme_solve_work_t
*work
, int nthread
,
171 real
*mesh_energy
, matrix vir
)
173 /* This function sums output over threads and should therefore
174 * only be called after thread synchronization.
178 *mesh_energy
= work
[0].energy_q
;
179 copy_mat(work
[0].vir_q
, vir
);
181 for (thread
= 1; thread
< nthread
; thread
++)
183 *mesh_energy
+= work
[thread
].energy_q
;
184 m_add(vir
, work
[thread
].vir_q
, vir
);
188 void get_pme_ener_vir_lj(struct pme_solve_work_t
*work
, int nthread
,
189 real
*mesh_energy
, matrix vir
)
191 /* This function sums output over threads and should therefore
192 * only be called after thread synchronization.
196 *mesh_energy
= work
[0].energy_lj
;
197 copy_mat(work
[0].vir_lj
, vir
);
199 for (thread
= 1; thread
< nthread
; thread
++)
201 *mesh_energy
+= work
[thread
].energy_lj
;
202 m_add(vir
, work
[thread
].vir_lj
, vir
);
206 #if defined PME_SIMD_SOLVE
207 /* Calculate exponentials through SIMD */
208 gmx_inline
static void calc_exponentials_q(int gmx_unused start
, int end
, real f
, real
*d_aligned
, real
*r_aligned
, real
*e_aligned
)
212 SimdReal tmp_d1
, tmp_r
, tmp_e
;
215 /* We only need to calculate from start. But since start is 0 or 1
216 * and we want to use aligned loads/stores, we always start from 0.
218 for (kx
= 0; kx
< end
; kx
+= GMX_SIMD_REAL_WIDTH
)
220 tmp_d1
= load(d_aligned
+kx
);
221 tmp_r
= load(r_aligned
+kx
);
222 tmp_r
= gmx::exp(tmp_r
);
223 tmp_e
= f_simd
/ tmp_d1
;
224 tmp_e
= tmp_e
* tmp_r
;
225 store(e_aligned
+kx
, tmp_e
);
230 gmx_inline
static void calc_exponentials_q(int start
, int end
, real f
, real
*d
, real
*r
, real
*e
)
233 for (kx
= start
; kx
< end
; kx
++)
237 for (kx
= start
; kx
< end
; kx
++)
239 r
[kx
] = std::exp(r
[kx
]);
241 for (kx
= start
; kx
< end
; kx
++)
243 e
[kx
] = f
*r
[kx
]*d
[kx
];
248 #if defined PME_SIMD_SOLVE
249 /* Calculate exponentials through SIMD */
250 gmx_inline
static void calc_exponentials_lj(int gmx_unused start
, int end
, real
*r_aligned
, real
*factor_aligned
, real
*d_aligned
)
252 SimdReal tmp_r
, tmp_d
, tmp_fac
, d_inv
, tmp_mk
;
253 const SimdReal sqr_PI
= sqrt(SimdReal(M_PI
));
255 for (kx
= 0; kx
< end
; kx
+= GMX_SIMD_REAL_WIDTH
)
257 /* We only need to calculate from start. But since start is 0 or 1
258 * and we want to use aligned loads/stores, we always start from 0.
260 tmp_d
= load(d_aligned
+kx
);
261 d_inv
= SimdReal(1.0) / tmp_d
;
262 store(d_aligned
+kx
, d_inv
);
263 tmp_r
= load(r_aligned
+kx
);
264 tmp_r
= gmx::exp(tmp_r
);
265 store(r_aligned
+kx
, tmp_r
);
266 tmp_mk
= load(factor_aligned
+kx
);
267 tmp_fac
= sqr_PI
* tmp_mk
* erfc(tmp_mk
);
268 store(factor_aligned
+kx
, tmp_fac
);
272 gmx_inline
static void calc_exponentials_lj(int start
, int end
, real
*r
, real
*tmp2
, real
*d
)
276 for (kx
= start
; kx
< end
; kx
++)
281 for (kx
= start
; kx
< end
; kx
++)
283 r
[kx
] = std::exp(r
[kx
]);
286 for (kx
= start
; kx
< end
; kx
++)
289 tmp2
[kx
] = sqrt(M_PI
)*mk
*std::erfc(mk
);
294 int solve_pme_yzx(struct gmx_pme_t
*pme
, t_complex
*grid
, real vol
,
296 int nthread
, int thread
)
298 /* do recip sum over local cells in grid */
299 /* y major, z middle, x minor or continuous */
301 int kx
, ky
, kz
, maxkx
, maxky
;
302 int nx
, ny
, nz
, iyz0
, iyz1
, iyz
, iy
, iz
, kxstart
, kxend
;
304 real ewaldcoeff
= pme
->ewaldcoeff_q
;
305 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
306 real ets2
, struct2
, vfactor
, ets2vf
;
307 real d1
, d2
, energy
= 0;
309 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
310 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
311 struct pme_solve_work_t
*work
;
312 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *eterm
, *m2inv
;
313 real mhxk
, mhyk
, mhzk
, m2k
;
316 ivec local_ndata
, local_offset
, local_size
;
319 elfac
= ONE_4PI_EPS0
/pme
->epsilon_r
;
325 /* Dimensions should be identical for A/B grid, so we just use A here */
326 gmx_parallel_3dfft_complex_limits(pme
->pfft_setup
[PME_GRID_QA
],
332 rxx
= pme
->recipbox
[XX
][XX
];
333 ryx
= pme
->recipbox
[YY
][XX
];
334 ryy
= pme
->recipbox
[YY
][YY
];
335 rzx
= pme
->recipbox
[ZZ
][XX
];
336 rzy
= pme
->recipbox
[ZZ
][YY
];
337 rzz
= pme
->recipbox
[ZZ
][ZZ
];
342 work
= &pme
->solve_work
[thread
];
352 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
353 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
355 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
357 iy
= iyz
/local_ndata
[ZZ
];
358 iz
= iyz
- iy
*local_ndata
[ZZ
];
360 ky
= iy
+ local_offset
[YY
];
371 by
= M_PI
*vol
*pme
->bsp_mod
[YY
][ky
];
373 kz
= iz
+ local_offset
[ZZ
];
377 bz
= pme
->bsp_mod
[ZZ
][kz
];
379 /* 0.5 correction for corner points */
381 if (kz
== 0 || kz
== (nz
+1)/2)
386 p0
= grid
+ iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
388 /* We should skip the k-space point (0,0,0) */
389 /* Note that since here x is the minor index, local_offset[XX]=0 */
390 if (local_offset
[XX
] > 0 || ky
> 0 || kz
> 0)
392 kxstart
= local_offset
[XX
];
396 kxstart
= local_offset
[XX
] + 1;
399 kxend
= local_offset
[XX
] + local_ndata
[XX
];
403 /* More expensive inner loop, especially because of the storage
404 * of the mh elements in array's.
405 * Because x is the minor grid index, all mh elements
406 * depend on kx for triclinic unit cells.
409 /* Two explicit loops to avoid a conditional inside the loop */
410 for (kx
= kxstart
; kx
< maxkx
; kx
++)
415 mhyk
= mx
* ryx
+ my
* ryy
;
416 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
417 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
422 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
423 tmp1
[kx
] = -factor
*m2k
;
426 for (kx
= maxkx
; kx
< kxend
; kx
++)
431 mhyk
= mx
* ryx
+ my
* ryy
;
432 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
433 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
438 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
439 tmp1
[kx
] = -factor
*m2k
;
442 for (kx
= kxstart
; kx
< kxend
; kx
++)
444 m2inv
[kx
] = 1.0/m2
[kx
];
447 calc_exponentials_q(kxstart
, kxend
, elfac
, denom
, tmp1
, eterm
);
449 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
454 p0
->re
= d1
*eterm
[kx
];
455 p0
->im
= d2
*eterm
[kx
];
457 struct2
= 2.0*(d1
*d1
+d2
*d2
);
459 tmp1
[kx
] = eterm
[kx
]*struct2
;
462 for (kx
= kxstart
; kx
< kxend
; kx
++)
464 ets2
= corner_fac
*tmp1
[kx
];
465 vfactor
= (factor
*m2
[kx
] + 1.0)*2.0*m2inv
[kx
];
468 ets2vf
= ets2
*vfactor
;
469 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
470 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
471 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
472 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
473 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
474 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
479 /* We don't need to calculate the energy and the virial.
480 * In this case the triclinic overhead is small.
483 /* Two explicit loops to avoid a conditional inside the loop */
485 for (kx
= kxstart
; kx
< maxkx
; kx
++)
490 mhyk
= mx
* ryx
+ my
* ryy
;
491 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
492 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
493 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
494 tmp1
[kx
] = -factor
*m2k
;
497 for (kx
= maxkx
; kx
< kxend
; kx
++)
502 mhyk
= mx
* ryx
+ my
* ryy
;
503 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
504 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
505 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
506 tmp1
[kx
] = -factor
*m2k
;
509 calc_exponentials_q(kxstart
, kxend
, elfac
, denom
, tmp1
, eterm
);
511 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
516 p0
->re
= d1
*eterm
[kx
];
517 p0
->im
= d2
*eterm
[kx
];
524 /* Update virial with local values.
525 * The virial is symmetric by definition.
526 * this virial seems ok for isotropic scaling, but I'm
527 * experiencing problems on semiisotropic membranes.
528 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
530 work
->vir_q
[XX
][XX
] = 0.25*virxx
;
531 work
->vir_q
[YY
][YY
] = 0.25*viryy
;
532 work
->vir_q
[ZZ
][ZZ
] = 0.25*virzz
;
533 work
->vir_q
[XX
][YY
] = work
->vir_q
[YY
][XX
] = 0.25*virxy
;
534 work
->vir_q
[XX
][ZZ
] = work
->vir_q
[ZZ
][XX
] = 0.25*virxz
;
535 work
->vir_q
[YY
][ZZ
] = work
->vir_q
[ZZ
][YY
] = 0.25*viryz
;
537 /* This energy should be corrected for a charged system */
538 work
->energy_q
= 0.5*energy
;
541 /* Return the loop count */
542 return local_ndata
[YY
]*local_ndata
[XX
];
545 int solve_pme_lj_yzx(struct gmx_pme_t
*pme
, t_complex
**grid
, gmx_bool bLB
, real vol
,
546 gmx_bool bEnerVir
, int nthread
, int thread
)
548 /* do recip sum over local cells in grid */
549 /* y major, z middle, x minor or continuous */
551 int kx
, ky
, kz
, maxkx
, maxky
;
552 int nx
, ny
, nz
, iy
, iyz0
, iyz1
, iyz
, iz
, kxstart
, kxend
;
554 real ewaldcoeff
= pme
->ewaldcoeff_lj
;
555 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
557 real eterm
, vterm
, d1
, d2
, energy
= 0;
559 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
560 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
561 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *tmp2
;
562 real mhxk
, mhyk
, mhzk
, m2k
;
563 struct pme_solve_work_t
*work
;
566 ivec local_ndata
, local_offset
, local_size
;
571 /* Dimensions should be identical for A/B grid, so we just use A here */
572 gmx_parallel_3dfft_complex_limits(pme
->pfft_setup
[PME_GRID_C6A
],
577 rxx
= pme
->recipbox
[XX
][XX
];
578 ryx
= pme
->recipbox
[YY
][XX
];
579 ryy
= pme
->recipbox
[YY
][YY
];
580 rzx
= pme
->recipbox
[ZZ
][XX
];
581 rzy
= pme
->recipbox
[ZZ
][YY
];
582 rzz
= pme
->recipbox
[ZZ
][ZZ
];
587 work
= &pme
->solve_work
[thread
];
596 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
597 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
599 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
601 iy
= iyz
/local_ndata
[ZZ
];
602 iz
= iyz
- iy
*local_ndata
[ZZ
];
604 ky
= iy
+ local_offset
[YY
];
615 by
= 3.0*vol
*pme
->bsp_mod
[YY
][ky
]
616 / (M_PI
*sqrt(M_PI
)*ewaldcoeff
*ewaldcoeff
*ewaldcoeff
);
618 kz
= iz
+ local_offset
[ZZ
];
622 bz
= pme
->bsp_mod
[ZZ
][kz
];
624 /* 0.5 correction for corner points */
626 if (kz
== 0 || kz
== (nz
+1)/2)
631 kxstart
= local_offset
[XX
];
632 kxend
= local_offset
[XX
] + local_ndata
[XX
];
635 /* More expensive inner loop, especially because of the
636 * storage of the mh elements in array's. Because x is the
637 * minor grid index, all mh elements depend on kx for
638 * triclinic unit cells.
641 /* Two explicit loops to avoid a conditional inside the loop */
642 for (kx
= kxstart
; kx
< maxkx
; kx
++)
647 mhyk
= mx
* ryx
+ my
* ryy
;
648 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
649 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
654 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
655 tmp1
[kx
] = -factor
*m2k
;
656 tmp2
[kx
] = sqrt(factor
*m2k
);
659 for (kx
= maxkx
; kx
< kxend
; kx
++)
664 mhyk
= mx
* ryx
+ my
* ryy
;
665 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
666 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
671 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
672 tmp1
[kx
] = -factor
*m2k
;
673 tmp2
[kx
] = sqrt(factor
*m2k
);
676 calc_exponentials_lj(kxstart
, kxend
, tmp1
, tmp2
, denom
);
678 for (kx
= kxstart
; kx
< kxend
; kx
++)
681 eterm
= -((1.0 - 2.0*m2k
)*tmp1
[kx
]
683 vterm
= 3.0*(-tmp1
[kx
] + tmp2
[kx
]);
684 tmp1
[kx
] = eterm
*denom
[kx
];
685 tmp2
[kx
] = vterm
*denom
[kx
];
693 p0
= grid
[0] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
694 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
704 struct2
= 2.0*(d1
*d1
+d2
*d2
);
706 tmp1
[kx
] = eterm
*struct2
;
707 tmp2
[kx
] = vterm
*struct2
;
712 real
*struct2
= denom
;
715 for (kx
= kxstart
; kx
< kxend
; kx
++)
719 /* Due to symmetry we only need to calculate 4 of the 7 terms */
720 for (ig
= 0; ig
<= 3; ++ig
)
725 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
726 p1
= grid
[6-ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
727 scale
= 2.0*lb_scale_factor_symm
[ig
];
728 for (kx
= kxstart
; kx
< kxend
; ++kx
, ++p0
, ++p1
)
730 struct2
[kx
] += scale
*(p0
->re
*p1
->re
+ p0
->im
*p1
->im
);
734 for (ig
= 0; ig
<= 6; ++ig
)
738 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
739 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
749 for (kx
= kxstart
; kx
< kxend
; kx
++)
754 tmp1
[kx
] = eterm
*str2
;
755 tmp2
[kx
] = vterm
*str2
;
759 for (kx
= kxstart
; kx
< kxend
; kx
++)
761 ets2
= corner_fac
*tmp1
[kx
];
762 vterm
= 2.0*factor
*tmp2
[kx
];
764 ets2vf
= corner_fac
*vterm
;
765 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
766 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
767 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
768 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
769 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
770 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
775 /* We don't need to calculate the energy and the virial.
776 * In this case the triclinic overhead is small.
779 /* Two explicit loops to avoid a conditional inside the loop */
781 for (kx
= kxstart
; kx
< maxkx
; kx
++)
786 mhyk
= mx
* ryx
+ my
* ryy
;
787 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
788 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
790 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
791 tmp1
[kx
] = -factor
*m2k
;
792 tmp2
[kx
] = sqrt(factor
*m2k
);
795 for (kx
= maxkx
; kx
< kxend
; kx
++)
800 mhyk
= mx
* ryx
+ my
* ryy
;
801 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
802 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
804 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
805 tmp1
[kx
] = -factor
*m2k
;
806 tmp2
[kx
] = sqrt(factor
*m2k
);
809 calc_exponentials_lj(kxstart
, kxend
, tmp1
, tmp2
, denom
);
811 for (kx
= kxstart
; kx
< kxend
; kx
++)
814 eterm
= -((1.0 - 2.0*m2k
)*tmp1
[kx
]
816 tmp1
[kx
] = eterm
*denom
[kx
];
818 gcount
= (bLB
? 7 : 1);
819 for (ig
= 0; ig
< gcount
; ++ig
)
823 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
824 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
839 work
->vir_lj
[XX
][XX
] = 0.25*virxx
;
840 work
->vir_lj
[YY
][YY
] = 0.25*viryy
;
841 work
->vir_lj
[ZZ
][ZZ
] = 0.25*virzz
;
842 work
->vir_lj
[XX
][YY
] = work
->vir_lj
[YY
][XX
] = 0.25*virxy
;
843 work
->vir_lj
[XX
][ZZ
] = work
->vir_lj
[ZZ
][XX
] = 0.25*virxz
;
844 work
->vir_lj
[YY
][ZZ
] = work
->vir_lj
[ZZ
][YY
] = 0.25*viryz
;
846 /* This energy should be corrected for a charged system */
847 work
->energy_lj
= 0.5*energy
;
849 /* Return the loop count */
850 return local_ndata
[YY
]*local_ndata
[XX
];