2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2013,2014,2015,2016,2017, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
40 #include "pme-solve.h"
44 #include "gromacs/fft/parallel_3dfft.h"
45 #include "gromacs/math/units.h"
46 #include "gromacs/math/utilities.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/simd/simd.h"
49 #include "gromacs/simd/simd_math.h"
50 #include "gromacs/utility/exceptions.h"
51 #include "gromacs/utility/smalloc.h"
53 #include "pme-internal.h"
55 #if GMX_SIMD_HAVE_REAL
56 /* Turn on arbitrary width SIMD intrinsics for PME solve */
57 # define PME_SIMD_SOLVE
60 using namespace gmx
; // TODO: Remove when this file is moved into gmx namespace
62 struct pme_solve_work_t
64 /* work data for solve_pme */
83 static void realloc_work(struct pme_solve_work_t
*work
, int nkx
)
85 if (nkx
> work
->nalloc
)
90 srenew(work
->mhx
, work
->nalloc
);
91 srenew(work
->mhy
, work
->nalloc
);
92 srenew(work
->mhz
, work
->nalloc
);
93 srenew(work
->m2
, work
->nalloc
);
94 /* Allocate an aligned pointer for SIMD operations, including extra
95 * elements at the end for padding.
98 simd_width
= GMX_SIMD_REAL_WIDTH
;
100 /* We can use any alignment, apart from 0, so we use 4 */
103 sfree_aligned(work
->denom
);
104 sfree_aligned(work
->tmp1
);
105 sfree_aligned(work
->tmp2
);
106 sfree_aligned(work
->eterm
);
107 snew_aligned(work
->denom
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
108 snew_aligned(work
->tmp1
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
109 snew_aligned(work
->tmp2
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
110 snew_aligned(work
->eterm
, work
->nalloc
+simd_width
, simd_width
*sizeof(real
));
111 srenew(work
->m2inv
, work
->nalloc
);
113 /* Init all allocated elements of denom to 1 to avoid 1/0 exceptions
114 * of simd padded elements.
116 for (i
= 0; i
< work
->nalloc
+simd_width
; i
++)
123 void pme_init_all_work(struct pme_solve_work_t
**work
, int nthread
, int nkx
)
126 /* Use fft5d, order after FFT is y major, z, x minor */
128 snew(*work
, nthread
);
129 /* Allocate the work arrays thread local to optimize memory access */
130 #pragma omp parallel for num_threads(nthread) schedule(static)
131 for (thread
= 0; thread
< nthread
; thread
++)
135 realloc_work(&((*work
)[thread
]), nkx
);
137 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
;
141 static void free_work(struct pme_solve_work_t
*work
)
149 sfree_aligned(work
->denom
);
150 sfree_aligned(work
->tmp1
);
151 sfree_aligned(work
->tmp2
);
152 sfree_aligned(work
->eterm
);
157 void pme_free_all_work(struct pme_solve_work_t
**work
, int nthread
)
161 for (thread
= 0; thread
< nthread
; thread
++)
163 free_work(&(*work
)[thread
]);
169 void get_pme_ener_vir_q(struct pme_solve_work_t
*work
, int nthread
,
170 real
*mesh_energy
, matrix vir
)
172 /* This function sums output over threads and should therefore
173 * only be called after thread synchronization.
177 *mesh_energy
= work
[0].energy_q
;
178 copy_mat(work
[0].vir_q
, vir
);
180 for (thread
= 1; thread
< nthread
; thread
++)
182 *mesh_energy
+= work
[thread
].energy_q
;
183 m_add(vir
, work
[thread
].vir_q
, vir
);
187 void get_pme_ener_vir_lj(struct pme_solve_work_t
*work
, int nthread
,
188 real
*mesh_energy
, matrix vir
)
190 /* This function sums output over threads and should therefore
191 * only be called after thread synchronization.
195 *mesh_energy
= work
[0].energy_lj
;
196 copy_mat(work
[0].vir_lj
, vir
);
198 for (thread
= 1; thread
< nthread
; thread
++)
200 *mesh_energy
+= work
[thread
].energy_lj
;
201 m_add(vir
, work
[thread
].vir_lj
, vir
);
205 #if defined PME_SIMD_SOLVE
206 /* Calculate exponentials through SIMD */
207 gmx_inline
static void calc_exponentials_q(int gmx_unused start
, int end
, real f
, real
*d_aligned
, real
*r_aligned
, real
*e_aligned
)
211 SimdReal tmp_d1
, tmp_r
, tmp_e
;
214 /* We only need to calculate from start. But since start is 0 or 1
215 * and we want to use aligned loads/stores, we always start from 0.
217 for (kx
= 0; kx
< end
; kx
+= GMX_SIMD_REAL_WIDTH
)
219 tmp_d1
= load(d_aligned
+kx
);
220 tmp_r
= load(r_aligned
+kx
);
221 tmp_r
= gmx::exp(tmp_r
);
222 tmp_e
= f_simd
/ tmp_d1
;
223 tmp_e
= tmp_e
* tmp_r
;
224 store(e_aligned
+kx
, tmp_e
);
229 gmx_inline
static void calc_exponentials_q(int start
, int end
, real f
, real
*d
, real
*r
, real
*e
)
232 for (kx
= start
; kx
< end
; kx
++)
236 for (kx
= start
; kx
< end
; kx
++)
238 r
[kx
] = std::exp(r
[kx
]);
240 for (kx
= start
; kx
< end
; kx
++)
242 e
[kx
] = f
*r
[kx
]*d
[kx
];
247 #if defined PME_SIMD_SOLVE
248 /* Calculate exponentials through SIMD */
249 gmx_inline
static void calc_exponentials_lj(int gmx_unused start
, int end
, real
*r_aligned
, real
*factor_aligned
, real
*d_aligned
)
251 SimdReal tmp_r
, tmp_d
, tmp_fac
, d_inv
, tmp_mk
;
252 const SimdReal sqr_PI
= sqrt(SimdReal(M_PI
));
254 for (kx
= 0; kx
< end
; kx
+= GMX_SIMD_REAL_WIDTH
)
256 /* We only need to calculate from start. But since start is 0 or 1
257 * and we want to use aligned loads/stores, we always start from 0.
259 tmp_d
= load(d_aligned
+kx
);
260 d_inv
= SimdReal(1.0) / tmp_d
;
261 store(d_aligned
+kx
, d_inv
);
262 tmp_r
= load(r_aligned
+kx
);
263 tmp_r
= gmx::exp(tmp_r
);
264 store(r_aligned
+kx
, tmp_r
);
265 tmp_mk
= load(factor_aligned
+kx
);
266 tmp_fac
= sqr_PI
* tmp_mk
* erfc(tmp_mk
);
267 store(factor_aligned
+kx
, tmp_fac
);
271 gmx_inline
static void calc_exponentials_lj(int start
, int end
, real
*r
, real
*tmp2
, real
*d
)
275 for (kx
= start
; kx
< end
; kx
++)
280 for (kx
= start
; kx
< end
; kx
++)
282 r
[kx
] = std::exp(r
[kx
]);
285 for (kx
= start
; kx
< end
; kx
++)
288 tmp2
[kx
] = sqrt(M_PI
)*mk
*std::erfc(mk
);
293 int solve_pme_yzx(struct gmx_pme_t
*pme
, t_complex
*grid
, real vol
,
295 int nthread
, int thread
)
297 /* do recip sum over local cells in grid */
298 /* y major, z middle, x minor or continuous */
300 int kx
, ky
, kz
, maxkx
, maxky
;
301 int nx
, ny
, nz
, iyz0
, iyz1
, iyz
, iy
, iz
, kxstart
, kxend
;
303 real ewaldcoeff
= pme
->ewaldcoeff_q
;
304 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
305 real ets2
, struct2
, vfactor
, ets2vf
;
306 real d1
, d2
, energy
= 0;
308 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
309 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
310 struct pme_solve_work_t
*work
;
311 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *eterm
, *m2inv
;
312 real mhxk
, mhyk
, mhzk
, m2k
;
315 ivec local_ndata
, local_offset
, local_size
;
318 elfac
= ONE_4PI_EPS0
/pme
->epsilon_r
;
324 /* Dimensions should be identical for A/B grid, so we just use A here */
325 gmx_parallel_3dfft_complex_limits(pme
->pfft_setup
[PME_GRID_QA
],
331 rxx
= pme
->recipbox
[XX
][XX
];
332 ryx
= pme
->recipbox
[YY
][XX
];
333 ryy
= pme
->recipbox
[YY
][YY
];
334 rzx
= pme
->recipbox
[ZZ
][XX
];
335 rzy
= pme
->recipbox
[ZZ
][YY
];
336 rzz
= pme
->recipbox
[ZZ
][ZZ
];
341 work
= &pme
->solve_work
[thread
];
351 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
352 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
354 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
356 iy
= iyz
/local_ndata
[ZZ
];
357 iz
= iyz
- iy
*local_ndata
[ZZ
];
359 ky
= iy
+ local_offset
[YY
];
370 by
= M_PI
*vol
*pme
->bsp_mod
[YY
][ky
];
372 kz
= iz
+ local_offset
[ZZ
];
376 bz
= pme
->bsp_mod
[ZZ
][kz
];
378 /* 0.5 correction for corner points */
380 if (kz
== 0 || kz
== (nz
+1)/2)
385 p0
= grid
+ iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
387 /* We should skip the k-space point (0,0,0) */
388 /* Note that since here x is the minor index, local_offset[XX]=0 */
389 if (local_offset
[XX
] > 0 || ky
> 0 || kz
> 0)
391 kxstart
= local_offset
[XX
];
395 kxstart
= local_offset
[XX
] + 1;
398 kxend
= local_offset
[XX
] + local_ndata
[XX
];
402 /* More expensive inner loop, especially because of the storage
403 * of the mh elements in array's.
404 * Because x is the minor grid index, all mh elements
405 * depend on kx for triclinic unit cells.
408 /* Two explicit loops to avoid a conditional inside the loop */
409 for (kx
= kxstart
; kx
< maxkx
; kx
++)
414 mhyk
= mx
* ryx
+ my
* ryy
;
415 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
416 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
421 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
422 tmp1
[kx
] = -factor
*m2k
;
425 for (kx
= maxkx
; kx
< kxend
; kx
++)
430 mhyk
= mx
* ryx
+ my
* ryy
;
431 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
432 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
437 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
438 tmp1
[kx
] = -factor
*m2k
;
441 for (kx
= kxstart
; kx
< kxend
; kx
++)
443 m2inv
[kx
] = 1.0/m2
[kx
];
446 calc_exponentials_q(kxstart
, kxend
, elfac
, denom
, tmp1
, eterm
);
448 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
453 p0
->re
= d1
*eterm
[kx
];
454 p0
->im
= d2
*eterm
[kx
];
456 struct2
= 2.0*(d1
*d1
+d2
*d2
);
458 tmp1
[kx
] = eterm
[kx
]*struct2
;
461 for (kx
= kxstart
; kx
< kxend
; kx
++)
463 ets2
= corner_fac
*tmp1
[kx
];
464 vfactor
= (factor
*m2
[kx
] + 1.0)*2.0*m2inv
[kx
];
467 ets2vf
= ets2
*vfactor
;
468 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
469 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
470 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
471 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
472 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
473 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
478 /* We don't need to calculate the energy and the virial.
479 * In this case the triclinic overhead is small.
482 /* Two explicit loops to avoid a conditional inside the loop */
484 for (kx
= kxstart
; kx
< maxkx
; kx
++)
489 mhyk
= mx
* ryx
+ my
* ryy
;
490 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
491 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
492 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
493 tmp1
[kx
] = -factor
*m2k
;
496 for (kx
= maxkx
; kx
< kxend
; kx
++)
501 mhyk
= mx
* ryx
+ my
* ryy
;
502 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
503 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
504 denom
[kx
] = m2k
*bz
*by
*pme
->bsp_mod
[XX
][kx
];
505 tmp1
[kx
] = -factor
*m2k
;
508 calc_exponentials_q(kxstart
, kxend
, elfac
, denom
, tmp1
, eterm
);
510 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
515 p0
->re
= d1
*eterm
[kx
];
516 p0
->im
= d2
*eterm
[kx
];
523 /* Update virial with local values.
524 * The virial is symmetric by definition.
525 * this virial seems ok for isotropic scaling, but I'm
526 * experiencing problems on semiisotropic membranes.
527 * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
529 work
->vir_q
[XX
][XX
] = 0.25*virxx
;
530 work
->vir_q
[YY
][YY
] = 0.25*viryy
;
531 work
->vir_q
[ZZ
][ZZ
] = 0.25*virzz
;
532 work
->vir_q
[XX
][YY
] = work
->vir_q
[YY
][XX
] = 0.25*virxy
;
533 work
->vir_q
[XX
][ZZ
] = work
->vir_q
[ZZ
][XX
] = 0.25*virxz
;
534 work
->vir_q
[YY
][ZZ
] = work
->vir_q
[ZZ
][YY
] = 0.25*viryz
;
536 /* This energy should be corrected for a charged system */
537 work
->energy_q
= 0.5*energy
;
540 /* Return the loop count */
541 return local_ndata
[YY
]*local_ndata
[XX
];
544 int solve_pme_lj_yzx(struct gmx_pme_t
*pme
, t_complex
**grid
, gmx_bool bLB
, real vol
,
545 gmx_bool bEnerVir
, int nthread
, int thread
)
547 /* do recip sum over local cells in grid */
548 /* y major, z middle, x minor or continuous */
550 int kx
, ky
, kz
, maxkx
, maxky
;
551 int nx
, ny
, nz
, iy
, iyz0
, iyz1
, iyz
, iz
, kxstart
, kxend
;
553 real ewaldcoeff
= pme
->ewaldcoeff_lj
;
554 real factor
= M_PI
*M_PI
/(ewaldcoeff
*ewaldcoeff
);
556 real eterm
, vterm
, d1
, d2
, energy
= 0;
558 real virxx
= 0, virxy
= 0, virxz
= 0, viryy
= 0, viryz
= 0, virzz
= 0;
559 real rxx
, ryx
, ryy
, rzx
, rzy
, rzz
;
560 real
*mhx
, *mhy
, *mhz
, *m2
, *denom
, *tmp1
, *tmp2
;
561 real mhxk
, mhyk
, mhzk
, m2k
;
562 struct pme_solve_work_t
*work
;
565 ivec local_ndata
, local_offset
, local_size
;
570 /* Dimensions should be identical for A/B grid, so we just use A here */
571 gmx_parallel_3dfft_complex_limits(pme
->pfft_setup
[PME_GRID_C6A
],
576 rxx
= pme
->recipbox
[XX
][XX
];
577 ryx
= pme
->recipbox
[YY
][XX
];
578 ryy
= pme
->recipbox
[YY
][YY
];
579 rzx
= pme
->recipbox
[ZZ
][XX
];
580 rzy
= pme
->recipbox
[ZZ
][YY
];
581 rzz
= pme
->recipbox
[ZZ
][ZZ
];
586 work
= &pme
->solve_work
[thread
];
595 iyz0
= local_ndata
[YY
]*local_ndata
[ZZ
]* thread
/nthread
;
596 iyz1
= local_ndata
[YY
]*local_ndata
[ZZ
]*(thread
+1)/nthread
;
598 for (iyz
= iyz0
; iyz
< iyz1
; iyz
++)
600 iy
= iyz
/local_ndata
[ZZ
];
601 iz
= iyz
- iy
*local_ndata
[ZZ
];
603 ky
= iy
+ local_offset
[YY
];
614 by
= 3.0*vol
*pme
->bsp_mod
[YY
][ky
]
615 / (M_PI
*sqrt(M_PI
)*ewaldcoeff
*ewaldcoeff
*ewaldcoeff
);
617 kz
= iz
+ local_offset
[ZZ
];
621 bz
= pme
->bsp_mod
[ZZ
][kz
];
623 /* 0.5 correction for corner points */
625 if (kz
== 0 || kz
== (nz
+1)/2)
630 kxstart
= local_offset
[XX
];
631 kxend
= local_offset
[XX
] + local_ndata
[XX
];
634 /* More expensive inner loop, especially because of the
635 * storage of the mh elements in array's. Because x is the
636 * minor grid index, all mh elements depend on kx for
637 * triclinic unit cells.
640 /* Two explicit loops to avoid a conditional inside the loop */
641 for (kx
= kxstart
; kx
< maxkx
; kx
++)
646 mhyk
= mx
* ryx
+ my
* ryy
;
647 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
648 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
653 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
654 tmp1
[kx
] = -factor
*m2k
;
655 tmp2
[kx
] = sqrt(factor
*m2k
);
658 for (kx
= maxkx
; kx
< kxend
; kx
++)
663 mhyk
= mx
* ryx
+ my
* ryy
;
664 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
665 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
670 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
671 tmp1
[kx
] = -factor
*m2k
;
672 tmp2
[kx
] = sqrt(factor
*m2k
);
675 calc_exponentials_lj(kxstart
, kxend
, tmp1
, tmp2
, denom
);
677 for (kx
= kxstart
; kx
< kxend
; kx
++)
680 eterm
= -((1.0 - 2.0*m2k
)*tmp1
[kx
]
682 vterm
= 3.0*(-tmp1
[kx
] + tmp2
[kx
]);
683 tmp1
[kx
] = eterm
*denom
[kx
];
684 tmp2
[kx
] = vterm
*denom
[kx
];
692 p0
= grid
[0] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
693 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
703 struct2
= 2.0*(d1
*d1
+d2
*d2
);
705 tmp1
[kx
] = eterm
*struct2
;
706 tmp2
[kx
] = vterm
*struct2
;
711 real
*struct2
= denom
;
714 for (kx
= kxstart
; kx
< kxend
; kx
++)
718 /* Due to symmetry we only need to calculate 4 of the 7 terms */
719 for (ig
= 0; ig
<= 3; ++ig
)
724 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
725 p1
= grid
[6-ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
726 scale
= 2.0*lb_scale_factor_symm
[ig
];
727 for (kx
= kxstart
; kx
< kxend
; ++kx
, ++p0
, ++p1
)
729 struct2
[kx
] += scale
*(p0
->re
*p1
->re
+ p0
->im
*p1
->im
);
733 for (ig
= 0; ig
<= 6; ++ig
)
737 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
738 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
748 for (kx
= kxstart
; kx
< kxend
; kx
++)
753 tmp1
[kx
] = eterm
*str2
;
754 tmp2
[kx
] = vterm
*str2
;
758 for (kx
= kxstart
; kx
< kxend
; kx
++)
760 ets2
= corner_fac
*tmp1
[kx
];
761 vterm
= 2.0*factor
*tmp2
[kx
];
763 ets2vf
= corner_fac
*vterm
;
764 virxx
+= ets2vf
*mhx
[kx
]*mhx
[kx
] - ets2
;
765 virxy
+= ets2vf
*mhx
[kx
]*mhy
[kx
];
766 virxz
+= ets2vf
*mhx
[kx
]*mhz
[kx
];
767 viryy
+= ets2vf
*mhy
[kx
]*mhy
[kx
] - ets2
;
768 viryz
+= ets2vf
*mhy
[kx
]*mhz
[kx
];
769 virzz
+= ets2vf
*mhz
[kx
]*mhz
[kx
] - ets2
;
774 /* We don't need to calculate the energy and the virial.
775 * In this case the triclinic overhead is small.
778 /* Two explicit loops to avoid a conditional inside the loop */
780 for (kx
= kxstart
; kx
< maxkx
; kx
++)
785 mhyk
= mx
* ryx
+ my
* ryy
;
786 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
787 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
789 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
790 tmp1
[kx
] = -factor
*m2k
;
791 tmp2
[kx
] = sqrt(factor
*m2k
);
794 for (kx
= maxkx
; kx
< kxend
; kx
++)
799 mhyk
= mx
* ryx
+ my
* ryy
;
800 mhzk
= mx
* rzx
+ my
* rzy
+ mz
* rzz
;
801 m2k
= mhxk
*mhxk
+ mhyk
*mhyk
+ mhzk
*mhzk
;
803 denom
[kx
] = bz
*by
*pme
->bsp_mod
[XX
][kx
];
804 tmp1
[kx
] = -factor
*m2k
;
805 tmp2
[kx
] = sqrt(factor
*m2k
);
808 calc_exponentials_lj(kxstart
, kxend
, tmp1
, tmp2
, denom
);
810 for (kx
= kxstart
; kx
< kxend
; kx
++)
813 eterm
= -((1.0 - 2.0*m2k
)*tmp1
[kx
]
815 tmp1
[kx
] = eterm
*denom
[kx
];
817 gcount
= (bLB
? 7 : 1);
818 for (ig
= 0; ig
< gcount
; ++ig
)
822 p0
= grid
[ig
] + iy
*local_size
[ZZ
]*local_size
[XX
] + iz
*local_size
[XX
];
823 for (kx
= kxstart
; kx
< kxend
; kx
++, p0
++)
838 work
->vir_lj
[XX
][XX
] = 0.25*virxx
;
839 work
->vir_lj
[YY
][YY
] = 0.25*viryy
;
840 work
->vir_lj
[ZZ
][ZZ
] = 0.25*virzz
;
841 work
->vir_lj
[XX
][YY
] = work
->vir_lj
[YY
][XX
] = 0.25*virxy
;
842 work
->vir_lj
[XX
][ZZ
] = work
->vir_lj
[ZZ
][XX
] = 0.25*virxz
;
843 work
->vir_lj
[YY
][ZZ
] = work
->vir_lj
[ZZ
][YY
] = 0.25*viryz
;
845 /* This energy should be corrected for a charged system */
846 work
->energy_lj
= 0.5*energy
;
848 /* Return the loop count */
849 return local_ndata
[YY
]*local_ndata
[XX
];