2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* This include file has code between ifdef's to make sure
39 * that this performance sensitive code is inlined
40 * and to remove conditionals and variable loop bounds at compile time.
43 #ifdef PME_SPREAD_SIMD4_ORDER4
44 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
45 * This code does not assume any memory alignment for the grid.
49 Simd4Real
ty_S0(thy
[0]);
50 Simd4Real
ty_S1(thy
[1]);
51 Simd4Real
ty_S2(thy
[2]);
52 Simd4Real
ty_S3(thy
[3]);
56 Simd4Real sum_S0
, sum_S1
, sum_S2
, sum_S3
;
57 Simd4Real gri_S0
, gri_S1
, gri_S2
, gri_S3
;
59 /* With order 4 the z-spline is actually aligned */
62 for (ithx
= 0; (ithx
< 4); ithx
++)
64 index_x
= (i0
+ithx
)*pny
*pnz
;
65 valx
= coefficient
*thx
[ithx
];
67 vx_S
= Simd4Real(valx
);
69 vx_tz_S
= vx_S
* tz_S
;
71 gri_S0
= load4U(grid
+index_x
+(j0
+0)*pnz
+k0
);
72 gri_S1
= load4U(grid
+index_x
+(j0
+1)*pnz
+k0
);
73 gri_S2
= load4U(grid
+index_x
+(j0
+2)*pnz
+k0
);
74 gri_S3
= load4U(grid
+index_x
+(j0
+3)*pnz
+k0
);
76 sum_S0
= fma(vx_tz_S
, ty_S0
, gri_S0
);
77 sum_S1
= fma(vx_tz_S
, ty_S1
, gri_S1
);
78 sum_S2
= fma(vx_tz_S
, ty_S2
, gri_S2
);
79 sum_S3
= fma(vx_tz_S
, ty_S3
, gri_S3
);
81 store4U(grid
+index_x
+(j0
+0)*pnz
+k0
, sum_S0
);
82 store4U(grid
+index_x
+(j0
+1)*pnz
+k0
, sum_S1
);
83 store4U(grid
+index_x
+(j0
+2)*pnz
+k0
, sum_S2
);
84 store4U(grid
+index_x
+(j0
+3)*pnz
+k0
, sum_S3
);
87 #undef PME_SPREAD_SIMD4_ORDER4
91 #ifdef PME_SPREAD_SIMD4_ALIGNED
92 /* This code assumes that the grid is allocated 4-real aligned
93 * and that pnz is a multiple of 4.
94 * This code supports pme_order <= 5.
100 Simd4Real
ty_S0(thy
[0]);
101 Simd4Real
ty_S1(thy
[1]);
102 Simd4Real
ty_S2(thy
[2]);
103 Simd4Real
ty_S3(thy
[3]);
109 Simd4Real sum_S00
, sum_S01
, sum_S02
, sum_S03
;
110 Simd4Real sum_S10
, sum_S11
, sum_S12
, sum_S13
;
111 Simd4Real gri_S00
, gri_S01
, gri_S02
, gri_S03
;
112 Simd4Real gri_S10
, gri_S11
, gri_S12
, gri_S13
;
114 Simd4Real
ty_S4(thy
[4]);
123 #ifdef PME_SIMD4_UNALIGNED
124 tz_S0
= load4U(thz
-offset
);
125 tz_S1
= load4U(thz
-offset
+4);
129 /* Copy thz to an aligned buffer (unused buffer parts are masked) */
130 for (i
= 0; i
< PME_ORDER
; i
++)
132 thz_aligned
[offset
+i
] = thz
[i
];
134 tz_S0
= load4(thz_aligned
);
135 tz_S1
= load4(thz_aligned
+4);
138 tz_S0
= selectByMask(tz_S0
, work
->mask_S0
[offset
]);
139 tz_S1
= selectByMask(tz_S1
, work
->mask_S1
[offset
]);
141 for (ithx
= 0; (ithx
< PME_ORDER
); ithx
++)
143 index
= (i0
+ithx
)*pny
*pnz
+ j0
*pnz
+ k0
- offset
;
144 valx
= coefficient
*thx
[ithx
];
146 vx_S
= Simd4Real(valx
);
148 vx_tz_S0
= vx_S
* tz_S0
;
149 vx_tz_S1
= vx_S
* tz_S1
;
151 gri_S00
= load4(grid
+index
+0*pnz
);
152 gri_S01
= load4(grid
+index
+1*pnz
);
153 gri_S02
= load4(grid
+index
+2*pnz
);
154 gri_S03
= load4(grid
+index
+3*pnz
);
156 gri_S04
= load4(grid
+index
+4*pnz
);
158 gri_S10
= load4(grid
+index
+0*pnz
+4);
159 gri_S11
= load4(grid
+index
+1*pnz
+4);
160 gri_S12
= load4(grid
+index
+2*pnz
+4);
161 gri_S13
= load4(grid
+index
+3*pnz
+4);
163 gri_S14
= load4(grid
+index
+4*pnz
+4);
166 sum_S00
= fma(vx_tz_S0
, ty_S0
, gri_S00
);
167 sum_S01
= fma(vx_tz_S0
, ty_S1
, gri_S01
);
168 sum_S02
= fma(vx_tz_S0
, ty_S2
, gri_S02
);
169 sum_S03
= fma(vx_tz_S0
, ty_S3
, gri_S03
);
171 sum_S04
= fma(vx_tz_S0
, ty_S4
, gri_S04
);
173 sum_S10
= fma(vx_tz_S1
, ty_S0
, gri_S10
);
174 sum_S11
= fma(vx_tz_S1
, ty_S1
, gri_S11
);
175 sum_S12
= fma(vx_tz_S1
, ty_S2
, gri_S12
);
176 sum_S13
= fma(vx_tz_S1
, ty_S3
, gri_S13
);
178 sum_S14
= fma(vx_tz_S1
, ty_S4
, gri_S14
);
181 store4(grid
+index
+0*pnz
, sum_S00
);
182 store4(grid
+index
+1*pnz
, sum_S01
);
183 store4(grid
+index
+2*pnz
, sum_S02
);
184 store4(grid
+index
+3*pnz
, sum_S03
);
186 store4(grid
+index
+4*pnz
, sum_S04
);
188 store4(grid
+index
+0*pnz
+4, sum_S10
);
189 store4(grid
+index
+1*pnz
+4, sum_S11
);
190 store4(grid
+index
+2*pnz
+4, sum_S12
);
191 store4(grid
+index
+3*pnz
+4, sum_S13
);
193 store4(grid
+index
+4*pnz
+4, sum_S14
);
198 #undef PME_SPREAD_SIMD4_ALIGNED