Add coolquotes
[gromacs.git] / src / gromacs / ewald / pme-simd4.h
blobd1d40bd825abe9e5fab341d6ad74f6d892be552b
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5 * Copyright (c) 2001-2004, The GROMACS development team.
6 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
7 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
8 * and including many others, as listed in the AUTHORS file in the
9 * top-level source directory and at http://www.gromacs.org.
11 * GROMACS is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public License
13 * as published by the Free Software Foundation; either version 2.1
14 * of the License, or (at your option) any later version.
16 * GROMACS is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with GROMACS; if not, see
23 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
24 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
26 * If you want to redistribute modifications to GROMACS, please
27 * consider that scientific software is very special. Version
28 * control is crucial - bugs must be traceable. We will be happy to
29 * consider code for inclusion in the official distribution, but
30 * derived work must not be called official GROMACS. Details are found
31 * in the README & COPYING files - if they are missing, get the
32 * official version at http://www.gromacs.org.
34 * To help us fund GROMACS development, we humbly ask that you cite
35 * the research papers on the package. Check out http://www.gromacs.org.
38 /* This include file has code between ifdef's to make sure
39 * that this performance sensitive code is inlined
40 * and to remove conditionals and variable loop bounds at compile time.
43 #ifdef PME_SPREAD_SIMD4_ORDER4
44 /* Spread one charge with pme_order=4 with unaligned SIMD4 load+store.
45 * This code does not assume any memory alignment for the grid.
48 using namespace gmx;
49 Simd4Real ty_S0(thy[0]);
50 Simd4Real ty_S1(thy[1]);
51 Simd4Real ty_S2(thy[2]);
52 Simd4Real ty_S3(thy[3]);
53 Simd4Real tz_S;
54 Simd4Real vx_S;
55 Simd4Real vx_tz_S;
56 Simd4Real sum_S0, sum_S1, sum_S2, sum_S3;
57 Simd4Real gri_S0, gri_S1, gri_S2, gri_S3;
59 /* With order 4 the z-spline is actually aligned */
60 tz_S = load4(thz);
62 for (ithx = 0; (ithx < 4); ithx++)
64 index_x = (i0+ithx)*pny*pnz;
65 valx = coefficient*thx[ithx];
67 vx_S = Simd4Real(valx);
69 vx_tz_S = vx_S * tz_S;
71 gri_S0 = load4U(grid+index_x+(j0+0)*pnz+k0);
72 gri_S1 = load4U(grid+index_x+(j0+1)*pnz+k0);
73 gri_S2 = load4U(grid+index_x+(j0+2)*pnz+k0);
74 gri_S3 = load4U(grid+index_x+(j0+3)*pnz+k0);
76 sum_S0 = fma(vx_tz_S, ty_S0, gri_S0);
77 sum_S1 = fma(vx_tz_S, ty_S1, gri_S1);
78 sum_S2 = fma(vx_tz_S, ty_S2, gri_S2);
79 sum_S3 = fma(vx_tz_S, ty_S3, gri_S3);
81 store4U(grid+index_x+(j0+0)*pnz+k0, sum_S0);
82 store4U(grid+index_x+(j0+1)*pnz+k0, sum_S1);
83 store4U(grid+index_x+(j0+2)*pnz+k0, sum_S2);
84 store4U(grid+index_x+(j0+3)*pnz+k0, sum_S3);
87 #undef PME_SPREAD_SIMD4_ORDER4
88 #endif
91 #ifdef PME_SPREAD_SIMD4_ALIGNED
92 /* This code assumes that the grid is allocated 4-real aligned
93 * and that pnz is a multiple of 4.
94 * This code supports pme_order <= 5.
97 using namespace gmx;
98 int offset;
99 int index;
100 Simd4Real ty_S0(thy[0]);
101 Simd4Real ty_S1(thy[1]);
102 Simd4Real ty_S2(thy[2]);
103 Simd4Real ty_S3(thy[3]);
104 Simd4Real tz_S0;
105 Simd4Real tz_S1;
106 Simd4Real vx_S;
107 Simd4Real vx_tz_S0;
108 Simd4Real vx_tz_S1;
109 Simd4Real sum_S00, sum_S01, sum_S02, sum_S03;
110 Simd4Real sum_S10, sum_S11, sum_S12, sum_S13;
111 Simd4Real gri_S00, gri_S01, gri_S02, gri_S03;
112 Simd4Real gri_S10, gri_S11, gri_S12, gri_S13;
113 #if PME_ORDER == 5
114 Simd4Real ty_S4(thy[4]);
115 Simd4Real sum_S04;
116 Simd4Real sum_S14;
117 Simd4Real gri_S04;
118 Simd4Real gri_S14;
119 #endif
121 offset = k0 & 3;
123 #ifdef PME_SIMD4_UNALIGNED
124 tz_S0 = load4U(thz-offset);
125 tz_S1 = load4U(thz-offset+4);
126 #else
128 int i;
129 /* Copy thz to an aligned buffer (unused buffer parts are masked) */
130 for (i = 0; i < PME_ORDER; i++)
132 thz_aligned[offset+i] = thz[i];
134 tz_S0 = load4(thz_aligned);
135 tz_S1 = load4(thz_aligned+4);
137 #endif
138 tz_S0 = selectByMask(tz_S0, work->mask_S0[offset]);
139 tz_S1 = selectByMask(tz_S1, work->mask_S1[offset]);
141 for (ithx = 0; (ithx < PME_ORDER); ithx++)
143 index = (i0+ithx)*pny*pnz + j0*pnz + k0 - offset;
144 valx = coefficient*thx[ithx];
146 vx_S = Simd4Real(valx);
148 vx_tz_S0 = vx_S * tz_S0;
149 vx_tz_S1 = vx_S * tz_S1;
151 gri_S00 = load4(grid+index+0*pnz);
152 gri_S01 = load4(grid+index+1*pnz);
153 gri_S02 = load4(grid+index+2*pnz);
154 gri_S03 = load4(grid+index+3*pnz);
155 #if PME_ORDER == 5
156 gri_S04 = load4(grid+index+4*pnz);
157 #endif
158 gri_S10 = load4(grid+index+0*pnz+4);
159 gri_S11 = load4(grid+index+1*pnz+4);
160 gri_S12 = load4(grid+index+2*pnz+4);
161 gri_S13 = load4(grid+index+3*pnz+4);
162 #if PME_ORDER == 5
163 gri_S14 = load4(grid+index+4*pnz+4);
164 #endif
166 sum_S00 = fma(vx_tz_S0, ty_S0, gri_S00);
167 sum_S01 = fma(vx_tz_S0, ty_S1, gri_S01);
168 sum_S02 = fma(vx_tz_S0, ty_S2, gri_S02);
169 sum_S03 = fma(vx_tz_S0, ty_S3, gri_S03);
170 #if PME_ORDER == 5
171 sum_S04 = fma(vx_tz_S0, ty_S4, gri_S04);
172 #endif
173 sum_S10 = fma(vx_tz_S1, ty_S0, gri_S10);
174 sum_S11 = fma(vx_tz_S1, ty_S1, gri_S11);
175 sum_S12 = fma(vx_tz_S1, ty_S2, gri_S12);
176 sum_S13 = fma(vx_tz_S1, ty_S3, gri_S13);
177 #if PME_ORDER == 5
178 sum_S14 = fma(vx_tz_S1, ty_S4, gri_S14);
179 #endif
181 store4(grid+index+0*pnz, sum_S00);
182 store4(grid+index+1*pnz, sum_S01);
183 store4(grid+index+2*pnz, sum_S02);
184 store4(grid+index+3*pnz, sum_S03);
185 #if PME_ORDER == 5
186 store4(grid+index+4*pnz, sum_S04);
187 #endif
188 store4(grid+index+0*pnz+4, sum_S10);
189 store4(grid+index+1*pnz+4, sum_S11);
190 store4(grid+index+2*pnz+4, sum_S12);
191 store4(grid+index+3*pnz+4, sum_S13);
192 #if PME_ORDER == 5
193 store4(grid+index+4*pnz+4, sum_S14);
194 #endif
197 #undef PME_ORDER
198 #undef PME_SPREAD_SIMD4_ALIGNED
199 #endif